From 54a09861c45b5f6b580e70e3bbfd811d7f52e472 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 23:59:28 +0100 Subject: [PATCH] Use model cache with basic annotator --- .../docspell/analysis/TextAnalyser.scala | 7 +- .../analysis/nlp/BasicCRFAnnotator.scala | 89 ++++++++++++------- .../docspell/analysis/nlp/PipelineCache.scala | 20 +++++ .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 12 ++- 4 files changed, 88 insertions(+), 40 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 6c8e6cff..38491c3a 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -33,12 +33,7 @@ object TextAnalyser { blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource - .liftF( - PipelineCache(cfg.clearStanfordPipelineInterval)( - StanfordNerAnnotator.makePipeline, - StanfordNerAnnotator.clearPipelineCaches[F] - ) - ) + .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval)) .map(cache => new TextAnalyser[F] { def annotate( diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala index 5823fba2..a6fb6af0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -1,17 +1,22 @@ package docspell.analysis.nlp -import docspell.common._ -import edu.stanford.nlp.ie.AbstractSequenceClassifier -import edu.stanford.nlp.ie.crf.CRFClassifier -import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} -import org.log4s.getLogger - import java.net.URL +import java.util.concurrent.atomic.AtomicReference import java.util.zip.GZIPInputStream import scala.jdk.CollectionConverters._ import scala.util.Using +import cats.Applicative +import cats.effect.BracketThrow + +import docspell.common._ + +import edu.stanford.nlp.ie.AbstractSequenceClassifier +import edu.stanford.nlp.ie.crf.CRFClassifier +import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger + /** This is only using the CRFClassifier without building an analysis * pipeline. The ner-classifier cannot use results from POS-tagging * etc. and is therefore not as good as the [[StanfordNerAnnotator]]. @@ -20,16 +25,20 @@ import scala.util.Using object BasicCRFAnnotator { private[this] val logger = getLogger - lazy val germanNerClassifier = makeClassifier(Language.German) - lazy val englishNerClassifier = makeClassifier(Language.English) - lazy val frenchNerClassifier = makeClassifier(Language.French) + // assert correct resource names + List(Language.French, Language.German, Language.English).foreach(classifierResource) - def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { - val nerClassifier = lang match { - case Language.English => englishNerClassifier - case Language.German => germanNerClassifier - case Language.French => frenchNerClassifier - } + type Annotator = AbstractSequenceClassifier[CoreLabel] + + def nerAnnotate[F[_]: BracketThrow]( + cacheKey: String, + cache: PipelineCache[F, Annotator] + )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = + cache + .obtain(cacheKey, settings) + .use(crf => Applicative[F].pure(nerAnnotate(crf)(text))) + + def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] = nerClassifier .classify(text) .asScala @@ -42,34 +51,54 @@ object BasicCRFAnnotator { .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) }) .toVector - } - private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + private def makeClassifier(lang: Language): Annotator = { logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") val ner = classifierResource(lang) Using(new GZIPInputStream(ner.openStream())) { in => - CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] + CRFClassifier.getClassifier(in).asInstanceOf[Annotator] }.fold(throw _, identity) } private def classifierResource(lang: Language): URL = { - def check(u: URL): URL = - if (u == null) sys.error(s"NER model url not found for language ${lang.name}") - else u + def check(name: String): URL = + Option(getClass.getResource(name)) match { + case None => + sys.error(s"NER model resource '$name' not found for language ${lang.name}") + case Some(url) => url + } check(lang match { case Language.French => - getClass.getResource( - "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" case Language.German => - getClass.getResource( - "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" case Language.English => - getClass.getResource( - "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" }) } + + final class Cache { + private[this] lazy val germanNerClassifier = makeClassifier(Language.German) + private[this] lazy val englishNerClassifier = makeClassifier(Language.English) + private[this] lazy val frenchNerClassifier = makeClassifier(Language.French) + + def forLang(language: Language): Annotator = + language match { + case Language.French => frenchNerClassifier + case Language.German => germanNerClassifier + case Language.English => englishNerClassifier + } + } + + object Cache { + + private[this] val cacheRef = new AtomicReference[Cache](new Cache) + + def getAnnotator(language: Language): Annotator = + cacheRef.get().forLang(language) + + def clearCache(): Unit = + cacheRef.set(new Cache) + } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 61598f9a..2b567548 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -1,12 +1,16 @@ package docspell.analysis.nlp import scala.concurrent.duration.{Duration => _, _} + import cats.Applicative import cats.data.Kleisli import cats.effect._ import cats.effect.concurrent.Ref import cats.implicits._ + import docspell.common._ + +import edu.stanford.nlp.pipeline.StanfordCoreNLP import org.log4s.getLogger /** Creating the StanfordCoreNLP pipeline is quite expensive as it @@ -45,6 +49,22 @@ object PipelineCache { cacheClear <- CacheClearing.create(data, clearInterval, release) } yield new Impl[F, A](data, creator, cacheClear) + def full[F[_]: Concurrent: Timer]( + clearInterval: Duration + ): F[PipelineCache[F, StanfordCoreNLP]] = + apply(clearInterval)( + StanfordNerAnnotator.makePipeline, + StanfordNerAnnotator.clearPipelineCaches + ) + + def basic[F[_]: Concurrent: Timer]( + clearInterval: Duration + ): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] = + apply(clearInterval)( + settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang), + Sync[F].delay(BasicCRFAnnotator.Cache.clearCache()) + ) + final private class Impl[F[_]: Sync, A]( data: Ref[F, Map[String, Entry[A]]], creator: StanfordNerSettings => A, diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala index bffc6744..0abab7e9 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -5,9 +5,12 @@ import docspell.files.TestFiles import docspell.common._ object BaseCRFAnnotatorSuite extends SimpleTestSuite { + + def annotate(language: Language): String => Vector[NerLabel] = + BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language)) + test("find english ner labels") { - val labels = - BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText) + val labels = annotate(Language.English)(TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -39,11 +42,11 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { NerLabel("Jeter", NerTag.Person, 1123, 1128) ) assertEquals(labels, expect) + BasicCRFAnnotator.Cache.clearCache() } test("find german ner labels") { - val labels = - BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText) + val labels = annotate(Language.German)(TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), @@ -59,5 +62,6 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { NerLabel("Mustermann", NerTag.Person, 509, 519) ) assertEquals(labels, expect) + BasicCRFAnnotator.Cache.clearCache() } }