Use model cache with basic annotator

This commit is contained in:
Eike Kettner 2021-01-13 23:59:28 +01:00
parent a77f67d73a
commit 54a09861c4
4 changed files with 88 additions and 40 deletions

View File

@ -33,12 +33,7 @@ object TextAnalyser {
blocker: Blocker blocker: Blocker
): Resource[F, TextAnalyser[F]] = ): Resource[F, TextAnalyser[F]] =
Resource Resource
.liftF( .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval))
PipelineCache(cfg.clearStanfordPipelineInterval)(
StanfordNerAnnotator.makePipeline,
StanfordNerAnnotator.clearPipelineCaches[F]
)
)
.map(cache => .map(cache =>
new TextAnalyser[F] { new TextAnalyser[F] {
def annotate( def annotate(

View File

@ -1,17 +1,22 @@
package docspell.analysis.nlp package docspell.analysis.nlp
import docspell.common._
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
import java.net.URL import java.net.URL
import java.util.concurrent.atomic.AtomicReference
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import scala.jdk.CollectionConverters._ import scala.jdk.CollectionConverters._
import scala.util.Using import scala.util.Using
import cats.Applicative
import cats.effect.BracketThrow
import docspell.common._
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
/** This is only using the CRFClassifier without building an analysis /** This is only using the CRFClassifier without building an analysis
* pipeline. The ner-classifier cannot use results from POS-tagging * pipeline. The ner-classifier cannot use results from POS-tagging
* etc. and is therefore not as good as the [[StanfordNerAnnotator]]. * etc. and is therefore not as good as the [[StanfordNerAnnotator]].
@ -20,16 +25,20 @@ import scala.util.Using
object BasicCRFAnnotator { object BasicCRFAnnotator {
private[this] val logger = getLogger private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German) // assert correct resource names
lazy val englishNerClassifier = makeClassifier(Language.English) List(Language.French, Language.German, Language.English).foreach(classifierResource)
lazy val frenchNerClassifier = makeClassifier(Language.French)
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { type Annotator = AbstractSequenceClassifier[CoreLabel]
val nerClassifier = lang match {
case Language.English => englishNerClassifier def nerAnnotate[F[_]: BracketThrow](
case Language.German => germanNerClassifier cacheKey: String,
case Language.French => frenchNerClassifier cache: PipelineCache[F, Annotator]
} )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.use(crf => Applicative[F].pure(nerAnnotate(crf)(text)))
def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
nerClassifier nerClassifier
.classify(text) .classify(text)
.asScala .asScala
@ -42,34 +51,54 @@ object BasicCRFAnnotator {
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
}) })
.toVector .toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { private def makeClassifier(lang: Language): Annotator = {
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
val ner = classifierResource(lang) val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in => Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] CRFClassifier.getClassifier(in).asInstanceOf[Annotator]
}.fold(throw _, identity) }.fold(throw _, identity)
} }
private def classifierResource(lang: Language): URL = { private def classifierResource(lang: Language): URL = {
def check(u: URL): URL = def check(name: String): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}") Option(getClass.getResource(name)) match {
else u case None =>
sys.error(s"NER model resource '$name' not found for language ${lang.name}")
case Some(url) => url
}
check(lang match { check(lang match {
case Language.French => case Language.French =>
getClass.getResource( "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
"/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
)
case Language.German => case Language.German =>
getClass.getResource( "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
)
case Language.English => case Language.English =>
getClass.getResource( "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
)
}) })
} }
final class Cache {
private[this] lazy val germanNerClassifier = makeClassifier(Language.German)
private[this] lazy val englishNerClassifier = makeClassifier(Language.English)
private[this] lazy val frenchNerClassifier = makeClassifier(Language.French)
def forLang(language: Language): Annotator =
language match {
case Language.French => frenchNerClassifier
case Language.German => germanNerClassifier
case Language.English => englishNerClassifier
}
}
object Cache {
private[this] val cacheRef = new AtomicReference[Cache](new Cache)
def getAnnotator(language: Language): Annotator =
cacheRef.get().forLang(language)
def clearCache(): Unit =
cacheRef.set(new Cache)
}
} }

View File

@ -1,12 +1,16 @@
package docspell.analysis.nlp package docspell.analysis.nlp
import scala.concurrent.duration.{Duration => _, _} import scala.concurrent.duration.{Duration => _, _}
import cats.Applicative import cats.Applicative
import cats.data.Kleisli import cats.data.Kleisli
import cats.effect._ import cats.effect._
import cats.effect.concurrent.Ref import cats.effect.concurrent.Ref
import cats.implicits._ import cats.implicits._
import docspell.common._ import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import org.log4s.getLogger import org.log4s.getLogger
/** Creating the StanfordCoreNLP pipeline is quite expensive as it /** Creating the StanfordCoreNLP pipeline is quite expensive as it
@ -45,6 +49,22 @@ object PipelineCache {
cacheClear <- CacheClearing.create(data, clearInterval, release) cacheClear <- CacheClearing.create(data, clearInterval, release)
} yield new Impl[F, A](data, creator, cacheClear) } yield new Impl[F, A](data, creator, cacheClear)
def full[F[_]: Concurrent: Timer](
clearInterval: Duration
): F[PipelineCache[F, StanfordCoreNLP]] =
apply(clearInterval)(
StanfordNerAnnotator.makePipeline,
StanfordNerAnnotator.clearPipelineCaches
)
def basic[F[_]: Concurrent: Timer](
clearInterval: Duration
): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] =
apply(clearInterval)(
settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang),
Sync[F].delay(BasicCRFAnnotator.Cache.clearCache())
)
final private class Impl[F[_]: Sync, A]( final private class Impl[F[_]: Sync, A](
data: Ref[F, Map[String, Entry[A]]], data: Ref[F, Map[String, Entry[A]]],
creator: StanfordNerSettings => A, creator: StanfordNerSettings => A,

View File

@ -5,9 +5,12 @@ import docspell.files.TestFiles
import docspell.common._ import docspell.common._
object BaseCRFAnnotatorSuite extends SimpleTestSuite { object BaseCRFAnnotatorSuite extends SimpleTestSuite {
def annotate(language: Language): String => Vector[NerLabel] =
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
test("find english ner labels") { test("find english ner labels") {
val labels = val labels = annotate(Language.English)(TestFiles.letterENText)
BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText)
val expect = Vector( val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11), NerLabel("Jeter", NerTag.Person, 6, 11),
@ -39,11 +42,11 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite {
NerLabel("Jeter", NerTag.Person, 1123, 1128) NerLabel("Jeter", NerTag.Person, 1123, 1128)
) )
assertEquals(labels, expect) assertEquals(labels, expect)
BasicCRFAnnotator.Cache.clearCache()
} }
test("find german ner labels") { test("find german ner labels") {
val labels = val labels = annotate(Language.German)(TestFiles.letterDEText)
BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText)
val expect = Vector( val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14), NerLabel("Mustermann", NerTag.Person, 4, 14),
@ -59,5 +62,6 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite {
NerLabel("Mustermann", NerTag.Person, 509, 519) NerLabel("Mustermann", NerTag.Person, 509, 519)
) )
assertEquals(labels, expect) assertEquals(labels, expect)
BasicCRFAnnotator.Cache.clearCache()
} }
} }