mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 23:25:58 +00:00
Use model cache with basic annotator
This commit is contained in:
parent
a77f67d73a
commit
54a09861c4
@ -33,12 +33,7 @@ object TextAnalyser {
|
|||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Resource[F, TextAnalyser[F]] =
|
): Resource[F, TextAnalyser[F]] =
|
||||||
Resource
|
Resource
|
||||||
.liftF(
|
.liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval))
|
||||||
PipelineCache(cfg.clearStanfordPipelineInterval)(
|
|
||||||
StanfordNerAnnotator.makePipeline,
|
|
||||||
StanfordNerAnnotator.clearPipelineCaches[F]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
.map(cache =>
|
.map(cache =>
|
||||||
new TextAnalyser[F] {
|
new TextAnalyser[F] {
|
||||||
def annotate(
|
def annotate(
|
||||||
|
@ -1,17 +1,22 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
import docspell.common._
|
|
||||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
|
||||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
|
||||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
|
||||||
import org.log4s.getLogger
|
|
||||||
|
|
||||||
import java.net.URL
|
import java.net.URL
|
||||||
|
import java.util.concurrent.atomic.AtomicReference
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
import scala.util.Using
|
import scala.util.Using
|
||||||
|
|
||||||
|
import cats.Applicative
|
||||||
|
import cats.effect.BracketThrow
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||||
|
import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||||
|
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||||
|
import org.log4s.getLogger
|
||||||
|
|
||||||
/** This is only using the CRFClassifier without building an analysis
|
/** This is only using the CRFClassifier without building an analysis
|
||||||
* pipeline. The ner-classifier cannot use results from POS-tagging
|
* pipeline. The ner-classifier cannot use results from POS-tagging
|
||||||
* etc. and is therefore not as good as the [[StanfordNerAnnotator]].
|
* etc. and is therefore not as good as the [[StanfordNerAnnotator]].
|
||||||
@ -20,16 +25,20 @@ import scala.util.Using
|
|||||||
object BasicCRFAnnotator {
|
object BasicCRFAnnotator {
|
||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
lazy val germanNerClassifier = makeClassifier(Language.German)
|
// assert correct resource names
|
||||||
lazy val englishNerClassifier = makeClassifier(Language.English)
|
List(Language.French, Language.German, Language.English).foreach(classifierResource)
|
||||||
lazy val frenchNerClassifier = makeClassifier(Language.French)
|
|
||||||
|
|
||||||
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
|
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||||
val nerClassifier = lang match {
|
|
||||||
case Language.English => englishNerClassifier
|
def nerAnnotate[F[_]: BracketThrow](
|
||||||
case Language.German => germanNerClassifier
|
cacheKey: String,
|
||||||
case Language.French => frenchNerClassifier
|
cache: PipelineCache[F, Annotator]
|
||||||
}
|
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||||
|
cache
|
||||||
|
.obtain(cacheKey, settings)
|
||||||
|
.use(crf => Applicative[F].pure(nerAnnotate(crf)(text)))
|
||||||
|
|
||||||
|
def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
|
||||||
nerClassifier
|
nerClassifier
|
||||||
.classify(text)
|
.classify(text)
|
||||||
.asScala
|
.asScala
|
||||||
@ -42,34 +51,54 @@ object BasicCRFAnnotator {
|
|||||||
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
|
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
|
||||||
})
|
})
|
||||||
.toVector
|
.toVector
|
||||||
}
|
|
||||||
|
|
||||||
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
|
private def makeClassifier(lang: Language): Annotator = {
|
||||||
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
||||||
val ner = classifierResource(lang)
|
val ner = classifierResource(lang)
|
||||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
Using(new GZIPInputStream(ner.openStream())) { in =>
|
||||||
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
|
CRFClassifier.getClassifier(in).asInstanceOf[Annotator]
|
||||||
}.fold(throw _, identity)
|
}.fold(throw _, identity)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def classifierResource(lang: Language): URL = {
|
private def classifierResource(lang: Language): URL = {
|
||||||
def check(u: URL): URL =
|
def check(name: String): URL =
|
||||||
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
|
Option(getClass.getResource(name)) match {
|
||||||
else u
|
case None =>
|
||||||
|
sys.error(s"NER model resource '$name' not found for language ${lang.name}")
|
||||||
|
case Some(url) => url
|
||||||
|
}
|
||||||
|
|
||||||
check(lang match {
|
check(lang match {
|
||||||
case Language.French =>
|
case Language.French =>
|
||||||
getClass.getResource(
|
"/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
|
||||||
"/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
|
|
||||||
)
|
|
||||||
case Language.German =>
|
case Language.German =>
|
||||||
getClass.getResource(
|
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
|
||||||
)
|
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
getClass.getResource(
|
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
|
||||||
)
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final class Cache {
|
||||||
|
private[this] lazy val germanNerClassifier = makeClassifier(Language.German)
|
||||||
|
private[this] lazy val englishNerClassifier = makeClassifier(Language.English)
|
||||||
|
private[this] lazy val frenchNerClassifier = makeClassifier(Language.French)
|
||||||
|
|
||||||
|
def forLang(language: Language): Annotator =
|
||||||
|
language match {
|
||||||
|
case Language.French => frenchNerClassifier
|
||||||
|
case Language.German => germanNerClassifier
|
||||||
|
case Language.English => englishNerClassifier
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object Cache {
|
||||||
|
|
||||||
|
private[this] val cacheRef = new AtomicReference[Cache](new Cache)
|
||||||
|
|
||||||
|
def getAnnotator(language: Language): Annotator =
|
||||||
|
cacheRef.get().forLang(language)
|
||||||
|
|
||||||
|
def clearCache(): Unit =
|
||||||
|
cacheRef.set(new Cache)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
import scala.concurrent.duration.{Duration => _, _}
|
import scala.concurrent.duration.{Duration => _, _}
|
||||||
|
|
||||||
import cats.Applicative
|
import cats.Applicative
|
||||||
import cats.data.Kleisli
|
import cats.data.Kleisli
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.effect.concurrent.Ref
|
import cats.effect.concurrent.Ref
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
import org.log4s.getLogger
|
import org.log4s.getLogger
|
||||||
|
|
||||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||||
@ -45,6 +49,22 @@ object PipelineCache {
|
|||||||
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
||||||
} yield new Impl[F, A](data, creator, cacheClear)
|
} yield new Impl[F, A](data, creator, cacheClear)
|
||||||
|
|
||||||
|
def full[F[_]: Concurrent: Timer](
|
||||||
|
clearInterval: Duration
|
||||||
|
): F[PipelineCache[F, StanfordCoreNLP]] =
|
||||||
|
apply(clearInterval)(
|
||||||
|
StanfordNerAnnotator.makePipeline,
|
||||||
|
StanfordNerAnnotator.clearPipelineCaches
|
||||||
|
)
|
||||||
|
|
||||||
|
def basic[F[_]: Concurrent: Timer](
|
||||||
|
clearInterval: Duration
|
||||||
|
): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] =
|
||||||
|
apply(clearInterval)(
|
||||||
|
settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang),
|
||||||
|
Sync[F].delay(BasicCRFAnnotator.Cache.clearCache())
|
||||||
|
)
|
||||||
|
|
||||||
final private class Impl[F[_]: Sync, A](
|
final private class Impl[F[_]: Sync, A](
|
||||||
data: Ref[F, Map[String, Entry[A]]],
|
data: Ref[F, Map[String, Entry[A]]],
|
||||||
creator: StanfordNerSettings => A,
|
creator: StanfordNerSettings => A,
|
||||||
|
@ -5,9 +5,12 @@ import docspell.files.TestFiles
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
||||||
|
|
||||||
|
def annotate(language: Language): String => Vector[NerLabel] =
|
||||||
|
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
|
||||||
|
|
||||||
test("find english ner labels") {
|
test("find english ner labels") {
|
||||||
val labels =
|
val labels = annotate(Language.English)(TestFiles.letterENText)
|
||||||
BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText)
|
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||||
@ -39,11 +42,11 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
|||||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||||
)
|
)
|
||||||
assertEquals(labels, expect)
|
assertEquals(labels, expect)
|
||||||
|
BasicCRFAnnotator.Cache.clearCache()
|
||||||
}
|
}
|
||||||
|
|
||||||
test("find german ner labels") {
|
test("find german ner labels") {
|
||||||
val labels =
|
val labels = annotate(Language.German)(TestFiles.letterDEText)
|
||||||
BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Max", NerTag.Person, 0, 3),
|
NerLabel("Max", NerTag.Person, 0, 3),
|
||||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||||
@ -59,5 +62,6 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
|||||||
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
||||||
)
|
)
|
||||||
assertEquals(labels, expect)
|
assertEquals(labels, expect)
|
||||||
|
BasicCRFAnnotator.Cache.clearCache()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user