Allow configuring stanford-ner and cache based on collective

2025-06-21 18:08:25 +00:00 · 2020-08-24 00:56:25 +02:00
parent 4e7c00c345
commit 8628a0a8b3
11 changed files with 271 additions and 117 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -5,12 +5,19 @@ import cats.implicits._

 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
+import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
+import docspell.analysis.nlp.StanfordSettings
 import docspell.common._

 trait TextAnalyser[F[_]] {

-  def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
+  def annotate(
+      logger: Logger[F],
+      settings: StanfordSettings,
+      cacheKey: Ident,
+      text: String
+  ): F[TextAnalyser.Result]

 }
 object TextAnalyser {
@ -22,43 +29,47 @@ object TextAnalyser {
  }

  def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
-    Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
-      def annotate(
-          logger: Logger[F],
-          lang: Language,
-          text: String
-      ): F[TextAnalyser.Result] =
-        for {
-          input <- textLimit(logger, text)
-          tags0 <- stanfordNer(lang, input)
-          tags1 <- contactNer(input)
-          dates <- dateNer(lang, input)
-          list  = tags0 ++ tags1
-          spans = NerLabelSpan.build(list)
-        } yield Result(spans ++ list, dates)
+    Resource
+      .liftF(PipelineCache[F]())
+      .map(cache =>
+        new TextAnalyser[F] {
+          def annotate(
+              logger: Logger[F],
+              settings: StanfordSettings,
+              cacheKey: Ident,
+              text: String
+          ): F[TextAnalyser.Result] =
+            for {
+              input <- textLimit(logger, text)
+              tags0 <- stanfordNer(cacheKey, settings, input)
+              tags1 <- contactNer(input)
+              dates <- dateNer(settings.lang, input)
+              list  = tags0 ++ tags1
+              spans = NerLabelSpan.build(list)
+            } yield Result(spans ++ list, dates)

-      private def textLimit(logger: Logger[F], text: String): F[String] =
-        if (text.length <= cfg.maxLength) text.pure[F]
-        else
-          logger.info(
-            s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
-              s" Analysing only first ${cfg.maxLength} characters."
-          ) *> text.take(cfg.maxLength).pure[F]
+          private def textLimit(logger: Logger[F], text: String): F[String] =
+            if (text.length <= cfg.maxLength) text.pure[F]
+            else
+              logger.info(
+                s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
+                  s" Analysing only first ${cfg.maxLength} characters."
+              ) *> text.take(cfg.maxLength).pure[F]

-      private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          StanfordNerClassifier.nerAnnotate(lang)(text)
+          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
+              : F[Vector[NerLabel]] =
+            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
+
+          private def contactNer(text: String): F[Vector[NerLabel]] =
+            Sync[F].delay {
+              Contact.annotate(text)
+            }
+
+          private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
+            Sync[F].delay {
+              DateFind.findDates(text, lang).toVector
+            }
        }
-
-      private def contactNer(text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          Contact.annotate(text)
-        }
-
-      private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
-        Sync[F].delay {
-          DateFind.findDates(text, lang).toVector
-        }
-    })
+      )

 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@ -0,0 +1,90 @@
+package docspell.analysis.nlp
+
+import cats.Applicative
+import cats.effect._
+import cats.effect.concurrent.Ref
+import cats.implicits._
+
+import docspell.common._
+
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import org.log4s.getLogger
+
+/** Creating the StanfordCoreNLP pipeline is quite expensive as it
+  * involves IO and initializing large objects.
+  *
+  * Therefore, the instances are cached, because they are thread-safe.
+  *
+  * **This is an internal API**
+  */
+trait PipelineCache[F[_]] {
+
+  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
+
+}
+
+object PipelineCache {
+  private[this] val logger = getLogger
+
+  def none[F[_]: Applicative]: PipelineCache[F] =
+    new PipelineCache[F] {
+      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+        makeClassifier(settings).pure[F]
+    }
+
+  def apply[F[_]: Sync](): F[PipelineCache[F]] =
+    Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
+
+  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
+      extends PipelineCache[F] {
+
+    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+      for {
+        id  <- makeSettingsId(settings)
+        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
+      } yield nlp
+
+    private def getOrCreate(
+        key: String,
+        id: String,
+        cache: Map[String, Entry],
+        settings: StanfordSettings
+    ): (Map[String, Entry], StanfordCoreNLP) =
+      cache.get(key) match {
+        case Some(entry) =>
+          if (entry.id == id) (cache, entry.value)
+          else {
+            logger.info(
+              s"StanfordNLP settings changed for key $key. Creating new classifier"
+            )
+            val nlp = makeClassifier(settings)
+            val e   = Entry(id, nlp)
+            (cache.updated(key, e), nlp)
+          }
+
+        case None =>
+          val nlp = makeClassifier(settings)
+          val e   = Entry(id, nlp)
+          (cache.updated(key, e), nlp)
+      }
+
+    private def makeSettingsId(settings: StanfordSettings): F[String] = {
+      val base = settings.copy(regexNer = None).toString
+      val size: F[Long] =
+        settings.regexNer match {
+          case Some(p) =>
+            File.size(p)
+          case None =>
+            0L.pure[F]
+        }
+      size.map(len => s"$base-$len")
+    }
+
+  }
+  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
+    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
+    new StanfordCoreNLP(Properties.forSettings(settings))
+  }
+
+  private case class Entry(id: String, value: StanfordCoreNLP)
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@ -3,6 +3,7 @@ package docspell.analysis.nlp
 import java.util.{Properties => JProps}

 import docspell.analysis.nlp.Properties.Implicits._
+import docspell.common._

 object Properties {

@ -13,6 +14,19 @@ object Properties {
    p
  }

+  def forSettings(settings: StanfordSettings): JProps = {
+    val regexNerFile = settings.regexNer
+      .map(p => p.normalize().toAbsolutePath().toString())
+    settings.lang match {
+      case Language.German =>
+        Properties.nerGerman(regexNerFile, settings.highRecall)
+      case Language.English =>
+        Properties.nerEnglish(regexNerFile)
+      case Language.French =>
+        Properties.nerFrench(regexNerFile, settings.highRecall)
+    }
+  }
+
  def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@ -1,45 +1,39 @@
 package docspell.analysis.nlp

-import java.util.{Properties => JProps}
-
 import scala.jdk.CollectionConverters._

+import cats.Applicative
+import cats.implicits._
+
 import docspell.common._

 import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
-import org.log4s.getLogger

 object StanfordNerClassifier {
-  private[this] val logger = getLogger

-  lazy val germanNerClassifier  = makeClassifier(Language.German)
-  lazy val englishNerClassifier = makeClassifier(Language.English)
-  lazy val frenchNerClassifier  = makeClassifier(Language.French)
+  /** Runs named entity recognition on the given `text`.
+    *
+    * This uses the classifier pipeline from stanford-nlp, see
+    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these
+    * classifiers is quite expensive, it involves loading large model
+    * files. The classifiers are thread-safe and so they are cached.
+    * The `cacheKey` defines the "slot" where classifiers are stored
+    * and retrieved. If for a given `cacheKey` the `settings` change,
+    * a new classifier must be created. It will then replace the
+    * previous one.
+    */
+  def nerAnnotate[F[_]: Applicative](
+      cacheKey: String,
+      cache: PipelineCache[F]
+  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
+    cache
+      .obtain(cacheKey, settings)
+      .map(crf => runClassifier(crf, text))

-  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
-    val nerClassifier = lang match {
-      case Language.English => englishNerClassifier
-      case Language.German  => germanNerClassifier
-      case Language.French  => frenchNerClassifier
-    }
+  def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
    val doc = new CoreDocument(text)
    nerClassifier.annotate(doc)
-
    doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
  }

-  private def makeClassifier(lang: Language): StanfordCoreNLP = {
-    logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
-    new StanfordCoreNLP(classifierProperties(lang))
-  }
-
-  private def classifierProperties(lang: Language): JProps =
-    lang match {
-      case Language.German =>
-        Properties.nerGerman(None, false)
-      case Language.English =>
-        Properties.nerEnglish(None)
-      case Language.French =>
-        Properties.nerFrench(None, false)
-    }
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
@ -0,0 +1,22 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+import docspell.common._
+
+/** Settings for configuring the stanford NER pipeline.
+  *
+  * The language is mandatory, only the provided ones are supported.
+  * The `highRecall` only applies for non-English languages. For
+  * non-English languages the english classifier is run as second
+  * classifier and if `highRecall` is true, then it will be used to
+  * tag untagged tokens. This may lead to a lot of false positives,
+  * but since English is omnipresent in other languages, too it
+  * depends on the use case for whether this is useful or not.
+  *
+  * The `regexNer` allows to specify a text file as described here:
+  * https://nlp.stanford.edu/software/regexner.html. This will be used
+  * as a last step to tag untagged tokens using the provided list of
+  * regexps.
+  */
+case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])