From aa937797bed2411d8bea6a6f8fa80fa0e30a866b Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Thu, 14 Jan 2021 00:55:19 +0100
Subject: [PATCH] Choose nlp mode in config file

---
 .../docspell/analysis/TextAnalyser.scala      | 42 +++++++++++++++----
 .../analysis/TextAnalysisConfig.scala         |  8 +++-
 .../main/scala/docspell/common/NlpMode.scala  | 23 ++++++++++
 .../docspell/common/config/Implicits.scala    |  3 ++
 .../joex/src/main/resources/reference.conf    | 29 ++++++++++---
 .../src/main/scala/docspell/joex/Config.scala |  9 ++--
 6 files changed, 95 insertions(+), 19 deletions(-)
 create mode 100644 modules/common/src/main/scala/docspell/common/NlpMode.scala

diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
index 38491c3a..a9234027 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -1,14 +1,17 @@
 package docspell.analysis
 
+import cats.Applicative
 import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
-import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
+import docspell.analysis.nlp._
 import docspell.common._
 
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+
 trait TextAnalyser[F[_]] {
 
   def annotate(
@@ -33,8 +36,8 @@ object TextAnalyser {
       blocker: Blocker
   ): Resource[F, TextAnalyser[F]] =
     Resource
-      .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval))
-      .map(cache =>
+      .liftF(Nlp(cfg.nlpConfig))
+      .map(stanfordNer =>
         new TextAnalyser[F] {
           def annotate(
               logger: Logger[F],
@@ -44,7 +47,7 @@ object TextAnalyser {
           ): F[TextAnalyser.Result] =
             for {
               input <- textLimit(logger, text)
-              tags0 <- stanfordNer(cacheKey, settings, input)
+              tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
               tags1 <- contactNer(input)
               dates <- dateNer(settings.lang, input)
               list  = tags0 ++ tags1
@@ -62,10 +65,6 @@ object TextAnalyser {
                   s" Analysing only first ${cfg.maxLength} characters."
               ) *> text.take(cfg.maxLength).pure[F]
 
-          private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
-              : F[Vector[NerLabel]] =
-            StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
-
           private def contactNer(text: String): F[Vector[NerLabel]] =
             Sync[F].delay {
               Contact.annotate(text)
@@ -78,4 +77,31 @@ object TextAnalyser {
         }
       )
 
+  private object Nlp {
+
+    def apply[F[_]: Concurrent: Timer: BracketThrow](
+        cfg: TextAnalysisConfig.NlpConfig
+    ): F[Input => F[Vector[NerLabel]]] =
+      cfg.mode match {
+        case NlpMode.Full =>
+          PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
+        case NlpMode.Basic =>
+          PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
+        case NlpMode.Disabled =>
+          Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
+      }
+
+    final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
+
+    def full[F[_]: BracketThrow](
+        cache: PipelineCache[F, StanfordCoreNLP]
+    )(input: Input): F[Vector[NerLabel]] =
+      StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
+
+    def basic[F[_]: BracketThrow](
+        cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
+    )(input: Input): F[Vector[NerLabel]] =
+      BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
+
+  }
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
index 2dbfbfc4..abc92043 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
@@ -1,10 +1,16 @@
 package docspell.analysis
 
+import docspell.analysis.TextAnalysisConfig.NlpConfig
 import docspell.analysis.classifier.TextClassifierConfig
 import docspell.common._
 
 case class TextAnalysisConfig(
     maxLength: Int,
-    clearStanfordPipelineInterval: Duration,
+    nlpConfig: NlpConfig,
     classifier: TextClassifierConfig
 )
+
+object TextAnalysisConfig {
+
+  case class NlpConfig(clearInterval: Duration, mode: NlpMode)
+}
diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala
new file mode 100644
index 00000000..36ebf7db
--- /dev/null
+++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala
@@ -0,0 +1,23 @@
+package docspell.common
+
+sealed trait NlpMode { self: Product =>
+
+  def name: String =
+    self.productPrefix
+}
+object NlpMode {
+  case object Full     extends NlpMode
+  case object Basic    extends NlpMode
+  case object Disabled extends NlpMode
+
+  def fromString(name: String): Either[String, NlpMode] =
+    name.toLowerCase match {
+      case "full"     => Right(Full)
+      case "basic"    => Right(Basic)
+      case "disabled" => Right(Disabled)
+      case _          => Left(s"Unknown nlp-mode: $name")
+    }
+
+  def unsafeFromString(name: String): NlpMode =
+    fromString(name).fold(sys.error, identity)
+}
diff --git a/modules/common/src/main/scala/docspell/common/config/Implicits.scala b/modules/common/src/main/scala/docspell/common/config/Implicits.scala
index c99c430a..9dab40dc 100644
--- a/modules/common/src/main/scala/docspell/common/config/Implicits.scala
+++ b/modules/common/src/main/scala/docspell/common/config/Implicits.scala
@@ -44,6 +44,9 @@ object Implicits {
   implicit val priorityReader: ConfigReader[Priority] =
     ConfigReader[String].emap(reason(Priority.fromString))
 
+  implicit val nlpModeReader: ConfigReader[NlpMode] =
+    ConfigReader[String].emap(reason(NlpMode.fromString))
+
   def reason[A: ClassTag](
       f: String => Either[String, A]
   ): String => Either[FailureReason, A] =
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 4aeb5a1b..583b40b1 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -277,12 +277,27 @@ docspell.joex {
     # files.
     working-dir = ${java.io.tmpdir}"/docspell-analysis"
 
-    # The StanfordCoreNLP library caches language models which
-    # requires quite some amount of memory. Setting this interval to a
-    # positive duration, the cache is cleared after this amount of
-    # idle time. Set it to 0 to disable it if you have enough memory,
-    # processing will be faster.
-    clear-stanford-nlp-interval = "15 minutes"
+    nlp-config {
+      # The StanfordCoreNLP library caches language models which
+      # requires quite some amount of memory. Setting this interval to a
+      # positive duration, the cache is cleared after this amount of
+      # idle time. Set it to 0 to disable it if you have enough memory,
+      # processing will be faster.
+      #
+      # This has only any effect, if mode != disabled.
+      clear-interval = "15 minutes"
+
+      # The mode for configuring NLP models. Currently 3 are available:
+      #
+      # 1. full – builds the complete pipeline, run with -Xmx1500M or more
+      # 2. basic - builds only the ner annotator, run with -Xmx600M or more
+      # 3. disabled - doesn't use any stanford-nlp feature
+      #
+      # The basic variant does a quite good job for German and
+      # English. It might be worse for French, always depending on the
+      # type of text that is analysed.
+      mode = full
+    }
 
     regex-ner {
       # Whether to enable custom NER annotation. This uses the address
@@ -295,6 +310,8 @@ docspell.joex {
       #
       # This setting might be moved to the collective settings in the
       # future.
+      #
+      # Note, this is only relevant if nlp-config.mode = full.
       enabled = true
 
       # The NER annotation uses a file of patterns that is derived from
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index 8fba3582..5b2bccc5 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -4,7 +4,8 @@ import java.nio.file.Path
 
 import cats.data.NonEmptyList
 
-import docspell.analysis.{TextAnalysisConfig, classifier}
+import docspell.analysis.TextAnalysisConfig
+import docspell.analysis.classifier.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@@ -59,7 +60,7 @@ object Config {
   case class TextAnalysis(
       maxLength: Int,
       workingDir: Path,
-      clearStanfordNlpInterval: Duration,
+      nlpConfig: TextAnalysisConfig.NlpConfig,
       regexNer: RegexNer,
       classification: Classification
   ) {
@@ -67,8 +68,8 @@ object Config {
     def textAnalysisConfig: TextAnalysisConfig =
       TextAnalysisConfig(
         maxLength,
-        clearStanfordNlpInterval,
-        classifier.TextClassifierConfig(
+        nlpConfig,
+        TextClassifierConfig(
           workingDir,
           NonEmptyList
             .fromList(classification.classifiers)