Choose nlp mode in config file

2025-08-05 02:24:52 +00:00 · 2021-01-14 00:55:19 +01:00
parent 54a09861c4
commit aa937797be
6 changed files with 95 additions and 19 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -277,12 +277,27 @@ docspell.joex {
    # files.
    working-dir = ${java.io.tmpdir}"/docspell-analysis"

-    # The StanfordCoreNLP library caches language models which
-    # requires quite some amount of memory. Setting this interval to a
-    # positive duration, the cache is cleared after this amount of
-    # idle time. Set it to 0 to disable it if you have enough memory,
-    # processing will be faster.
-    clear-stanford-nlp-interval = "15 minutes"
+    nlp-config {
+      # The StanfordCoreNLP library caches language models which
+      # requires quite some amount of memory. Setting this interval to a
+      # positive duration, the cache is cleared after this amount of
+      # idle time. Set it to 0 to disable it if you have enough memory,
+      # processing will be faster.
+      #
+      # This has only any effect, if mode != disabled.
+      clear-interval = "15 minutes"
+
+      # The mode for configuring NLP models. Currently 3 are available:
+      #
+      # 1. full – builds the complete pipeline, run with -Xmx1500M or more
+      # 2. basic - builds only the ner annotator, run with -Xmx600M or more
+      # 3. disabled - doesn't use any stanford-nlp feature
+      #
+      # The basic variant does a quite good job for German and
+      # English. It might be worse for French, always depending on the
+      # type of text that is analysed.
+      mode = full
+    }

    regex-ner {
      # Whether to enable custom NER annotation. This uses the address
@ -295,6 +310,8 @@ docspell.joex {
      #
      # This setting might be moved to the collective settings in the
      # future.
+      #
+      # Note, this is only relevant if nlp-config.mode = full.
      enabled = true

      # The NER annotation uses a file of patterns that is derived from
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -4,7 +4,8 @@ import java.nio.file.Path

 import cats.data.NonEmptyList

-import docspell.analysis.{TextAnalysisConfig, classifier}
+import docspell.analysis.TextAnalysisConfig
+import docspell.analysis.classifier.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@ -59,7 +60,7 @@ object Config {
  case class TextAnalysis(
      maxLength: Int,
      workingDir: Path,
-      clearStanfordNlpInterval: Duration,
+      nlpConfig: TextAnalysisConfig.NlpConfig,
      regexNer: RegexNer,
      classification: Classification
  ) {
@ -67,8 +68,8 @@ object Config {
    def textAnalysisConfig: TextAnalysisConfig =
      TextAnalysisConfig(
        maxLength,
-        clearStanfordNlpInterval,
-        classifier.TextClassifierConfig(
+        nlpConfig,
+        TextClassifierConfig(
          workingDir,
          NonEmptyList
            .fromList(classification.classifiers)