Reorganize nlp pipeline and add nlp-unsupported language italian

Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
2025-09-15 21:46:53 +00:00 · 2021-01-16 23:43:24 +01:00
parent a70e9ab614
commit f01646aeb5
29 changed files with 676 additions and 255 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -277,7 +277,39 @@ docspell.joex {
    # files.
    working-dir = ${java.io.tmpdir}"/docspell-analysis"

-    nlp-config {
+    nlp {
+      # The mode for configuring NLP models:
+      #
+      # 1. full – builds the complete pipeline
+      # 2. basic - builds only the ner annotator
+      # 3. regexonly - matches each entry in your address book via regexps
+      # 4. disabled - doesn't use any stanford-nlp feature
+      #
+      # The full and basic variants rely on pre-build language models
+      # that are available for only 3 lanugages at the moment: German,
+      # English and French.
+      #
+      # Memory usage varies greatly among the languages. German has
+      # quite large models, that require about 1G heap. So joex should
+      # run with -Xmx1500M at least when using mode=full.
+      #
+      # The basic variant does a quite good job for German and
+      # English. It might be worse for French, always depending on the
+      # type of text that is analysed. Joex should run with about 600M
+      # heap, here again lanugage German uses the most.
+      #
+      # The regexonly variant doesn't depend on a language. It roughly
+      # works by converting all entries in your addressbook into
+      # regexps and matches each one against the text. This can get
+      # memory intensive, too, when the addressbook grows large. This
+      # is included in the full and basic by default, but can be used
+      # independently by setting mode=regexner.
+      #
+      # When mode=disabled, then the whole nlp pipeline is disabled,
+      # and you won't get any suggestions. Only what the classifier
+      # returns (if enabled).
+      mode = full
+
      # The StanfordCoreNLP library caches language models which
      # requires quite some amount of memory. Setting this interval to a
      # positive duration, the cache is cleared after this amount of
@@ -287,37 +319,28 @@ docspell.joex {
      # This has only any effect, if mode != disabled.
      clear-interval = "15 minutes"

-      # The mode for configuring NLP models. Currently 3 are available:
-      #
-      # 1. full – builds the complete pipeline, run with -Xmx1500M or more
-      # 2. basic - builds only the ner annotator, run with -Xmx600M or more
-      # 3. disabled - doesn't use any stanford-nlp feature
-      #
-      # The basic variant does a quite good job for German and
-      # English. It might be worse for French, always depending on the
-      # type of text that is analysed.
-      mode = full
-    }
+      regex-ner {
+        # Whether to enable custom NER annotation. This uses the
+        # address book of a collective as input for NER tagging (to
+        # automatically find correspondent and concerned entities). If
+        # the address book is large, this can be quite memory
+        # intensive and also makes text analysis much slower. But it
+        # improves accuracy and can be used independent of the
+        # lanugage. If this is set to 0, it is effectively disabled
+        # and NER tagging uses only statistical models (that also work
+        # quite well, but are restricted to the languages mentioned
+        # above).
+        #
+        # Note, this is only relevant if nlp-config.mode is not
+        # "disabled".
+        max-entries = 1000

-    regex-ner {
-      # Whether to enable custom NER annotation. This uses the address
-      # book of a collective as input for NER tagging (to automatically
-      # find correspondent and concerned entities). If the address book
-      # is large, this can be quite memory intensive and also makes text
-      # analysis slower. But it greatly improves accuracy. If this is
-      # false, NER tagging uses only statistical models (that also work
-      # quite well).
-      #
-      # This setting might be moved to the collective settings in the
-      # future.
-      #
-      # Note, this is only relevant if nlp-config.mode = full.
-      enabled = true
-
-      # The NER annotation uses a file of patterns that is derived from
-      # a collective's address book. This is is the time how long this
-      # file will be kept until a check for a state change is done.
-      file-cache-time = "1 minute"
+        # The NER annotation uses a file of patterns that is derived
+        # from a collective's address book. This is is the time how
+        # long this data will be kept until a check for a state change
+        # is done.
+        file-cache-time = "1 minute"
+      }
    }

    # Settings for doing document classification.