Use collective data with NER annotation

2025-06-24 19:38:24 +00:00 · 2020-08-24 23:25:57 +02:00
parent de5b33c40d
commit 3473cbb773
12 changed files with 413 additions and 76 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -248,6 +248,29 @@ docspell.joex {
    # should suffice. Default is 10000, which are about 2-3 pages
    # (just a rough guess, of course).
    max-length = 10000
+
+    # A working directory for the analyser to store temporary/working
+    # files.
+    working-dir = ${java.io.tmpdir}"/docspell-analysis"
+
+    regex-ner {
+      # Whether to enable custom NER annotation. This uses the address
+      # book of a collective as input for NER tagging (to automatically
+      # find correspondent and concerned entities). If the address book
+      # is large, this can be quite memory intensive and also makes text
+      # analysis slower. But it greatly improves accuracy. If this is
+      # false, NER tagging uses only statistical models (that also work
+      # quite well).
+      #
+      # This setting might be moved to the collective settings in the
+      # future.
+      enabled = true
+
+      # The NER annotation uses a file of patterns that is derived from
+      # a collective's address book. This is is the time how long this
+      # file will be kept until a check for a state change is done.
+      file-cache-time = "1 minute"
+    }
  }

  # Configuration for converting files into PDFs.