Add classifier settings

2025-08-05 02:24:52 +00:00 · 2020-08-28 22:17:49 +02:00
parent 53fdb100ab
commit 8c4f2e702b
17 changed files with 649 additions and 56 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -271,6 +271,50 @@ docspell.joex {
      # file will be kept until a check for a state change is done.
      file-cache-time = "1 minute"
    }
+
+    # Settings for doing document classification.
+    #
+    # This works by learning from existing documents. A collective can
+    # specify a tag category and the system will try to predict a tag
+    # from this category for new incoming documents.
+    #
+    # This requires a satstical model that is computed from all
+    # existing documents. This process is run periodically as
+    # configured by the collective. It may require a lot of memory,
+    # depending on the amount of data.
+    #
+    # It utilises this NLP library: https://nlp.stanford.edu/.
+    classification {
+      # Whether to enable classification globally. Each collective can
+      # decide to disable it. If it is disabled here, no collective
+      # can use classification.
+      enabled = true
+
+      # If concerned with memory consumption, this restricts the
+      # number of items to consider. More are better for training. A
+      # negative value or zero means no train on all items.
+      item-count = 0
+
+      # These settings are used to configure the classifier. If
+      # multiple are given, they are all tried and the "best" is
+      # chosen at the end. See
+      # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
+      # for more info about these settings. The settings are almost
+      # identical to them, as they yielded best results with *my*
+      # dataset.
+      #
+      # Enclose regexps in triple quotes.
+      classifiers = [
+        { "useSplitWords" = "true"
+          "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
+          "splitWordsIgnoreRegexp" = """\s+"""
+          "useSplitPrefixSuffixNGrams" = "true"
+          "maxNGramLeng" = "4"
+          "minNGramLeng" = "1"
+          "splitWordShape" = "chris4"
+        }
+      ]
+    }
  }

  # Configuration for converting files into PDFs.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -57,7 +57,8 @@ object Config {
  case class TextAnalysis(
      maxLength: Int,
      workingDir: Path,
-      regexNer: RegexNer
+      regexNer: RegexNer,
+      classification: Classification
  ) {

    def textAnalysisConfig: TextAnalysisConfig =
@ -68,4 +69,10 @@ object Config {
  }

  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
+
+  case class Classification(
+      enabled: Boolean,
+      itemCount: Int,
+      classifiers: List[Map[String, String]]
+  )
 }