From afbe9554b6b14c60d6e4395ca18397c05a21b15c Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Wed, 2 Sep 2020 22:23:08 +0200
Subject: [PATCH] Update joex nixos module

---
 nix/module-joex.nix | 68 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index d550c2d3..7619711f 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -95,6 +95,21 @@ let
         enabled = true;
         file-cache-time = "1 minute";
       };
+      classification = {
+        enabled = true;
+        item-count = 0;
+        classifiers = [
+          { "useSplitWords" = "true";
+            "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
+            "splitWordsIgnoreRegexp" = ''\s+'';
+            "useSplitPrefixSuffixNGrams" = "true";
+            "maxNGramLeng" = "4";
+            "minNGramLeng" = "1";
+            "splitWordShape" = "chris4";
+            "intern" = "true";
+          }
+        ];
+      };
       working-dir = "/tmp/docspell-analysis";
     };
     processing = {
@@ -736,6 +751,59 @@ in {
               default = defaults.text-analysis.regex-ner;
               description = "";
             };
+
+            classification = mkOption {
+              type = types.submodule({
+                options = {
+                  enabled = mkOption {
+                    type = types.bool;
+                    default = defaults.text-analysis.classification.enabled;
+                    description = ''
+                      Whether to enable classification globally. Each collective can
+                      decide to disable it. If it is disabled here, no collective
+                      can use classification.
+                    '';
+                  };
+                  item-count = mkOption {
+                    type = types.int;
+                    default = defaults.text-analysis.classification.item-count;
+                    description = ''
+                      If concerned with memory consumption, this restricts the
+                      number of items to consider. More are better for training. A
+                      negative value or zero means no train on all items.
+                    '';
+                  };
+                  classifiers = mkOption {
+                    type = types.listOf types.attrs;
+                    default = defaults.text-analysis.classification.classifiers;
+                    description = ''
+                      These settings are used to configure the classifier. If
+                      multiple are given, they are all tried and the "best" is
+                      chosen at the end. See
+                      https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
+                      for more info about these settings. The settings here yielded
+                      good results with *my* dataset.
+                    '';
+                  };
+
+                };
+              });
+              default = defaults.text-analysis.classification;
+              description = ''
+                Settings for doing document classification.
+
+                This works by learning from existing documents. A collective can
+                specify a tag category and the system will try to predict a tag
+                from this category for new incoming documents.
+
+                This requires a satstical model that is computed from all
+                existing documents. This process is run periodically as
+                configured by the collective. It may require a lot of memory,
+                depending on the amount of data.
+
+                It utilises this NLP library: https://nlp.stanford.edu/.
+              '';
+            };
           };
         });
         default = defaults.text-analysis;