Update documentation

2025-08-05 02:24:52 +00:00 · 2021-01-20 21:35:54 +01:00
parent 85ddc61d9d
commit a6c31be22f
6 changed files with 206 additions and 93 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -286,16 +286,13 @@ docspell.joex {
      # 4. disabled - doesn't use any stanford-nlp feature
      #
      # The full and basic variants rely on pre-build language models
-      # that are available for only 3 lanugages at the moment: German,
-      # English and French.
-      #
-      # Memory usage varies greatly among the languages. German has
-      # quite large models, that require about 1G heap. So joex should
-      # run with -Xmx1500M at least when using mode=full.
+      # that are available for only a few languages. Memory usage
+      # varies among the languages. So joex should run with -Xmx1400M
+      # at least when using mode=full.
      #
      # The basic variant does a quite good job for German and
      # English. It might be worse for French, always depending on the
-      # type of text that is analysed. Joex should run with about 600M
+      # type of text that is analysed. Joex should run with about 500M
      # heap, here again lanugage German uses the most.
      #
      # The regexonly variant doesn't depend on a language. It roughly
@ -349,25 +346,23 @@ docspell.joex {

    # Settings for doing document classification.
    #
-    # This works by learning from existing documents. A collective can
-    # specify a tag category and the system will try to predict a tag
-    # from this category for new incoming documents.
-    #
-    # This requires a satstical model that is computed from all
-    # existing documents. This process is run periodically as
-    # configured by the collective. It may require a lot of memory,
-    # depending on the amount of data.
+    # This works by learning from existing documents. This requires a
+    # satstical model that is computed from all existing documents.
+    # This process is run periodically as configured by the
+    # collective. It may require more memory, depending on the amount
+    # of data.
    #
    # It utilises this NLP library: https://nlp.stanford.edu/.
    classification {
      # Whether to enable classification globally. Each collective can
-      # decide to disable it. If it is disabled here, no collective
-      # can use classification.
+      # enable/disable auto-tagging. The classifier is also used for
+      # finding correspondents and concerned entities, if enabled
+      # here.
      enabled = true

      # If concerned with memory consumption, this restricts the
      # number of items to consider. More are better for training. A
-      # negative value or zero means no train on all items.
+      # negative value or zero means to train on all items.
      item-count = 0

      # These settings are used to configure the classifier. If