From fdb46da26d7be6ce457a5b8dbe4f104b198d034f Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 21 Apr 2020 23:33:15 +0200
Subject: [PATCH 1/7] Add french language and upgrade stanford-nlp to 4.0.0

---
 README.md                                     |  2 +-
 .../docspell/analysis/date/DateFind.scala     |  1 +
 .../analysis/nlp/LabelConverter.scala         | 25 +++++
 .../docspell/analysis/nlp/Properties.scala    | 97 +++++++++++++++++++
 .../analysis/nlp/StanfordNerClassifier.scala  | 54 ++++-------
 .../analysis/nlp/TextAnalyserSuite.scala      | 24 +++--
 .../main/scala/docspell/common/Language.scala |  7 +-
 .../main/scala/docspell/ftssolr/Field.scala   |  3 +
 .../scala/docspell/ftssolr/SolrQuery.scala    |  1 +
 .../scala/docspell/ftssolr/SolrSetup.scala    |  8 ++
 modules/webapp/src/main/elm/Data/Language.elm | 12 ++-
 project/Dependencies.scala                    | 13 ++-
 project/NerModelsPlugin.scala                 | 15 ++-
 13 files changed, 208 insertions(+), 54 deletions(-)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
diff --git a/README.md b/README.md
index 88928bef..6ad5e9e1 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
 
-[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell)
+[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
 [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org)
 [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
 [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)
diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
index 86fea719..f2170d31 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -54,6 +54,7 @@ object DateFind {
       val p = lang match {
         case Language.English => p2.or(p0).or(p1)
         case Language.German  => p1.or(p0).or(p2)
+        case Language.French  => p1.or(p0).or(p2)
       }
       p.read(parts).toOption
     }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
new file mode 100644
index 00000000..c32a532d
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
@@ -0,0 +1,25 @@
+package docspell.analysis.nlp
+
+import docspell.common.{NerLabel, NerTag}
+
+import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
+
+object LabelConverter {
+
+  private def tagFromLabel[A <: CoreAnnotation[String]](
+      label: CoreLabel,
+      annot: Class[A]
+  ): Option[NerTag] = {
+    val tag = label.get(annot)
+    Option(tag).flatMap(s => NerTag.fromString(s).toOption)
+  }
+
+  def findTag(label: CoreLabel): Option[NerTag] =
+    tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
+      .orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
+
+  def toNerLabel(label: CoreLabel): Option[NerLabel] =
+    findTag(label).map(t =>
+      NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
+    )
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
new file mode 100644
index 00000000..75ee7040
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -0,0 +1,97 @@
+package docspell.analysis.nlp
+
+import java.util.{Properties => JProps}
+
+import docspell.analysis.nlp.Properties.Implicits._
+
+object Properties {
+
+  def apply(ps: (String, String)*): JProps = {
+    val p = new JProps()
+    for ((k, v) <- ps)
+      p.setProperty(k, v)
+    p
+  }
+
+  def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language"           -> "de",
+      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
+      "ner.useSUTime"               -> "false", //only english, unused in docspell
+      "ner.language"                -> "de",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
+
+  def nerEnglish(regexNerMappingFile: Option[String]): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,pos,lemma,ner",
+      "tokenize.language"           -> "en",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false",
+      "ner.useSUTime"               -> "false",
+      "ner.language"                -> "en",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile)
+
+  def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language"           -> "fr",
+      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
+      "mwt.pos.model"               -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
+      "mwt.statisticalMappingFile"  -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false",
+      "ner.useSUTime"               -> "false",
+      "ner.language"                -> "de",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
+
+  object Implicits {
+    implicit final class JPropsOps(val p: JProps) extends AnyVal {
+
+      def set(name: String, value: Option[String]): JProps =
+        value match {
+          case Some(v) =>
+            p.setProperty(name, v)
+            p
+          case None =>
+            p
+        }
+
+      def change(name: String, f: String => String): JProps =
+        Option(p.getProperty(name)) match {
+          case Some(current) =>
+            p.setProperty(name, f(current))
+            p
+          case None =>
+            p
+        }
+
+      def withRegexNer(mappingFile: Option[String]): JProps =
+        set("regexner.mapping", mappingFile)
+          .change(
+            "annotators",
+            v => if (mappingFile.isDefined) v + ",regexner" else v
+          )
+
+      def withHighRecall(flag: Boolean): JProps = {
+        if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
+        else p.setProperty("ner.combinationMode", "NORMAL")
+        p
+      }
+    }
+  }
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
index 094abcca..32c165f5 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -1,16 +1,12 @@
 package docspell.analysis.nlp
 
-import java.net.URL
-import java.util.zip.GZIPInputStream
+import java.util.{Properties => JProps}
 
 import scala.jdk.CollectionConverters._
-import scala.util.Using
 
 import docspell.common._
 
-import edu.stanford.nlp.ie.AbstractSequenceClassifier
-import edu.stanford.nlp.ie.crf.CRFClassifier
-import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
+import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
 import org.log4s.getLogger
 
 object StanfordNerClassifier {
@@ -18,48 +14,32 @@ object StanfordNerClassifier {
 
   lazy val germanNerClassifier  = makeClassifier(Language.German)
   lazy val englishNerClassifier = makeClassifier(Language.English)
+  lazy val frenchNerClassifier  = makeClassifier(Language.French)
 
   def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
     val nerClassifier = lang match {
       case Language.English => englishNerClassifier
       case Language.German  => germanNerClassifier
+      case Language.French  => frenchNerClassifier
     }
-    nerClassifier
-      .classify(text)
-      .asScala
-      .flatMap(a => a.asScala)
-      .collect(Function.unlift { label =>
-        val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
-        NerTag
-          .fromString(Option(tag).getOrElse(""))
-          .toOption
-          .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
-      })
-      .toVector
+    val doc = new CoreDocument(text)
+    nerClassifier.annotate(doc)
+
+    doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
   }
 
-  private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
+  private def makeClassifier(lang: Language): StanfordCoreNLP = {
     logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
-    val ner = classifierResource(lang)
-    Using(new GZIPInputStream(ner.openStream())) { in =>
-      CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
-    }.fold(throw _, identity)
+    new StanfordCoreNLP(classifierProperties(lang))
   }
 
-  private def classifierResource(lang: Language): URL = {
-    def check(u: URL): URL =
-      if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
-      else u
-
-    check(lang match {
+  private def classifierProperties(lang: Language): JProps =
+    lang match {
       case Language.German =>
-        getClass.getResource(
-          "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
-        )
+        Properties.nerGerman(None, false)
       case Language.English =>
-        getClass.getResource(
-          "/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
-        )
-    })
-  }
+        Properties.nerEnglish(None)
+      case Language.French =>
+        Properties.nerFrench(None, false)
+    }
 }
diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
index c851edce..b7c083a1 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@@ -12,22 +12,30 @@ object TextAnalyserSuite extends SimpleTestSuite {
     val expect = Vector(
       NerLabel("Derek", NerTag.Person, 0, 5),
       NerLabel("Jeter", NerTag.Person, 6, 11),
-      NerLabel("Treesville", NerTag.Person, 27, 37),
+      NerLabel("Elm", NerTag.Misc, 17, 20),
+      NerLabel("Ave.", NerTag.Misc, 21, 25),
+      NerLabel("Treesville", NerTag.Misc, 27, 37),
       NerLabel("Derek", NerTag.Person, 68, 73),
       NerLabel("Jeter", NerTag.Person, 74, 79),
-      NerLabel("Treesville", NerTag.Location, 95, 105),
+      NerLabel("Elm", NerTag.Misc, 85, 88),
+      NerLabel("Ave.", NerTag.Misc, 89, 93),
+      NerLabel("Treesville", NerTag.Person, 95, 105),
+      NerLabel("Leaf", NerTag.Organization, 144, 148),
+      NerLabel("Chief", NerTag.Organization, 150, 155),
+      NerLabel("of", NerTag.Organization, 156, 158),
       NerLabel("Syrup", NerTag.Organization, 159, 164),
       NerLabel("Production", NerTag.Organization, 165, 175),
       NerLabel("Old", NerTag.Organization, 176, 179),
       NerLabel("Sticky", NerTag.Organization, 180, 186),
       NerLabel("Pancake", NerTag.Organization, 187, 194),
       NerLabel("Company", NerTag.Organization, 195, 202),
-      NerLabel("Maple", NerTag.Location, 207, 212),
-      NerLabel("Lane", NerTag.Location, 213, 217),
-      NerLabel("Forest", NerTag.Location, 219, 225),
+      NerLabel("Maple", NerTag.Organization, 207, 212),
+      NerLabel("Lane", NerTag.Organization, 213, 217),
+      NerLabel("Forest", NerTag.Organization, 219, 225),
       NerLabel("Hemptown", NerTag.Location, 239, 247),
-      NerLabel("Little", NerTag.Organization, 347, 353),
-      NerLabel("League", NerTag.Organization, 354, 360),
+      NerLabel("Leaf", NerTag.Person, 276, 280),
+      NerLabel("Little", NerTag.Misc, 347, 353),
+      NerLabel("League", NerTag.Misc, 354, 360),
       NerLabel("Derek", NerTag.Person, 1117, 1122),
       NerLabel("Jeter", NerTag.Person, 1123, 1128)
     )
@@ -40,7 +48,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
     val expect = Vector(
       NerLabel("Max", NerTag.Person, 0, 3),
       NerLabel("Mustermann", NerTag.Person, 4, 14),
-      NerLabel("Lilienweg", NerTag.Location, 16, 25),
+      NerLabel("Lilienweg", NerTag.Person, 16, 25),
       NerLabel("Max", NerTag.Person, 77, 80),
       NerLabel("Mustermann", NerTag.Person, 81, 91),
       NerLabel("Lilienweg", NerTag.Location, 93, 102),
diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala
index 7d836347..92c32f4b 100644
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@@ -27,7 +27,12 @@ object Language {
     val iso3 = "eng"
   }
 
-  val all: List[Language] = List(German, English)
+  case object French extends Language {
+    val iso2 = "fr"
+    val iso3 = "fra"
+  }
+
+  val all: List[Language] = List(German, English, French)
 
   def fromString(str: String): Either[String, Language] = {
     val lang = str.toLowerCase
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
index 6031cd61..2306a44d 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
@@ -23,6 +23,7 @@ object Field {
   val content        = Field("content")
   val content_de     = Field("content_de")
   val content_en     = Field("content_en")
+  val content_fr     = Field("content_fr")
   val itemName       = Field("itemName")
   val itemNotes      = Field("itemNotes")
   val folderId       = Field("folder")
@@ -33,6 +34,8 @@ object Field {
         Field.content_de
       case Language.English =>
         Field.content_en
+      case Language.French =>
+        Field.content_fr
     }
 
   implicit val jsonEncoder: Encoder[Field] =
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
index e07e9c36..1e3b09b3 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@@ -39,6 +39,7 @@ object SolrQuery {
             Field.content,
             Field.content_de,
             Field.content_en,
+            Field.content_fr,
             Field.itemName,
             Field.itemNotes,
             Field.attachmentName
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
index 932519c8..efb94a09 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -80,6 +80,8 @@ object SolrSetup {
             addTextField(l.some)(Field.content_de)
           case l @ Language.English =>
             addTextField(l.some)(Field.content_en)
+          case l @ Language.French =>
+            addTextField(l.some)(Field.content_fr)
         }
 
         cmds0 *> cmds1 *> cntLang *> ().pure[F]
@@ -105,6 +107,9 @@ object SolrSetup {
           case Some(Language.English) =>
             run(DeleteField.command(DeleteField(field))).attempt *>
               run(AddField.command(AddField.textEN(field)))
+          case Some(Language.French) =>
+            run(DeleteField.command(DeleteField(field))).attempt *>
+              run(AddField.command(AddField.textFR(field)))
         }
     }
   }
@@ -138,6 +143,9 @@ object SolrSetup {
 
     def textEN(field: Field): AddField =
       AddField(field, "text_en", true, true, false)
+
+    def textFR(field: Field): AddField =
+      AddField(field, "text_fr", true, true, false)
   }
 
   case class DeleteField(name: Field)
diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm
index 6704ec3e..40fe5eb2 100644
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@@ -10,6 +10,7 @@ module Data.Language exposing
 type Language
     = German
     | English
+    | French
 
 
 fromString : String -> Maybe Language
@@ -20,6 +21,9 @@ fromString str =
     else if str == "eng" || str == "en" || str == "english" then
         Just English
 
+    else if str == "fra" || str == "fr" || str == "french" then
+        Just French
+
     else
         Nothing
 
@@ -33,6 +37,9 @@ toIso3 lang =
         English ->
             "eng"
 
+        French ->
+            "fra"
+
 
 toName : Language -> String
 toName lang =
@@ -43,7 +50,10 @@ toName lang =
         English ->
             "English"
 
+        French ->
+            "French"
+
 
 all : List Language
 all =
-    [ German, English ]
+    [ German, English, French ]
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index ddcfa155..7ab0e4ad 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -31,7 +31,7 @@ object Dependencies {
   val PostgresVersion         = "42.2.16"
   val PureConfigVersion       = "0.13.0"
   val Slf4jVersion            = "1.7.30"
-  val StanfordNlpVersion      = "3.9.2"
+  val StanfordNlpVersion      = "4.0.0"
   val TikaVersion             = "1.24.1"
   val YamuscaVersion          = "0.6.2"
   val SwaggerUIVersion        = "3.32.3"
@@ -135,11 +135,16 @@ object Dependencies {
   )
 
   val stanfordNlpModels = Seq(
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier("models"),
     ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
       .classifier("models-german"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
-      "models-english"
-    )
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier("models-french"),
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier(
+        "models-english"
+      )
   )
 
   val tika = Seq(
diff --git a/project/NerModelsPlugin.scala b/project/NerModelsPlugin.scala
index cb658615..8d8fbb2c 100644
--- a/project/NerModelsPlugin.scala
+++ b/project/NerModelsPlugin.scala
@@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
   }
 
   private val nerModels = List(
-    "german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
-    "english.all.3class.distsim.crf.ser.gz"
+    "german.distsim.crf.ser.gz",
+    "english.conll.4class.distsim.crf.ser.gz",
+    "french-wikiner-4class.crf.ser.gz",
+    "french-mwt-statistical.tsv",
+    "french-mwt.tagger",
+    "french-mwt.tsv",
+    "german-mwt.tsv",
+    "german-ud.tagger",
+    "german-ud.tagger.props",
+    "french-ud.tagger",
+    "french-ud.tagger.props",
+    "english-left3words-distsim.tagger",
+    "english-left3words-distsim.tagger.props"
   )
 }

From 14f646f6a2135e135f1500e50f9eeaf884adefcf Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Sun, 23 Aug 2020 10:25:04 +0200
Subject: [PATCH 2/7] Make new coursier cache available to travis

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index fb6d1e7d..4d750d05 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,6 +10,7 @@ cache:
     - $HOME/.ivy2/cache
     - $HOME/.sbt/boot
     - $HOME/.coursier/cache
+    - $HOME/.cache/coursier
     - sysconfcpus
 
 install:

From 4e7c00c3457e02f86651cd86b5c2b50eb5b5835a Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Sun, 23 Aug 2020 17:40:37 +0200
Subject: [PATCH 3/7] Don't ignore updates for stanford-nlp anymore

---
 .scala-steward.conf | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 .scala-steward.conf

diff --git a/.scala-steward.conf b/.scala-steward.conf
deleted file mode 100644
index 2bbb5c09..00000000
--- a/.scala-steward.conf
+++ /dev/null
@@ -1,3 +0,0 @@
-updates.ignore = [
-  { groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
-]
\ No newline at end of file

From 8628a0a8b3d3eb2efc9a58a2f3c61b2fe5c1b190 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 24 Aug 2020 00:56:25 +0200
Subject: [PATCH 4/7] Allow configuring stanford-ner and cache based on
 collective

---
 .../docspell/analysis/TextAnalyser.scala      | 83 +++++++++--------
 .../docspell/analysis/nlp/PipelineCache.scala | 90 +++++++++++++++++++
 .../docspell/analysis/nlp/Properties.scala    | 14 +++
 .../analysis/nlp/StanfordNerClassifier.scala  | 50 +++++------
 .../analysis/nlp/StanfordSettings.scala       | 22 +++++
 .../src/main/scala/docspell/common/File.scala |  3 +
 .../scala/docspell/joex/JoexAppImpl.scala     | 22 ++---
 .../docspell/joex/process/ItemHandler.scala   | 14 +--
 .../docspell/joex/process/ProcessItem.scala   | 19 ++--
 .../docspell/joex/process/ReProcessItem.scala | 16 ++--
 .../docspell/joex/process/TextAnalysis.scala  | 55 ++++++------
 11 files changed, 271 insertions(+), 117 deletions(-)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala

diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
index 443fd47d..75d07eef 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -5,12 +5,19 @@ import cats.implicits._
 
 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
+import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
+import docspell.analysis.nlp.StanfordSettings
 import docspell.common._
 
 trait TextAnalyser[F[_]] {
 
-  def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
+  def annotate(
+      logger: Logger[F],
+      settings: StanfordSettings,
+      cacheKey: Ident,
+      text: String
+  ): F[TextAnalyser.Result]
 
 }
 object TextAnalyser {
@@ -22,43 +29,47 @@ object TextAnalyser {
   }
 
   def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
-    Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
-      def annotate(
-          logger: Logger[F],
-          lang: Language,
-          text: String
-      ): F[TextAnalyser.Result] =
-        for {
-          input <- textLimit(logger, text)
-          tags0 <- stanfordNer(lang, input)
-          tags1 <- contactNer(input)
-          dates <- dateNer(lang, input)
-          list  = tags0 ++ tags1
-          spans = NerLabelSpan.build(list)
-        } yield Result(spans ++ list, dates)
+    Resource
+      .liftF(PipelineCache[F]())
+      .map(cache =>
+        new TextAnalyser[F] {
+          def annotate(
+              logger: Logger[F],
+              settings: StanfordSettings,
+              cacheKey: Ident,
+              text: String
+          ): F[TextAnalyser.Result] =
+            for {
+              input <- textLimit(logger, text)
+              tags0 <- stanfordNer(cacheKey, settings, input)
+              tags1 <- contactNer(input)
+              dates <- dateNer(settings.lang, input)
+              list  = tags0 ++ tags1
+              spans = NerLabelSpan.build(list)
+            } yield Result(spans ++ list, dates)
 
-      private def textLimit(logger: Logger[F], text: String): F[String] =
-        if (text.length <= cfg.maxLength) text.pure[F]
-        else
-          logger.info(
-            s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
-              s" Analysing only first ${cfg.maxLength} characters."
-          ) *> text.take(cfg.maxLength).pure[F]
+          private def textLimit(logger: Logger[F], text: String): F[String] =
+            if (text.length <= cfg.maxLength) text.pure[F]
+            else
+              logger.info(
+                s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
+                  s" Analysing only first ${cfg.maxLength} characters."
+              ) *> text.take(cfg.maxLength).pure[F]
 
-      private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          StanfordNerClassifier.nerAnnotate(lang)(text)
+          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
+              : F[Vector[NerLabel]] =
+            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
+
+          private def contactNer(text: String): F[Vector[NerLabel]] =
+            Sync[F].delay {
+              Contact.annotate(text)
+            }
+
+          private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
+            Sync[F].delay {
+              DateFind.findDates(text, lang).toVector
+            }
         }
-
-      private def contactNer(text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          Contact.annotate(text)
-        }
-
-      private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
-        Sync[F].delay {
-          DateFind.findDates(text, lang).toVector
-        }
-    })
+      )
 
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
new file mode 100644
index 00000000..9787563f
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@@ -0,0 +1,90 @@
+package docspell.analysis.nlp
+
+import cats.Applicative
+import cats.effect._
+import cats.effect.concurrent.Ref
+import cats.implicits._
+
+import docspell.common._
+
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import org.log4s.getLogger
+
+/** Creating the StanfordCoreNLP pipeline is quite expensive as it
+  * involves IO and initializing large objects.
+  *
+  * Therefore, the instances are cached, because they are thread-safe.
+  *
+  * **This is an internal API**
+  */
+trait PipelineCache[F[_]] {
+
+  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
+
+}
+
+object PipelineCache {
+  private[this] val logger = getLogger
+
+  def none[F[_]: Applicative]: PipelineCache[F] =
+    new PipelineCache[F] {
+      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+        makeClassifier(settings).pure[F]
+    }
+
+  def apply[F[_]: Sync](): F[PipelineCache[F]] =
+    Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
+
+  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
+      extends PipelineCache[F] {
+
+    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+      for {
+        id  <- makeSettingsId(settings)
+        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
+      } yield nlp
+
+    private def getOrCreate(
+        key: String,
+        id: String,
+        cache: Map[String, Entry],
+        settings: StanfordSettings
+    ): (Map[String, Entry], StanfordCoreNLP) =
+      cache.get(key) match {
+        case Some(entry) =>
+          if (entry.id == id) (cache, entry.value)
+          else {
+            logger.info(
+              s"StanfordNLP settings changed for key $key. Creating new classifier"
+            )
+            val nlp = makeClassifier(settings)
+            val e   = Entry(id, nlp)
+            (cache.updated(key, e), nlp)
+          }
+
+        case None =>
+          val nlp = makeClassifier(settings)
+          val e   = Entry(id, nlp)
+          (cache.updated(key, e), nlp)
+      }
+
+    private def makeSettingsId(settings: StanfordSettings): F[String] = {
+      val base = settings.copy(regexNer = None).toString
+      val size: F[Long] =
+        settings.regexNer match {
+          case Some(p) =>
+            File.size(p)
+          case None =>
+            0L.pure[F]
+        }
+      size.map(len => s"$base-$len")
+    }
+
+  }
+  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
+    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
+    new StanfordCoreNLP(Properties.forSettings(settings))
+  }
+
+  private case class Entry(id: String, value: StanfordCoreNLP)
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
index 75ee7040..314f04fb 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -3,6 +3,7 @@ package docspell.analysis.nlp
 import java.util.{Properties => JProps}
 
 import docspell.analysis.nlp.Properties.Implicits._
+import docspell.common._
 
 object Properties {
 
@@ -13,6 +14,19 @@ object Properties {
     p
   }
 
+  def forSettings(settings: StanfordSettings): JProps = {
+    val regexNerFile = settings.regexNer
+      .map(p => p.normalize().toAbsolutePath().toString())
+    settings.lang match {
+      case Language.German =>
+        Properties.nerGerman(regexNerFile, settings.highRecall)
+      case Language.English =>
+        Properties.nerEnglish(regexNerFile)
+      case Language.French =>
+        Properties.nerFrench(regexNerFile, settings.highRecall)
+    }
+  }
+
   def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
     Properties(
       "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
index 32c165f5..424396e5 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -1,45 +1,39 @@
 package docspell.analysis.nlp
 
-import java.util.{Properties => JProps}
-
 import scala.jdk.CollectionConverters._
 
+import cats.Applicative
+import cats.implicits._
+
 import docspell.common._
 
 import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
-import org.log4s.getLogger
 
 object StanfordNerClassifier {
-  private[this] val logger = getLogger
 
-  lazy val germanNerClassifier  = makeClassifier(Language.German)
-  lazy val englishNerClassifier = makeClassifier(Language.English)
-  lazy val frenchNerClassifier  = makeClassifier(Language.French)
+  /** Runs named entity recognition on the given `text`.
+    *
+    * This uses the classifier pipeline from stanford-nlp, see
+    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these
+    * classifiers is quite expensive, it involves loading large model
+    * files. The classifiers are thread-safe and so they are cached.
+    * The `cacheKey` defines the "slot" where classifiers are stored
+    * and retrieved. If for a given `cacheKey` the `settings` change,
+    * a new classifier must be created. It will then replace the
+    * previous one.
+    */
+  def nerAnnotate[F[_]: Applicative](
+      cacheKey: String,
+      cache: PipelineCache[F]
+  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
+    cache
+      .obtain(cacheKey, settings)
+      .map(crf => runClassifier(crf, text))
 
-  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
-    val nerClassifier = lang match {
-      case Language.English => englishNerClassifier
-      case Language.German  => germanNerClassifier
-      case Language.French  => frenchNerClassifier
-    }
+  def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
     val doc = new CoreDocument(text)
     nerClassifier.annotate(doc)
-
     doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
   }
 
-  private def makeClassifier(lang: Language): StanfordCoreNLP = {
-    logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
-    new StanfordCoreNLP(classifierProperties(lang))
-  }
-
-  private def classifierProperties(lang: Language): JProps =
-    lang match {
-      case Language.German =>
-        Properties.nerGerman(None, false)
-      case Language.English =>
-        Properties.nerEnglish(None)
-      case Language.French =>
-        Properties.nerFrench(None, false)
-    }
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
new file mode 100644
index 00000000..c2f6f98c
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
@@ -0,0 +1,22 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+import docspell.common._
+
+/** Settings for configuring the stanford NER pipeline.
+  *
+  * The language is mandatory, only the provided ones are supported.
+  * The `highRecall` only applies for non-English languages. For
+  * non-English languages the english classifier is run as second
+  * classifier and if `highRecall` is true, then it will be used to
+  * tag untagged tokens. This may lead to a lot of false positives,
+  * but since English is omnipresent in other languages, too it
+  * depends on the use case for whether this is useful or not.
+  *
+  * The `regexNer` allows to specify a text file as described here:
+  * https://nlp.stanford.edu/software/regexner.html. This will be used
+  * as a last step to tag untagged tokens using the provided list of
+  * regexps.
+  */
+case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala
index e9596fa8..0efc552a 100644
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@@ -55,6 +55,9 @@ object File {
   def exists[F[_]: Sync](file: Path): F[Boolean] =
     Sync[F].delay(Files.exists(file))
 
+  def size[F[_]: Sync](file: Path): F[Long] =
+    Sync[F].delay(Files.size(file))
+
   def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
     Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
 
diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
index bc415446..dcea79df 100644
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -6,6 +6,7 @@ import cats.effect._
 import cats.implicits._
 import fs2.concurrent.SignallingRef
 
+import docspell.analysis.TextAnalyser
 import docspell.backend.ops._
 import docspell.common._
 import docspell.ftsclient.FtsClient
@@ -80,14 +81,15 @@ object JoexAppImpl {
     for {
       httpClient <- BlazeClientBuilder[F](clientEC).resource
       client = JoexClient(httpClient)
-      store   <- Store.create(cfg.jdbc, connectEC, blocker)
-      queue   <- JobQueue(store)
-      pstore  <- PeriodicTaskStore.create(store)
-      nodeOps <- ONode(store)
-      joex    <- OJoex(client, store)
-      upload  <- OUpload(store, queue, cfg.files, joex)
-      fts     <- createFtsClient(cfg)(httpClient)
-      itemOps <- OItem(store, fts, queue, joex)
+      store    <- Store.create(cfg.jdbc, connectEC, blocker)
+      queue    <- JobQueue(store)
+      pstore   <- PeriodicTaskStore.create(store)
+      nodeOps  <- ONode(store)
+      joex     <- OJoex(client, store)
+      upload   <- OUpload(store, queue, cfg.files, joex)
+      fts      <- createFtsClient(cfg)(httpClient)
+      itemOps  <- OItem(store, fts, queue, joex)
+      analyser <- TextAnalyser.create[F](cfg.textAnalysis)
       javaEmil =
         JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
       sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@@ -95,14 +97,14 @@ object JoexAppImpl {
         .withTask(
           JobTask.json(
             ProcessItemArgs.taskName,
-            ItemHandler.newItem[F](cfg, itemOps, fts),
+            ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
             ItemHandler.onCancel[F]
           )
         )
         .withTask(
           JobTask.json(
             ReProcessItemArgs.taskName,
-            ReProcessItem[F](cfg, fts),
+            ReProcessItem[F](cfg, fts, analyser),
             ReProcessItem.onCancel[F]
           )
         )
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
index 4da8f779..240e7f54 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@@ -5,6 +5,7 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream
 
+import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.{ItemState, ProcessItemArgs}
 import docspell.ftsclient.FtsClient
@@ -29,11 +30,12 @@ object ItemHandler {
   def newItem[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   ): Task[F, Args, Unit] =
     CreateItem[F]
       .flatMap(itemStateTask(ItemState.Processing))
-      .flatMap(safeProcess[F](cfg, itemOps, fts))
+      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
       .map(_ => ())
 
   def itemStateTask[F[_]: Sync, A](
@@ -51,11 +53,12 @@ object ItemHandler {
   def safeProcess[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   )(data: ItemData): Task[F, Args, ItemData] =
     isLastRetry[F].flatMap {
       case true =>
-        ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
+        ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
           case Right(d) =>
             Task.pure(d)
           case Left(ex) =>
@@ -65,7 +68,8 @@ object ItemHandler {
               .andThen(_ => Sync[F].raiseError(ex))
         })
       case false =>
-        ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
+        ProcessItem[F](cfg, itemOps, fts, analyser)(data)
+          .flatMap(itemStateTask(ItemState.Created))
     }
 
   private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
index 9b4d050f..cd76e095 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -2,6 +2,7 @@ package docspell.joex.process
 
 import cats.effect._
 
+import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.ProcessItemArgs
 import docspell.ftsclient.FtsClient
@@ -13,25 +14,28 @@ object ProcessItem {
   def apply[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     ExtractArchive(item)
       .flatMap(Task.setProgress(20))
-      .flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
+      .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
       .flatMap(LinkProposal[F])
       .flatMap(SetGivenData[F](itemOps))
       .flatMap(Task.setProgress(99))
 
   def processAttachments[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    processAttachments0[F](cfg, fts, (30, 60, 90))(item)
+    processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
 
   def analysisOnly[F[_]: Sync](
-      cfg: Config
+      cfg: Config,
+      analyser: TextAnalyser[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](cfg.textAnalysis)(item)
+    TextAnalysis[F](analyser)(item)
       .flatMap(FindProposal[F](cfg.processing))
       .flatMap(EvalProposals[F])
       .flatMap(SaveProposals[F])
@@ -39,12 +43,13 @@ object ProcessItem {
   private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       fts: FtsClient[F],
+      analyser: TextAnalyser[F],
       progress: (Int, Int, Int)
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     ConvertPdf(cfg.convert, item)
       .flatMap(Task.setProgress(progress._1))
       .flatMap(TextExtraction(cfg.extraction, fts))
       .flatMap(Task.setProgress(progress._2))
-      .flatMap(analysisOnly[F](cfg))
+      .flatMap(analysisOnly[F](cfg, analyser))
       .flatMap(Task.setProgress(progress._3))
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
index 8f5e11f2..53282539 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -4,6 +4,7 @@ import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 
+import docspell.analysis.TextAnalyser
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
@@ -19,10 +20,11 @@ object ReProcessItem {
 
   def apply[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   ): Task[F, Args, Unit] =
     loadItem[F]
-      .flatMap(safeProcess[F](cfg, fts))
+      .flatMap(safeProcess[F](cfg, fts, analyser))
       .map(_ => ())
 
   def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@@ -70,6 +72,7 @@ object ReProcessItem {
   def processFiles[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       fts: FtsClient[F],
+      analyser: TextAnalyser[F],
       data: ItemData
   ): Task[F, Args, ItemData] = {
 
@@ -91,7 +94,7 @@ object ReProcessItem {
 
     getLanguage[F].flatMap { lang =>
       ProcessItem
-        .processAttachments[F](cfg, fts)(data)
+        .processAttachments[F](cfg, fts, analyser)(data)
         .contramap[Args](convertArgs(lang))
     }
   }
@@ -109,11 +112,12 @@ object ReProcessItem {
 
   def safeProcess[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F]
   )(data: ItemData): Task[F, Args, ItemData] =
     isLastRetry[F].flatMap {
       case true =>
-        processFiles[F](cfg, fts, data).attempt
+        processFiles[F](cfg, fts, analyser, data).attempt
           .flatMap({
             case Right(d) =>
               Task.pure(d)
@@ -123,7 +127,7 @@ object ReProcessItem {
               ).andThen(_ => Sync[F].raiseError(ex))
           })
       case false =>
-        processFiles[F](cfg, fts, data)
+        processFiles[F](cfg, fts, analyser, data)
     }
 
   private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 5e31e2d9..625738ef 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,9 +1,10 @@
 package docspell.joex.process
 
-import cats.effect.Sync
+import cats.effect._
 import cats.implicits._
 
-import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
+import docspell.analysis.TextAnalyser
+import docspell.analysis.nlp.StanfordSettings
 import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Task
@@ -12,36 +13,40 @@ import docspell.store.records.RAttachmentMeta
 object TextAnalysis {
 
   def apply[F[_]: Sync](
-      cfg: TextAnalysisConfig
+      analyser: TextAnalyser[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     Task { ctx =>
-      TextAnalyser.create[F](cfg).use { analyser =>
-        for {
-          _ <- ctx.logger.info("Starting text analysis")
-          s <- Duration.stopTime[F]
-          t <-
-            item.metas.toList
-              .traverse(
-                annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
-              )
-          _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
-          _ <- t.traverse(m =>
-            ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
-          )
-          e <- s
-          _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
-          v = t.toVector
-        } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
-      }
+      for {
+        _ <- ctx.logger.info("Starting text analysis")
+        s <- Duration.stopTime[F]
+        t <-
+          item.metas.toList
+            .traverse(
+              annotateAttachment[F](ctx.args, ctx.logger, analyser)
+            )
+        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
+        _ <- t.traverse(m =>
+          ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
+        )
+        e <- s
+        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
+        v = t.toVector
+      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
     }
 
   def annotateAttachment[F[_]: Sync](
-      lang: Language,
+      args: ProcessItemArgs,
       logger: Logger[F],
       analyser: TextAnalyser[F]
-  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
+  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
+    val settings = StanfordSettings(args.meta.language, false, None)
     for {
-      labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
+      labels <- analyser.annotate(
+        logger,
+        settings,
+        args.meta.collective,
+        rm.content.getOrElse("")
+      )
     } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
-
+  }
 }

From 96d2f948f2af5a0e11859fc1101ced9de36d862b Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 24 Aug 2020 14:35:56 +0200
Subject: [PATCH 5/7] Use collective's addressbook to configure regexner

---
 .../analysis/nlp/TextAnalyserSuite.scala      |  9 ++-
 .../src/main/scala/docspell/common/File.scala |  4 ++
 .../docspell/joex/process/TextAnalysis.scala  | 65 +++++++++++++++++--
 .../docspell/store/queries/QCollective.scala  | 15 +++++
 4 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
index b7c083a1..b22093f1 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@@ -3,12 +3,17 @@ package docspell.analysis.nlp
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
 
 object TextAnalyserSuite extends SimpleTestSuite {
+  lazy val germanClassifier =
+    new StanfordCoreNLP(Properties.nerGerman(None, false))
+  lazy val englishClassifier =
+    new StanfordCoreNLP(Properties.nerEnglish(None))
 
   test("find english ner labels") {
     val labels =
-      StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
     val expect = Vector(
       NerLabel("Derek", NerTag.Person, 0, 5),
       NerLabel("Jeter", NerTag.Person, 6, 11),
@@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
 
   test("find german ner labels") {
     val labels =
-      StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
     val expect = Vector(
       NerLabel("Max", NerTag.Person, 0, 3),
       NerLabel("Mustermann", NerTag.Person, 4, 14),
diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala
index 0efc552a..2d5cfb8a 100644
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@@ -1,6 +1,7 @@
 package docspell.common
 
 import java.io.IOException
+import java.nio.charset.StandardCharsets
 import java.nio.file._
 import java.nio.file.attribute.BasicFileAttributes
 import java.util.concurrent.atomic.AtomicInteger
@@ -87,4 +88,7 @@ object File {
 
   def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
     readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
+
+  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
+    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 625738ef..9ee3850c 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,13 +1,18 @@
 package docspell.joex.process
 
+import java.nio.file.Paths
+
 import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
+import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
+import docspell.store.queries.QCollective
 import docspell.store.records.RAttachmentMeta
 
 object TextAnalysis {
@@ -22,7 +27,7 @@ object TextAnalysis {
         t <-
           item.metas.toList
             .traverse(
-              annotateAttachment[F](ctx.args, ctx.logger, analyser)
+              annotateAttachment[F](ctx, analyser)
             )
         _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
         _ <- t.traverse(m =>
@@ -35,18 +40,64 @@ object TextAnalysis {
     }
 
   def annotateAttachment[F[_]: Sync](
-      args: ProcessItemArgs,
-      logger: Logger[F],
+      ctx: Context[F, ProcessItemArgs],
       analyser: TextAnalyser[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(args.meta.language, false, None)
+    val settings = StanfordSettings(ctx.args.meta.language, false, None)
     for {
+      names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
+      temp  <- File.mkTempFile(Paths.get("."), "textanalysis")
+      _     <- File.writeString(temp, mkNerConfig(names))
+      sett = settings.copy(regexNer = Some(temp))
       labels <- analyser.annotate(
-        logger,
-        settings,
-        args.meta.collective,
+        ctx.logger,
+        sett,
+        ctx.args.meta.collective,
         rm.content.getOrElse("")
       )
+      _ <- File.deleteFile(temp)
     } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
   }
+
+  def mkNerConfig(names: QCollective.Names): String = {
+    val orgs = names.org
+      .flatMap(Pattern(3))
+      .distinct
+      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
+
+    val pers =
+      names.pers
+        .flatMap(Pattern(2))
+        .distinct
+        .map(_.toRow("PERSON", "LOCATION,MISC"))
+
+    val equips =
+      names.equip
+        .flatMap(Pattern(1))
+        .distinct
+        .map(_.toRow("MISC", "LOCATION"))
+
+    (orgs ++ pers ++ equips).mkString("\n")
+  }
+
+  case class Pattern(value: String, weight: Int) {
+    def toRow(tag: String, overrideTags: String): String =
+      s"$value\t$tag\t$overrideTags\t$weight"
+  }
+
+  object Pattern {
+    def apply(weight: Int)(str: String): Vector[Pattern] = {
+      val delims = " \t\n\r".toSet
+      val words =
+        TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
+      val tokens =
+        TextSplitter
+          .splitToken(str, delims)
+          .toVector
+          .take(3)
+          .map(w => s"(?i)${w.toLower.value}")
+
+      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+    }
+  }
 }
diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
index 2dc94e05..80b40207 100644
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@@ -1,5 +1,6 @@
 package docspell.store.queries
 
+import cats.data.OptionT
 import fs2.Stream
 
 import docspell.common.ContactKind
@@ -11,6 +12,20 @@ import doobie._
 import doobie.implicits._
 
 object QCollective {
+
+  case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
+  object Names {
+    val empty = Names(Vector.empty, Vector.empty, Vector.empty)
+  }
+
+  def allNames(collective: Ident): ConnectionIO[Names] =
+    (for {
+      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
+      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
+      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
+    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
+      .getOrElse(Names.empty)
+
   case class TagCount(tag: RTag, count: Int)
 
   case class InsightData(

From de5b33c40ddc50cd901eaf6e2d8d2f8f07290562 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 24 Aug 2020 16:09:11 +0200
Subject: [PATCH 6/7] Add `updated` column to some tables

---
 .../restserver/conv/Conversions.scala         | 16 ++++--
 .../restserver/routes/EquipmentRoutes.scala   |  8 +--
 .../mariadb/V1.9.0__updated_column.sql        | 29 ++++++++++
 .../postgresql/V1.9.0__updated_column.sql     | 29 ++++++++++
 .../docspell/store/records/REquipment.scala   | 35 ++++++++----
 .../store/records/ROrganization.scala         | 39 ++++++++------
 .../docspell/store/records/RPerson.scala      | 53 +++++++++++++------
 7 files changed, 156 insertions(+), 53 deletions(-)
 create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
 create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql

diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
index f2f131f0..539ec3eb 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@@ -341,6 +341,7 @@ trait Conversions {
         v.address.city,
         v.address.country,
         v.notes,
+        now,
         now
       )
     } yield OOrganization.OrgAndContacts(org, cont)
@@ -353,6 +354,7 @@ trait Conversions {
     def contacts(oid: Ident) =
       v.contacts.traverse(c => newContact(c, oid.some, None))
     for {
+      now  <- Timestamp.current[F]
       cont <- contacts(v.id)
       org = ROrganization(
         v.id,
@@ -363,7 +365,8 @@ trait Conversions {
         v.address.city,
         v.address.country,
         v.notes,
-        v.created
+        v.created,
+        now
       )
     } yield OOrganization.OrgAndContacts(org, cont)
   }
@@ -398,6 +401,7 @@ trait Conversions {
         v.address.country,
         v.notes,
         v.concerning,
+        now,
         now
       )
     } yield OOrganization.PersonAndContacts(org, cont)
@@ -410,6 +414,7 @@ trait Conversions {
     def contacts(pid: Ident) =
       v.contacts.traverse(c => newContact(c, None, pid.some))
     for {
+      now  <- Timestamp.current[F]
       cont <- contacts(v.id)
       org = RPerson(
         v.id,
@@ -421,7 +426,8 @@ trait Conversions {
         v.address.country,
         v.notes,
         v.concerning,
-        v.created
+        v.created,
+        now
       )
     } yield OOrganization.PersonAndContacts(org, cont)
   }
@@ -536,11 +542,11 @@ trait Conversions {
   def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
     timeId.map({
       case (id, now) =>
-        REquipment(id, cid, e.name, now)
+        REquipment(id, cid, e.name, now, now)
     })
 
-  def changeEquipment(e: Equipment, cid: Ident): REquipment =
-    REquipment(e.id, cid, e.name, e.created)
+  def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
+    Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))
 
   // idref
 
diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
index edfc7521..a8db67ba 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
@@ -39,10 +39,10 @@ object EquipmentRoutes {
 
       case req @ PUT -> Root =>
         for {
-          data <- req.as[Equipment]
-          equip = changeEquipment(data, user.account.collective)
-          res  <- backend.equipment.update(equip)
-          resp <- Ok(basicResult(res, "Equipment updated."))
+          data  <- req.as[Equipment]
+          equip <- changeEquipment(data, user.account.collective)
+          res   <- backend.equipment.update(equip)
+          resp  <- Ok(basicResult(res, "Equipment updated."))
         } yield resp
 
       case DELETE -> Root / Ident(id) =>
diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
new file mode 100644
index 00000000..72b6b152
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
@@ -0,0 +1,29 @@
+-- organization
+ALTER TABLE `organization`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `organization` SET `updated` = `created`;
+
+ALTER TABLE `organization`
+MODIFY `updated` timestamp NOT NULL;
+
+-- person
+ALTER TABLE `person`
+MODIFY `created` timestamp;
+
+ALTER TABLE `person`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `person` SET `updated` = `created`;
+
+ALTER TABLE `person`
+MODIFY `updated` timestamp NOT NULL;
+
+-- equipment
+ALTER TABLE `equipment`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `equipment` SET `updated` = `created`;
+
+ALTER TABLE `equipment`
+MODIFY `updated` timestamp NOT NULL;
diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
new file mode 100644
index 00000000..34c57718
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
@@ -0,0 +1,29 @@
+-- organization
+ALTER TABLE "organization"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "organization" SET "updated" = "created";
+
+ALTER TABLE "organization"
+ALTER COLUMN "updated" SET NOT NULL;
+
+-- person
+ALTER TABLE "person" ALTER COLUMN "created"
+  TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
+
+ALTER TABLE "person"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "person" SET "updated" = "created";
+
+ALTER TABLE "person"
+ALTER COLUMN "updated" SET NOT NULL;
+
+-- equipment
+ALTER TABLE "equipment"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "equipment" SET "updated" = "created";
+
+ALTER TABLE "equipment"
+ALTER COLUMN "updated" SET NOT NULL;
diff --git a/modules/store/src/main/scala/docspell/store/records/REquipment.scala b/modules/store/src/main/scala/docspell/store/records/REquipment.scala
index 78d2e7f8..3a7f6d2f 100644
--- a/modules/store/src/main/scala/docspell/store/records/REquipment.scala
+++ b/modules/store/src/main/scala/docspell/store/records/REquipment.scala
@@ -7,7 +7,13 @@ import docspell.store.impl._
 import doobie._
 import doobie.implicits._
 
-case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {}
+case class REquipment(
+    eid: Ident,
+    cid: Ident,
+    name: String,
+    created: Timestamp,
+    updated: Timestamp
+) {}
 
 object REquipment {
 
@@ -18,25 +24,32 @@ object REquipment {
     val cid     = Column("cid")
     val name    = Column("name")
     val created = Column("created")
-    val all     = List(eid, cid, name, created)
+    val updated = Column("updated")
+    val all     = List(eid, cid, name, created, updated)
   }
   import Columns._
 
   def insert(v: REquipment): ConnectionIO[Int] = {
-    val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}")
+    val sql =
+      insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
     sql.update.run
   }
 
   def update(v: REquipment): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(eid.is(v.eid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(eid.is(v.eid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          updated.setTo(now)
+        )
       )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
   }
 
   def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {
diff --git a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
index 17fe4845..8eb07e29 100644
--- a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
+++ b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
@@ -19,7 +19,8 @@ case class ROrganization(
     city: String,
     country: String,
     notes: Option[String],
-    created: Timestamp
+    created: Timestamp,
+    updated: Timestamp
 ) {}
 
 object ROrganization {
@@ -38,7 +39,8 @@ object ROrganization {
     val country = Column("country")
     val notes   = Column("notes")
     val created = Column("created")
-    val all     = List(oid, cid, name, street, zip, city, country, notes, created)
+    val updated = Column("updated")
+    val all     = List(oid, cid, name, street, zip, city, country, notes, created, updated)
   }
 
   import Columns._
@@ -47,26 +49,31 @@ object ROrganization {
     val sql = insertRow(
       table,
       all,
-      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}"
+      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
     )
     sql.update.run
   }
 
   def update(v: ROrganization): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(oid.is(v.oid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name),
-        street.setTo(v.street),
-        zip.setTo(v.zip),
-        city.setTo(v.city),
-        country.setTo(v.country),
-        notes.setTo(v.notes)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(oid.is(v.oid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          street.setTo(v.street),
+          zip.setTo(v.zip),
+          city.setTo(v.city),
+          country.setTo(v.country),
+          notes.setTo(v.notes),
+          updated.setTo(now)
+        )
       )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
   }
 
   def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =
diff --git a/modules/store/src/main/scala/docspell/store/records/RPerson.scala b/modules/store/src/main/scala/docspell/store/records/RPerson.scala
index eb9a9872..0c2bdcd9 100644
--- a/modules/store/src/main/scala/docspell/store/records/RPerson.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RPerson.scala
@@ -20,7 +20,8 @@ case class RPerson(
     country: String,
     notes: Option[String],
     concerning: Boolean,
-    created: Timestamp
+    created: Timestamp,
+    updated: Timestamp
 ) {}
 
 object RPerson {
@@ -40,7 +41,20 @@ object RPerson {
     val notes      = Column("notes")
     val concerning = Column("concerning")
     val created    = Column("created")
-    val all        = List(pid, cid, name, street, zip, city, country, notes, concerning, created)
+    val updated    = Column("updated")
+    val all = List(
+      pid,
+      cid,
+      name,
+      street,
+      zip,
+      city,
+      country,
+      notes,
+      concerning,
+      created,
+      updated
+    )
   }
 
   import Columns._
@@ -49,27 +63,32 @@ object RPerson {
     val sql = insertRow(
       table,
       all,
-      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}"
+      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
     )
     sql.update.run
   }
 
   def update(v: RPerson): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(pid.is(v.pid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name),
-        street.setTo(v.street),
-        zip.setTo(v.zip),
-        city.setTo(v.city),
-        country.setTo(v.country),
-        concerning.setTo(v.concerning),
-        notes.setTo(v.notes)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(pid.is(v.pid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          street.setTo(v.street),
+          zip.setTo(v.zip),
+          city.setTo(v.city),
+          country.setTo(v.country),
+          concerning.setTo(v.concerning),
+          notes.setTo(v.notes),
+          updated.setTo(now)
+        )
       )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
   }
 
   def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =

From 3473cbb773b6ee11f92e523f6920b684ac82912b Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 24 Aug 2020 23:25:57 +0200
Subject: [PATCH 7/7] Use collective data with NER annotation

---
 .../main/scala/docspell/common/Duration.scala |   9 +
 .../src/main/scala/docspell/common/File.scala |  10 ++
 .../joex/src/main/resources/reference.conf    |  23 +++
 .../src/main/scala/docspell/joex/Config.scala |  20 ++-
 .../scala/docspell/joex/JoexAppImpl.scala     |   8 +-
 .../docspell/joex/analysis/NerFile.scala      |  99 +++++++++++
 .../docspell/joex/analysis/RegexNerFile.scala | 164 ++++++++++++++++++
 .../docspell/joex/process/ItemHandler.scala   |  13 +-
 .../docspell/joex/process/ProcessItem.scala   |  19 +-
 .../docspell/joex/process/ReProcessItem.scala |  16 +-
 .../docspell/joex/process/TextAnalysis.scala  |  62 +------
 nix/module-joex.nix                           |  46 +++++
 12 files changed, 413 insertions(+), 76 deletions(-)
 create mode 100644 modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
 create mode 100644 modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala

diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala
index f154a292..1c290c95 100644
--- a/modules/common/src/main/scala/docspell/common/Duration.scala
+++ b/modules/common/src/main/scala/docspell/common/Duration.scala
@@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
 
   def hours: Long = minutes / 60
 
+  def >(other: Duration): Boolean =
+    nanos > other.nanos
+
+  def <(other: Duration): Boolean =
+    nanos < other.nanos
+
   def toScala: FiniteDuration =
     FiniteDuration(nanos, TimeUnit.NANOSECONDS)
 
@@ -62,6 +68,9 @@ object Duration {
   def nanos(n: Long): Duration =
     Duration(n)
 
+  def between(start: Timestamp, end: Timestamp): Duration =
+    apply(JDur.between(start.value, end.value))
+
   def stopTime[F[_]: Sync]: F[F[Duration]] =
     for {
       now <- Timestamp.current[F]
diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala
index 2d5cfb8a..572291c5 100644
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@@ -12,6 +12,10 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream
 
+import docspell.common.syntax.all._
+
+import io.circe.Decoder
+
 object File {
 
   def mkDir[F[_]: Sync](dir: Path): F[Path] =
@@ -91,4 +95,10 @@ object File {
 
   def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
     Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
+
+  def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
+      d: Decoder[A]
+  ): F[A] =
+    readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
+
 }
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index bd0de234..115d2893 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -248,6 +248,29 @@ docspell.joex {
     # should suffice. Default is 10000, which are about 2-3 pages
     # (just a rough guess, of course).
     max-length = 10000
+
+    # A working directory for the analyser to store temporary/working
+    # files.
+    working-dir = ${java.io.tmpdir}"/docspell-analysis"
+
+    regex-ner {
+      # Whether to enable custom NER annotation. This uses the address
+      # book of a collective as input for NER tagging (to automatically
+      # find correspondent and concerned entities). If the address book
+      # is large, this can be quite memory intensive and also makes text
+      # analysis slower. But it greatly improves accuracy. If this is
+      # false, NER tagging uses only statistical models (that also work
+      # quite well).
+      #
+      # This setting might be moved to the collective settings in the
+      # future.
+      enabled = true
+
+      # The NER annotation uses a file of patterns that is derived from
+      # a collective's address book. This is is the time how long this
+      # file will be kept until a check for a state change is done.
+      file-cache-time = "1 minute"
+    }
   }
 
   # Configuration for converting files into PDFs.
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index 3625ffb1..cb6bb9f3 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -1,11 +1,14 @@
 package docspell.joex
 
+import java.nio.file.Path
+
 import docspell.analysis.TextAnalysisConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
 import docspell.extract.ExtractConfig
 import docspell.ftssolr.SolrConfig
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.hk.HouseKeepingConfig
 import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
 import docspell.store.JdbcConfig
@@ -20,7 +23,7 @@ case class Config(
     userTasks: Config.UserTasks,
     houseKeeping: HouseKeepingConfig,
     extraction: ExtractConfig,
-    textAnalysis: TextAnalysisConfig,
+    textAnalysis: Config.TextAnalysis,
     convert: ConvertConfig,
     sendMail: MailSendConfig,
     files: Files,
@@ -50,4 +53,19 @@ object Config {
   }
 
   case class Processing(maxDueDateYears: Int)
+
+  case class TextAnalysis(
+      maxLength: Int,
+      workingDir: Path,
+      regexNer: RegexNer
+  ) {
+
+    def textAnalysisConfig: TextAnalysisConfig =
+      TextAnalysisConfig(maxLength)
+
+    def regexNerFileConfig: RegexNerFile.Config =
+      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
+  }
+
+  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
index dcea79df..2fa94c25 100644
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -11,6 +11,7 @@ import docspell.backend.ops._
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.ftssolr.SolrFtsClient
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.notify._
@@ -89,7 +90,8 @@ object JoexAppImpl {
       upload   <- OUpload(store, queue, cfg.files, joex)
       fts      <- createFtsClient(cfg)(httpClient)
       itemOps  <- OItem(store, fts, queue, joex)
-      analyser <- TextAnalyser.create[F](cfg.textAnalysis)
+      analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
+      regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
       javaEmil =
         JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
       sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@@ -97,14 +99,14 @@ object JoexAppImpl {
         .withTask(
           JobTask.json(
             ProcessItemArgs.taskName,
-            ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
+            ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
             ItemHandler.onCancel[F]
           )
         )
         .withTask(
           JobTask.json(
             ReProcessItemArgs.taskName,
-            ReProcessItem[F](cfg, fts, analyser),
+            ReProcessItem[F](cfg, fts, analyser, regexNer),
             ReProcessItem.onCancel[F]
           )
         )
diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
new file mode 100644
index 00000000..f7abe029
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@@ -0,0 +1,99 @@
+package docspell.joex.analysis
+
+import java.nio.file.Path
+
+import cats.effect._
+import cats.implicits._
+
+import docspell.analysis.split.TextSplitter
+import docspell.common._
+import docspell.store.queries.QCollective
+
+import io.circe.generic.semiauto._
+import io.circe.{Decoder, Encoder}
+
+case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
+  def nerFilePath(directory: Path): Path =
+    NerFile.nerFilePath(directory, collective)
+
+  def jsonFilePath(directory: Path) =
+    NerFile.jsonFilePath(directory, collective)
+}
+
+object NerFile {
+  implicit val jsonDecoder: Decoder[NerFile] =
+    deriveDecoder[NerFile]
+
+  implicit val jsonEncoder: Encoder[NerFile] =
+    deriveEncoder[NerFile]
+
+  private def nerFilePath(directory: Path, collective: Ident): Path =
+    directory.resolve(s"${collective.id}.txt")
+
+  private def jsonFilePath(directory: Path, collective: Ident): Path =
+    directory.resolve(s"${collective.id}.json")
+
+  def find[F[_]: Sync: ContextShift](
+      collective: Ident,
+      directory: Path,
+      blocker: Blocker
+  ): F[Option[NerFile]] = {
+    val file = jsonFilePath(directory, collective)
+    File.existsNonEmpty[F](file).flatMap {
+      case true =>
+        File
+          .readJson[F, NerFile](file, blocker)
+          .map(_.some)
+      case false =>
+        (None: Option[NerFile]).pure[F]
+    }
+  }
+
+  def mkNerConfig(names: QCollective.Names): String = {
+    val orgs = names.org
+      .flatMap(Pattern(3))
+      .distinct
+      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
+
+    val pers =
+      names.pers
+        .flatMap(Pattern(2))
+        .distinct
+        .map(_.toRow("PERSON", "LOCATION,MISC"))
+
+    val equips =
+      names.equip
+        .flatMap(Pattern(1))
+        .distinct
+        .map(_.toRow("MISC", "LOCATION"))
+
+    (orgs ++ pers ++ equips).mkString("\n")
+  }
+  case class Pattern(value: String, weight: Int) {
+    def toRow(tag: String, overrideTags: String): String =
+      s"$value\t$tag\t$overrideTags\t$weight"
+  }
+
+  object Pattern {
+    def apply(weight: Int)(str: String): Vector[Pattern] = {
+      val delims = " \t\n\r".toSet
+      val words =
+        TextSplitter
+          .split(str, delims)
+          .map(_.toLower.value.trim)
+          .filter(_.nonEmpty)
+          .toVector
+          .map(w => s"(?i)${w}")
+      val tokens =
+        TextSplitter
+          .splitToken(str, delims)
+          .map(_.toLower.value.trim)
+          .filter(_.nonEmpty)
+          .toVector
+          .take(3)
+          .map(w => s"(?i)${w}")
+
+      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+    }
+  }
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
new file mode 100644
index 00000000..570fc659
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
@@ -0,0 +1,164 @@
+package docspell.joex.analysis
+
+import java.nio.file.Path
+
+import cats.effect._
+import cats.effect.concurrent.Semaphore
+import cats.implicits._
+
+import docspell.common._
+import docspell.common.syntax.all._
+import docspell.store.Store
+import docspell.store.queries.QCollective
+import docspell.store.records.REquipment
+import docspell.store.records.ROrganization
+import docspell.store.records.RPerson
+
+import io.circe.syntax._
+import org.log4s.getLogger
+
+/** Maintains a custom regex-ner file per collective for stanford's
+  * regexner annotator.
+  */
+trait RegexNerFile[F[_]] {
+
+  def makeFile(collective: Ident): F[Option[Path]]
+
+}
+
+object RegexNerFile {
+  private[this] val logger = getLogger
+
+  case class Config(enabled: Boolean, directory: Path, minTime: Duration)
+
+  def apply[F[_]: Concurrent: ContextShift](
+      cfg: Config,
+      blocker: Blocker,
+      store: Store[F]
+  ): Resource[F, RegexNerFile[F]] =
+    for {
+      dir    <- File.withTempDir[F](cfg.directory, "regexner-")
+      writer <- Resource.liftF(Semaphore(1))
+    } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
+
+  final private class Impl[F[_]: Concurrent: ContextShift](
+      cfg: Config,
+      blocker: Blocker,
+      store: Store[F],
+      writer: Semaphore[F] //TODO allow parallelism per collective
+  ) extends RegexNerFile[F] {
+
+    def makeFile(collective: Ident): F[Option[Path]] =
+      if (cfg.enabled) doMakeFile(collective)
+      else (None: Option[Path]).pure[F]
+
+    def doMakeFile(collective: Ident): F[Option[Path]] =
+      for {
+        now      <- Timestamp.current[F]
+        existing <- NerFile.find[F](collective, cfg.directory, blocker)
+        result <- existing match {
+          case Some(nf) =>
+            val dur = Duration.between(nf.creation, now)
+            if (dur > cfg.minTime)
+              logger.fdebug(
+                s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
+              ) *> updateFile(
+                collective,
+                now,
+                Some(nf)
+              )
+            else nf.nerFilePath(cfg.directory).some.pure[F]
+          case None =>
+            updateFile(collective, now, None)
+        }
+      } yield result
+
+    private def updateFile(
+        collective: Ident,
+        now: Timestamp,
+        current: Option[NerFile]
+    ): F[Option[Path]] =
+      for {
+        lastUpdate <- store.transact(Sql.latestUpdate(collective))
+        result <- lastUpdate match {
+          case None =>
+            (None: Option[Path]).pure[F]
+          case Some(lup) =>
+            current match {
+              case Some(cur) =>
+                val nerf =
+                  if (cur.updated == lup)
+                    logger.fdebug(s"No state change detected.") *> updateTimestamp(
+                      cur,
+                      now
+                    ) *> cur.pure[F]
+                  else
+                    logger.fdebug(
+                      s"There have been state changes for collective '${collective.id}'. Reload NER file."
+                    ) *> createFile(lup, collective, now)
+                nerf.map(_.nerFilePath(cfg.directory).some)
+              case None =>
+                createFile(lup, collective, now)
+                  .map(_.nerFilePath(cfg.directory).some)
+            }
+        }
+      } yield result
+
+    private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
+      writer.withPermit(for {
+        file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
+        _    <- File.mkDir(file.getParent)
+        _    <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
+      } yield ())
+
+    private def createFile(
+        lastUpdate: Timestamp,
+        collective: Ident,
+        now: Timestamp
+    ): F[NerFile] = {
+      def update(nf: NerFile, text: String): F[Unit] =
+        writer.withPermit(for {
+          jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
+          _        <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
+          _        <- File.mkDir(jsonFile.getParent)
+          _        <- File.writeString(nf.nerFilePath(cfg.directory), text)
+          _        <- File.writeString(jsonFile, nf.asJson.spaces2)
+        } yield ())
+
+      for {
+        _     <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
+        names <- store.transact(QCollective.allNames(collective))
+        nerFile = NerFile(collective, lastUpdate, now)
+        _ <- update(nerFile, NerFile.mkNerConfig(names))
+      } yield nerFile
+    }
+  }
+
+  object Sql {
+    import doobie._
+    import doobie.implicits._
+    import docspell.store.impl.Implicits._
+    import docspell.store.impl.Column
+
+    def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
+      def max(col: Column, table: Fragment, cidCol: Column): Fragment =
+        selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
+
+      val sql =
+        List(
+          max(
+            ROrganization.Columns.updated,
+            ROrganization.table,
+            ROrganization.Columns.cid
+          ),
+          max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
+          max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
+        )
+          .reduce(_ ++ fr"UNION ALL" ++ _)
+
+      selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
+        .query[Timestamp]
+        .option
+    }
+  }
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
index 240e7f54..acbf810b 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@@ -10,6 +10,7 @@ import docspell.backend.ops.OItem
 import docspell.common.{ItemState, ProcessItemArgs}
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QItem
 import docspell.store.records.RItem
@@ -31,11 +32,12 @@ object ItemHandler {
       cfg: Config,
       itemOps: OItem[F],
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   ): Task[F, Args, Unit] =
     CreateItem[F]
       .flatMap(itemStateTask(ItemState.Processing))
-      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
+      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
       .map(_ => ())
 
   def itemStateTask[F[_]: Sync, A](
@@ -54,11 +56,12 @@ object ItemHandler {
       cfg: Config,
       itemOps: OItem[F],
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   )(data: ItemData): Task[F, Args, ItemData] =
     isLastRetry[F].flatMap {
       case true =>
-        ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
           case Right(d) =>
             Task.pure(d)
           case Left(ex) =>
@@ -68,7 +71,7 @@ object ItemHandler {
               .andThen(_ => Sync[F].raiseError(ex))
         })
       case false =>
-        ProcessItem[F](cfg, itemOps, fts, analyser)(data)
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
           .flatMap(itemStateTask(ItemState.Created))
     }
 
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
index cd76e095..7b8b6431 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -7,6 +7,7 @@ import docspell.backend.ops.OItem
 import docspell.common.ProcessItemArgs
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 
 object ProcessItem {
@@ -15,11 +16,12 @@ object ProcessItem {
       cfg: Config,
       itemOps: OItem[F],
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     ExtractArchive(item)
       .flatMap(Task.setProgress(20))
-      .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
+      .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
       .flatMap(LinkProposal[F])
       .flatMap(SetGivenData[F](itemOps))
       .flatMap(Task.setProgress(99))
@@ -27,15 +29,17 @@ object ProcessItem {
   def processAttachments[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
+    processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
 
   def analysisOnly[F[_]: Sync](
       cfg: Config,
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](analyser)(item)
+    TextAnalysis[F](analyser, regexNer)(item)
       .flatMap(FindProposal[F](cfg.processing))
       .flatMap(EvalProposals[F])
       .flatMap(SaveProposals[F])
@@ -44,12 +48,13 @@ object ProcessItem {
       cfg: Config,
       fts: FtsClient[F],
       analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F],
       progress: (Int, Int, Int)
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     ConvertPdf(cfg.convert, item)
       .flatMap(Task.setProgress(progress._1))
       .flatMap(TextExtraction(cfg.extraction, fts))
       .flatMap(Task.setProgress(progress._2))
-      .flatMap(analysisOnly[F](cfg, analyser))
+      .flatMap(analysisOnly[F](cfg, analyser, regexNer))
       .flatMap(Task.setProgress(progress._3))
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
index 53282539..bf6d2467 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachment
@@ -21,10 +22,11 @@ object ReProcessItem {
   def apply[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   ): Task[F, Args, Unit] =
     loadItem[F]
-      .flatMap(safeProcess[F](cfg, fts, analyser))
+      .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
       .map(_ => ())
 
   def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@@ -73,6 +75,7 @@ object ReProcessItem {
       cfg: Config,
       fts: FtsClient[F],
       analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F],
       data: ItemData
   ): Task[F, Args, ItemData] = {
 
@@ -94,7 +97,7 @@ object ReProcessItem {
 
     getLanguage[F].flatMap { lang =>
       ProcessItem
-        .processAttachments[F](cfg, fts, analyser)(data)
+        .processAttachments[F](cfg, fts, analyser, regexNer)(data)
         .contramap[Args](convertArgs(lang))
     }
   }
@@ -113,11 +116,12 @@ object ReProcessItem {
   def safeProcess[F[_]: ConcurrentEffect: ContextShift](
       cfg: Config,
       fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
   )(data: ItemData): Task[F, Args, ItemData] =
     isLastRetry[F].flatMap {
       case true =>
-        processFiles[F](cfg, fts, analyser, data).attempt
+        processFiles[F](cfg, fts, analyser, regexNer, data).attempt
           .flatMap({
             case Right(d) =>
               Task.pure(d)
@@ -127,7 +131,7 @@ object ReProcessItem {
               ).andThen(_ => Sync[F].raiseError(ex))
           })
       case false =>
-        processFiles[F](cfg, fts, analyser, data)
+        processFiles[F](cfg, fts, analyser, regexNer, data)
     }
 
   private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 9ee3850c..abbb6870 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,24 +1,22 @@
 package docspell.joex.process
 
-import java.nio.file.Paths
-
 import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
-import docspell.analysis.split.TextSplitter
 import docspell.common._
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
-import docspell.store.queries.QCollective
 import docspell.store.records.RAttachmentMeta
 
 object TextAnalysis {
 
   def apply[F[_]: Sync](
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      nerFile: RegexNerFile[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     Task { ctx =>
       for {
@@ -27,7 +25,7 @@ object TextAnalysis {
         t <-
           item.metas.toList
             .traverse(
-              annotateAttachment[F](ctx, analyser)
+              annotateAttachment[F](ctx, analyser, nerFile)
             )
         _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
         _ <- t.traverse(m =>
@@ -41,63 +39,19 @@ object TextAnalysis {
 
   def annotateAttachment[F[_]: Sync](
       ctx: Context[F, ProcessItemArgs],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
+      nerFile: RegexNerFile[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
     val settings = StanfordSettings(ctx.args.meta.language, false, None)
     for {
-      names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
-      temp  <- File.mkTempFile(Paths.get("."), "textanalysis")
-      _     <- File.writeString(temp, mkNerConfig(names))
-      sett = settings.copy(regexNer = Some(temp))
+      customNer <- nerFile.makeFile(ctx.args.meta.collective)
+      sett = settings.copy(regexNer = customNer)
       labels <- analyser.annotate(
         ctx.logger,
         sett,
         ctx.args.meta.collective,
         rm.content.getOrElse("")
       )
-      _ <- File.deleteFile(temp)
     } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
   }
-
-  def mkNerConfig(names: QCollective.Names): String = {
-    val orgs = names.org
-      .flatMap(Pattern(3))
-      .distinct
-      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
-
-    val pers =
-      names.pers
-        .flatMap(Pattern(2))
-        .distinct
-        .map(_.toRow("PERSON", "LOCATION,MISC"))
-
-    val equips =
-      names.equip
-        .flatMap(Pattern(1))
-        .distinct
-        .map(_.toRow("MISC", "LOCATION"))
-
-    (orgs ++ pers ++ equips).mkString("\n")
-  }
-
-  case class Pattern(value: String, weight: Int) {
-    def toRow(tag: String, overrideTags: String): String =
-      s"$value\t$tag\t$overrideTags\t$weight"
-  }
-
-  object Pattern {
-    def apply(weight: Int)(str: String): Vector[Pattern] = {
-      val delims = " \t\n\r".toSet
-      val words =
-        TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
-      val tokens =
-        TextSplitter
-          .splitToken(str, delims)
-          .toVector
-          .take(3)
-          .map(w => s"(?i)${w.toLower.value}")
-
-      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
-    }
-  }
 }
diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index 6e16581f..d550c2d3 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -91,6 +91,11 @@ let
     };
     text-analysis = {
       max-length = 10000;
+      regex-ner = {
+        enabled = true;
+        file-cache-time = "1 minute";
+      };
+      working-dir = "/tmp/docspell-analysis";
     };
     processing = {
       max-due-date-years = 10;
@@ -689,7 +694,48 @@ in {
                 (a rough guess).
               '';
             };
+            working-dir = mkOption {
+              type = types.str;
+              default = defaults.text-analysis.working-dir;
+              description = ''
+                A working directory for the analyser to store temporary/working
+                files.
+              '';
+            };
 
+            regex-ner = mkOption {
+              type = types.submodule({
+                options = {
+                  enabled = mkOption {
+                    type = types.bool;
+                    default = defaults.text-analysis.regex-ner.enabled;
+                    description = ''
+                      Whether to enable custom NER annotation. This uses the address
+                      book of a collective as input for NER tagging (to automatically
+                      find correspondent and concerned entities). If the address book
+                      is large, this can be quite memory intensive and also makes text
+                      analysis slower. But it greatly improves accuracy. If this is
+                      false, NER tagging uses only statistical models (that also work
+                      quite well).
+
+                      This setting might be moved to the collective settings in the
+                      future.
+                    '';
+                  };
+                  file-cache-time = mkOption {
+                    type = types.str;
+                    default = defaults.text-analysis.ner-file-cache-time;
+                    description = ''
+                      The NER annotation uses a file of patterns that is derived from
+                      a collective's address book. This is is the time how long this
+                      file will be kept until a check for a state change is done.
+                    '';
+                  };
+                };
+              });
+              default = defaults.text-analysis.regex-ner;
+              description = "";
+            };
           };
         });
         default = defaults.text-analysis;