Merge pull request #238 from eikek/stanford-nlp4

Stanford nlp4
2025-07-04 16:48:26 +00:00 · 2020-08-25 19:02:43 +00:00
parent aefa7bace2 3473cbb773
commit 31544240fb
38 changed files with 1040 additions and 219 deletions
--- a/.scala-steward.conf
+++ b/.scala-steward.conf
@ -1,3 +0,0 @@
 updates.ignore = [
  { groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
 ]
--- a/.travis.yml
+++ b/.travis.yml
@ -10,6 +10,7 @@ cache:
    - $HOME/.ivy2/cache
    - $HOME/.sbt/boot
    - $HOME/.coursier/cache
    - $HOME/.cache/coursier
    - sysconfcpus
 install:
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
-[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell)
+[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
 [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org)
 [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
 [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -5,12 +5,19 @@ import cats.implicits._
 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
 import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
 import docspell.analysis.nlp.StanfordSettings
 import docspell.common._
 trait TextAnalyser[F[_]] {
-  def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
+  def annotate(
      logger: Logger[F],
      settings: StanfordSettings,
      cacheKey: Ident,
      text: String
  ): F[TextAnalyser.Result]
 }
 object TextAnalyser {
@ -22,43 +29,47 @@ object TextAnalyser {
  }
  def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
-    Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
+    Resource
-      def annotate(
+      .liftF(PipelineCache[F]())
-          logger: Logger[F],
+      .map(cache =>
-          lang: Language,
+        new TextAnalyser[F] {
-          text: String
+          def annotate(
-      ): F[TextAnalyser.Result] =
+              logger: Logger[F],
-        for {
+              settings: StanfordSettings,
-          input <- textLimit(logger, text)
+              cacheKey: Ident,
-          tags0 <- stanfordNer(lang, input)
+              text: String
-          tags1 <- contactNer(input)
+          ): F[TextAnalyser.Result] =
-          dates <- dateNer(lang, input)
+            for {
-          list  = tags0 ++ tags1
+              input <- textLimit(logger, text)
-          spans = NerLabelSpan.build(list)
+              tags0 <- stanfordNer(cacheKey, settings, input)
-        } yield Result(spans ++ list, dates)
+              tags1 <- contactNer(input)
              dates <- dateNer(settings.lang, input)
              list  = tags0 ++ tags1
              spans = NerLabelSpan.build(list)
            } yield Result(spans ++ list, dates)
-      private def textLimit(logger: Logger[F], text: String): F[String] =
+          private def textLimit(logger: Logger[F], text: String): F[String] =
-        if (text.length <= cfg.maxLength) text.pure[F]
+            if (text.length <= cfg.maxLength) text.pure[F]
-        else
+            else
-          logger.info(
+              logger.info(
-            s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
+                s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
-              s" Analysing only first ${cfg.maxLength} characters."
+                  s" Analysing only first ${cfg.maxLength} characters."
-          ) *> text.take(cfg.maxLength).pure[F]
+              ) *> text.take(cfg.maxLength).pure[F]
-      private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
+          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
-        Sync[F].delay {
+              : F[Vector[NerLabel]] =
-          StanfordNerClassifier.nerAnnotate(lang)(text)
+            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
          private def contactNer(text: String): F[Vector[NerLabel]] =
            Sync[F].delay {
              Contact.annotate(text)
            }
          private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
            Sync[F].delay {
              DateFind.findDates(text, lang).toVector
            }
        }
-
+      )
      private def contactNer(text: String): F[Vector[NerLabel]] =
        Sync[F].delay {
          Contact.annotate(text)
        }
      private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
        Sync[F].delay {
          DateFind.findDates(text, lang).toVector
        }
    })
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@ -54,6 +54,7 @@ object DateFind {
      val p = lang match {
        case Language.English => p2.or(p0).or(p1)
        case Language.German  => p1.or(p0).or(p2)
        case Language.French  => p1.or(p0).or(p2)
      }
      p.read(parts).toOption
    }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
@ -0,0 +1,25 @@
 package docspell.analysis.nlp
 import docspell.common.{NerLabel, NerTag}
 import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
 object LabelConverter {
  private def tagFromLabel[A <: CoreAnnotation[String]](
      label: CoreLabel,
      annot: Class[A]
  ): Option[NerTag] = {
    val tag = label.get(annot)
    Option(tag).flatMap(s => NerTag.fromString(s).toOption)
  }
  def findTag(label: CoreLabel): Option[NerTag] =
    tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
      .orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
  def toNerLabel(label: CoreLabel): Option[NerLabel] =
    findTag(label).map(t =>
      NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
    )
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@ -0,0 +1,90 @@
 package docspell.analysis.nlp
 import cats.Applicative
 import cats.effect._
 import cats.effect.concurrent.Ref
 import cats.implicits._
 import docspell.common._
 import edu.stanford.nlp.pipeline.StanfordCoreNLP
 import org.log4s.getLogger
 /** Creating the StanfordCoreNLP pipeline is quite expensive as it
  * involves IO and initializing large objects.
  *
  * Therefore, the instances are cached, because they are thread-safe.
  *
  * **This is an internal API**
  */
 trait PipelineCache[F[_]] {
  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
 }
 object PipelineCache {
  private[this] val logger = getLogger
  def none[F[_]: Applicative]: PipelineCache[F] =
    new PipelineCache[F] {
      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
        makeClassifier(settings).pure[F]
    }
  def apply[F[_]: Sync](): F[PipelineCache[F]] =
    Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
      extends PipelineCache[F] {
    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
      for {
        id  <- makeSettingsId(settings)
        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
      } yield nlp
    private def getOrCreate(
        key: String,
        id: String,
        cache: Map[String, Entry],
        settings: StanfordSettings
    ): (Map[String, Entry], StanfordCoreNLP) =
      cache.get(key) match {
        case Some(entry) =>
          if (entry.id == id) (cache, entry.value)
          else {
            logger.info(
              s"StanfordNLP settings changed for key $key. Creating new classifier"
            )
            val nlp = makeClassifier(settings)
            val e   = Entry(id, nlp)
            (cache.updated(key, e), nlp)
          }
        case None =>
          val nlp = makeClassifier(settings)
          val e   = Entry(id, nlp)
          (cache.updated(key, e), nlp)
      }
    private def makeSettingsId(settings: StanfordSettings): F[String] = {
      val base = settings.copy(regexNer = None).toString
      val size: F[Long] =
        settings.regexNer match {
          case Some(p) =>
            File.size(p)
          case None =>
            0L.pure[F]
        }
      size.map(len => s"$base-$len")
    }
  }
  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
    new StanfordCoreNLP(Properties.forSettings(settings))
  }
  private case class Entry(id: String, value: StanfordCoreNLP)
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@ -0,0 +1,111 @@
 package docspell.analysis.nlp
 import java.util.{Properties => JProps}
 import docspell.analysis.nlp.Properties.Implicits._
 import docspell.common._
 object Properties {
  def apply(ps: (String, String)*): JProps = {
    val p = new JProps()
    for ((k, v) <- ps)
      p.setProperty(k, v)
    p
  }
  def forSettings(settings: StanfordSettings): JProps = {
    val regexNerFile = settings.regexNer
      .map(p => p.normalize().toAbsolutePath().toString())
    settings.lang match {
      case Language.German =>
        Properties.nerGerman(regexNerFile, settings.highRecall)
      case Language.English =>
        Properties.nerEnglish(regexNerFile)
      case Language.French =>
        Properties.nerFrench(regexNerFile, settings.highRecall)
    }
  }
  def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
      "tokenize.language"           -> "de",
      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
      "ner.statisticalOnly"         -> "true",
      "ner.rulesOnly"               -> "false",
      "ner.applyFineGrained"        -> "false",
      "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
      "ner.useSUTime"               -> "false", //only english, unused in docspell
      "ner.language"                -> "de",
      "ner.model"                   -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
  def nerEnglish(regexNerMappingFile: Option[String]): JProps =
    Properties(
      "annotators"                  -> "tokenize,ssplit,pos,lemma,ner",
      "tokenize.language"           -> "en",
      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
      "ner.statisticalOnly"         -> "true",
      "ner.rulesOnly"               -> "false",
      "ner.applyFineGrained"        -> "false",
      "ner.applyNumericClassifiers" -> "false",
      "ner.useSUTime"               -> "false",
      "ner.language"                -> "en",
      "ner.model"                   -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile)
  def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
      "tokenize.language"           -> "fr",
      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
      "mwt.pos.model"               -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
      "mwt.statisticalMappingFile"  -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
      "ner.statisticalOnly"         -> "true",
      "ner.rulesOnly"               -> "false",
      "ner.applyFineGrained"        -> "false",
      "ner.applyNumericClassifiers" -> "false",
      "ner.useSUTime"               -> "false",
      "ner.language"                -> "de",
      "ner.model"                   -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
  object Implicits {
    implicit final class JPropsOps(val p: JProps) extends AnyVal {
      def set(name: String, value: Option[String]): JProps =
        value match {
          case Some(v) =>
            p.setProperty(name, v)
            p
          case None =>
            p
        }
      def change(name: String, f: String => String): JProps =
        Option(p.getProperty(name)) match {
          case Some(current) =>
            p.setProperty(name, f(current))
            p
          case None =>
            p
        }
      def withRegexNer(mappingFile: Option[String]): JProps =
        set("regexner.mapping", mappingFile)
          .change(
            "annotators",
            v => if (mappingFile.isDefined) v + ",regexner" else v
          )
      def withHighRecall(flag: Boolean): JProps = {
        if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
        else p.setProperty("ner.combinationMode", "NORMAL")
        p
      }
    }
  }
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@ -1,65 +1,39 @@
 package docspell.analysis.nlp
 import java.net.URL
 import java.util.zip.GZIPInputStream
 import scala.jdk.CollectionConverters._
-import scala.util.Using
+
 import cats.Applicative
 import cats.implicits._
 import docspell.common._
-import edu.stanford.nlp.ie.AbstractSequenceClassifier
+import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
 import edu.stanford.nlp.ie.crf.CRFClassifier
 import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 import org.log4s.getLogger
 object StanfordNerClassifier {
  private[this] val logger = getLogger
-  lazy val germanNerClassifier  = makeClassifier(Language.German)
+  /** Runs named entity recognition on the given `text`.
-  lazy val englishNerClassifier = makeClassifier(Language.English)
+    *
    * This uses the classifier pipeline from stanford-nlp, see
    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these
    * classifiers is quite expensive, it involves loading large model
    * files. The classifiers are thread-safe and so they are cached.
    * The `cacheKey` defines the "slot" where classifiers are stored
    * and retrieved. If for a given `cacheKey` the `settings` change,
    * a new classifier must be created. It will then replace the
    * previous one.
    */
  def nerAnnotate[F[_]: Applicative](
      cacheKey: String,
      cache: PipelineCache[F]
  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
    cache
      .obtain(cacheKey, settings)
      .map(crf => runClassifier(crf, text))
-  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
+  def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
-    val nerClassifier = lang match {
+    val doc = new CoreDocument(text)
-      case Language.English => englishNerClassifier
+    nerClassifier.annotate(doc)
-      case Language.German  => germanNerClassifier
+    doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
    }
    nerClassifier
      .classify(text)
      .asScala
      .flatMap(a => a.asScala)
      .collect(Function.unlift { label =>
        val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
        NerTag
          .fromString(Option(tag).getOrElse(""))
          .toOption
          .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
      })
      .toVector
  }
  private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
    logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
    val ner = classifierResource(lang)
    Using(new GZIPInputStream(ner.openStream())) { in =>
      CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
    }.fold(throw _, identity)
  }
  private def classifierResource(lang: Language): URL = {
    def check(u: URL): URL =
      if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
      else u
    check(lang match {
      case Language.German =>
        getClass.getResource(
          "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
        )
      case Language.English =>
        getClass.getResource(
          "/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
        )
    })
  }
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
@ -0,0 +1,22 @@
 package docspell.analysis.nlp
 import java.nio.file.Path
 import docspell.common._
 /** Settings for configuring the stanford NER pipeline.
  *
  * The language is mandatory, only the provided ones are supported.
  * The `highRecall` only applies for non-English languages. For
  * non-English languages the english classifier is run as second
  * classifier and if `highRecall` is true, then it will be used to
  * tag untagged tokens. This may lead to a lot of false positives,
  * but since English is omnipresent in other languages, too it
  * depends on the use case for whether this is useful or not.
  *
  * The `regexNer` allows to specify a text file as described here:
  * https://nlp.stanford.edu/software/regexner.html. This will be used
  * as a last step to tag untagged tokens using the provided list of
  * regexps.
  */
 case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@ -3,31 +3,44 @@ package docspell.analysis.nlp
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
 import edu.stanford.nlp.pipeline.StanfordCoreNLP
 object TextAnalyserSuite extends SimpleTestSuite {
  lazy val germanClassifier =
    new StanfordCoreNLP(Properties.nerGerman(None, false))
  lazy val englishClassifier =
    new StanfordCoreNLP(Properties.nerEnglish(None))
  test("find english ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
    val expect = Vector(
      NerLabel("Derek", NerTag.Person, 0, 5),
      NerLabel("Jeter", NerTag.Person, 6, 11),
-      NerLabel("Treesville", NerTag.Person, 27, 37),
+      NerLabel("Elm", NerTag.Misc, 17, 20),
      NerLabel("Ave.", NerTag.Misc, 21, 25),
      NerLabel("Treesville", NerTag.Misc, 27, 37),
      NerLabel("Derek", NerTag.Person, 68, 73),
      NerLabel("Jeter", NerTag.Person, 74, 79),
-      NerLabel("Treesville", NerTag.Location, 95, 105),
+      NerLabel("Elm", NerTag.Misc, 85, 88),
      NerLabel("Ave.", NerTag.Misc, 89, 93),
      NerLabel("Treesville", NerTag.Person, 95, 105),
      NerLabel("Leaf", NerTag.Organization, 144, 148),
      NerLabel("Chief", NerTag.Organization, 150, 155),
      NerLabel("of", NerTag.Organization, 156, 158),
      NerLabel("Syrup", NerTag.Organization, 159, 164),
      NerLabel("Production", NerTag.Organization, 165, 175),
      NerLabel("Old", NerTag.Organization, 176, 179),
      NerLabel("Sticky", NerTag.Organization, 180, 186),
      NerLabel("Pancake", NerTag.Organization, 187, 194),
      NerLabel("Company", NerTag.Organization, 195, 202),
-      NerLabel("Maple", NerTag.Location, 207, 212),
+      NerLabel("Maple", NerTag.Organization, 207, 212),
-      NerLabel("Lane", NerTag.Location, 213, 217),
+      NerLabel("Lane", NerTag.Organization, 213, 217),
-      NerLabel("Forest", NerTag.Location, 219, 225),
+      NerLabel("Forest", NerTag.Organization, 219, 225),
      NerLabel("Hemptown", NerTag.Location, 239, 247),
-      NerLabel("Little", NerTag.Organization, 347, 353),
+      NerLabel("Leaf", NerTag.Person, 276, 280),
-      NerLabel("League", NerTag.Organization, 354, 360),
+      NerLabel("Little", NerTag.Misc, 347, 353),
      NerLabel("League", NerTag.Misc, 354, 360),
      NerLabel("Derek", NerTag.Person, 1117, 1122),
      NerLabel("Jeter", NerTag.Person, 1123, 1128)
    )
@ -36,11 +49,11 @@ object TextAnalyserSuite extends SimpleTestSuite {
  test("find german ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
    val expect = Vector(
      NerLabel("Max", NerTag.Person, 0, 3),
      NerLabel("Mustermann", NerTag.Person, 4, 14),
-      NerLabel("Lilienweg", NerTag.Location, 16, 25),
+      NerLabel("Lilienweg", NerTag.Person, 16, 25),
      NerLabel("Max", NerTag.Person, 77, 80),
      NerLabel("Mustermann", NerTag.Person, 81, 91),
      NerLabel("Lilienweg", NerTag.Location, 93, 102),
--- a/modules/common/src/main/scala/docspell/common/Duration.scala
+++ b/modules/common/src/main/scala/docspell/common/Duration.scala
@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
  def hours: Long = minutes / 60
  def >(other: Duration): Boolean =
    nanos > other.nanos
  def <(other: Duration): Boolean =
    nanos < other.nanos
  def toScala: FiniteDuration =
    FiniteDuration(nanos, TimeUnit.NANOSECONDS)
@ -62,6 +68,9 @@ object Duration {
  def nanos(n: Long): Duration =
    Duration(n)
  def between(start: Timestamp, end: Timestamp): Duration =
    apply(JDur.between(start.value, end.value))
  def stopTime[F[_]: Sync]: F[F[Duration]] =
    for {
      now <- Timestamp.current[F]
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@ -1,6 +1,7 @@
 package docspell.common
 import java.io.IOException
 import java.nio.charset.StandardCharsets
 import java.nio.file._
 import java.nio.file.attribute.BasicFileAttributes
 import java.util.concurrent.atomic.AtomicInteger
@ -11,6 +12,10 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream
 import docspell.common.syntax.all._
 import io.circe.Decoder
 object File {
  def mkDir[F[_]: Sync](dir: Path): F[Path] =
@ -55,6 +60,9 @@ object File {
  def exists[F[_]: Sync](file: Path): F[Boolean] =
    Sync[F].delay(Files.exists(file))
  def size[F[_]: Sync](file: Path): F[Long] =
    Sync[F].delay(Files.size(file))
  def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
    Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
@ -84,4 +92,13 @@ object File {
  def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
    readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
  def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
      d: Decoder[A]
  ): F[A] =
    readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
 }
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@ -27,7 +27,12 @@ object Language {
    val iso3 = "eng"
  }
-  val all: List[Language] = List(German, English)
+  case object French extends Language {
    val iso2 = "fr"
    val iso3 = "fra"
  }
  val all: List[Language] = List(German, English, French)
  def fromString(str: String): Either[String, Language] = {
    val lang = str.toLowerCase
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
@ -23,6 +23,7 @@ object Field {
  val content        = Field("content")
  val content_de     = Field("content_de")
  val content_en     = Field("content_en")
  val content_fr     = Field("content_fr")
  val itemName       = Field("itemName")
  val itemNotes      = Field("itemNotes")
  val folderId       = Field("folder")
@ -33,6 +34,8 @@ object Field {
        Field.content_de
      case Language.English =>
        Field.content_en
      case Language.French =>
        Field.content_fr
    }
  implicit val jsonEncoder: Encoder[Field] =
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@ -39,6 +39,7 @@ object SolrQuery {
            Field.content,
            Field.content_de,
            Field.content_en,
            Field.content_fr,
            Field.itemName,
            Field.itemNotes,
            Field.attachmentName
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@ -80,6 +80,8 @@ object SolrSetup {
            addTextField(l.some)(Field.content_de)
          case l @ Language.English =>
            addTextField(l.some)(Field.content_en)
          case l @ Language.French =>
            addTextField(l.some)(Field.content_fr)
        }
        cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -105,6 +107,9 @@ object SolrSetup {
          case Some(Language.English) =>
            run(DeleteField.command(DeleteField(field))).attempt *>
              run(AddField.command(AddField.textEN(field)))
          case Some(Language.French) =>
            run(DeleteField.command(DeleteField(field))).attempt *>
              run(AddField.command(AddField.textFR(field)))
        }
    }
  }
@ -138,6 +143,9 @@ object SolrSetup {
    def textEN(field: Field): AddField =
      AddField(field, "text_en", true, true, false)
    def textFR(field: Field): AddField =
      AddField(field, "text_fr", true, true, false)
  }
  case class DeleteField(name: Field)
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -248,6 +248,29 @@ docspell.joex {
    # should suffice. Default is 10000, which are about 2-3 pages
    # (just a rough guess, of course).
    max-length = 10000
    # A working directory for the analyser to store temporary/working
    # files.
    working-dir = ${java.io.tmpdir}"/docspell-analysis"
    regex-ner {
      # Whether to enable custom NER annotation. This uses the address
      # book of a collective as input for NER tagging (to automatically
      # find correspondent and concerned entities). If the address book
      # is large, this can be quite memory intensive and also makes text
      # analysis slower. But it greatly improves accuracy. If this is
      # false, NER tagging uses only statistical models (that also work
      # quite well).
      #
      # This setting might be moved to the collective settings in the
      # future.
      enabled = true
      # The NER annotation uses a file of patterns that is derived from
      # a collective's address book. This is is the time how long this
      # file will be kept until a check for a state change is done.
      file-cache-time = "1 minute"
    }
  }
  # Configuration for converting files into PDFs.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -1,11 +1,14 @@
 package docspell.joex
 import java.nio.file.Path
 import docspell.analysis.TextAnalysisConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
 import docspell.extract.ExtractConfig
 import docspell.ftssolr.SolrConfig
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.hk.HouseKeepingConfig
 import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
 import docspell.store.JdbcConfig
@ -20,7 +23,7 @@ case class Config(
    userTasks: Config.UserTasks,
    houseKeeping: HouseKeepingConfig,
    extraction: ExtractConfig,
-    textAnalysis: TextAnalysisConfig,
+    textAnalysis: Config.TextAnalysis,
    convert: ConvertConfig,
    sendMail: MailSendConfig,
    files: Files,
@ -50,4 +53,19 @@ object Config {
  }
  case class Processing(maxDueDateYears: Int)
  case class TextAnalysis(
      maxLength: Int,
      workingDir: Path,
      regexNer: RegexNer
  ) {
    def textAnalysisConfig: TextAnalysisConfig =
      TextAnalysisConfig(maxLength)
    def regexNerFileConfig: RegexNerFile.Config =
      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
  }
  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
 }
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@ -6,10 +6,12 @@ import cats.effect._
 import cats.implicits._
 import fs2.concurrent.SignallingRef
 import docspell.analysis.TextAnalyser
 import docspell.backend.ops._
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.ftssolr.SolrFtsClient
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.notify._
@ -80,14 +82,16 @@ object JoexAppImpl {
    for {
      httpClient <- BlazeClientBuilder[F](clientEC).resource
      client = JoexClient(httpClient)
-      store   <- Store.create(cfg.jdbc, connectEC, blocker)
+      store    <- Store.create(cfg.jdbc, connectEC, blocker)
-      queue   <- JobQueue(store)
+      queue    <- JobQueue(store)
-      pstore  <- PeriodicTaskStore.create(store)
+      pstore   <- PeriodicTaskStore.create(store)
-      nodeOps <- ONode(store)
+      nodeOps  <- ONode(store)
-      joex    <- OJoex(client, store)
+      joex     <- OJoex(client, store)
-      upload  <- OUpload(store, queue, cfg.files, joex)
+      upload   <- OUpload(store, queue, cfg.files, joex)
-      fts     <- createFtsClient(cfg)(httpClient)
+      fts      <- createFtsClient(cfg)(httpClient)
-      itemOps <- OItem(store, fts, queue, joex)
+      itemOps  <- OItem(store, fts, queue, joex)
      analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
      regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
      javaEmil =
        JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
      sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -95,14 +99,14 @@ object JoexAppImpl {
        .withTask(
          JobTask.json(
            ProcessItemArgs.taskName,
-            ItemHandler.newItem[F](cfg, itemOps, fts),
+            ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
            ItemHandler.onCancel[F]
          )
        )
        .withTask(
          JobTask.json(
            ReProcessItemArgs.taskName,
-            ReProcessItem[F](cfg, fts),
+            ReProcessItem[F](cfg, fts, analyser, regexNer),
            ReProcessItem.onCancel[F]
          )
        )
--- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@ -0,0 +1,99 @@
 package docspell.joex.analysis
 import java.nio.file.Path
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.store.queries.QCollective
 import io.circe.generic.semiauto._
 import io.circe.{Decoder, Encoder}
 case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
  def nerFilePath(directory: Path): Path =
    NerFile.nerFilePath(directory, collective)
  def jsonFilePath(directory: Path) =
    NerFile.jsonFilePath(directory, collective)
 }
 object NerFile {
  implicit val jsonDecoder: Decoder[NerFile] =
    deriveDecoder[NerFile]
  implicit val jsonEncoder: Encoder[NerFile] =
    deriveEncoder[NerFile]
  private def nerFilePath(directory: Path, collective: Ident): Path =
    directory.resolve(s"${collective.id}.txt")
  private def jsonFilePath(directory: Path, collective: Ident): Path =
    directory.resolve(s"${collective.id}.json")
  def find[F[_]: Sync: ContextShift](
      collective: Ident,
      directory: Path,
      blocker: Blocker
  ): F[Option[NerFile]] = {
    val file = jsonFilePath(directory, collective)
    File.existsNonEmpty[F](file).flatMap {
      case true =>
        File
          .readJson[F, NerFile](file, blocker)
          .map(_.some)
      case false =>
        (None: Option[NerFile]).pure[F]
    }
  }
  def mkNerConfig(names: QCollective.Names): String = {
    val orgs = names.org
      .flatMap(Pattern(3))
      .distinct
      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
    val pers =
      names.pers
        .flatMap(Pattern(2))
        .distinct
        .map(_.toRow("PERSON", "LOCATION,MISC"))
    val equips =
      names.equip
        .flatMap(Pattern(1))
        .distinct
        .map(_.toRow("MISC", "LOCATION"))
    (orgs ++ pers ++ equips).mkString("\n")
  }
  case class Pattern(value: String, weight: Int) {
    def toRow(tag: String, overrideTags: String): String =
      s"$value\t$tag\t$overrideTags\t$weight"
  }
  object Pattern {
    def apply(weight: Int)(str: String): Vector[Pattern] = {
      val delims = " \t\n\r".toSet
      val words =
        TextSplitter
          .split(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
          .toVector
          .map(w => s"(?i)${w}")
      val tokens =
        TextSplitter
          .splitToken(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
          .toVector
          .take(3)
          .map(w => s"(?i)${w}")
      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
    }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
@ -0,0 +1,164 @@
 package docspell.joex.analysis
 import java.nio.file.Path
 import cats.effect._
 import cats.effect.concurrent.Semaphore
 import cats.implicits._
 import docspell.common._
 import docspell.common.syntax.all._
 import docspell.store.Store
 import docspell.store.queries.QCollective
 import docspell.store.records.REquipment
 import docspell.store.records.ROrganization
 import docspell.store.records.RPerson
 import io.circe.syntax._
 import org.log4s.getLogger
 /** Maintains a custom regex-ner file per collective for stanford's
  * regexner annotator.
  */
 trait RegexNerFile[F[_]] {
  def makeFile(collective: Ident): F[Option[Path]]
 }
 object RegexNerFile {
  private[this] val logger = getLogger
  case class Config(enabled: Boolean, directory: Path, minTime: Duration)
  def apply[F[_]: Concurrent: ContextShift](
      cfg: Config,
      blocker: Blocker,
      store: Store[F]
  ): Resource[F, RegexNerFile[F]] =
    for {
      dir    <- File.withTempDir[F](cfg.directory, "regexner-")
      writer <- Resource.liftF(Semaphore(1))
    } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
  final private class Impl[F[_]: Concurrent: ContextShift](
      cfg: Config,
      blocker: Blocker,
      store: Store[F],
      writer: Semaphore[F] //TODO allow parallelism per collective
  ) extends RegexNerFile[F] {
    def makeFile(collective: Ident): F[Option[Path]] =
      if (cfg.enabled) doMakeFile(collective)
      else (None: Option[Path]).pure[F]
    def doMakeFile(collective: Ident): F[Option[Path]] =
      for {
        now      <- Timestamp.current[F]
        existing <- NerFile.find[F](collective, cfg.directory, blocker)
        result <- existing match {
          case Some(nf) =>
            val dur = Duration.between(nf.creation, now)
            if (dur > cfg.minTime)
              logger.fdebug(
                s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
              ) *> updateFile(
                collective,
                now,
                Some(nf)
              )
            else nf.nerFilePath(cfg.directory).some.pure[F]
          case None =>
            updateFile(collective, now, None)
        }
      } yield result
    private def updateFile(
        collective: Ident,
        now: Timestamp,
        current: Option[NerFile]
    ): F[Option[Path]] =
      for {
        lastUpdate <- store.transact(Sql.latestUpdate(collective))
        result <- lastUpdate match {
          case None =>
            (None: Option[Path]).pure[F]
          case Some(lup) =>
            current match {
              case Some(cur) =>
                val nerf =
                  if (cur.updated == lup)
                    logger.fdebug(s"No state change detected.") *> updateTimestamp(
                      cur,
                      now
                    ) *> cur.pure[F]
                  else
                    logger.fdebug(
                      s"There have been state changes for collective '${collective.id}'. Reload NER file."
                    ) *> createFile(lup, collective, now)
                nerf.map(_.nerFilePath(cfg.directory).some)
              case None =>
                createFile(lup, collective, now)
                  .map(_.nerFilePath(cfg.directory).some)
            }
        }
      } yield result
    private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
      writer.withPermit(for {
        file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
        _    <- File.mkDir(file.getParent)
        _    <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
      } yield ())
    private def createFile(
        lastUpdate: Timestamp,
        collective: Ident,
        now: Timestamp
    ): F[NerFile] = {
      def update(nf: NerFile, text: String): F[Unit] =
        writer.withPermit(for {
          jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
          _        <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
          _        <- File.mkDir(jsonFile.getParent)
          _        <- File.writeString(nf.nerFilePath(cfg.directory), text)
          _        <- File.writeString(jsonFile, nf.asJson.spaces2)
        } yield ())
      for {
        _     <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
        names <- store.transact(QCollective.allNames(collective))
        nerFile = NerFile(collective, lastUpdate, now)
        _ <- update(nerFile, NerFile.mkNerConfig(names))
      } yield nerFile
    }
  }
  object Sql {
    import doobie._
    import doobie.implicits._
    import docspell.store.impl.Implicits._
    import docspell.store.impl.Column
    def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
      def max(col: Column, table: Fragment, cidCol: Column): Fragment =
        selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
      val sql =
        List(
          max(
            ROrganization.Columns.updated,
            ROrganization.table,
            ROrganization.Columns.cid
          ),
          max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
          max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
        )
          .reduce(_ ++ fr"UNION ALL" ++ _)
      selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
        .query[Timestamp]
        .option
    }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@ -5,10 +5,12 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream
 import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.{ItemState, ProcessItemArgs}
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QItem
 import docspell.store.records.RItem
@ -29,11 +31,13 @@ object ItemHandler {
  def newItem[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    CreateItem[F]
      .flatMap(itemStateTask(ItemState.Processing))
-      .flatMap(safeProcess[F](cfg, itemOps, fts))
+      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
      .map(_ => ())
  def itemStateTask[F[_]: Sync, A](
@ -51,11 +55,13 @@ object ItemHandler {
  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
          case Right(d) =>
            Task.pure(d)
          case Left(ex) =>
@ -65,7 +71,8 @@ object ItemHandler {
              .andThen(_ => Sync[F].raiseError(ex))
        })
      case false =>
-        ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
          .flatMap(itemStateTask(ItemState.Created))
    }
  private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@ -2,10 +2,12 @@ package docspell.joex.process
 import cats.effect._
 import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.ProcessItemArgs
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 object ProcessItem {
@ -13,25 +15,31 @@ object ProcessItem {
  def apply[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ExtractArchive(item)
      .flatMap(Task.setProgress(20))
-      .flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
+      .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
      .flatMap(LinkProposal[F])
      .flatMap(SetGivenData[F](itemOps))
      .flatMap(Task.setProgress(99))
  def processAttachments[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    processAttachments0[F](cfg, fts, (30, 60, 90))(item)
+    processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
  def analysisOnly[F[_]: Sync](
-      cfg: Config
+      cfg: Config,
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](cfg.textAnalysis)(item)
+    TextAnalysis[F](analyser, regexNer)(item)
      .flatMap(FindProposal[F](cfg.processing))
      .flatMap(EvalProposals[F])
      .flatMap(SaveProposals[F])
@ -39,12 +47,14 @@ object ProcessItem {
  private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F],
      progress: (Int, Int, Int)
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ConvertPdf(cfg.convert, item)
      .flatMap(Task.setProgress(progress._1))
      .flatMap(TextExtraction(cfg.extraction, fts))
      .flatMap(Task.setProgress(progress._2))
-      .flatMap(analysisOnly[F](cfg))
+      .flatMap(analysisOnly[F](cfg, analyser, regexNer))
      .flatMap(Task.setProgress(progress._3))
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@ -4,9 +4,11 @@ import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.TextAnalyser
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachment
@ -19,10 +21,12 @@ object ReProcessItem {
  def apply[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    loadItem[F]
-      .flatMap(safeProcess[F](cfg, fts))
+      .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
      .map(_ => ())
  def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -70,6 +74,8 @@ object ReProcessItem {
  def processFiles[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F],
      data: ItemData
  ): Task[F, Args, ItemData] = {
@ -91,7 +97,7 @@ object ReProcessItem {
    getLanguage[F].flatMap { lang =>
      ProcessItem
-        .processAttachments[F](cfg, fts)(data)
+        .processAttachments[F](cfg, fts, analyser, regexNer)(data)
        .contramap[Args](convertArgs(lang))
    }
  }
@ -109,11 +115,13 @@ object ReProcessItem {
  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        processFiles[F](cfg, fts, data).attempt
+        processFiles[F](cfg, fts, analyser, regexNer, data).attempt
          .flatMap({
            case Right(d) =>
              Task.pure(d)
@ -123,7 +131,7 @@ object ReProcessItem {
              ).andThen(_ => Sync[F].raiseError(ex))
          })
      case false =>
-        processFiles[F](cfg, fts, data)
+        processFiles[F](cfg, fts, analyser, regexNer, data)
    }
  private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -1,47 +1,57 @@
 package docspell.joex.process
-import cats.effect.Sync
+import cats.effect._
 import cats.implicits._
-import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
+import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
 import docspell.common._
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta
 object TextAnalysis {
  def apply[F[_]: Sync](
-      cfg: TextAnalysisConfig
+      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
-      TextAnalyser.create[F](cfg).use { analyser =>
+      for {
-        for {
+        _ <- ctx.logger.info("Starting text analysis")
-          _ <- ctx.logger.info("Starting text analysis")
+        s <- Duration.stopTime[F]
-          s <- Duration.stopTime[F]
+        t <-
-          t <-
+          item.metas.toList
-            item.metas.toList
+            .traverse(
-              .traverse(
+              annotateAttachment[F](ctx, analyser, nerFile)
-                annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
+            )
-              )
+        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
-          _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
+        _ <- t.traverse(m =>
-          _ <- t.traverse(m =>
+          ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
-            ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
+        )
-          )
+        e <- s
-          e <- s
+        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
-          _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
+        v = t.toVector
-          v = t.toVector
+      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
        } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
      }
    }
  def annotateAttachment[F[_]: Sync](
-      lang: Language,
+      ctx: Context[F, ProcessItemArgs],
-      logger: Logger[F],
+      analyser: TextAnalyser[F],
-      analyser: TextAnalyser[F]
+      nerFile: RegexNerFile[F]
-  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
+  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
    val settings = StanfordSettings(ctx.args.meta.language, false, None)
    for {
-      labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
+      customNer <- nerFile.makeFile(ctx.args.meta.collective)
      sett = settings.copy(regexNer = customNer)
      labels <- analyser.annotate(
        ctx.logger,
        sett,
        ctx.args.meta.collective,
        rm.content.getOrElse("")
      )
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
-
+  }
 }
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@ -341,6 +341,7 @@ trait Conversions {
        v.address.city,
        v.address.country,
        v.notes,
        now,
        now
      )
    } yield OOrganization.OrgAndContacts(org, cont)
@ -353,6 +354,7 @@ trait Conversions {
    def contacts(oid: Ident) =
      v.contacts.traverse(c => newContact(c, oid.some, None))
    for {
      now  <- Timestamp.current[F]
      cont <- contacts(v.id)
      org = ROrganization(
        v.id,
@ -363,7 +365,8 @@ trait Conversions {
        v.address.city,
        v.address.country,
        v.notes,
-        v.created
+        v.created,
        now
      )
    } yield OOrganization.OrgAndContacts(org, cont)
  }
@ -398,6 +401,7 @@ trait Conversions {
        v.address.country,
        v.notes,
        v.concerning,
        now,
        now
      )
    } yield OOrganization.PersonAndContacts(org, cont)
@ -410,6 +414,7 @@ trait Conversions {
    def contacts(pid: Ident) =
      v.contacts.traverse(c => newContact(c, None, pid.some))
    for {
      now  <- Timestamp.current[F]
      cont <- contacts(v.id)
      org = RPerson(
        v.id,
@ -421,7 +426,8 @@ trait Conversions {
        v.address.country,
        v.notes,
        v.concerning,
-        v.created
+        v.created,
        now
      )
    } yield OOrganization.PersonAndContacts(org, cont)
  }
@ -536,11 +542,11 @@ trait Conversions {
  def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
    timeId.map({
      case (id, now) =>
-        REquipment(id, cid, e.name, now)
+        REquipment(id, cid, e.name, now, now)
    })
-  def changeEquipment(e: Equipment, cid: Ident): REquipment =
+  def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
-    REquipment(e.id, cid, e.name, e.created)
+    Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))
  // idref
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
@ -39,10 +39,10 @@ object EquipmentRoutes {
      case req @ PUT -> Root =>
        for {
-          data <- req.as[Equipment]
+          data  <- req.as[Equipment]
-          equip = changeEquipment(data, user.account.collective)
+          equip <- changeEquipment(data, user.account.collective)
-          res  <- backend.equipment.update(equip)
+          res   <- backend.equipment.update(equip)
-          resp <- Ok(basicResult(res, "Equipment updated."))
+          resp  <- Ok(basicResult(res, "Equipment updated."))
        } yield resp
      case DELETE -> Root / Ident(id) =>
--- a/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
@ -0,0 +1,29 @@
 -- organization
 ALTER TABLE `organization`
 ADD COLUMN (`updated` timestamp);
 UPDATE `organization` SET `updated` = `created`;
 ALTER TABLE `organization`
 MODIFY `updated` timestamp NOT NULL;
 -- person
 ALTER TABLE `person`
 MODIFY `created` timestamp;
 ALTER TABLE `person`
 ADD COLUMN (`updated` timestamp);
 UPDATE `person` SET `updated` = `created`;
 ALTER TABLE `person`
 MODIFY `updated` timestamp NOT NULL;
 -- equipment
 ALTER TABLE `equipment`
 ADD COLUMN (`updated` timestamp);
 UPDATE `equipment` SET `updated` = `created`;
 ALTER TABLE `equipment`
 MODIFY `updated` timestamp NOT NULL;
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
@ -0,0 +1,29 @@
 -- organization
 ALTER TABLE "organization"
 ADD COLUMN "updated" timestamp;
 UPDATE "organization" SET "updated" = "created";
 ALTER TABLE "organization"
 ALTER COLUMN "updated" SET NOT NULL;
 -- person
 ALTER TABLE "person" ALTER COLUMN "created"
  TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
 ALTER TABLE "person"
 ADD COLUMN "updated" timestamp;
 UPDATE "person" SET "updated" = "created";
 ALTER TABLE "person"
 ALTER COLUMN "updated" SET NOT NULL;
 -- equipment
 ALTER TABLE "equipment"
 ADD COLUMN "updated" timestamp;
 UPDATE "equipment" SET "updated" = "created";
 ALTER TABLE "equipment"
 ALTER COLUMN "updated" SET NOT NULL;
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@ -1,5 +1,6 @@
 package docspell.store.queries
 import cats.data.OptionT
 import fs2.Stream
 import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
 import doobie.implicits._
 object QCollective {
  case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
  object Names {
    val empty = Names(Vector.empty, Vector.empty, Vector.empty)
  }
  def allNames(collective: Ident): ConnectionIO[Names] =
    (for {
      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
      .getOrElse(Names.empty)
  case class TagCount(tag: RTag, count: Int)
  case class InsightData(
--- a/modules/store/src/main/scala/docspell/store/records/REquipment.scala
+++ b/modules/store/src/main/scala/docspell/store/records/REquipment.scala
@ -7,7 +7,13 @@ import docspell.store.impl._
 import doobie._
 import doobie.implicits._
-case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {}
+case class REquipment(
    eid: Ident,
    cid: Ident,
    name: String,
    created: Timestamp,
    updated: Timestamp
 ) {}
 object REquipment {
@ -18,25 +24,32 @@ object REquipment {
    val cid     = Column("cid")
    val name    = Column("name")
    val created = Column("created")
-    val all     = List(eid, cid, name, created)
+    val updated = Column("updated")
    val all     = List(eid, cid, name, created, updated)
  }
  import Columns._
  def insert(v: REquipment): ConnectionIO[Int] = {
-    val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}")
+    val sql =
      insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
    sql.update.run
  }
  def update(v: REquipment): ConnectionIO[Int] = {
-    val sql = updateRow(
+    def sql(now: Timestamp) =
-      table,
+      updateRow(
-      and(eid.is(v.eid), cid.is(v.cid)),
+        table,
-      commas(
+        and(eid.is(v.eid), cid.is(v.cid)),
-        cid.setTo(v.cid),
+        commas(
-        name.setTo(v.name)
+          cid.setTo(v.cid),
          name.setTo(v.name),
          updated.setTo(now)
        )
      )
-    )
+    for {
-    sql.update.run
+      now <- Timestamp.current[ConnectionIO]
      n   <- sql(now).update.run
    } yield n
  }
  def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {
--- a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
+++ b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
@ -19,7 +19,8 @@ case class ROrganization(
    city: String,
    country: String,
    notes: Option[String],
-    created: Timestamp
+    created: Timestamp,
    updated: Timestamp
 ) {}
 object ROrganization {
@ -38,7 +39,8 @@ object ROrganization {
    val country = Column("country")
    val notes   = Column("notes")
    val created = Column("created")
-    val all     = List(oid, cid, name, street, zip, city, country, notes, created)
+    val updated = Column("updated")
    val all     = List(oid, cid, name, street, zip, city, country, notes, created, updated)
  }
  import Columns._
@ -47,26 +49,31 @@ object ROrganization {
    val sql = insertRow(
      table,
      all,
-      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}"
+      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
    )
    sql.update.run
  }
  def update(v: ROrganization): ConnectionIO[Int] = {
-    val sql = updateRow(
+    def sql(now: Timestamp) =
-      table,
+      updateRow(
-      and(oid.is(v.oid), cid.is(v.cid)),
+        table,
-      commas(
+        and(oid.is(v.oid), cid.is(v.cid)),
-        cid.setTo(v.cid),
+        commas(
-        name.setTo(v.name),
+          cid.setTo(v.cid),
-        street.setTo(v.street),
+          name.setTo(v.name),
-        zip.setTo(v.zip),
+          street.setTo(v.street),
-        city.setTo(v.city),
+          zip.setTo(v.zip),
-        country.setTo(v.country),
+          city.setTo(v.city),
-        notes.setTo(v.notes)
+          country.setTo(v.country),
          notes.setTo(v.notes),
          updated.setTo(now)
        )
      )
-    )
+    for {
-    sql.update.run
+      now <- Timestamp.current[ConnectionIO]
      n   <- sql(now).update.run
    } yield n
  }
  def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =
--- a/modules/store/src/main/scala/docspell/store/records/RPerson.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RPerson.scala
@ -20,7 +20,8 @@ case class RPerson(
    country: String,
    notes: Option[String],
    concerning: Boolean,
-    created: Timestamp
+    created: Timestamp,
    updated: Timestamp
 ) {}
 object RPerson {
@ -40,7 +41,20 @@ object RPerson {
    val notes      = Column("notes")
    val concerning = Column("concerning")
    val created    = Column("created")
-    val all        = List(pid, cid, name, street, zip, city, country, notes, concerning, created)
+    val updated    = Column("updated")
    val all = List(
      pid,
      cid,
      name,
      street,
      zip,
      city,
      country,
      notes,
      concerning,
      created,
      updated
    )
  }
  import Columns._
@ -49,27 +63,32 @@ object RPerson {
    val sql = insertRow(
      table,
      all,
-      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}"
+      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
    )
    sql.update.run
  }
  def update(v: RPerson): ConnectionIO[Int] = {
-    val sql = updateRow(
+    def sql(now: Timestamp) =
-      table,
+      updateRow(
-      and(pid.is(v.pid), cid.is(v.cid)),
+        table,
-      commas(
+        and(pid.is(v.pid), cid.is(v.cid)),
-        cid.setTo(v.cid),
+        commas(
-        name.setTo(v.name),
+          cid.setTo(v.cid),
-        street.setTo(v.street),
+          name.setTo(v.name),
-        zip.setTo(v.zip),
+          street.setTo(v.street),
-        city.setTo(v.city),
+          zip.setTo(v.zip),
-        country.setTo(v.country),
+          city.setTo(v.city),
-        concerning.setTo(v.concerning),
+          country.setTo(v.country),
-        notes.setTo(v.notes)
+          concerning.setTo(v.concerning),
          notes.setTo(v.notes),
          updated.setTo(now)
        )
      )
-    )
+    for {
-    sql.update.run
+      now <- Timestamp.current[ConnectionIO]
      n   <- sql(now).update.run
    } yield n
  }
  def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@ -10,6 +10,7 @@ module Data.Language exposing
 type Language
    = German
    | English
    | French
 fromString : String -> Maybe Language
@ -20,6 +21,9 @@ fromString str =
    else if str == "eng" || str == "en" || str == "english" then
        Just English
    else if str == "fra" || str == "fr" || str == "french" then
        Just French
    else
        Nothing
@ -33,6 +37,9 @@ toIso3 lang =
        English ->
            "eng"
        French ->
            "fra"
 toName : Language -> String
 toName lang =
@ -43,7 +50,10 @@ toName lang =
        English ->
            "English"
        French ->
            "French"
 all : List Language
 all =
-    [ German, English ]
+    [ German, English, French ]
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@ -91,6 +91,11 @@ let
    };
    text-analysis = {
      max-length = 10000;
      regex-ner = {
        enabled = true;
        file-cache-time = "1 minute";
      };
      working-dir = "/tmp/docspell-analysis";
    };
    processing = {
      max-due-date-years = 10;
@ -689,7 +694,48 @@ in {
                (a rough guess).
              '';
            };
            working-dir = mkOption {
              type = types.str;
              default = defaults.text-analysis.working-dir;
              description = ''
                A working directory for the analyser to store temporary/working
                files.
              '';
            };
            regex-ner = mkOption {
              type = types.submodule({
                options = {
                  enabled = mkOption {
                    type = types.bool;
                    default = defaults.text-analysis.regex-ner.enabled;
                    description = ''
                      Whether to enable custom NER annotation. This uses the address
                      book of a collective as input for NER tagging (to automatically
                      find correspondent and concerned entities). If the address book
                      is large, this can be quite memory intensive and also makes text
                      analysis slower. But it greatly improves accuracy. If this is
                      false, NER tagging uses only statistical models (that also work
                      quite well).
                      This setting might be moved to the collective settings in the
                      future.
                    '';
                  };
                  file-cache-time = mkOption {
                    type = types.str;
                    default = defaults.text-analysis.ner-file-cache-time;
                    description = ''
                      The NER annotation uses a file of patterns that is derived from
                      a collective's address book. This is is the time how long this
                      file will be kept until a check for a state change is done.
                    '';
                  };
                };
              });
              default = defaults.text-analysis.regex-ner;
              description = "";
            };
          };
        });
        default = defaults.text-analysis;
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@ -31,7 +31,7 @@ object Dependencies {
  val PostgresVersion         = "42.2.16"
  val PureConfigVersion       = "0.13.0"
  val Slf4jVersion            = "1.7.30"
-  val StanfordNlpVersion      = "3.9.2"
+  val StanfordNlpVersion      = "4.0.0"
  val TikaVersion             = "1.24.1"
  val YamuscaVersion          = "0.6.2"
  val SwaggerUIVersion        = "3.32.3"
@ -135,11 +135,16 @@ object Dependencies {
  )
  val stanfordNlpModels = Seq(
    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
      .classifier("models"),
    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
      .classifier("models-german"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
-      "models-english"
+      .classifier("models-french"),
-    )
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
      .classifier(
        "models-english"
      )
  )
  val tika = Seq(
--- a/project/NerModelsPlugin.scala
+++ b/project/NerModelsPlugin.scala
@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
  }
  private val nerModels = List(
-    "german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
+    "german.distsim.crf.ser.gz",
-    "english.all.3class.distsim.crf.ser.gz"
+    "english.conll.4class.distsim.crf.ser.gz",
    "french-wikiner-4class.crf.ser.gz",
    "french-mwt-statistical.tsv",
    "french-mwt.tagger",
    "french-mwt.tsv",
    "german-mwt.tsv",
    "german-ud.tagger",
    "german-ud.tagger.props",
    "french-ud.tagger",
    "french-ud.tagger.props",
    "english-left3words-distsim.tagger",
    "english-left3words-distsim.tagger.props"
  )
 }