Merge pull request #238 from eikek/stanford-nlp4

Stanford nlp4
2025-07-04 16:48:26 +00:00 · 2020-08-25 19:02:43 +00:00
parent aefa7bace2 3473cbb773
commit 31544240fb
38 changed files with 1040 additions and 219 deletions
--- a/.scala-steward.conf
+++ b/.scala-steward.conf
@ -1,3 +0,0 @@
-updates.ignore = [
-  { groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
-]
--- a/.travis.yml
+++ b/.travis.yml
@ -10,6 +10,7 @@ cache:
    - $HOME/.ivy2/cache
    - $HOME/.sbt/boot
    - $HOME/.coursier/cache
+    - $HOME/.cache/coursier
    - sysconfcpus

 install:
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 <img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>

-[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell)
+[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
 [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org)
 [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
 [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -5,12 +5,19 @@ import cats.implicits._

 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
+import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
+import docspell.analysis.nlp.StanfordSettings
 import docspell.common._

 trait TextAnalyser[F[_]] {

-  def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
+  def annotate(
+      logger: Logger[F],
+      settings: StanfordSettings,
+      cacheKey: Ident,
+      text: String
+  ): F[TextAnalyser.Result]

 }
 object TextAnalyser {
@ -22,43 +29,47 @@ object TextAnalyser {
  }

  def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
-    Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
-      def annotate(
-          logger: Logger[F],
-          lang: Language,
-          text: String
-      ): F[TextAnalyser.Result] =
-        for {
-          input <- textLimit(logger, text)
-          tags0 <- stanfordNer(lang, input)
-          tags1 <- contactNer(input)
-          dates <- dateNer(lang, input)
-          list  = tags0 ++ tags1
-          spans = NerLabelSpan.build(list)
-        } yield Result(spans ++ list, dates)
+    Resource
+      .liftF(PipelineCache[F]())
+      .map(cache =>
+        new TextAnalyser[F] {
+          def annotate(
+              logger: Logger[F],
+              settings: StanfordSettings,
+              cacheKey: Ident,
+              text: String
+          ): F[TextAnalyser.Result] =
+            for {
+              input <- textLimit(logger, text)
+              tags0 <- stanfordNer(cacheKey, settings, input)
+              tags1 <- contactNer(input)
+              dates <- dateNer(settings.lang, input)
+              list  = tags0 ++ tags1
+              spans = NerLabelSpan.build(list)
+            } yield Result(spans ++ list, dates)

-      private def textLimit(logger: Logger[F], text: String): F[String] =
-        if (text.length <= cfg.maxLength) text.pure[F]
-        else
-          logger.info(
-            s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
-              s" Analysing only first ${cfg.maxLength} characters."
-          ) *> text.take(cfg.maxLength).pure[F]
+          private def textLimit(logger: Logger[F], text: String): F[String] =
+            if (text.length <= cfg.maxLength) text.pure[F]
+            else
+              logger.info(
+                s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
+                  s" Analysing only first ${cfg.maxLength} characters."
+              ) *> text.take(cfg.maxLength).pure[F]

-      private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          StanfordNerClassifier.nerAnnotate(lang)(text)
+          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
+              : F[Vector[NerLabel]] =
+            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
+
+          private def contactNer(text: String): F[Vector[NerLabel]] =
+            Sync[F].delay {
+              Contact.annotate(text)
+            }
+
+          private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
+            Sync[F].delay {
+              DateFind.findDates(text, lang).toVector
+            }
        }
-
-      private def contactNer(text: String): F[Vector[NerLabel]] =
-        Sync[F].delay {
-          Contact.annotate(text)
-        }
-
-      private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
-        Sync[F].delay {
-          DateFind.findDates(text, lang).toVector
-        }
-    })
+      )

 }
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@ -54,6 +54,7 @@ object DateFind {
      val p = lang match {
        case Language.English => p2.or(p0).or(p1)
        case Language.German  => p1.or(p0).or(p2)
+        case Language.French  => p1.or(p0).or(p2)
      }
      p.read(parts).toOption
    }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala
@ -0,0 +1,25 @@
+package docspell.analysis.nlp
+
+import docspell.common.{NerLabel, NerTag}
+
+import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
+
+object LabelConverter {
+
+  private def tagFromLabel[A <: CoreAnnotation[String]](
+      label: CoreLabel,
+      annot: Class[A]
+  ): Option[NerTag] = {
+    val tag = label.get(annot)
+    Option(tag).flatMap(s => NerTag.fromString(s).toOption)
+  }
+
+  def findTag(label: CoreLabel): Option[NerTag] =
+    tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
+      .orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
+
+  def toNerLabel(label: CoreLabel): Option[NerLabel] =
+    findTag(label).map(t =>
+      NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
+    )
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@ -0,0 +1,90 @@
+package docspell.analysis.nlp
+
+import cats.Applicative
+import cats.effect._
+import cats.effect.concurrent.Ref
+import cats.implicits._
+
+import docspell.common._
+
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import org.log4s.getLogger
+
+/** Creating the StanfordCoreNLP pipeline is quite expensive as it
+  * involves IO and initializing large objects.
+  *
+  * Therefore, the instances are cached, because they are thread-safe.
+  *
+  * **This is an internal API**
+  */
+trait PipelineCache[F[_]] {
+
+  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
+
+}
+
+object PipelineCache {
+  private[this] val logger = getLogger
+
+  def none[F[_]: Applicative]: PipelineCache[F] =
+    new PipelineCache[F] {
+      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+        makeClassifier(settings).pure[F]
+    }
+
+  def apply[F[_]: Sync](): F[PipelineCache[F]] =
+    Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
+
+  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
+      extends PipelineCache[F] {
+
+    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+      for {
+        id  <- makeSettingsId(settings)
+        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
+      } yield nlp
+
+    private def getOrCreate(
+        key: String,
+        id: String,
+        cache: Map[String, Entry],
+        settings: StanfordSettings
+    ): (Map[String, Entry], StanfordCoreNLP) =
+      cache.get(key) match {
+        case Some(entry) =>
+          if (entry.id == id) (cache, entry.value)
+          else {
+            logger.info(
+              s"StanfordNLP settings changed for key $key. Creating new classifier"
+            )
+            val nlp = makeClassifier(settings)
+            val e   = Entry(id, nlp)
+            (cache.updated(key, e), nlp)
+          }
+
+        case None =>
+          val nlp = makeClassifier(settings)
+          val e   = Entry(id, nlp)
+          (cache.updated(key, e), nlp)
+      }
+
+    private def makeSettingsId(settings: StanfordSettings): F[String] = {
+      val base = settings.copy(regexNer = None).toString
+      val size: F[Long] =
+        settings.regexNer match {
+          case Some(p) =>
+            File.size(p)
+          case None =>
+            0L.pure[F]
+        }
+      size.map(len => s"$base-$len")
+    }
+
+  }
+  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
+    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
+    new StanfordCoreNLP(Properties.forSettings(settings))
+  }
+
+  private case class Entry(id: String, value: StanfordCoreNLP)
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@ -0,0 +1,111 @@
+package docspell.analysis.nlp
+
+import java.util.{Properties => JProps}
+
+import docspell.analysis.nlp.Properties.Implicits._
+import docspell.common._
+
+object Properties {
+
+  def apply(ps: (String, String)*): JProps = {
+    val p = new JProps()
+    for ((k, v) <- ps)
+      p.setProperty(k, v)
+    p
+  }
+
+  def forSettings(settings: StanfordSettings): JProps = {
+    val regexNerFile = settings.regexNer
+      .map(p => p.normalize().toAbsolutePath().toString())
+    settings.lang match {
+      case Language.German =>
+        Properties.nerGerman(regexNerFile, settings.highRecall)
+      case Language.English =>
+        Properties.nerEnglish(regexNerFile)
+      case Language.French =>
+        Properties.nerFrench(regexNerFile, settings.highRecall)
+    }
+  }
+
+  def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language"           -> "de",
+      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
+      "ner.useSUTime"               -> "false", //only english, unused in docspell
+      "ner.language"                -> "de",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
+
+  def nerEnglish(regexNerMappingFile: Option[String]): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,pos,lemma,ner",
+      "tokenize.language"           -> "en",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false",
+      "ner.useSUTime"               -> "false",
+      "ner.language"                -> "en",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile)
+
+  def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
+    Properties(
+      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language"           -> "fr",
+      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
+      "mwt.pos.model"               -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
+      "mwt.statisticalMappingFile"  -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
+      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
+      "ner.statisticalOnly"         -> "true",
+      "ner.rulesOnly"               -> "false",
+      "ner.applyFineGrained"        -> "false",
+      "ner.applyNumericClassifiers" -> "false",
+      "ner.useSUTime"               -> "false",
+      "ner.language"                -> "de",
+      "ner.model"                   -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
+
+  object Implicits {
+    implicit final class JPropsOps(val p: JProps) extends AnyVal {
+
+      def set(name: String, value: Option[String]): JProps =
+        value match {
+          case Some(v) =>
+            p.setProperty(name, v)
+            p
+          case None =>
+            p
+        }
+
+      def change(name: String, f: String => String): JProps =
+        Option(p.getProperty(name)) match {
+          case Some(current) =>
+            p.setProperty(name, f(current))
+            p
+          case None =>
+            p
+        }
+
+      def withRegexNer(mappingFile: Option[String]): JProps =
+        set("regexner.mapping", mappingFile)
+          .change(
+            "annotators",
+            v => if (mappingFile.isDefined) v + ",regexner" else v
+          )
+
+      def withHighRecall(flag: Boolean): JProps = {
+        if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
+        else p.setProperty("ner.combinationMode", "NORMAL")
+        p
+      }
+    }
+  }
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@ -1,65 +1,39 @@
 package docspell.analysis.nlp

-import java.net.URL
-import java.util.zip.GZIPInputStream
-
 import scala.jdk.CollectionConverters._
-import scala.util.Using
+
+import cats.Applicative
+import cats.implicits._

 import docspell.common._

-import edu.stanford.nlp.ie.AbstractSequenceClassifier
-import edu.stanford.nlp.ie.crf.CRFClassifier
-import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
-import org.log4s.getLogger
+import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}

 object StanfordNerClassifier {
-  private[this] val logger = getLogger

-  lazy val germanNerClassifier  = makeClassifier(Language.German)
-  lazy val englishNerClassifier = makeClassifier(Language.English)
+  /** Runs named entity recognition on the given `text`.
+    *
+    * This uses the classifier pipeline from stanford-nlp, see
+    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these
+    * classifiers is quite expensive, it involves loading large model
+    * files. The classifiers are thread-safe and so they are cached.
+    * The `cacheKey` defines the "slot" where classifiers are stored
+    * and retrieved. If for a given `cacheKey` the `settings` change,
+    * a new classifier must be created. It will then replace the
+    * previous one.
+    */
+  def nerAnnotate[F[_]: Applicative](
+      cacheKey: String,
+      cache: PipelineCache[F]
+  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
+    cache
+      .obtain(cacheKey, settings)
+      .map(crf => runClassifier(crf, text))

-  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
-    val nerClassifier = lang match {
-      case Language.English => englishNerClassifier
-      case Language.German  => germanNerClassifier
-    }
-    nerClassifier
-      .classify(text)
-      .asScala
-      .flatMap(a => a.asScala)
-      .collect(Function.unlift { label =>
-        val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
-        NerTag
-          .fromString(Option(tag).getOrElse(""))
-          .toOption
-          .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
-      })
-      .toVector
+  def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
+    val doc = new CoreDocument(text)
+    nerClassifier.annotate(doc)
+    doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
  }

-  private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
-    logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
-    val ner = classifierResource(lang)
-    Using(new GZIPInputStream(ner.openStream())) { in =>
-      CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
-    }.fold(throw _, identity)
-  }
-
-  private def classifierResource(lang: Language): URL = {
-    def check(u: URL): URL =
-      if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
-      else u
-
-    check(lang match {
-      case Language.German =>
-        getClass.getResource(
-          "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
-        )
-      case Language.English =>
-        getClass.getResource(
-          "/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
-        )
-    })
-  }
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
@ -0,0 +1,22 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+import docspell.common._
+
+/** Settings for configuring the stanford NER pipeline.
+  *
+  * The language is mandatory, only the provided ones are supported.
+  * The `highRecall` only applies for non-English languages. For
+  * non-English languages the english classifier is run as second
+  * classifier and if `highRecall` is true, then it will be used to
+  * tag untagged tokens. This may lead to a lot of false positives,
+  * but since English is omnipresent in other languages, too it
+  * depends on the use case for whether this is useful or not.
+  *
+  * The `regexNer` allows to specify a text file as described here:
+  * https://nlp.stanford.edu/software/regexner.html. This will be used
+  * as a last step to tag untagged tokens using the provided list of
+  * regexps.
+  */
+case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@ -3,31 +3,44 @@ package docspell.analysis.nlp
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import edu.stanford.nlp.pipeline.StanfordCoreNLP

 object TextAnalyserSuite extends SimpleTestSuite {
+  lazy val germanClassifier =
+    new StanfordCoreNLP(Properties.nerGerman(None, false))
+  lazy val englishClassifier =
+    new StanfordCoreNLP(Properties.nerEnglish(None))

  test("find english ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
    val expect = Vector(
      NerLabel("Derek", NerTag.Person, 0, 5),
      NerLabel("Jeter", NerTag.Person, 6, 11),
-      NerLabel("Treesville", NerTag.Person, 27, 37),
+      NerLabel("Elm", NerTag.Misc, 17, 20),
+      NerLabel("Ave.", NerTag.Misc, 21, 25),
+      NerLabel("Treesville", NerTag.Misc, 27, 37),
      NerLabel("Derek", NerTag.Person, 68, 73),
      NerLabel("Jeter", NerTag.Person, 74, 79),
-      NerLabel("Treesville", NerTag.Location, 95, 105),
+      NerLabel("Elm", NerTag.Misc, 85, 88),
+      NerLabel("Ave.", NerTag.Misc, 89, 93),
+      NerLabel("Treesville", NerTag.Person, 95, 105),
+      NerLabel("Leaf", NerTag.Organization, 144, 148),
+      NerLabel("Chief", NerTag.Organization, 150, 155),
+      NerLabel("of", NerTag.Organization, 156, 158),
      NerLabel("Syrup", NerTag.Organization, 159, 164),
      NerLabel("Production", NerTag.Organization, 165, 175),
      NerLabel("Old", NerTag.Organization, 176, 179),
      NerLabel("Sticky", NerTag.Organization, 180, 186),
      NerLabel("Pancake", NerTag.Organization, 187, 194),
      NerLabel("Company", NerTag.Organization, 195, 202),
-      NerLabel("Maple", NerTag.Location, 207, 212),
-      NerLabel("Lane", NerTag.Location, 213, 217),
-      NerLabel("Forest", NerTag.Location, 219, 225),
+      NerLabel("Maple", NerTag.Organization, 207, 212),
+      NerLabel("Lane", NerTag.Organization, 213, 217),
+      NerLabel("Forest", NerTag.Organization, 219, 225),
      NerLabel("Hemptown", NerTag.Location, 239, 247),
-      NerLabel("Little", NerTag.Organization, 347, 353),
-      NerLabel("League", NerTag.Organization, 354, 360),
+      NerLabel("Leaf", NerTag.Person, 276, 280),
+      NerLabel("Little", NerTag.Misc, 347, 353),
+      NerLabel("League", NerTag.Misc, 354, 360),
      NerLabel("Derek", NerTag.Person, 1117, 1122),
      NerLabel("Jeter", NerTag.Person, 1123, 1128)
    )
@ -36,11 +49,11 @@ object TextAnalyserSuite extends SimpleTestSuite {

  test("find german ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
    val expect = Vector(
      NerLabel("Max", NerTag.Person, 0, 3),
      NerLabel("Mustermann", NerTag.Person, 4, 14),
-      NerLabel("Lilienweg", NerTag.Location, 16, 25),
+      NerLabel("Lilienweg", NerTag.Person, 16, 25),
      NerLabel("Max", NerTag.Person, 77, 80),
      NerLabel("Mustermann", NerTag.Person, 81, 91),
      NerLabel("Lilienweg", NerTag.Location, 93, 102),
--- a/modules/common/src/main/scala/docspell/common/Duration.scala
+++ b/modules/common/src/main/scala/docspell/common/Duration.scala
@ -20,6 +20,12 @@ case class Duration(nanos: Long) {

  def hours: Long = minutes / 60

+  def >(other: Duration): Boolean =
+    nanos > other.nanos
+
+  def <(other: Duration): Boolean =
+    nanos < other.nanos
+
  def toScala: FiniteDuration =
    FiniteDuration(nanos, TimeUnit.NANOSECONDS)

@ -62,6 +68,9 @@ object Duration {
  def nanos(n: Long): Duration =
    Duration(n)

+  def between(start: Timestamp, end: Timestamp): Duration =
+    apply(JDur.between(start.value, end.value))
+
  def stopTime[F[_]: Sync]: F[F[Duration]] =
    for {
      now <- Timestamp.current[F]
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@ -1,6 +1,7 @@
 package docspell.common

 import java.io.IOException
+import java.nio.charset.StandardCharsets
 import java.nio.file._
 import java.nio.file.attribute.BasicFileAttributes
 import java.util.concurrent.atomic.AtomicInteger
@ -11,6 +12,10 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream

+import docspell.common.syntax.all._
+
+import io.circe.Decoder
+
 object File {

  def mkDir[F[_]: Sync](dir: Path): F[Path] =
@ -55,6 +60,9 @@ object File {
  def exists[F[_]: Sync](file: Path): F[Boolean] =
    Sync[F].delay(Files.exists(file))

+  def size[F[_]: Sync](file: Path): F[Long] =
+    Sync[F].delay(Files.size(file))
+
  def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
    Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)

@ -84,4 +92,13 @@ object File {

  def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
    readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
+
+  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
+    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
+
+  def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
+      d: Decoder[A]
+  ): F[A] =
+    readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
+
 }
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@ -27,7 +27,12 @@ object Language {
    val iso3 = "eng"
  }

-  val all: List[Language] = List(German, English)
+  case object French extends Language {
+    val iso2 = "fr"
+    val iso3 = "fra"
+  }
+
+  val all: List[Language] = List(German, English, French)

  def fromString(str: String): Either[String, Language] = {
    val lang = str.toLowerCase
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
@ -23,6 +23,7 @@ object Field {
  val content        = Field("content")
  val content_de     = Field("content_de")
  val content_en     = Field("content_en")
+  val content_fr     = Field("content_fr")
  val itemName       = Field("itemName")
  val itemNotes      = Field("itemNotes")
  val folderId       = Field("folder")
@ -33,6 +34,8 @@ object Field {
        Field.content_de
      case Language.English =>
        Field.content_en
+      case Language.French =>
+        Field.content_fr
    }

  implicit val jsonEncoder: Encoder[Field] =
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@ -39,6 +39,7 @@ object SolrQuery {
            Field.content,
            Field.content_de,
            Field.content_en,
+            Field.content_fr,
            Field.itemName,
            Field.itemNotes,
            Field.attachmentName
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@ -80,6 +80,8 @@ object SolrSetup {
            addTextField(l.some)(Field.content_de)
          case l @ Language.English =>
            addTextField(l.some)(Field.content_en)
+          case l @ Language.French =>
+            addTextField(l.some)(Field.content_fr)
        }

        cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -105,6 +107,9 @@ object SolrSetup {
          case Some(Language.English) =>
            run(DeleteField.command(DeleteField(field))).attempt *>
              run(AddField.command(AddField.textEN(field)))
+          case Some(Language.French) =>
+            run(DeleteField.command(DeleteField(field))).attempt *>
+              run(AddField.command(AddField.textFR(field)))
        }
    }
  }
@ -138,6 +143,9 @@ object SolrSetup {

    def textEN(field: Field): AddField =
      AddField(field, "text_en", true, true, false)
+
+    def textFR(field: Field): AddField =
+      AddField(field, "text_fr", true, true, false)
  }

  case class DeleteField(name: Field)
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -248,6 +248,29 @@ docspell.joex {
    # should suffice. Default is 10000, which are about 2-3 pages
    # (just a rough guess, of course).
    max-length = 10000
+
+    # A working directory for the analyser to store temporary/working
+    # files.
+    working-dir = ${java.io.tmpdir}"/docspell-analysis"
+
+    regex-ner {
+      # Whether to enable custom NER annotation. This uses the address
+      # book of a collective as input for NER tagging (to automatically
+      # find correspondent and concerned entities). If the address book
+      # is large, this can be quite memory intensive and also makes text
+      # analysis slower. But it greatly improves accuracy. If this is
+      # false, NER tagging uses only statistical models (that also work
+      # quite well).
+      #
+      # This setting might be moved to the collective settings in the
+      # future.
+      enabled = true
+
+      # The NER annotation uses a file of patterns that is derived from
+      # a collective's address book. This is is the time how long this
+      # file will be kept until a check for a state change is done.
+      file-cache-time = "1 minute"
+    }
  }

  # Configuration for converting files into PDFs.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -1,11 +1,14 @@
 package docspell.joex

+import java.nio.file.Path
+
 import docspell.analysis.TextAnalysisConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
 import docspell.extract.ExtractConfig
 import docspell.ftssolr.SolrConfig
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.hk.HouseKeepingConfig
 import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
 import docspell.store.JdbcConfig
@ -20,7 +23,7 @@ case class Config(
    userTasks: Config.UserTasks,
    houseKeeping: HouseKeepingConfig,
    extraction: ExtractConfig,
-    textAnalysis: TextAnalysisConfig,
+    textAnalysis: Config.TextAnalysis,
    convert: ConvertConfig,
    sendMail: MailSendConfig,
    files: Files,
@ -50,4 +53,19 @@ object Config {
  }

  case class Processing(maxDueDateYears: Int)
+
+  case class TextAnalysis(
+      maxLength: Int,
+      workingDir: Path,
+      regexNer: RegexNer
+  ) {
+
+    def textAnalysisConfig: TextAnalysisConfig =
+      TextAnalysisConfig(maxLength)
+
+    def regexNerFileConfig: RegexNerFile.Config =
+      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
+  }
+
+  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
 }
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@ -6,10 +6,12 @@ import cats.effect._
 import cats.implicits._
 import fs2.concurrent.SignallingRef

+import docspell.analysis.TextAnalyser
 import docspell.backend.ops._
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.ftssolr.SolrFtsClient
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.notify._
@ -80,14 +82,16 @@ object JoexAppImpl {
    for {
      httpClient <- BlazeClientBuilder[F](clientEC).resource
      client = JoexClient(httpClient)
-      store   <- Store.create(cfg.jdbc, connectEC, blocker)
-      queue   <- JobQueue(store)
-      pstore  <- PeriodicTaskStore.create(store)
-      nodeOps <- ONode(store)
-      joex    <- OJoex(client, store)
-      upload  <- OUpload(store, queue, cfg.files, joex)
-      fts     <- createFtsClient(cfg)(httpClient)
-      itemOps <- OItem(store, fts, queue, joex)
+      store    <- Store.create(cfg.jdbc, connectEC, blocker)
+      queue    <- JobQueue(store)
+      pstore   <- PeriodicTaskStore.create(store)
+      nodeOps  <- ONode(store)
+      joex     <- OJoex(client, store)
+      upload   <- OUpload(store, queue, cfg.files, joex)
+      fts      <- createFtsClient(cfg)(httpClient)
+      itemOps  <- OItem(store, fts, queue, joex)
+      analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
+      regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
      javaEmil =
        JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
      sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -95,14 +99,14 @@ object JoexAppImpl {
        .withTask(
          JobTask.json(
            ProcessItemArgs.taskName,
-            ItemHandler.newItem[F](cfg, itemOps, fts),
+            ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
            ItemHandler.onCancel[F]
          )
        )
        .withTask(
          JobTask.json(
            ReProcessItemArgs.taskName,
-            ReProcessItem[F](cfg, fts),
+            ReProcessItem[F](cfg, fts, analyser, regexNer),
            ReProcessItem.onCancel[F]
          )
        )
--- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@ -0,0 +1,99 @@
+package docspell.joex.analysis
+
+import java.nio.file.Path
+
+import cats.effect._
+import cats.implicits._
+
+import docspell.analysis.split.TextSplitter
+import docspell.common._
+import docspell.store.queries.QCollective
+
+import io.circe.generic.semiauto._
+import io.circe.{Decoder, Encoder}
+
+case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
+  def nerFilePath(directory: Path): Path =
+    NerFile.nerFilePath(directory, collective)
+
+  def jsonFilePath(directory: Path) =
+    NerFile.jsonFilePath(directory, collective)
+}
+
+object NerFile {
+  implicit val jsonDecoder: Decoder[NerFile] =
+    deriveDecoder[NerFile]
+
+  implicit val jsonEncoder: Encoder[NerFile] =
+    deriveEncoder[NerFile]
+
+  private def nerFilePath(directory: Path, collective: Ident): Path =
+    directory.resolve(s"${collective.id}.txt")
+
+  private def jsonFilePath(directory: Path, collective: Ident): Path =
+    directory.resolve(s"${collective.id}.json")
+
+  def find[F[_]: Sync: ContextShift](
+      collective: Ident,
+      directory: Path,
+      blocker: Blocker
+  ): F[Option[NerFile]] = {
+    val file = jsonFilePath(directory, collective)
+    File.existsNonEmpty[F](file).flatMap {
+      case true =>
+        File
+          .readJson[F, NerFile](file, blocker)
+          .map(_.some)
+      case false =>
+        (None: Option[NerFile]).pure[F]
+    }
+  }
+
+  def mkNerConfig(names: QCollective.Names): String = {
+    val orgs = names.org
+      .flatMap(Pattern(3))
+      .distinct
+      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
+
+    val pers =
+      names.pers
+        .flatMap(Pattern(2))
+        .distinct
+        .map(_.toRow("PERSON", "LOCATION,MISC"))
+
+    val equips =
+      names.equip
+        .flatMap(Pattern(1))
+        .distinct
+        .map(_.toRow("MISC", "LOCATION"))
+
+    (orgs ++ pers ++ equips).mkString("\n")
+  }
+  case class Pattern(value: String, weight: Int) {
+    def toRow(tag: String, overrideTags: String): String =
+      s"$value\t$tag\t$overrideTags\t$weight"
+  }
+
+  object Pattern {
+    def apply(weight: Int)(str: String): Vector[Pattern] = {
+      val delims = " \t\n\r".toSet
+      val words =
+        TextSplitter
+          .split(str, delims)
+          .map(_.toLower.value.trim)
+          .filter(_.nonEmpty)
+          .toVector
+          .map(w => s"(?i)${w}")
+      val tokens =
+        TextSplitter
+          .splitToken(str, delims)
+          .map(_.toLower.value.trim)
+          .filter(_.nonEmpty)
+          .toVector
+          .take(3)
+          .map(w => s"(?i)${w}")
+
+      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+    }
+  }
+}
--- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
@ -0,0 +1,164 @@
+package docspell.joex.analysis
+
+import java.nio.file.Path
+
+import cats.effect._
+import cats.effect.concurrent.Semaphore
+import cats.implicits._
+
+import docspell.common._
+import docspell.common.syntax.all._
+import docspell.store.Store
+import docspell.store.queries.QCollective
+import docspell.store.records.REquipment
+import docspell.store.records.ROrganization
+import docspell.store.records.RPerson
+
+import io.circe.syntax._
+import org.log4s.getLogger
+
+/** Maintains a custom regex-ner file per collective for stanford's
+  * regexner annotator.
+  */
+trait RegexNerFile[F[_]] {
+
+  def makeFile(collective: Ident): F[Option[Path]]
+
+}
+
+object RegexNerFile {
+  private[this] val logger = getLogger
+
+  case class Config(enabled: Boolean, directory: Path, minTime: Duration)
+
+  def apply[F[_]: Concurrent: ContextShift](
+      cfg: Config,
+      blocker: Blocker,
+      store: Store[F]
+  ): Resource[F, RegexNerFile[F]] =
+    for {
+      dir    <- File.withTempDir[F](cfg.directory, "regexner-")
+      writer <- Resource.liftF(Semaphore(1))
+    } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
+
+  final private class Impl[F[_]: Concurrent: ContextShift](
+      cfg: Config,
+      blocker: Blocker,
+      store: Store[F],
+      writer: Semaphore[F] //TODO allow parallelism per collective
+  ) extends RegexNerFile[F] {
+
+    def makeFile(collective: Ident): F[Option[Path]] =
+      if (cfg.enabled) doMakeFile(collective)
+      else (None: Option[Path]).pure[F]
+
+    def doMakeFile(collective: Ident): F[Option[Path]] =
+      for {
+        now      <- Timestamp.current[F]
+        existing <- NerFile.find[F](collective, cfg.directory, blocker)
+        result <- existing match {
+          case Some(nf) =>
+            val dur = Duration.between(nf.creation, now)
+            if (dur > cfg.minTime)
+              logger.fdebug(
+                s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
+              ) *> updateFile(
+                collective,
+                now,
+                Some(nf)
+              )
+            else nf.nerFilePath(cfg.directory).some.pure[F]
+          case None =>
+            updateFile(collective, now, None)
+        }
+      } yield result
+
+    private def updateFile(
+        collective: Ident,
+        now: Timestamp,
+        current: Option[NerFile]
+    ): F[Option[Path]] =
+      for {
+        lastUpdate <- store.transact(Sql.latestUpdate(collective))
+        result <- lastUpdate match {
+          case None =>
+            (None: Option[Path]).pure[F]
+          case Some(lup) =>
+            current match {
+              case Some(cur) =>
+                val nerf =
+                  if (cur.updated == lup)
+                    logger.fdebug(s"No state change detected.") *> updateTimestamp(
+                      cur,
+                      now
+                    ) *> cur.pure[F]
+                  else
+                    logger.fdebug(
+                      s"There have been state changes for collective '${collective.id}'. Reload NER file."
+                    ) *> createFile(lup, collective, now)
+                nerf.map(_.nerFilePath(cfg.directory).some)
+              case None =>
+                createFile(lup, collective, now)
+                  .map(_.nerFilePath(cfg.directory).some)
+            }
+        }
+      } yield result
+
+    private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
+      writer.withPermit(for {
+        file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
+        _    <- File.mkDir(file.getParent)
+        _    <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
+      } yield ())
+
+    private def createFile(
+        lastUpdate: Timestamp,
+        collective: Ident,
+        now: Timestamp
+    ): F[NerFile] = {
+      def update(nf: NerFile, text: String): F[Unit] =
+        writer.withPermit(for {
+          jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
+          _        <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
+          _        <- File.mkDir(jsonFile.getParent)
+          _        <- File.writeString(nf.nerFilePath(cfg.directory), text)
+          _        <- File.writeString(jsonFile, nf.asJson.spaces2)
+        } yield ())
+
+      for {
+        _     <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
+        names <- store.transact(QCollective.allNames(collective))
+        nerFile = NerFile(collective, lastUpdate, now)
+        _ <- update(nerFile, NerFile.mkNerConfig(names))
+      } yield nerFile
+    }
+  }
+
+  object Sql {
+    import doobie._
+    import doobie.implicits._
+    import docspell.store.impl.Implicits._
+    import docspell.store.impl.Column
+
+    def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
+      def max(col: Column, table: Fragment, cidCol: Column): Fragment =
+        selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
+
+      val sql =
+        List(
+          max(
+            ROrganization.Columns.updated,
+            ROrganization.table,
+            ROrganization.Columns.cid
+          ),
+          max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
+          max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
+        )
+          .reduce(_ ++ fr"UNION ALL" ++ _)
+
+      selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
+        .query[Timestamp]
+        .option
+    }
+  }
+}
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@ -5,10 +5,12 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream

+import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.{ItemState, ProcessItemArgs}
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QItem
 import docspell.store.records.RItem
@ -29,11 +31,13 @@ object ItemHandler {
  def newItem[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    CreateItem[F]
      .flatMap(itemStateTask(ItemState.Processing))
-      .flatMap(safeProcess[F](cfg, itemOps, fts))
+      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
      .map(_ => ())

  def itemStateTask[F[_]: Sync, A](
@ -51,11 +55,13 @@ object ItemHandler {
  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
          case Right(d) =>
            Task.pure(d)
          case Left(ex) =>
@ -65,7 +71,8 @@ object ItemHandler {
              .andThen(_ => Sync[F].raiseError(ex))
        })
      case false =>
-        ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
+          .flatMap(itemStateTask(ItemState.Created))
    }

  private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@ -2,10 +2,12 @@ package docspell.joex.process

 import cats.effect._

+import docspell.analysis.TextAnalyser
 import docspell.backend.ops.OItem
 import docspell.common.ProcessItemArgs
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task

 object ProcessItem {
@ -13,25 +15,31 @@ object ProcessItem {
  def apply[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      itemOps: OItem[F],
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ExtractArchive(item)
      .flatMap(Task.setProgress(20))
-      .flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
+      .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
      .flatMap(LinkProposal[F])
      .flatMap(SetGivenData[F](itemOps))
      .flatMap(Task.setProgress(99))

  def processAttachments[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    processAttachments0[F](cfg, fts, (30, 60, 90))(item)
+    processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)

  def analysisOnly[F[_]: Sync](
-      cfg: Config
+      cfg: Config,
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](cfg.textAnalysis)(item)
+    TextAnalysis[F](analyser, regexNer)(item)
      .flatMap(FindProposal[F](cfg.processing))
      .flatMap(EvalProposals[F])
      .flatMap(SaveProposals[F])
@ -39,12 +47,14 @@ object ProcessItem {
  private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F],
      progress: (Int, Int, Int)
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ConvertPdf(cfg.convert, item)
      .flatMap(Task.setProgress(progress._1))
      .flatMap(TextExtraction(cfg.extraction, fts))
      .flatMap(Task.setProgress(progress._2))
-      .flatMap(analysisOnly[F](cfg))
+      .flatMap(analysisOnly[F](cfg, analyser, regexNer))
      .flatMap(Task.setProgress(progress._3))
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@ -4,9 +4,11 @@ import cats.data.OptionT
 import cats.effect._
 import cats.implicits._

+import docspell.analysis.TextAnalyser
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachment
@ -19,10 +21,12 @@ object ReProcessItem {

  def apply[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    loadItem[F]
-      .flatMap(safeProcess[F](cfg, fts))
+      .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
      .map(_ => ())

  def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -70,6 +74,8 @@ object ReProcessItem {
  def processFiles[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F],
      data: ItemData
  ): Task[F, Args, ItemData] = {

@ -91,7 +97,7 @@ object ReProcessItem {

    getLanguage[F].flatMap { lang =>
      ProcessItem
-        .processAttachments[F](cfg, fts)(data)
+        .processAttachments[F](cfg, fts, analyser, regexNer)(data)
        .contramap[Args](convertArgs(lang))
    }
  }
@ -109,11 +115,13 @@ object ReProcessItem {

  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
-      fts: FtsClient[F]
+      fts: FtsClient[F],
+      analyser: TextAnalyser[F],
+      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        processFiles[F](cfg, fts, data).attempt
+        processFiles[F](cfg, fts, analyser, regexNer, data).attempt
          .flatMap({
            case Right(d) =>
              Task.pure(d)
@ -123,7 +131,7 @@ object ReProcessItem {
              ).andThen(_ => Sync[F].raiseError(ex))
          })
      case false =>
-        processFiles[F](cfg, fts, data)
+        processFiles[F](cfg, fts, analyser, regexNer, data)
    }

  private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -1,47 +1,57 @@
 package docspell.joex.process

-import cats.effect.Sync
+import cats.effect._
 import cats.implicits._

-import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
+import docspell.analysis.TextAnalyser
+import docspell.analysis.nlp.StanfordSettings
 import docspell.common._
+import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
+import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta

 object TextAnalysis {

  def apply[F[_]: Sync](
-      cfg: TextAnalysisConfig
+      analyser: TextAnalyser[F],
+      nerFile: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
-      TextAnalyser.create[F](cfg).use { analyser =>
-        for {
-          _ <- ctx.logger.info("Starting text analysis")
-          s <- Duration.stopTime[F]
-          t <-
-            item.metas.toList
-              .traverse(
-                annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
-              )
-          _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
-          _ <- t.traverse(m =>
-            ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
-          )
-          e <- s
-          _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
-          v = t.toVector
-        } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
-      }
+      for {
+        _ <- ctx.logger.info("Starting text analysis")
+        s <- Duration.stopTime[F]
+        t <-
+          item.metas.toList
+            .traverse(
+              annotateAttachment[F](ctx, analyser, nerFile)
+            )
+        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
+        _ <- t.traverse(m =>
+          ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
+        )
+        e <- s
+        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
+        v = t.toVector
+      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
    }

  def annotateAttachment[F[_]: Sync](
-      lang: Language,
-      logger: Logger[F],
-      analyser: TextAnalyser[F]
-  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
+      ctx: Context[F, ProcessItemArgs],
+      analyser: TextAnalyser[F],
+      nerFile: RegexNerFile[F]
+  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
+    val settings = StanfordSettings(ctx.args.meta.language, false, None)
    for {
-      labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
+      customNer <- nerFile.makeFile(ctx.args.meta.collective)
+      sett = settings.copy(regexNer = customNer)
+      labels <- analyser.annotate(
+        ctx.logger,
+        sett,
+        ctx.args.meta.collective,
+        rm.content.getOrElse("")
+      )
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
-
+  }
 }
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@ -341,6 +341,7 @@ trait Conversions {
        v.address.city,
        v.address.country,
        v.notes,
+        now,
        now
      )
    } yield OOrganization.OrgAndContacts(org, cont)
@ -353,6 +354,7 @@ trait Conversions {
    def contacts(oid: Ident) =
      v.contacts.traverse(c => newContact(c, oid.some, None))
    for {
+      now  <- Timestamp.current[F]
      cont <- contacts(v.id)
      org = ROrganization(
        v.id,
@ -363,7 +365,8 @@ trait Conversions {
        v.address.city,
        v.address.country,
        v.notes,
-        v.created
+        v.created,
+        now
      )
    } yield OOrganization.OrgAndContacts(org, cont)
  }
@ -398,6 +401,7 @@ trait Conversions {
        v.address.country,
        v.notes,
        v.concerning,
+        now,
        now
      )
    } yield OOrganization.PersonAndContacts(org, cont)
@ -410,6 +414,7 @@ trait Conversions {
    def contacts(pid: Ident) =
      v.contacts.traverse(c => newContact(c, None, pid.some))
    for {
+      now  <- Timestamp.current[F]
      cont <- contacts(v.id)
      org = RPerson(
        v.id,
@ -421,7 +426,8 @@ trait Conversions {
        v.address.country,
        v.notes,
        v.concerning,
-        v.created
+        v.created,
+        now
      )
    } yield OOrganization.PersonAndContacts(org, cont)
  }
@ -536,11 +542,11 @@ trait Conversions {
  def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
    timeId.map({
      case (id, now) =>
-        REquipment(id, cid, e.name, now)
+        REquipment(id, cid, e.name, now, now)
    })

-  def changeEquipment(e: Equipment, cid: Ident): REquipment =
-    REquipment(e.id, cid, e.name, e.created)
+  def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
+    Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))

  // idref

--- a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala
@ -39,10 +39,10 @@ object EquipmentRoutes {

      case req @ PUT -> Root =>
        for {
-          data <- req.as[Equipment]
-          equip = changeEquipment(data, user.account.collective)
-          res  <- backend.equipment.update(equip)
-          resp <- Ok(basicResult(res, "Equipment updated."))
+          data  <- req.as[Equipment]
+          equip <- changeEquipment(data, user.account.collective)
+          res   <- backend.equipment.update(equip)
+          resp  <- Ok(basicResult(res, "Equipment updated."))
        } yield resp

      case DELETE -> Root / Ident(id) =>
--- a/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql
@ -0,0 +1,29 @@
+-- organization
+ALTER TABLE `organization`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `organization` SET `updated` = `created`;
+
+ALTER TABLE `organization`
+MODIFY `updated` timestamp NOT NULL;
+
+-- person
+ALTER TABLE `person`
+MODIFY `created` timestamp;
+
+ALTER TABLE `person`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `person` SET `updated` = `created`;
+
+ALTER TABLE `person`
+MODIFY `updated` timestamp NOT NULL;
+
+-- equipment
+ALTER TABLE `equipment`
+ADD COLUMN (`updated` timestamp);
+
+UPDATE `equipment` SET `updated` = `created`;
+
+ALTER TABLE `equipment`
+MODIFY `updated` timestamp NOT NULL;
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql
@ -0,0 +1,29 @@
+-- organization
+ALTER TABLE "organization"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "organization" SET "updated" = "created";
+
+ALTER TABLE "organization"
+ALTER COLUMN "updated" SET NOT NULL;
+
+-- person
+ALTER TABLE "person" ALTER COLUMN "created"
+  TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
+
+ALTER TABLE "person"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "person" SET "updated" = "created";
+
+ALTER TABLE "person"
+ALTER COLUMN "updated" SET NOT NULL;
+
+-- equipment
+ALTER TABLE "equipment"
+ADD COLUMN "updated" timestamp;
+
+UPDATE "equipment" SET "updated" = "created";
+
+ALTER TABLE "equipment"
+ALTER COLUMN "updated" SET NOT NULL;
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@ -1,5 +1,6 @@
 package docspell.store.queries

+import cats.data.OptionT
 import fs2.Stream

 import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
 import doobie.implicits._

 object QCollective {
+
+  case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
+  object Names {
+    val empty = Names(Vector.empty, Vector.empty, Vector.empty)
+  }
+
+  def allNames(collective: Ident): ConnectionIO[Names] =
+    (for {
+      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
+      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
+      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
+    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
+      .getOrElse(Names.empty)
+
  case class TagCount(tag: RTag, count: Int)

  case class InsightData(
--- a/modules/store/src/main/scala/docspell/store/records/REquipment.scala
+++ b/modules/store/src/main/scala/docspell/store/records/REquipment.scala
@ -7,7 +7,13 @@ import docspell.store.impl._
 import doobie._
 import doobie.implicits._

-case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {}
+case class REquipment(
+    eid: Ident,
+    cid: Ident,
+    name: String,
+    created: Timestamp,
+    updated: Timestamp
+) {}

 object REquipment {

@ -18,25 +24,32 @@ object REquipment {
    val cid     = Column("cid")
    val name    = Column("name")
    val created = Column("created")
-    val all     = List(eid, cid, name, created)
+    val updated = Column("updated")
+    val all     = List(eid, cid, name, created, updated)
  }
  import Columns._

  def insert(v: REquipment): ConnectionIO[Int] = {
-    val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}")
+    val sql =
+      insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
    sql.update.run
  }

  def update(v: REquipment): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(eid.is(v.eid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(eid.is(v.eid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          updated.setTo(now)
+        )
      )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
  }

  def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {
--- a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
+++ b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala
@ -19,7 +19,8 @@ case class ROrganization(
    city: String,
    country: String,
    notes: Option[String],
-    created: Timestamp
+    created: Timestamp,
+    updated: Timestamp
 ) {}

 object ROrganization {
@ -38,7 +39,8 @@ object ROrganization {
    val country = Column("country")
    val notes   = Column("notes")
    val created = Column("created")
-    val all     = List(oid, cid, name, street, zip, city, country, notes, created)
+    val updated = Column("updated")
+    val all     = List(oid, cid, name, street, zip, city, country, notes, created, updated)
  }

  import Columns._
@ -47,26 +49,31 @@ object ROrganization {
    val sql = insertRow(
      table,
      all,
-      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}"
+      fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
    )
    sql.update.run
  }

  def update(v: ROrganization): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(oid.is(v.oid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name),
-        street.setTo(v.street),
-        zip.setTo(v.zip),
-        city.setTo(v.city),
-        country.setTo(v.country),
-        notes.setTo(v.notes)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(oid.is(v.oid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          street.setTo(v.street),
+          zip.setTo(v.zip),
+          city.setTo(v.city),
+          country.setTo(v.country),
+          notes.setTo(v.notes),
+          updated.setTo(now)
+        )
      )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
  }

  def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =
--- a/modules/store/src/main/scala/docspell/store/records/RPerson.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RPerson.scala
@ -20,7 +20,8 @@ case class RPerson(
    country: String,
    notes: Option[String],
    concerning: Boolean,
-    created: Timestamp
+    created: Timestamp,
+    updated: Timestamp
 ) {}

 object RPerson {
@ -40,7 +41,20 @@ object RPerson {
    val notes      = Column("notes")
    val concerning = Column("concerning")
    val created    = Column("created")
-    val all        = List(pid, cid, name, street, zip, city, country, notes, concerning, created)
+    val updated    = Column("updated")
+    val all = List(
+      pid,
+      cid,
+      name,
+      street,
+      zip,
+      city,
+      country,
+      notes,
+      concerning,
+      created,
+      updated
+    )
  }

  import Columns._
@ -49,27 +63,32 @@ object RPerson {
    val sql = insertRow(
      table,
      all,
-      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}"
+      fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
    )
    sql.update.run
  }

  def update(v: RPerson): ConnectionIO[Int] = {
-    val sql = updateRow(
-      table,
-      and(pid.is(v.pid), cid.is(v.cid)),
-      commas(
-        cid.setTo(v.cid),
-        name.setTo(v.name),
-        street.setTo(v.street),
-        zip.setTo(v.zip),
-        city.setTo(v.city),
-        country.setTo(v.country),
-        concerning.setTo(v.concerning),
-        notes.setTo(v.notes)
+    def sql(now: Timestamp) =
+      updateRow(
+        table,
+        and(pid.is(v.pid), cid.is(v.cid)),
+        commas(
+          cid.setTo(v.cid),
+          name.setTo(v.name),
+          street.setTo(v.street),
+          zip.setTo(v.zip),
+          city.setTo(v.city),
+          country.setTo(v.country),
+          concerning.setTo(v.concerning),
+          notes.setTo(v.notes),
+          updated.setTo(now)
+        )
      )
-    )
-    sql.update.run
+    for {
+      now <- Timestamp.current[ConnectionIO]
+      n   <- sql(now).update.run
+    } yield n
  }

  def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@ -10,6 +10,7 @@ module Data.Language exposing
 type Language
    = German
    | English
+    | French


 fromString : String -> Maybe Language
@ -20,6 +21,9 @@ fromString str =
    else if str == "eng" || str == "en" || str == "english" then
        Just English

+    else if str == "fra" || str == "fr" || str == "french" then
+        Just French
+
    else
        Nothing

@ -33,6 +37,9 @@ toIso3 lang =
        English ->
            "eng"

+        French ->
+            "fra"
+

 toName : Language -> String
 toName lang =
@ -43,7 +50,10 @@ toName lang =
        English ->
            "English"

+        French ->
+            "French"
+

 all : List Language
 all =
-    [ German, English ]
+    [ German, English, French ]
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@ -91,6 +91,11 @@ let
    };
    text-analysis = {
      max-length = 10000;
+      regex-ner = {
+        enabled = true;
+        file-cache-time = "1 minute";
+      };
+      working-dir = "/tmp/docspell-analysis";
    };
    processing = {
      max-due-date-years = 10;
@ -689,7 +694,48 @@ in {
                (a rough guess).
              '';
            };
+            working-dir = mkOption {
+              type = types.str;
+              default = defaults.text-analysis.working-dir;
+              description = ''
+                A working directory for the analyser to store temporary/working
+                files.
+              '';
+            };

+            regex-ner = mkOption {
+              type = types.submodule({
+                options = {
+                  enabled = mkOption {
+                    type = types.bool;
+                    default = defaults.text-analysis.regex-ner.enabled;
+                    description = ''
+                      Whether to enable custom NER annotation. This uses the address
+                      book of a collective as input for NER tagging (to automatically
+                      find correspondent and concerned entities). If the address book
+                      is large, this can be quite memory intensive and also makes text
+                      analysis slower. But it greatly improves accuracy. If this is
+                      false, NER tagging uses only statistical models (that also work
+                      quite well).
+
+                      This setting might be moved to the collective settings in the
+                      future.
+                    '';
+                  };
+                  file-cache-time = mkOption {
+                    type = types.str;
+                    default = defaults.text-analysis.ner-file-cache-time;
+                    description = ''
+                      The NER annotation uses a file of patterns that is derived from
+                      a collective's address book. This is is the time how long this
+                      file will be kept until a check for a state change is done.
+                    '';
+                  };
+                };
+              });
+              default = defaults.text-analysis.regex-ner;
+              description = "";
+            };
          };
        });
        default = defaults.text-analysis;
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@ -31,7 +31,7 @@ object Dependencies {
  val PostgresVersion         = "42.2.16"
  val PureConfigVersion       = "0.13.0"
  val Slf4jVersion            = "1.7.30"
-  val StanfordNlpVersion      = "3.9.2"
+  val StanfordNlpVersion      = "4.0.0"
  val TikaVersion             = "1.24.1"
  val YamuscaVersion          = "0.6.2"
  val SwaggerUIVersion        = "3.32.3"
@ -135,11 +135,16 @@ object Dependencies {
  )

  val stanfordNlpModels = Seq(
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier("models"),
    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
      .classifier("models-german"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
-      "models-english"
-    )
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier("models-french"),
+    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      .classifier(
+        "models-english"
+      )
  )

  val tika = Seq(
--- a/project/NerModelsPlugin.scala
+++ b/project/NerModelsPlugin.scala
@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
  }

  private val nerModels = List(
-    "german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
-    "english.all.3class.distsim.crf.ser.gz"
+    "german.distsim.crf.ser.gz",
+    "english.conll.4class.distsim.crf.ser.gz",
+    "french-wikiner-4class.crf.ser.gz",
+    "french-mwt-statistical.tsv",
+    "french-mwt.tagger",
+    "french-mwt.tsv",
+    "german-mwt.tsv",
+    "german-ud.tagger",
+    "german-ud.tagger.props",
+    "french-ud.tagger",
+    "french-ud.tagger.props",
+    "english-left3words-distsim.tagger",
+    "english-left3words-distsim.tagger.props"
  )
 }