From f01646aeb5a08246ace732fa53963a40c32fd182 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Sat, 16 Jan 2021 23:43:24 +0100
Subject: [PATCH] Reorganize nlp pipeline and add nlp-unsupported language
 italian

Improves and reorganizes how nlp pipelines are setup. Now users can
choose from many options, depending on their hardware and usage
scenario.

This is the base to use more languages without depending on what
stanford-nlp supports. Support then is involves to text extraction and
simple regex-ner processing.
---
 .travis.yml                                   |   2 +-
 docker/joex-base.dockerfile                   |   1 +
 .../scala/docspell/analysis/NlpSettings.scala |   7 ++
 .../docspell/analysis/TextAnalyser.scala      |  48 ++++----
 .../docspell/analysis/date/DateFind.scala     |  44 +++----
 .../docspell/analysis/date/MonthName.scala    | 101 +++++++++++++++
 .../docspell/analysis/nlp/Annotator.scala     |  98 +++++++++++++++
 .../analysis/nlp/BasicCRFAnnotator.scala      |  26 ++--
 .../docspell/analysis/nlp/PipelineCache.scala |  65 +++-------
 .../docspell/analysis/nlp/Properties.scala    |  32 +++--
 .../analysis/nlp/StanfordNerAnnotator.scala   |  27 ++--
 .../analysis/nlp/StanfordNerSettings.scala    |  58 +++++----
 .../analysis/nlp/BaseCRFAnnotatorSuite.scala  |   3 +-
 .../nlp/StanfordNerAnnotatorSuite.scala       |  36 ++++++
 .../main/scala/docspell/common/Language.scala |  23 +++-
 .../main/scala/docspell/common/NlpMode.scala  |  16 +--
 .../docspell/common/syntax/FileSyntax.scala   |  20 +++
 .../docspell/common/syntax/package.scala      |   7 +-
 .../test/resources/examples/letter-ita.txt    |  13 ++
 .../main/scala/docspell/ftssolr/Field.scala   |   3 +
 .../scala/docspell/ftssolr/SolrQuery.scala    |   1 +
 .../scala/docspell/ftssolr/SolrSetup.scala    |  19 ++-
 .../joex/src/main/resources/reference.conf    |  85 ++++++++-----
 .../src/main/scala/docspell/joex/Config.scala |  15 ++-
 .../docspell/joex/analysis/RegexNerFile.scala |   6 +-
 .../docspell/joex/process/TextAnalysis.scala  |   5 +-
 .../docspell/store/queries/QCollective.scala  |  43 +++++--
 modules/webapp/src/main/elm/Data/Language.elm |  11 +-
 nix/module-joex.nix                           | 116 +++++++++++++-----
 29 files changed, 676 insertions(+), 255 deletions(-)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala
 create mode 100644 modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala
 create mode 100644 modules/files/src/test/resources/examples/letter-ita.txt

diff --git a/.travis.yml b/.travis.yml
index 4d750d05..d78ff4b0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -24,4 +24,4 @@ before_script:
   - export TZ=Europe/Berlin
 
 script:
-  - sbt ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test"
+  - sbt -J-XX:+UseG1GC ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test"
diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile
index 0baa1973..8ebad224 100644
--- a/docker/joex-base.dockerfile
+++ b/docker/joex-base.dockerfile
@@ -15,6 +15,7 @@ RUN apk add --no-cache openjdk11-jre \
     tesseract-ocr \
     tesseract-ocr-data-deu \
     tesseract-ocr-data-fra \
+    tesseract-ocr-data-ita \
     unpaper \
     wkhtmltopdf \
     libreoffice \
diff --git a/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala
new file mode 100644
index 00000000..a1b426e5
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala
@@ -0,0 +1,7 @@
+package docspell.analysis
+
+import java.nio.file.Path
+
+import docspell.common._
+
+case class NlpSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
index a9234027..c2deafce 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -10,13 +10,13 @@ import docspell.analysis.date.DateFind
 import docspell.analysis.nlp._
 import docspell.common._
 
-import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import org.log4s.getLogger
 
 trait TextAnalyser[F[_]] {
 
   def annotate(
       logger: Logger[F],
-      settings: StanfordNerSettings,
+      settings: NlpSettings,
       cacheKey: Ident,
       text: String
   ): F[TextAnalyser.Result]
@@ -24,6 +24,7 @@ trait TextAnalyser[F[_]] {
   def classifier: TextClassifier[F]
 }
 object TextAnalyser {
+  private[this] val logger = getLogger
 
   case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
 
@@ -41,13 +42,13 @@ object TextAnalyser {
         new TextAnalyser[F] {
           def annotate(
               logger: Logger[F],
-              settings: StanfordNerSettings,
+              settings: NlpSettings,
               cacheKey: Ident,
               text: String
           ): F[TextAnalyser.Result] =
             for {
               input <- textLimit(logger, text)
-              tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
+              tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input))
               tags1 <- contactNer(input)
               dates <- dateNer(settings.lang, input)
               list  = tags0 ++ tags1
@@ -77,31 +78,36 @@ object TextAnalyser {
         }
       )
 
+  /** Provides the nlp pipeline based on the configuration. */
   private object Nlp {
-
     def apply[F[_]: Concurrent: Timer: BracketThrow](
         cfg: TextAnalysisConfig.NlpConfig
-    ): F[Input => F[Vector[NerLabel]]] =
+    ): F[Input[F] => F[Vector[NerLabel]]] =
       cfg.mode match {
-        case NlpMode.Full =>
-          PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
-        case NlpMode.Basic =>
-          PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
         case NlpMode.Disabled =>
-          Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
+          Logger.log4s(logger).info("NLP is disabled as defined in config.") *>
+            Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
+        case _ =>
+          PipelineCache(cfg.clearInterval)(
+            Annotator[F](cfg.mode),
+            Annotator.clearCaches[F]
+          )
+            .map(annotate[F])
       }
 
-    final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
+    final case class Input[F[_]](
+        key: Ident,
+        settings: NlpSettings,
+        logger: Logger[F],
+        text: String
+    )
 
-    def full[F[_]: BracketThrow](
-        cache: PipelineCache[F, StanfordCoreNLP]
-    )(input: Input): F[Vector[NerLabel]] =
-      StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
-
-    def basic[F[_]: BracketThrow](
-        cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
-    )(input: Input): F[Vector[NerLabel]] =
-      BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
+    def annotate[F[_]: BracketThrow](
+        cache: PipelineCache[F]
+    )(input: Input[F]): F[Vector[NerLabel]] =
+      cache
+        .obtain(input.key.id, input.settings)
+        .use(ann => ann.nerAnnotate(input.logger)(input.text))
 
   }
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
index 90fcd8cd..5feb8b57 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -41,23 +41,30 @@ object DateFind {
   }
 
   object SimpleDate {
-    val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) =>
-      List(SimpleDate(y, m, d))
+    def pattern0(lang: Language) = (readYear >> readMonth(lang) >> readDay).map {
+      case ((y, m), d) =>
+        List(SimpleDate(y, m, d))
     }
-    val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) =>
-      List(SimpleDate(y, m, d))
+    def pattern1(lang: Language) = (readDay >> readMonth(lang) >> readYear).map {
+      case ((d, m), y) =>
+        List(SimpleDate(y, m, d))
     }
-    val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) =>
-      List(SimpleDate(y, m, d))
+    def pattern2(lang: Language) = (readMonth(lang) >> readDay >> readYear).map {
+      case ((m, d), y) =>
+        List(SimpleDate(y, m, d))
     }
 
     // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
     def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
+      val p0 = pattern0(lang)
+      val p1 = pattern1(lang)
+      val p2 = pattern2(lang)
       val p = lang match {
         case Language.English =>
           p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1)
-        case Language.German => p1.or(p0).or(p2)
-        case Language.French => p1.or(p0).or(p2)
+        case Language.German  => p1.or(p0).or(p2)
+        case Language.French  => p1.or(p0).or(p2)
+        case Language.Italian => p1.or(p0).or(p2)
       }
       p.read(parts) match {
         case Result.Success(sds, _) =>
@@ -76,9 +83,11 @@ object DateFind {
         }
       )
 
-    def readMonth: Reader[Int] =
+    def readMonth(lang: Language): Reader[Int] =
       Reader.readFirst(w =>
-        Some(months.indexWhere(_.contains(w.value))).filter(_ >= 0).map(_ + 1)
+        Some(MonthName.getAll(lang).indexWhere(_.contains(w.value)))
+          .filter(_ >= 0)
+          .map(_ + 1)
       )
 
     def readDay: Reader[Int] =
@@ -150,20 +159,5 @@ object DateFind {
             Failure
         }
     }
-
-    private val months = List(
-      List("jan", "january", "januar", "01"),
-      List("feb", "february", "februar", "02"),
-      List("mar", "march", "märz", "marz", "03"),
-      List("apr", "april", "04"),
-      List("may", "mai", "05"),
-      List("jun", "june", "juni", "06"),
-      List("jul", "july", "juli", "07"),
-      List("aug", "august", "08"),
-      List("sep", "september", "09"),
-      List("oct", "october", "oktober", "10"),
-      List("nov", "november", "11"),
-      List("dec", "december", "dezember", "12")
-    )
   }
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
new file mode 100644
index 00000000..cf61cd72
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
@@ -0,0 +1,101 @@
+package docspell.analysis.date
+
+import docspell.common.Language
+
+object MonthName {
+
+  def getAll(lang: Language): List[List[String]] =
+    merge(numbers, forLang(lang))
+
+  private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
+    ns.foldLeft(n0) { (res, el) =>
+      res.zip(el).map({ case (a, b) => a ++ b })
+    }
+
+  private def forLang(lang: Language): List[List[String]] =
+    lang match {
+      case Language.English =>
+        english
+      case Language.German =>
+        german
+      case Language.French =>
+        french
+      case Language.Italian =>
+        italian
+    }
+
+  private val numbers = List(
+    List("01"),
+    List("02"),
+    List("03"),
+    List("04"),
+    List("05"),
+    List("06"),
+    List("07"),
+    List("08"),
+    List("09"),
+    List("10"),
+    List("11"),
+    List("12")
+  )
+
+  private val english = List(
+    List("jan", "january"),
+    List("feb", "february"),
+    List("mar", "march"),
+    List("apr", "april"),
+    List("may"),
+    List("jun", "june"),
+    List("jul", "july"),
+    List("aug", "august"),
+    List("sept", "september"),
+    List("oct", "october"),
+    List("nov", "november"),
+    List("dec", "december")
+  )
+
+  private val german = List(
+    List("jan", "januar"),
+    List("feb", "februar"),
+    List("märz"),
+    List("apr", "april"),
+    List("mai"),
+    List("juni"),
+    List("juli"),
+    List("aug", "august"),
+    List("sept", "september"),
+    List("okt", "oktober"),
+    List("nov", "november"),
+    List("dez", "dezember")
+  )
+
+  private val french = List(
+    List("janv", "janvier"),
+    List("févr", "fevr", "février", "fevrier"),
+    List("mars"),
+    List("avril"),
+    List("mai"),
+    List("juin"),
+    List("juil", "juillet"),
+    List("aout", "août"),
+    List("sept", "septembre"),
+    List("oct", "octobre"),
+    List("nov", "novembre"),
+    List("dec", "déc", "décembre", "decembre")
+  )
+
+  private val italian = List(
+    List("genn", "gennaio"),
+    List("febbr", "febbraio"),
+    List("mar", "marzo"),
+    List("apr", "aprile"),
+    List("magg", "maggio"),
+    List("giugno"),
+    List("luglio"),
+    List("ag", "agosto"),
+    List("sett", "settembre"),
+    List("ott", "ottobre"),
+    List("nov", "novembre"),
+    List("dic", "dicembre")
+  )
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala
new file mode 100644
index 00000000..d509805a
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala
@@ -0,0 +1,98 @@
+package docspell.analysis.nlp
+
+import cats.effect.Sync
+import cats.implicits._
+import cats.{Applicative, FlatMap}
+
+import docspell.analysis.NlpSettings
+import docspell.common._
+
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+
+/** Analyses a text to mark certain parts with a `NerLabel`. */
+trait Annotator[F[_]] { self =>
+  def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]]
+
+  def ++(next: Annotator[F])(implicit F: FlatMap[F]): Annotator[F] =
+    new Annotator[F] {
+      def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
+        for {
+          n0 <- self.nerAnnotate(logger)(text)
+          n1 <- next.nerAnnotate(logger)(text)
+        } yield (n0 ++ n1).distinct
+    }
+}
+
+object Annotator {
+
+  /** Creates an annotator according to the given `mode` and `settings`.
+    *
+    * There are the following ways:
+    *
+    * - disabled: it returns a no-op annotator that always gives an empty list
+    * - full: the complete stanford pipeline is used
+    * - basic: only the ner classifier is used
+    *
+    * Additionally, if there is a regexNer-file specified, the regexner annotator is
+    * also run. In case the full pipeline is used, this is already included.
+    */
+  def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
+    mode match {
+      case NlpMode.Disabled =>
+        Annotator.none[F]
+      case NlpMode.Full =>
+        StanfordNerSettings.fromNlpSettings(settings) match {
+          case Some(ss) =>
+            Annotator.pipeline(StanfordNerAnnotator.makePipeline(ss))
+          case None =>
+            Annotator.none[F]
+        }
+      case NlpMode.Basic =>
+        StanfordNerSettings.fromNlpSettings(settings) match {
+          case Some(StanfordNerSettings.Full(lang, _, Some(file))) =>
+            Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) ++
+              Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
+          case Some(StanfordNerSettings.Full(lang, _, None)) =>
+            Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang))
+          case Some(StanfordNerSettings.RegexOnly(file)) =>
+            Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
+          case None =>
+            Annotator.none[F]
+        }
+      case NlpMode.RegexOnly =>
+        settings.regexNer match {
+          case Some(file) =>
+            Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
+          case None =>
+            Annotator.none[F]
+        }
+    }
+
+  def none[F[_]: Applicative]: Annotator[F] =
+    new Annotator[F] {
+      def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
+        logger.debug("Running empty annotator. NLP not supported.") *>
+          Vector.empty[NerLabel].pure[F]
+    }
+
+  def basic[F[_]: Sync](ann: BasicCRFAnnotator.Annotator): Annotator[F] =
+    new Annotator[F] {
+      def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
+        Sync[F].delay(
+          BasicCRFAnnotator.nerAnnotate(ann)(text)
+        )
+    }
+
+  def pipeline[F[_]: Sync](cp: StanfordCoreNLP): Annotator[F] =
+    new Annotator[F] {
+      def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
+        Sync[F].delay(StanfordNerAnnotator.nerAnnotate(cp, text))
+
+    }
+
+  def clearCaches[F[_]: Sync]: F[Unit] =
+    Sync[F].delay {
+      StanfordCoreNLP.clearAnnotatorPool()
+      BasicCRFAnnotator.Cache.clearCache()
+    }
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
index a6fb6af0..76ffe7c6 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
@@ -7,9 +7,7 @@ import java.util.zip.GZIPInputStream
 import scala.jdk.CollectionConverters._
 import scala.util.Using
 
-import cats.Applicative
-import cats.effect.BracketThrow
-
+import docspell.common.Language.NLPLanguage
 import docspell.common._
 
 import edu.stanford.nlp.ie.AbstractSequenceClassifier
@@ -30,14 +28,6 @@ object BasicCRFAnnotator {
 
   type Annotator = AbstractSequenceClassifier[CoreLabel]
 
-  def nerAnnotate[F[_]: BracketThrow](
-      cacheKey: String,
-      cache: PipelineCache[F, Annotator]
-  )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
-    cache
-      .obtain(cacheKey, settings)
-      .use(crf => Applicative[F].pure(nerAnnotate(crf)(text)))
-
   def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
     nerClassifier
       .classify(text)
@@ -52,7 +42,7 @@ object BasicCRFAnnotator {
       })
       .toVector
 
-  private def makeClassifier(lang: Language): Annotator = {
+  def makeAnnotator(lang: NLPLanguage): Annotator = {
     logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
     val ner = classifierResource(lang)
     Using(new GZIPInputStream(ner.openStream())) { in =>
@@ -60,7 +50,7 @@ object BasicCRFAnnotator {
     }.fold(throw _, identity)
   }
 
-  private def classifierResource(lang: Language): URL = {
+  private def classifierResource(lang: NLPLanguage): URL = {
     def check(name: String): URL =
       Option(getClass.getResource(name)) match {
         case None =>
@@ -79,11 +69,11 @@ object BasicCRFAnnotator {
   }
 
   final class Cache {
-    private[this] lazy val germanNerClassifier  = makeClassifier(Language.German)
-    private[this] lazy val englishNerClassifier = makeClassifier(Language.English)
-    private[this] lazy val frenchNerClassifier  = makeClassifier(Language.French)
+    private[this] lazy val germanNerClassifier  = makeAnnotator(Language.German)
+    private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
+    private[this] lazy val frenchNerClassifier  = makeAnnotator(Language.French)
 
-    def forLang(language: Language): Annotator =
+    def forLang(language: NLPLanguage): Annotator =
       language match {
         case Language.French  => frenchNerClassifier
         case Language.German  => germanNerClassifier
@@ -95,7 +85,7 @@ object BasicCRFAnnotator {
 
     private[this] val cacheRef = new AtomicReference[Cache](new Cache)
 
-    def getAnnotator(language: Language): Annotator =
+    def getAnnotator(language: NLPLanguage): Annotator =
       cacheRef.get().forLang(language)
 
     def clearCache(): Unit =
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
index 2b567548..3b38da22 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@@ -3,14 +3,13 @@ package docspell.analysis.nlp
 import scala.concurrent.duration.{Duration => _, _}
 
 import cats.Applicative
-import cats.data.Kleisli
 import cats.effect._
 import cats.effect.concurrent.Ref
 import cats.implicits._
 
+import docspell.analysis.NlpSettings
 import docspell.common._
 
-import edu.stanford.nlp.pipeline.StanfordCoreNLP
 import org.log4s.getLogger
 
 /** Creating the StanfordCoreNLP pipeline is quite expensive as it
@@ -20,58 +19,32 @@ import org.log4s.getLogger
   *
   * **This is an internal API**
   */
-trait PipelineCache[F[_], A] {
+trait PipelineCache[F[_]] {
 
-  def obtain(key: String, settings: StanfordNerSettings): Resource[F, A]
+  def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]]
 
 }
 
 object PipelineCache {
   private[this] val logger = getLogger
 
-  def none[F[_]: Applicative, A](
-      creator: Kleisli[F, StanfordNerSettings, A]
-  ): PipelineCache[F, A] =
-    new PipelineCache[F, A] {
-      def obtain(
-          ignored: String,
-          settings: StanfordNerSettings
-      ): Resource[F, A] =
-        Resource.liftF(creator.run(settings))
-    }
-
-  def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)(
-      creator: StanfordNerSettings => A,
+  def apply[F[_]: Concurrent: Timer](clearInterval: Duration)(
+      creator: NlpSettings => Annotator[F],
       release: F[Unit]
-  ): F[PipelineCache[F, A]] =
+  ): F[PipelineCache[F]] =
     for {
-      data       <- Ref.of(Map.empty[String, Entry[A]])
+      data       <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
       cacheClear <- CacheClearing.create(data, clearInterval, release)
-    } yield new Impl[F, A](data, creator, cacheClear)
+      _          <- Logger.log4s(logger).info("Creating nlp pipeline cache")
+    } yield new Impl[F](data, creator, cacheClear)
 
-  def full[F[_]: Concurrent: Timer](
-      clearInterval: Duration
-  ): F[PipelineCache[F, StanfordCoreNLP]] =
-    apply(clearInterval)(
-      StanfordNerAnnotator.makePipeline,
-      StanfordNerAnnotator.clearPipelineCaches
-    )
-
-  def basic[F[_]: Concurrent: Timer](
-      clearInterval: Duration
-  ): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] =
-    apply(clearInterval)(
-      settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang),
-      Sync[F].delay(BasicCRFAnnotator.Cache.clearCache())
-    )
-
-  final private class Impl[F[_]: Sync, A](
-      data: Ref[F, Map[String, Entry[A]]],
-      creator: StanfordNerSettings => A,
+  final private class Impl[F[_]: Sync](
+      data: Ref[F, Map[String, Entry[Annotator[F]]]],
+      creator: NlpSettings => Annotator[F],
       cacheClear: CacheClearing[F]
-  ) extends PipelineCache[F, A] {
+  ) extends PipelineCache[F] {
 
-    def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] =
+    def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] =
       for {
         _  <- cacheClear.withCache
         id <- Resource.liftF(makeSettingsId(settings))
@@ -83,10 +56,10 @@ object PipelineCache {
     private def getOrCreate(
         key: String,
         id: String,
-        cache: Map[String, Entry[A]],
-        settings: StanfordNerSettings,
-        creator: StanfordNerSettings => A
-    ): (Map[String, Entry[A]], A) =
+        cache: Map[String, Entry[Annotator[F]]],
+        settings: NlpSettings,
+        creator: NlpSettings => Annotator[F]
+    ): (Map[String, Entry[Annotator[F]]], Annotator[F]) =
       cache.get(key) match {
         case Some(entry) =>
           if (entry.id == id) (cache, entry.value)
@@ -105,7 +78,7 @@ object PipelineCache {
           (cache.updated(key, e), nlp)
       }
 
-    private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
+    private def makeSettingsId(settings: NlpSettings): F[String] = {
       val base = settings.copy(regexNer = None).toString
       val size: F[Long] =
         settings.regexNer match {
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
index 46a614d1..75fe9d36 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -1,9 +1,11 @@
 package docspell.analysis.nlp
 
+import java.nio.file.Path
 import java.util.{Properties => JProps}
 
 import docspell.analysis.nlp.Properties.Implicits._
 import docspell.common._
+import docspell.common.syntax.FileSyntax._
 
 object Properties {
 
@@ -17,18 +19,21 @@ object Properties {
     p
   }
 
-  def forSettings(settings: StanfordNerSettings): JProps = {
-    val regexNerFile = settings.regexNer
-      .map(p => p.normalize().toAbsolutePath().toString())
-    settings.lang match {
-      case Language.German =>
-        Properties.nerGerman(regexNerFile, settings.highRecall)
-      case Language.English =>
-        Properties.nerEnglish(regexNerFile)
-      case Language.French =>
-        Properties.nerFrench(regexNerFile, settings.highRecall)
+  def forSettings(settings: StanfordNerSettings): JProps =
+    settings match {
+      case StanfordNerSettings.Full(lang, highRecall, regexNer) =>
+        val regexNerFile = regexNer.map(p => p.absolutePathAsString)
+        lang match {
+          case Language.German =>
+            Properties.nerGerman(regexNerFile, highRecall)
+          case Language.English =>
+            Properties.nerEnglish(regexNerFile)
+          case Language.French =>
+            Properties.nerFrench(regexNerFile, highRecall)
+        }
+      case StanfordNerSettings.RegexOnly(path) =>
+        Properties.regexNerOnly(path)
     }
-  }
 
   def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
     Properties(
@@ -76,6 +81,11 @@ object Properties {
       "ner.model"                   -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
     ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
 
+  def regexNerOnly(regexNerMappingFile: Path): JProps =
+    Properties(
+      "annotators" -> "tokenize,ssplit"
+    ).withRegexNer(Some(regexNerMappingFile.absolutePathAsString))
+
   object Implicits {
     implicit final class JPropsOps(val p: JProps) extends AnyVal {
 
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala
index 37b54b40..2ec4e802 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala
@@ -1,8 +1,9 @@
 package docspell.analysis.nlp
 
+import java.nio.file.Path
+
 import scala.jdk.CollectionConverters._
 
-import cats.Applicative
 import cats.effect._
 
 import docspell.common._
@@ -24,24 +25,24 @@ object StanfordNerAnnotator {
     * a new classifier must be created. It will then replace the
     * previous one.
     */
-  def nerAnnotate[F[_]: BracketThrow](
-      cacheKey: String,
-      cache: PipelineCache[F, StanfordCoreNLP]
-  )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
-    cache
-      .obtain(cacheKey, settings)
-      .use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
-
   def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
     val doc = new CoreDocument(text)
     nerClassifier.annotate(doc)
     doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
   }
 
-  def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = {
-    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
-    new StanfordCoreNLP(Properties.forSettings(settings))
-  }
+  def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP =
+    settings match {
+      case s: StanfordNerSettings.Full =>
+        logger.info(s"Creating ${s.lang.name} Stanford NLP NER classifier...")
+        new StanfordCoreNLP(Properties.forSettings(settings))
+      case StanfordNerSettings.RegexOnly(path) =>
+        logger.info(s"Creating regexNer-only Stanford NLP NER classifier...")
+        regexNerPipeline(path)
+    }
+
+  def regexNerPipeline(regexNerFile: Path): StanfordCoreNLP =
+    new StanfordCoreNLP(Properties.regexNerOnly(regexNerFile))
 
   def clearPipelineCaches[F[_]: Sync]: F[Unit] =
     Sync[F].delay {
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
index 06136a18..fd0a7ecd 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
@@ -2,25 +2,41 @@ package docspell.analysis.nlp
 
 import java.nio.file.Path
 
-import docspell.common._
+import docspell.analysis.NlpSettings
+import docspell.common.Language.NLPLanguage
 
-/** Settings for configuring the stanford NER pipeline.
-  *
-  * The language is mandatory, only the provided ones are supported.
-  * The `highRecall` only applies for non-English languages. For
-  * non-English languages the english classifier is run as second
-  * classifier and if `highRecall` is true, then it will be used to
-  * tag untagged tokens. This may lead to a lot of false positives,
-  * but since English is omnipresent in other languages, too it
-  * depends on the use case for whether this is useful or not.
-  *
-  * The `regexNer` allows to specify a text file as described here:
-  * https://nlp.stanford.edu/software/regexner.html. This will be used
-  * as a last step to tag untagged tokens using the provided list of
-  * regexps.
-  */
-case class StanfordNerSettings(
-    lang: Language,
-    highRecall: Boolean,
-    regexNer: Option[Path]
-)
+sealed trait StanfordNerSettings
+
+object StanfordNerSettings {
+
+  /** Settings for configuring the stanford NER pipeline.
+    *
+    * The language is mandatory, only the provided ones are supported.
+    * The `highRecall` only applies for non-English languages. For
+    * non-English languages the english classifier is run as second
+    * classifier and if `highRecall` is true, then it will be used to
+    * tag untagged tokens. This may lead to a lot of false positives,
+    * but since English is omnipresent in other languages, too it
+    * depends on the use case for whether this is useful or not.
+    *
+    * The `regexNer` allows to specify a text file as described here:
+    * https://nlp.stanford.edu/software/regexner.html. This will be used
+    * as a last step to tag untagged tokens using the provided list of
+    * regexps.
+    */
+  case class Full(
+      lang: NLPLanguage,
+      highRecall: Boolean,
+      regexNer: Option[Path]
+  ) extends StanfordNerSettings
+
+  /** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
+    */
+  case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings
+
+  def fromNlpSettings(ns: NlpSettings): Option[StanfordNerSettings] =
+    NLPLanguage.all
+      .find(nl => nl == ns.lang)
+      .map(nl => Full(nl, ns.highRecall, ns.regexNer))
+      .orElse(ns.regexNer.map(nrf => RegexOnly(nrf)))
+}
diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala
index 0abab7e9..2f0cab57 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala
@@ -1,12 +1,13 @@
 package docspell.analysis.nlp
 
+import docspell.common.Language.NLPLanguage
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
 
 object BaseCRFAnnotatorSuite extends SimpleTestSuite {
 
-  def annotate(language: Language): String => Vector[NerLabel] =
+  def annotate(language: NLPLanguage): String => Vector[NerLabel] =
     BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
 
   test("find english ner labels") {
diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala
index 1704ef1b..416cdff7 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala
@@ -1,8 +1,12 @@
 package docspell.analysis.nlp
 
+import java.nio.file.Paths
+
+import cats.effect.IO
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import docspell.common.syntax.FileSyntax._
 import edu.stanford.nlp.pipeline.StanfordCoreNLP
 
 object StanfordNerAnnotatorSuite extends SimpleTestSuite {
@@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
     assertEquals(labels, expect)
     StanfordCoreNLP.clearAnnotatorPool()
   }
+
+  test("regexner-only annotator") {
+    val regexNerContent =
+      s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |""".stripMargin
+
+    File
+      .withTempDir[IO](Paths.get("target"), "test-regex-ner")
+      .use { dir =>
+        for {
+          out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
+          ann    = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
+          labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
+          _ <- IO(
+            assertEquals(
+              labels,
+              Vector(
+                NerLabel("Andrea", NerTag.Person, 6, 12),
+                NerLabel("Rossi", NerTag.Person, 13, 18)
+              )
+            )
+          )
+        } yield ()
+      }
+      .unsafeRunSync()
+    StanfordCoreNLP.clearAnnotatorPool()
+  }
 }
diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala
index 92c32f4b..f18d4adf 100644
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@@ -1,5 +1,7 @@
 package docspell.common
 
+import cats.data.NonEmptyList
+
 import io.circe.{Decoder, Encoder}
 
 sealed trait Language { self: Product =>
@@ -11,28 +13,41 @@ sealed trait Language { self: Product =>
 
   def iso3: String
 
+  val allowsNLP: Boolean = false
+
   private[common] def allNames =
     Set(name, iso3, iso2)
 }
 
 object Language {
+  sealed trait NLPLanguage extends Language with Product {
+    override val allowsNLP = true
+  }
+  object NLPLanguage {
+    val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
+  }
 
-  case object German extends Language {
+  case object German extends NLPLanguage {
     val iso2 = "de"
     val iso3 = "deu"
   }
 
-  case object English extends Language {
+  case object English extends NLPLanguage {
     val iso2 = "en"
     val iso3 = "eng"
   }
 
-  case object French extends Language {
+  case object French extends NLPLanguage {
     val iso2 = "fr"
     val iso3 = "fra"
   }
 
-  val all: List[Language] = List(German, English, French)
+  case object Italian extends Language {
+    val iso2 = "it"
+    val iso3 = "ita"
+  }
+
+  val all: List[Language] = List(German, English, French, Italian)
 
   def fromString(str: String): Either[String, Language] = {
     val lang = str.toLowerCase
diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala
index 36ebf7db..013b2275 100644
--- a/modules/common/src/main/scala/docspell/common/NlpMode.scala
+++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala
@@ -6,16 +6,18 @@ sealed trait NlpMode { self: Product =>
     self.productPrefix
 }
 object NlpMode {
-  case object Full     extends NlpMode
-  case object Basic    extends NlpMode
-  case object Disabled extends NlpMode
+  case object Full      extends NlpMode
+  case object Basic     extends NlpMode
+  case object RegexOnly extends NlpMode
+  case object Disabled  extends NlpMode
 
   def fromString(name: String): Either[String, NlpMode] =
     name.toLowerCase match {
-      case "full"     => Right(Full)
-      case "basic"    => Right(Basic)
-      case "disabled" => Right(Disabled)
-      case _          => Left(s"Unknown nlp-mode: $name")
+      case "full"      => Right(Full)
+      case "basic"     => Right(Basic)
+      case "regexonly" => Right(RegexOnly)
+      case "disabled"  => Right(Disabled)
+      case _           => Left(s"Unknown nlp-mode: $name")
     }
 
   def unsafeFromString(name: String): NlpMode =
diff --git a/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala
new file mode 100644
index 00000000..6eef143b
--- /dev/null
+++ b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala
@@ -0,0 +1,20 @@
+package docspell.common.syntax
+
+import java.nio.file.Path
+
+trait FileSyntax {
+
+  implicit final class PathOps(p: Path) {
+
+    def absolutePath: Path =
+      p.normalize().toAbsolutePath
+
+    def absolutePathAsString: String =
+      absolutePath.toString
+
+    def /(next: String): Path =
+      p.resolve(next)
+  }
+}
+
+object FileSyntax extends FileSyntax
diff --git a/modules/common/src/main/scala/docspell/common/syntax/package.scala b/modules/common/src/main/scala/docspell/common/syntax/package.scala
index 77e17039..8d512741 100644
--- a/modules/common/src/main/scala/docspell/common/syntax/package.scala
+++ b/modules/common/src/main/scala/docspell/common/syntax/package.scala
@@ -2,6 +2,11 @@ package docspell.common
 
 package object syntax {
 
-  object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
+  object all
+      extends EitherSyntax
+      with StreamSyntax
+      with StringSyntax
+      with LoggerSyntax
+      with FileSyntax
 
 }
diff --git a/modules/files/src/test/resources/examples/letter-ita.txt b/modules/files/src/test/resources/examples/letter-ita.txt
new file mode 100644
index 00000000..cca09122
--- /dev/null
+++ b/modules/files/src/test/resources/examples/letter-ita.txt
@@ -0,0 +1,13 @@
+Pontremoli, 9 aprile 2013
+
+Spettabile Villa Albicocca
+Via Francigena, 9
+55100 Pontetetto (LU)
+
+Oggetto: Prenotazione
+
+Gentile Direttore,
+
+Vorrei prenotare una camera matrimoniale …….
+
+In attesa di una Sua pronta risposta, La saluto cordialmente
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
index 2306a44d..345f4665 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
@@ -24,6 +24,7 @@ object Field {
   val content_de     = Field("content_de")
   val content_en     = Field("content_en")
   val content_fr     = Field("content_fr")
+  val content_it     = Field("content_it")
   val itemName       = Field("itemName")
   val itemNotes      = Field("itemNotes")
   val folderId       = Field("folder")
@@ -36,6 +37,8 @@ object Field {
         Field.content_en
       case Language.French =>
         Field.content_fr
+      case Language.Italian =>
+        Field.content_it
     }
 
   implicit val jsonEncoder: Encoder[Field] =
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
index 1e3b09b3..0b7e6e31 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@@ -40,6 +40,7 @@ object SolrQuery {
             Field.content_de,
             Field.content_en,
             Field.content_fr,
+            Field.content_it,
             Field.itemName,
             Field.itemNotes,
             Field.attachmentName
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
index 3deba577..769919bd 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -63,6 +63,12 @@ object SolrSetup {
             solrEngine,
             "Index all from database",
             FtsMigration.Result.indexAll.pure[F]
+          ),
+          FtsMigration[F](
+            7,
+            solrEngine,
+            "Add content_it field",
+            addContentItField.map(_ => FtsMigration.Result.reIndexAll)
           )
         )
 
@@ -72,6 +78,9 @@ object SolrSetup {
       def addContentFrField: F[Unit] =
         addTextField(Some(Language.French))(Field.content_fr)
 
+      def addContentItField: F[Unit] =
+        addTextField(Some(Language.Italian))(Field.content_it)
+
       def setupCoreSchema: F[Unit] = {
         val cmds0 =
           List(
@@ -90,13 +99,15 @@ object SolrSetup {
         )
           .traverse(addTextField(None))
 
-        val cntLang = Language.all.traverse {
+        val cntLang = List(Language.German, Language.English, Language.French).traverse {
           case l @ Language.German =>
             addTextField(l.some)(Field.content_de)
           case l @ Language.English =>
             addTextField(l.some)(Field.content_en)
           case l @ Language.French =>
             addTextField(l.some)(Field.content_fr)
+          case _ =>
+            ().pure[F]
         }
 
         cmds0 *> cmds1 *> cntLang *> ().pure[F]
@@ -125,6 +136,9 @@ object SolrSetup {
           case Some(Language.French) =>
             run(DeleteField.command(DeleteField(field))).attempt *>
               run(AddField.command(AddField.textFR(field)))
+          case Some(Language.Italian) =>
+            run(DeleteField.command(DeleteField(field))).attempt *>
+              run(AddField.command(AddField.textIT(field)))
         }
     }
   }
@@ -161,6 +175,9 @@ object SolrSetup {
 
     def textFR(field: Field): AddField =
       AddField(field, "text_fr", true, true, false)
+
+    def textIT(field: Field): AddField =
+      AddField(field, "text_it", true, true, false)
   }
 
   case class DeleteField(name: Field)
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 583b40b1..a495ea5a 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -277,7 +277,39 @@ docspell.joex {
     # files.
     working-dir = ${java.io.tmpdir}"/docspell-analysis"
 
-    nlp-config {
+    nlp {
+      # The mode for configuring NLP models:
+      #
+      # 1. full – builds the complete pipeline
+      # 2. basic - builds only the ner annotator
+      # 3. regexonly - matches each entry in your address book via regexps
+      # 4. disabled - doesn't use any stanford-nlp feature
+      #
+      # The full and basic variants rely on pre-build language models
+      # that are available for only 3 lanugages at the moment: German,
+      # English and French.
+      #
+      # Memory usage varies greatly among the languages. German has
+      # quite large models, that require about 1G heap. So joex should
+      # run with -Xmx1500M at least when using mode=full.
+      #
+      # The basic variant does a quite good job for German and
+      # English. It might be worse for French, always depending on the
+      # type of text that is analysed. Joex should run with about 600M
+      # heap, here again lanugage German uses the most.
+      #
+      # The regexonly variant doesn't depend on a language. It roughly
+      # works by converting all entries in your addressbook into
+      # regexps and matches each one against the text. This can get
+      # memory intensive, too, when the addressbook grows large. This
+      # is included in the full and basic by default, but can be used
+      # independently by setting mode=regexner.
+      #
+      # When mode=disabled, then the whole nlp pipeline is disabled,
+      # and you won't get any suggestions. Only what the classifier
+      # returns (if enabled).
+      mode = full
+
       # The StanfordCoreNLP library caches language models which
       # requires quite some amount of memory. Setting this interval to a
       # positive duration, the cache is cleared after this amount of
@@ -287,37 +319,28 @@ docspell.joex {
       # This has only any effect, if mode != disabled.
       clear-interval = "15 minutes"
 
-      # The mode for configuring NLP models. Currently 3 are available:
-      #
-      # 1. full – builds the complete pipeline, run with -Xmx1500M or more
-      # 2. basic - builds only the ner annotator, run with -Xmx600M or more
-      # 3. disabled - doesn't use any stanford-nlp feature
-      #
-      # The basic variant does a quite good job for German and
-      # English. It might be worse for French, always depending on the
-      # type of text that is analysed.
-      mode = full
-    }
+      regex-ner {
+        # Whether to enable custom NER annotation. This uses the
+        # address book of a collective as input for NER tagging (to
+        # automatically find correspondent and concerned entities). If
+        # the address book is large, this can be quite memory
+        # intensive and also makes text analysis much slower. But it
+        # improves accuracy and can be used independent of the
+        # lanugage. If this is set to 0, it is effectively disabled
+        # and NER tagging uses only statistical models (that also work
+        # quite well, but are restricted to the languages mentioned
+        # above).
+        #
+        # Note, this is only relevant if nlp-config.mode is not
+        # "disabled".
+        max-entries = 1000
 
-    regex-ner {
-      # Whether to enable custom NER annotation. This uses the address
-      # book of a collective as input for NER tagging (to automatically
-      # find correspondent and concerned entities). If the address book
-      # is large, this can be quite memory intensive and also makes text
-      # analysis slower. But it greatly improves accuracy. If this is
-      # false, NER tagging uses only statistical models (that also work
-      # quite well).
-      #
-      # This setting might be moved to the collective settings in the
-      # future.
-      #
-      # Note, this is only relevant if nlp-config.mode = full.
-      enabled = true
-
-      # The NER annotation uses a file of patterns that is derived from
-      # a collective's address book. This is is the time how long this
-      # file will be kept until a check for a state change is done.
-      file-cache-time = "1 minute"
+        # The NER annotation uses a file of patterns that is derived
+        # from a collective's address book. This is is the time how
+        # long this data will be kept until a check for a state change
+        # is done.
+        file-cache-time = "1 minute"
+      }
     }
 
     # Settings for doing document classification.
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index 5b2bccc5..4ad72d7c 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -60,15 +60,14 @@ object Config {
   case class TextAnalysis(
       maxLength: Int,
       workingDir: Path,
-      nlpConfig: TextAnalysisConfig.NlpConfig,
-      regexNer: RegexNer,
+      nlp: NlpConfig,
       classification: Classification
   ) {
 
     def textAnalysisConfig: TextAnalysisConfig =
       TextAnalysisConfig(
         maxLength,
-        nlpConfig,
+        TextAnalysisConfig.NlpConfig(nlp.clearInterval, nlp.mode),
         TextClassifierConfig(
           workingDir,
           NonEmptyList
@@ -78,10 +77,16 @@ object Config {
       )
 
     def regexNerFileConfig: RegexNerFile.Config =
-      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
+      RegexNerFile.Config(
+        nlp.regexNer.maxEntries,
+        workingDir,
+        nlp.regexNer.fileCacheTime
+      )
   }
 
-  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
+  case class NlpConfig(mode: NlpMode, clearInterval: Duration, regexNer: RegexNer)
+
+  case class RegexNer(maxEntries: Int, fileCacheTime: Duration)
 
   case class Classification(
       enabled: Boolean,
diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
index 24e7f6ae..56e48012 100644
--- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
@@ -29,7 +29,7 @@ trait RegexNerFile[F[_]] {
 object RegexNerFile {
   private[this] val logger = getLogger
 
-  case class Config(enabled: Boolean, directory: Path, minTime: Duration)
+  case class Config(maxEntries: Int, directory: Path, minTime: Duration)
 
   def apply[F[_]: Concurrent: ContextShift](
       cfg: Config,
@@ -49,7 +49,7 @@ object RegexNerFile {
   ) extends RegexNerFile[F] {
 
     def makeFile(collective: Ident): F[Option[Path]] =
-      if (cfg.enabled) doMakeFile(collective)
+      if (cfg.maxEntries > 0) doMakeFile(collective)
       else (None: Option[Path]).pure[F]
 
     def doMakeFile(collective: Ident): F[Option[Path]] =
@@ -127,7 +127,7 @@ object RegexNerFile {
 
       for {
         _     <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
-        names <- store.transact(QCollective.allNames(collective))
+        names <- store.transact(QCollective.allNames(collective, cfg.maxEntries))
         nerFile = NerFile(collective, lastUpdate, now)
         _ <- update(nerFile, NerFile.mkNerConfig(names))
       } yield nerFile
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 1fd2401a..f336132d 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -4,9 +4,8 @@ import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 
-import docspell.analysis.TextAnalyser
 import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
-import docspell.analysis.nlp.StanfordNerSettings
+import docspell.analysis.{NlpSettings, TextAnalyser}
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
@@ -54,7 +53,7 @@ object TextAnalysis {
       analyser: TextAnalyser[F],
       nerFile: RegexNerFile[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
+    val settings = NlpSettings(ctx.args.meta.language, false, None)
     for {
       customNer <- nerFile.makeFile(ctx.args.meta.collective)
       sett = settings.copy(regexNer = customNer)
diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
index b9fe40c7..84caa840 100644
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@@ -1,10 +1,8 @@
 package docspell.store.queries
 
-import cats.data.OptionT
 import fs2.Stream
 
-import docspell.common.ContactKind
-import docspell.common.{Direction, Ident}
+import docspell.common._
 import docspell.store.qb.DSL._
 import docspell.store.qb._
 import docspell.store.records._
@@ -17,6 +15,7 @@ object QCollective {
   private val t  = RTag.as("t")
   private val ro = ROrganization.as("o")
   private val rp = RPerson.as("p")
+  private val re = REquipment.as("e")
   private val rc = RContact.as("c")
   private val i  = RItem.as("i")
 
@@ -25,13 +24,37 @@ object QCollective {
     val empty = Names(Vector.empty, Vector.empty, Vector.empty)
   }
 
-  def allNames(collective: Ident): ConnectionIO[Names] =
-    (for {
-      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
-      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
-      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
-    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
-      .getOrElse(Names.empty)
+  def allNames(collective: Ident, maxEntries: Int): ConnectionIO[Names] = {
+    val created = Column[Timestamp]("created", TableDef(""))
+    union(
+      Select(
+        select(ro.name.s, lit(1).as("kind"), ro.created.as(created)),
+        from(ro),
+        ro.cid === collective
+      ),
+      Select(
+        select(rp.name.s, lit(2).as("kind"), rp.created.as(created)),
+        from(rp),
+        rp.cid === collective
+      ),
+      Select(
+        select(re.name.s, lit(3).as("kind"), re.created.as(created)),
+        from(re),
+        re.cid === collective
+      )
+    ).orderBy(created.desc)
+      .limit(Batch.limit(maxEntries))
+      .build
+      .query[(String, Int)]
+      .streamWithChunkSize(maxEntries)
+      .fold(Names.empty) { case (names, (name, kind)) =>
+        if (kind == 1) names.copy(org = names.org :+ name)
+        else if (kind == 2) names.copy(pers = names.pers :+ name)
+        else names.copy(equip = names.equip :+ name)
+      }
+      .compile
+      .lastOrError
+  }
 
   case class InsightData(
       incoming: Int,
diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm
index 40fe5eb2..c7e04b7b 100644
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@@ -11,6 +11,7 @@ type Language
     = German
     | English
     | French
+    | Italian
 
 
 fromString : String -> Maybe Language
@@ -24,6 +25,8 @@ fromString str =
     else if str == "fra" || str == "fr" || str == "french" then
         Just French
 
+    else if str == "ita" || str == "it" || str == "italian" then
+        Just Italian
     else
         Nothing
 
@@ -40,6 +43,9 @@ toIso3 lang =
         French ->
             "fra"
 
+        Italian ->
+            "ita"
+
 
 toName : Language -> String
 toName lang =
@@ -53,7 +59,10 @@ toName lang =
         French ->
             "French"
 
+        Italian ->
+            "Italian"
+
 
 all : List Language
 all =
-    [ German, English, French ]
+    [ German, English, French, Italian ]
diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index 373a6aed..aae8d835 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -98,9 +98,13 @@ let
     };
     text-analysis = {
       max-length = 10000;
-      regex-ner = {
-        enabled = true;
-        file-cache-time = "1 minute";
+      nlp = {
+        mode = "full";
+        clear-interval = "15 minutes";
+        regex-ner = {
+          max-entries = 1000;
+          file-cache-time = "1 minute";
+        };
       };
       classification = {
         enabled = true;
@@ -118,7 +122,6 @@ let
         ];
       };
       working-dir = "/tmp/docspell-analysis";
-      clear-stanford-nlp-interval = "15 minutes";
     };
     processing = {
       max-due-date-years = 10;
@@ -772,47 +775,96 @@ in {
                 files.
               '';
             };
-            clear-stanford-nlp-interval = mkOption {
-              type = types.str;
-              default = defaults.text-analysis.clear-stanford-nlp-interval;
-              description = ''
-                Idle time after which the NLP caches are cleared to free
-                memory. If <= 0 clearing the cache is disabled.
-              '';
-            };
 
-            regex-ner = mkOption {
+            nlp = mkOption {
               type = types.submodule({
                 options = {
-                  enabled = mkOption {
-                    type = types.bool;
-                    default = defaults.text-analysis.regex-ner.enabled;
+                  mode = mkOption {
+                    type = types.str;
+                    default = defaults.text-analysis.nlp.mode;
                     description = ''
-                      Whether to enable custom NER annotation. This uses the address
-                      book of a collective as input for NER tagging (to automatically
-                      find correspondent and concerned entities). If the address book
-                      is large, this can be quite memory intensive and also makes text
-                      analysis slower. But it greatly improves accuracy. If this is
-                      false, NER tagging uses only statistical models (that also work
-                      quite well).
+                      The mode for configuring NLP models:
 
-                      This setting might be moved to the collective settings in the
-                      future.
+                      1. full – builds the complete pipeline
+                      2. basic - builds only the ner annotator
+                      3. regexonly - matches each entry in your address book via regexps
+                      4. disabled - doesn't use any stanford-nlp feature
+
+                      The full and basic variants rely on pre-build language models
+                      that are available for only 3 lanugages at the moment: German,
+                      English and French.
+
+                      Memory usage varies greatly among the languages. German has
+                      quite large models, that require about 1G heap. So joex should
+                      run with -Xmx1500M at least when using mode=full.
+
+                      The basic variant does a quite good job for German and
+                      English. It might be worse for French, always depending on the
+                      type of text that is analysed. Joex should run with about 600M
+                      heap, here again lanugage German uses the most.
+
+                      The regexonly variant doesn't depend on a language. It roughly
+                      works by converting all entries in your addressbook into
+                      regexps and matches each one against the text. This can get
+                      memory intensive, too, when the addressbook grows large. This
+                      is included in the full and basic by default, but can be used
+                      independently by setting mode=regexner.
+
+                      When mode=disabled, then the whole nlp pipeline is disabled,
+                      and you won't get any suggestions. Only what the classifier
+                      returns (if enabled).
                     '';
                   };
-                  file-cache-time = mkOption {
+
+                  clear-interval = mkOption {
                     type = types.str;
-                    default = defaults.text-analysis.ner-file-cache-time;
+                    default = defaults.text-analysis.nlp.clear-interval;
                     description = ''
-                      The NER annotation uses a file of patterns that is derived from
-                      a collective's address book. This is is the time how long this
-                      file will be kept until a check for a state change is done.
+                      Idle time after which the NLP caches are cleared to free
+                      memory. If <= 0 clearing the cache is disabled.
                     '';
                   };
+
+                  regex-ner = mkOption {
+                    type = types.submodule({
+                      options = {
+                        enabled = mkOption {
+                          type = types.int;
+                          default = defaults.text-analysis.regex-ner.max-entries;
+                          description = ''
+                            Whether to enable custom NER annotation. This uses the
+                            address book of a collective as input for NER tagging (to
+                            automatically find correspondent and concerned entities). If
+                            the address book is large, this can be quite memory
+                            intensive and also makes text analysis much slower. But it
+                            improves accuracy and can be used independent of the
+                            lanugage. If this is set to 0, it is effectively disabled
+                            and NER tagging uses only statistical models (that also work
+                            quite well, but are restricted to the languages mentioned
+                            above).
+
+                            Note, this is only relevant if nlp-config.mode is not
+                            "disabled".
+                          '';
+                        };
+                        file-cache-time = mkOption {
+                          type = types.str;
+                          default = defaults.text-analysis.ner-file-cache-time;
+                          description = ''
+                            The NER annotation uses a file of patterns that is derived from
+                            a collective's address book. This is is the time how long this
+                            file will be kept until a check for a state change is done.
+                          '';
+                        };
+                      };
+                    });
+                    default = defaults.text-analysis.nlp.regex-ner;
+                    description = "";
+                  };
                 };
               });
-              default = defaults.text-analysis.regex-ner;
-              description = "";
+              default = defaults.text-analysis.nlp;
+              description = "Configure NLP";
             };
 
             classification = mkOption {