Use collective data with NER annotation

2025-09-15 21:46:53 +00:00 · 2020-08-24 23:25:57 +02:00
parent de5b33c40d
commit 3473cbb773
12 changed files with 413 additions and 76 deletions
--- a/modules/common/src/main/scala/docspell/common/Duration.scala
+++ b/modules/common/src/main/scala/docspell/common/Duration.scala
@@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
  def hours: Long = minutes / 60
  def >(other: Duration): Boolean =
    nanos > other.nanos
  def <(other: Duration): Boolean =
    nanos < other.nanos
  def toScala: FiniteDuration =
    FiniteDuration(nanos, TimeUnit.NANOSECONDS)
@@ -62,6 +68,9 @@ object Duration {
  def nanos(n: Long): Duration =
    Duration(n)
  def between(start: Timestamp, end: Timestamp): Duration =
    apply(JDur.between(start.value, end.value))
  def stopTime[F[_]: Sync]: F[F[Duration]] =
    for {
      now <- Timestamp.current[F]
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@@ -12,6 +12,10 @@ import cats.effect._
 import cats.implicits._
 import fs2.Stream
 import docspell.common.syntax.all._
 import io.circe.Decoder
 object File {
  def mkDir[F[_]: Sync](dir: Path): F[Path] =
@@ -91,4 +95,10 @@ object File {
  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
  def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
      d: Decoder[A]
  ): F[A] =
    readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
 }
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -248,6 +248,29 @@ docspell.joex {
    # should suffice. Default is 10000, which are about 2-3 pages
    # (just a rough guess, of course).
    max-length = 10000
    # A working directory for the analyser to store temporary/working
    # files.
    working-dir = ${java.io.tmpdir}"/docspell-analysis"
    regex-ner {
      # Whether to enable custom NER annotation. This uses the address
      # book of a collective as input for NER tagging (to automatically
      # find correspondent and concerned entities). If the address book
      # is large, this can be quite memory intensive and also makes text
      # analysis slower. But it greatly improves accuracy. If this is
      # false, NER tagging uses only statistical models (that also work
      # quite well).
      #
      # This setting might be moved to the collective settings in the
      # future.
      enabled = true
      # The NER annotation uses a file of patterns that is derived from
      # a collective's address book. This is is the time how long this
      # file will be kept until a check for a state change is done.
      file-cache-time = "1 minute"
    }
  }
  # Configuration for converting files into PDFs.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -1,11 +1,14 @@
 package docspell.joex
 import java.nio.file.Path
 import docspell.analysis.TextAnalysisConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
 import docspell.extract.ExtractConfig
 import docspell.ftssolr.SolrConfig
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.hk.HouseKeepingConfig
 import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
 import docspell.store.JdbcConfig
@@ -20,7 +23,7 @@ case class Config(
    userTasks: Config.UserTasks,
    houseKeeping: HouseKeepingConfig,
    extraction: ExtractConfig,
-    textAnalysis: TextAnalysisConfig,
+    textAnalysis: Config.TextAnalysis,
    convert: ConvertConfig,
    sendMail: MailSendConfig,
    files: Files,
@@ -50,4 +53,19 @@ object Config {
  }
  case class Processing(maxDueDateYears: Int)
  case class TextAnalysis(
      maxLength: Int,
      workingDir: Path,
      regexNer: RegexNer
  ) {
    def textAnalysisConfig: TextAnalysisConfig =
      TextAnalysisConfig(maxLength)
    def regexNerFileConfig: RegexNerFile.Config =
      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
  }
  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
 }
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -11,6 +11,7 @@ import docspell.backend.ops._
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.ftssolr.SolrFtsClient
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.notify._
@@ -89,7 +90,8 @@ object JoexAppImpl {
      upload   <- OUpload(store, queue, cfg.files, joex)
      fts      <- createFtsClient(cfg)(httpClient)
      itemOps  <- OItem(store, fts, queue, joex)
-      analyser <- TextAnalyser.create[F](cfg.textAnalysis)
+      analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
      regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
      javaEmil =
        JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
      sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@@ -97,14 +99,14 @@ object JoexAppImpl {
        .withTask(
          JobTask.json(
            ProcessItemArgs.taskName,
-            ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
+            ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
            ItemHandler.onCancel[F]
          )
        )
        .withTask(
          JobTask.json(
            ReProcessItemArgs.taskName,
-            ReProcessItem[F](cfg, fts, analyser),
+            ReProcessItem[F](cfg, fts, analyser, regexNer),
            ReProcessItem.onCancel[F]
          )
        )
--- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@@ -0,0 +1,99 @@
 package docspell.joex.analysis
 import java.nio.file.Path
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.store.queries.QCollective
 import io.circe.generic.semiauto._
 import io.circe.{Decoder, Encoder}
 case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
  def nerFilePath(directory: Path): Path =
    NerFile.nerFilePath(directory, collective)
  def jsonFilePath(directory: Path) =
    NerFile.jsonFilePath(directory, collective)
 }
 object NerFile {
  implicit val jsonDecoder: Decoder[NerFile] =
    deriveDecoder[NerFile]
  implicit val jsonEncoder: Encoder[NerFile] =
    deriveEncoder[NerFile]
  private def nerFilePath(directory: Path, collective: Ident): Path =
    directory.resolve(s"${collective.id}.txt")
  private def jsonFilePath(directory: Path, collective: Ident): Path =
    directory.resolve(s"${collective.id}.json")
  def find[F[_]: Sync: ContextShift](
      collective: Ident,
      directory: Path,
      blocker: Blocker
  ): F[Option[NerFile]] = {
    val file = jsonFilePath(directory, collective)
    File.existsNonEmpty[F](file).flatMap {
      case true =>
        File
          .readJson[F, NerFile](file, blocker)
          .map(_.some)
      case false =>
        (None: Option[NerFile]).pure[F]
    }
  }
  def mkNerConfig(names: QCollective.Names): String = {
    val orgs = names.org
      .flatMap(Pattern(3))
      .distinct
      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
    val pers =
      names.pers
        .flatMap(Pattern(2))
        .distinct
        .map(_.toRow("PERSON", "LOCATION,MISC"))
    val equips =
      names.equip
        .flatMap(Pattern(1))
        .distinct
        .map(_.toRow("MISC", "LOCATION"))
    (orgs ++ pers ++ equips).mkString("\n")
  }
  case class Pattern(value: String, weight: Int) {
    def toRow(tag: String, overrideTags: String): String =
      s"$value\t$tag\t$overrideTags\t$weight"
  }
  object Pattern {
    def apply(weight: Int)(str: String): Vector[Pattern] = {
      val delims = " \t\n\r".toSet
      val words =
        TextSplitter
          .split(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
          .toVector
          .map(w => s"(?i)${w}")
      val tokens =
        TextSplitter
          .splitToken(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
          .toVector
          .take(3)
          .map(w => s"(?i)${w}")
      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
    }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala
@@ -0,0 +1,164 @@
 package docspell.joex.analysis
 import java.nio.file.Path
 import cats.effect._
 import cats.effect.concurrent.Semaphore
 import cats.implicits._
 import docspell.common._
 import docspell.common.syntax.all._
 import docspell.store.Store
 import docspell.store.queries.QCollective
 import docspell.store.records.REquipment
 import docspell.store.records.ROrganization
 import docspell.store.records.RPerson
 import io.circe.syntax._
 import org.log4s.getLogger
 /** Maintains a custom regex-ner file per collective for stanford's
  * regexner annotator.
  */
 trait RegexNerFile[F[_]] {
  def makeFile(collective: Ident): F[Option[Path]]
 }
 object RegexNerFile {
  private[this] val logger = getLogger
  case class Config(enabled: Boolean, directory: Path, minTime: Duration)
  def apply[F[_]: Concurrent: ContextShift](
      cfg: Config,
      blocker: Blocker,
      store: Store[F]
  ): Resource[F, RegexNerFile[F]] =
    for {
      dir    <- File.withTempDir[F](cfg.directory, "regexner-")
      writer <- Resource.liftF(Semaphore(1))
    } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
  final private class Impl[F[_]: Concurrent: ContextShift](
      cfg: Config,
      blocker: Blocker,
      store: Store[F],
      writer: Semaphore[F] //TODO allow parallelism per collective
  ) extends RegexNerFile[F] {
    def makeFile(collective: Ident): F[Option[Path]] =
      if (cfg.enabled) doMakeFile(collective)
      else (None: Option[Path]).pure[F]
    def doMakeFile(collective: Ident): F[Option[Path]] =
      for {
        now      <- Timestamp.current[F]
        existing <- NerFile.find[F](collective, cfg.directory, blocker)
        result <- existing match {
          case Some(nf) =>
            val dur = Duration.between(nf.creation, now)
            if (dur > cfg.minTime)
              logger.fdebug(
                s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
              ) *> updateFile(
                collective,
                now,
                Some(nf)
              )
            else nf.nerFilePath(cfg.directory).some.pure[F]
          case None =>
            updateFile(collective, now, None)
        }
      } yield result
    private def updateFile(
        collective: Ident,
        now: Timestamp,
        current: Option[NerFile]
    ): F[Option[Path]] =
      for {
        lastUpdate <- store.transact(Sql.latestUpdate(collective))
        result <- lastUpdate match {
          case None =>
            (None: Option[Path]).pure[F]
          case Some(lup) =>
            current match {
              case Some(cur) =>
                val nerf =
                  if (cur.updated == lup)
                    logger.fdebug(s"No state change detected.") *> updateTimestamp(
                      cur,
                      now
                    ) *> cur.pure[F]
                  else
                    logger.fdebug(
                      s"There have been state changes for collective '${collective.id}'. Reload NER file."
                    ) *> createFile(lup, collective, now)
                nerf.map(_.nerFilePath(cfg.directory).some)
              case None =>
                createFile(lup, collective, now)
                  .map(_.nerFilePath(cfg.directory).some)
            }
        }
      } yield result
    private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
      writer.withPermit(for {
        file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
        _    <- File.mkDir(file.getParent)
        _    <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
      } yield ())
    private def createFile(
        lastUpdate: Timestamp,
        collective: Ident,
        now: Timestamp
    ): F[NerFile] = {
      def update(nf: NerFile, text: String): F[Unit] =
        writer.withPermit(for {
          jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
          _        <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
          _        <- File.mkDir(jsonFile.getParent)
          _        <- File.writeString(nf.nerFilePath(cfg.directory), text)
          _        <- File.writeString(jsonFile, nf.asJson.spaces2)
        } yield ())
      for {
        _     <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
        names <- store.transact(QCollective.allNames(collective))
        nerFile = NerFile(collective, lastUpdate, now)
        _ <- update(nerFile, NerFile.mkNerConfig(names))
      } yield nerFile
    }
  }
  object Sql {
    import doobie._
    import doobie.implicits._
    import docspell.store.impl.Implicits._
    import docspell.store.impl.Column
    def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
      def max(col: Column, table: Fragment, cidCol: Column): Fragment =
        selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
      val sql =
        List(
          max(
            ROrganization.Columns.updated,
            ROrganization.table,
            ROrganization.Columns.cid
          ),
          max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
          max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
        )
          .reduce(_ ++ fr"UNION ALL" ++ _)
      selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
        .query[Timestamp]
        .option
    }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@@ -10,6 +10,7 @@ import docspell.backend.ops.OItem
 import docspell.common.{ItemState, ProcessItemArgs}
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QItem
 import docspell.store.records.RItem
@@ -31,11 +32,12 @@ object ItemHandler {
      cfg: Config,
      itemOps: OItem[F],
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    CreateItem[F]
      .flatMap(itemStateTask(ItemState.Processing))
-      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
+      .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
      .map(_ => ())
  def itemStateTask[F[_]: Sync, A](
@@ -54,11 +56,12 @@ object ItemHandler {
      cfg: Config,
      itemOps: OItem[F],
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
          case Right(d) =>
            Task.pure(d)
          case Left(ex) =>
@@ -68,7 +71,7 @@ object ItemHandler {
              .andThen(_ => Sync[F].raiseError(ex))
        })
      case false =>
-        ProcessItem[F](cfg, itemOps, fts, analyser)(data)
+        ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
          .flatMap(itemStateTask(ItemState.Created))
    }
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -7,6 +7,7 @@ import docspell.backend.ops.OItem
 import docspell.common.ProcessItemArgs
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Task
 object ProcessItem {
@@ -15,11 +16,12 @@ object ProcessItem {
      cfg: Config,
      itemOps: OItem[F],
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ExtractArchive(item)
      .flatMap(Task.setProgress(20))
-      .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
+      .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
      .flatMap(LinkProposal[F])
      .flatMap(SetGivenData[F](itemOps))
      .flatMap(Task.setProgress(99))
@@ -27,15 +29,17 @@ object ProcessItem {
  def processAttachments[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
+    processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
  def analysisOnly[F[_]: Sync](
      cfg: Config,
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](analyser)(item)
+    TextAnalysis[F](analyser, regexNer)(item)
      .flatMap(FindProposal[F](cfg.processing))
      .flatMap(EvalProposals[F])
      .flatMap(SaveProposals[F])
@@ -44,12 +48,13 @@ object ProcessItem {
      cfg: Config,
      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F],
      progress: (Int, Int, Int)
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ConvertPdf(cfg.convert, item)
      .flatMap(Task.setProgress(progress._1))
      .flatMap(TextExtraction(cfg.extraction, fts))
      .flatMap(Task.setProgress(progress._2))
-      .flatMap(analysisOnly[F](cfg, analyser))
+      .flatMap(analysisOnly[F](cfg, analyser, regexNer))
      .flatMap(Task.setProgress(progress._3))
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachment
@@ -21,10 +22,11 @@ object ReProcessItem {
  def apply[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  ): Task[F, Args, Unit] =
    loadItem[F]
-      .flatMap(safeProcess[F](cfg, fts, analyser))
+      .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
      .map(_ => ())
  def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@@ -73,6 +75,7 @@ object ReProcessItem {
      cfg: Config,
      fts: FtsClient[F],
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F],
      data: ItemData
  ): Task[F, Args, ItemData] = {
@@ -94,7 +97,7 @@ object ReProcessItem {
    getLanguage[F].flatMap { lang =>
      ProcessItem
-        .processAttachments[F](cfg, fts, analyser)(data)
+        .processAttachments[F](cfg, fts, analyser, regexNer)(data)
        .contramap[Args](convertArgs(lang))
    }
  }
@@ -113,11 +116,12 @@ object ReProcessItem {
  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
      cfg: Config,
      fts: FtsClient[F],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(data: ItemData): Task[F, Args, ItemData] =
    isLastRetry[F].flatMap {
      case true =>
-        processFiles[F](cfg, fts, analyser, data).attempt
+        processFiles[F](cfg, fts, analyser, regexNer, data).attempt
          .flatMap({
            case Right(d) =>
              Task.pure(d)
@@ -127,7 +131,7 @@ object ReProcessItem {
              ).andThen(_ => Sync[F].raiseError(ex))
          })
      case false =>
-        processFiles[F](cfg, fts, analyser, data)
+        processFiles[F](cfg, fts, analyser, regexNer, data)
    }
  private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,24 +1,22 @@
 package docspell.joex.process
 import java.nio.file.Paths
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
 import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QCollective
 import docspell.store.records.RAttachmentMeta
 object TextAnalysis {
  def apply[F[_]: Sync](
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
      for {
@@ -27,7 +25,7 @@ object TextAnalysis {
        t <-
          item.metas.toList
            .traverse(
-              annotateAttachment[F](ctx, analyser)
+              annotateAttachment[F](ctx, analyser, nerFile)
            )
        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
        _ <- t.traverse(m =>
@@ -41,63 +39,19 @@ object TextAnalysis {
  def annotateAttachment[F[_]: Sync](
      ctx: Context[F, ProcessItemArgs],
-      analyser: TextAnalyser[F]
+      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
    val settings = StanfordSettings(ctx.args.meta.language, false, None)
    for {
-      names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
+      customNer <- nerFile.makeFile(ctx.args.meta.collective)
-      temp  <- File.mkTempFile(Paths.get("."), "textanalysis")
+      sett = settings.copy(regexNer = customNer)
      _     <- File.writeString(temp, mkNerConfig(names))
      sett = settings.copy(regexNer = Some(temp))
      labels <- analyser.annotate(
        ctx.logger,
        sett,
        ctx.args.meta.collective,
        rm.content.getOrElse("")
      )
      _ <- File.deleteFile(temp)
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
  }
  def mkNerConfig(names: QCollective.Names): String = {
    val orgs = names.org
      .flatMap(Pattern(3))
      .distinct
      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
    val pers =
      names.pers
        .flatMap(Pattern(2))
        .distinct
        .map(_.toRow("PERSON", "LOCATION,MISC"))
    val equips =
      names.equip
        .flatMap(Pattern(1))
        .distinct
        .map(_.toRow("MISC", "LOCATION"))
    (orgs ++ pers ++ equips).mkString("\n")
  }
  case class Pattern(value: String, weight: Int) {
    def toRow(tag: String, overrideTags: String): String =
      s"$value\t$tag\t$overrideTags\t$weight"
  }
  object Pattern {
    def apply(weight: Int)(str: String): Vector[Pattern] = {
      val delims = " \t\n\r".toSet
      val words =
        TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
      val tokens =
        TextSplitter
          .splitToken(str, delims)
          .toVector
          .take(3)
          .map(w => s"(?i)${w.toLower.value}")
      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
    }
  }
 }
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -91,6 +91,11 @@ let
    };
    text-analysis = {
      max-length = 10000;
      regex-ner = {
        enabled = true;
        file-cache-time = "1 minute";
      };
      working-dir = "/tmp/docspell-analysis";
    };
    processing = {
      max-due-date-years = 10;
@@ -689,7 +694,48 @@ in {
                (a rough guess).
              '';
            };
            working-dir = mkOption {
              type = types.str;
              default = defaults.text-analysis.working-dir;
              description = ''
                A working directory for the analyser to store temporary/working
                files.
              '';
            };
            regex-ner = mkOption {
              type = types.submodule({
                options = {
                  enabled = mkOption {
                    type = types.bool;
                    default = defaults.text-analysis.regex-ner.enabled;
                    description = ''
                      Whether to enable custom NER annotation. This uses the address
                      book of a collective as input for NER tagging (to automatically
                      find correspondent and concerned entities). If the address book
                      is large, this can be quite memory intensive and also makes text
                      analysis slower. But it greatly improves accuracy. If this is
                      false, NER tagging uses only statistical models (that also work
                      quite well).
                      This setting might be moved to the collective settings in the
                      future.
                    '';
                  };
                  file-cache-time = mkOption {
                    type = types.str;
                    default = defaults.text-analysis.ner-file-cache-time;
                    description = ''
                      The NER annotation uses a file of patterns that is derived from
                      a collective's address book. This is is the time how long this
                      file will be kept until a check for a state change is done.
                    '';
                  };
                };
              });
              default = defaults.text-analysis.regex-ner;
              description = "";
            };
          };
        });
        default = defaults.text-analysis;