Use collective's addressbook to configure regexner

2025-07-04 16:48:26 +00:00 · 2020-08-24 14:35:56 +02:00
parent 8628a0a8b3
commit 96d2f948f2
4 changed files with 84 additions and 9 deletions
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@ -3,12 +3,17 @@ package docspell.analysis.nlp
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import edu.stanford.nlp.pipeline.StanfordCoreNLP

 object TextAnalyserSuite extends SimpleTestSuite {
+  lazy val germanClassifier =
+    new StanfordCoreNLP(Properties.nerGerman(None, false))
+  lazy val englishClassifier =
+    new StanfordCoreNLP(Properties.nerEnglish(None))

  test("find english ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
    val expect = Vector(
      NerLabel("Derek", NerTag.Person, 0, 5),
      NerLabel("Jeter", NerTag.Person, 6, 11),
@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {

  test("find german ner labels") {
    val labels =
-      StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
    val expect = Vector(
      NerLabel("Max", NerTag.Person, 0, 3),
      NerLabel("Mustermann", NerTag.Person, 4, 14),
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@ -1,6 +1,7 @@
 package docspell.common

 import java.io.IOException
+import java.nio.charset.StandardCharsets
 import java.nio.file._
 import java.nio.file.attribute.BasicFileAttributes
 import java.util.concurrent.atomic.AtomicInteger
@ -87,4 +88,7 @@ object File {

  def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
    readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
+
+  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
+    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -1,13 +1,18 @@
 package docspell.joex.process

+import java.nio.file.Paths
+
 import cats.effect._
 import cats.implicits._

 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
+import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
+import docspell.store.queries.QCollective
 import docspell.store.records.RAttachmentMeta

 object TextAnalysis {
@ -22,7 +27,7 @@ object TextAnalysis {
        t <-
          item.metas.toList
            .traverse(
-              annotateAttachment[F](ctx.args, ctx.logger, analyser)
+              annotateAttachment[F](ctx, analyser)
            )
        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
        _ <- t.traverse(m =>
@ -35,18 +40,64 @@ object TextAnalysis {
    }

  def annotateAttachment[F[_]: Sync](
-      args: ProcessItemArgs,
-      logger: Logger[F],
+      ctx: Context[F, ProcessItemArgs],
      analyser: TextAnalyser[F]
  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(args.meta.language, false, None)
+    val settings = StanfordSettings(ctx.args.meta.language, false, None)
    for {
+      names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
+      temp  <- File.mkTempFile(Paths.get("."), "textanalysis")
+      _     <- File.writeString(temp, mkNerConfig(names))
+      sett = settings.copy(regexNer = Some(temp))
      labels <- analyser.annotate(
-        logger,
-        settings,
-        args.meta.collective,
+        ctx.logger,
+        sett,
+        ctx.args.meta.collective,
        rm.content.getOrElse("")
      )
+      _ <- File.deleteFile(temp)
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
  }
+
+  def mkNerConfig(names: QCollective.Names): String = {
+    val orgs = names.org
+      .flatMap(Pattern(3))
+      .distinct
+      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
+
+    val pers =
+      names.pers
+        .flatMap(Pattern(2))
+        .distinct
+        .map(_.toRow("PERSON", "LOCATION,MISC"))
+
+    val equips =
+      names.equip
+        .flatMap(Pattern(1))
+        .distinct
+        .map(_.toRow("MISC", "LOCATION"))
+
+    (orgs ++ pers ++ equips).mkString("\n")
+  }
+
+  case class Pattern(value: String, weight: Int) {
+    def toRow(tag: String, overrideTags: String): String =
+      s"$value\t$tag\t$overrideTags\t$weight"
+  }
+
+  object Pattern {
+    def apply(weight: Int)(str: String): Vector[Pattern] = {
+      val delims = " \t\n\r".toSet
+      val words =
+        TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
+      val tokens =
+        TextSplitter
+          .splitToken(str, delims)
+          .toVector
+          .take(3)
+          .map(w => s"(?i)${w.toLower.value}")
+
+      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+    }
+  }
 }
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@ -1,5 +1,6 @@
 package docspell.store.queries

+import cats.data.OptionT
 import fs2.Stream

 import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
 import doobie.implicits._

 object QCollective {
+
+  case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
+  object Names {
+    val empty = Names(Vector.empty, Vector.empty, Vector.empty)
+  }
+
+  def allNames(collective: Ident): ConnectionIO[Names] =
+    (for {
+      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
+      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
+      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
+    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
+      .getOrElse(Names.empty)
+
  case class TagCount(tag: RTag, count: Int)

  case class InsightData(