From 96d2f948f2af5a0e11859fc1101ced9de36d862b Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 24 Aug 2020 14:35:56 +0200
Subject: [PATCH] Use collective's addressbook to configure regexner

---
 .../analysis/nlp/TextAnalyserSuite.scala      |  9 ++-
 .../src/main/scala/docspell/common/File.scala |  4 ++
 .../docspell/joex/process/TextAnalysis.scala  | 65 +++++++++++++++++--
 .../docspell/store/queries/QCollective.scala  | 15 +++++
 4 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
index b7c083a1..b22093f1 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@@ -3,12 +3,17 @@ package docspell.analysis.nlp
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
 
 object TextAnalyserSuite extends SimpleTestSuite {
+  lazy val germanClassifier =
+    new StanfordCoreNLP(Properties.nerGerman(None, false))
+  lazy val englishClassifier =
+    new StanfordCoreNLP(Properties.nerEnglish(None))
 
   test("find english ner labels") {
     val labels =
-      StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
     val expect = Vector(
       NerLabel("Derek", NerTag.Person, 0, 5),
       NerLabel("Jeter", NerTag.Person, 6, 11),
@@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
 
   test("find german ner labels") {
     val labels =
-      StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
     val expect = Vector(
       NerLabel("Max", NerTag.Person, 0, 3),
       NerLabel("Mustermann", NerTag.Person, 4, 14),
diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala
index 0efc552a..2d5cfb8a 100644
--- a/modules/common/src/main/scala/docspell/common/File.scala
+++ b/modules/common/src/main/scala/docspell/common/File.scala
@@ -1,6 +1,7 @@
 package docspell.common
 
 import java.io.IOException
+import java.nio.charset.StandardCharsets
 import java.nio.file._
 import java.nio.file.attribute.BasicFileAttributes
 import java.util.concurrent.atomic.AtomicInteger
@@ -87,4 +88,7 @@ object File {
 
   def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
     readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
+
+  def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
+    Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 625738ef..9ee3850c 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,13 +1,18 @@
 package docspell.joex.process
 
+import java.nio.file.Paths
+
 import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.split.TextSplitter
 import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
+import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
+import docspell.store.queries.QCollective
 import docspell.store.records.RAttachmentMeta
 
 object TextAnalysis {
@@ -22,7 +27,7 @@ object TextAnalysis {
         t <-
           item.metas.toList
             .traverse(
-              annotateAttachment[F](ctx.args, ctx.logger, analyser)
+              annotateAttachment[F](ctx, analyser)
             )
         _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
         _ <- t.traverse(m =>
@@ -35,18 +40,64 @@ object TextAnalysis {
     }
 
   def annotateAttachment[F[_]: Sync](
-      args: ProcessItemArgs,
-      logger: Logger[F],
+      ctx: Context[F, ProcessItemArgs],
       analyser: TextAnalyser[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(args.meta.language, false, None)
+    val settings = StanfordSettings(ctx.args.meta.language, false, None)
     for {
+      names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
+      temp  <- File.mkTempFile(Paths.get("."), "textanalysis")
+      _     <- File.writeString(temp, mkNerConfig(names))
+      sett = settings.copy(regexNer = Some(temp))
       labels <- analyser.annotate(
-        logger,
-        settings,
-        args.meta.collective,
+        ctx.logger,
+        sett,
+        ctx.args.meta.collective,
         rm.content.getOrElse("")
       )
+      _ <- File.deleteFile(temp)
     } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
   }
+
+  def mkNerConfig(names: QCollective.Names): String = {
+    val orgs = names.org
+      .flatMap(Pattern(3))
+      .distinct
+      .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
+
+    val pers =
+      names.pers
+        .flatMap(Pattern(2))
+        .distinct
+        .map(_.toRow("PERSON", "LOCATION,MISC"))
+
+    val equips =
+      names.equip
+        .flatMap(Pattern(1))
+        .distinct
+        .map(_.toRow("MISC", "LOCATION"))
+
+    (orgs ++ pers ++ equips).mkString("\n")
+  }
+
+  case class Pattern(value: String, weight: Int) {
+    def toRow(tag: String, overrideTags: String): String =
+      s"$value\t$tag\t$overrideTags\t$weight"
+  }
+
+  object Pattern {
+    def apply(weight: Int)(str: String): Vector[Pattern] = {
+      val delims = " \t\n\r".toSet
+      val words =
+        TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
+      val tokens =
+        TextSplitter
+          .splitToken(str, delims)
+          .toVector
+          .take(3)
+          .map(w => s"(?i)${w.toLower.value}")
+
+      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+    }
+  }
 }
diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
index 2dc94e05..80b40207 100644
--- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala
@@ -1,5 +1,6 @@
 package docspell.store.queries
 
+import cats.data.OptionT
 import fs2.Stream
 
 import docspell.common.ContactKind
@@ -11,6 +12,20 @@ import doobie._
 import doobie.implicits._
 
 object QCollective {
+
+  case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
+  object Names {
+    val empty = Names(Vector.empty, Vector.empty, Vector.empty)
+  }
+
+  def allNames(collective: Ident): ConnectionIO[Names] =
+    (for {
+      orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
+      pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
+      equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
+    } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
+      .getOrElse(Names.empty)
+
   case class TagCount(tag: RTag, count: Int)
 
   case class InsightData(