From 1cd34414628eeca481f61b5711aa6aa7c8c0557a Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 19 Jan 2021 22:04:13 +0100
Subject: [PATCH] Run classifier for item entities (concerned, correspondent)

Store the results separately from nlp results in attachment metadata.
---
 .../scala/docspell/joex/learn/Classify.scala  |  9 ++-
 .../joex/learn/LearnClassifierTask.scala      |  2 +-
 .../joex/process/AttachmentPageCount.scala    |  1 +
 .../docspell/joex/process/CreateItem.scala    |  6 +-
 .../docspell/joex/process/ItemData.scala      |  8 +-
 .../docspell/joex/process/ReProcessItem.scala |  3 +-
 .../docspell/joex/process/SaveProposals.scala |  5 +-
 .../docspell/joex/process/TextAnalysis.scala  | 81 +++++++++++++++----
 .../h2/V1.19.0__add_classify_meta.sql         |  3 +
 .../mariadb/V1.19.0__add_classify_meta.sql    |  3 +
 .../postgresql/V1.19.0__add_classify_meta.sql |  3 +
 .../store/records/RAttachmentMeta.scala       | 42 +++++++---
 .../store/records/RClassifierSetting.scala    |  2 +-
 13 files changed, 131 insertions(+), 37 deletions(-)
 create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql
 create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql
 create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql

diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
index ae34d18f..4c65556c 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
@@ -1,15 +1,18 @@
 package docspell.joex.learn
 
 import java.nio.file.Path
-import cats.implicits._
-import bitpeace.RangeDef
+
 import cats.data.OptionT
 import cats.effect._
-import docspell.store.Store
+import cats.implicits._
+
 import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
 import docspell.common._
+import docspell.store.Store
 import docspell.store.records.RClassifierModel
 
+import bitpeace.RangeDef
+
 object Classify {
 
   def apply[F[_]: Sync: ContextShift](
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
index 354a8e39..e3aae66f 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -89,7 +89,7 @@ object LearnClassifierTask {
   ): OptionT[F, OCollective.Classifier] =
     if (cfg.classification.enabled)
       OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
-        .filter(_.enabled)
+        .filter(_.autoTagEnabled)
         .map(OCollective.Classifier.fromRecord)
     else
       OptionT.none
diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala
index 0373db8a..15678322 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala
@@ -84,6 +84,7 @@ object AttachmentPageCount {
                 Nil,
                 MetaProposalList.empty,
                 md.pageCount.some,
+                None,
                 None
               )
             )
diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
index fe21203b..8bc9ccc1 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -107,7 +107,8 @@ object CreateItem {
         Vector.empty,
         fm.map(a => a.id -> a.fileId).toMap,
         MetaProposalList.empty,
-        Nil
+        Nil,
+        None
       )
     }
 
@@ -166,7 +167,8 @@ object CreateItem {
           Vector.empty,
           origMap,
           MetaProposalList.empty,
-          Nil
+          Nil,
+          None
         )
       )
     }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
index 0435e37c..a151e8a6 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
   * containng the source or origin file
   * @param givenMeta meta data to this item that was not "guessed"
   * from an attachment but given and thus is always correct
+  * @param classifyProposals these are proposals that were obtained by
+  * a trained classifier. There are no ner-tags, it will only provide a
+  * single label
   */
 case class ItemData(
     item: RItem,
@@ -23,7 +26,10 @@ case class ItemData(
     dateLabels: Vector[AttachmentDates],
     originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
     givenMeta: MetaProposalList,   // given meta data not associated to a specific attachment
-    tags: List[String]             // a list of tags (names or ids) attached to the item if they exist
+    // a list of tags (names or ids) attached to the item if they exist
+    tags: List[String],
+    // proposals obtained from the classifier
+    classifyProposals: Option[MetaProposalList]
 ) {
 
   def findMeta(attachId: Ident): Option[RAttachmentMeta] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
index 07fb2901..db41e901 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -65,7 +65,8 @@ object ReProcessItem {
         Vector.empty,
         asrcMap.view.mapValues(_.fileId).toMap,
         MetaProposalList.empty,
-        Nil
+        Nil,
+        None
       )).getOrElseF(
         Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
       )
diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala
index ee4fd923..9d2f0ae3 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala
@@ -17,7 +17,10 @@ object SaveProposals {
         data.metas
           .traverse(rm =>
             ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
-              ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
+              ctx.store.transact(
+                RAttachmentMeta
+                  .updateProposals(rm.id, rm.proposals, data.classifyProposals)
+              )
           )
           .map(_ => data)
     }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index b2d50f75..a2561e07 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,9 +1,12 @@
 package docspell.joex.process
 
+import cats.Traverse
 import cats.effect._
 import cats.implicits._
+
 import docspell.analysis.classifier.TextClassifier
 import docspell.analysis.{NlpSettings, TextAnalyser}
+import docspell.common.MetaProposal.Candidate
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
@@ -37,12 +40,22 @@ object TextAnalysis {
         e <- s
         _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
         v = t.toVector
-        classifierEnabled <- getActive(ctx, cfg)
+        autoTagEnabled <- getActiveAutoTag(ctx, cfg)
         tag <-
-          if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
+          if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
           else List.empty[String].pure[F]
+
+        classProposals <-
+          if (cfg.classification.enabled)
+            predictItemEntities(ctx, cfg, item.metas, analyser.classifier)
+          else MetaProposalList.empty.pure[F]
+
       } yield item
-        .copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+        .copy(
+          metas = v.map(_._1),
+          dateLabels = v.map(_._2),
+          classifyProposals = classProposals.some
+        )
         .appendTags(tag)
     }
 
@@ -72,15 +85,8 @@ object TextAnalysis {
   ): F[List[String]] = {
     val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
     val classifyWith: ClassifierName => F[Option[String]] =
-      Classify[F](
-        ctx.blocker,
-        ctx.logger,
-        cfg.workingDir,
-        ctx.store,
-        classifier,
-        ctx.args.meta.collective,
-        text
-      )
+      makeClassify(ctx, cfg, classifier)(text)
+
     for {
       names <- ctx.store.transact(
         ClassifierName.findTagClassifiers(ctx.args.meta.collective)
@@ -90,14 +96,61 @@ object TextAnalysis {
     } yield tags.flatten
   }
 
-  private def getActive[F[_]: Sync](
+  def predictItemEntities[F[_]: Sync: ContextShift](
+      ctx: Context[F, Args],
+      cfg: Config.TextAnalysis,
+      metas: Vector[RAttachmentMeta],
+      classifier: TextClassifier[F]
+  ): F[MetaProposalList] = {
+    val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
+
+    def classifyWith(
+        cname: ClassifierName,
+        mtype: MetaProposalType
+    ): F[Option[MetaProposal]] =
+      for {
+        _     <- ctx.logger.debug(s"Guessing $mtype using classifier")
+        label <- makeClassify(ctx, cfg, classifier)(text).apply(cname)
+      } yield label.map(str =>
+        MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty))
+      )
+
+    Traverse[List]
+      .sequence(
+        List(
+          classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg),
+          classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson),
+          classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson),
+          classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip)
+        )
+      )
+      .map(_.flatten)
+      .map(MetaProposalList.apply)
+  }
+
+  private def makeClassify[F[_]: Sync: ContextShift](
+      ctx: Context[F, Args],
+      cfg: Config.TextAnalysis,
+      classifier: TextClassifier[F]
+  )(text: String): ClassifierName => F[Option[String]] =
+    Classify[F](
+      ctx.blocker,
+      ctx.logger,
+      cfg.workingDir,
+      ctx.store,
+      classifier,
+      ctx.args.meta.collective,
+      text
+    )
+
+  private def getActiveAutoTag[F[_]: Sync](
       ctx: Context[F, Args],
       cfg: Config.TextAnalysis
   ): F[Boolean] =
     if (cfg.classification.enabled)
       ctx.store
         .transact(RClassifierSetting.findById(ctx.args.meta.collective))
-        .map(_.exists(_.enabled))
+        .map(_.exists(_.autoTagEnabled))
         .flatTap(enabled =>
           if (enabled) ().pure[F]
           else ctx.logger.info("Classification is disabled. Check config or settings.")
diff --git a/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql
new file mode 100644
index 00000000..2513dc8d
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql
@@ -0,0 +1,3 @@
+ALTER TABLE "attachmentmeta"
+ADD COLUMN "classify_proposals" text;
+
diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql
new file mode 100644
index 00000000..fdc3c9f0
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql
@@ -0,0 +1,3 @@
+ALTER TABLE `attachmentmeta`
+ADD COLUMN (`classify_proposals` mediumtext);
+
diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql
new file mode 100644
index 00000000..2513dc8d
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql
@@ -0,0 +1,3 @@
+ALTER TABLE "attachmentmeta"
+ADD COLUMN "classify_proposals" text;
+
diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala
index 919a5b17..f201525c 100644
--- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala
@@ -16,7 +16,8 @@ case class RAttachmentMeta(
     nerlabels: List[NerLabel],
     proposals: MetaProposalList,
     pages: Option[Int],
-    language: Option[Language]
+    language: Option[Language],
+    classifyProposals: Option[MetaProposalList]
 ) {
 
   def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
@@ -29,19 +30,28 @@ case class RAttachmentMeta(
 
 object RAttachmentMeta {
   def empty(attachId: Ident, lang: Language) =
-    RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang))
+    RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None)
 
   final case class Table(alias: Option[String]) extends TableDef {
     val tableName = "attachmentmeta"
 
-    val id        = Column[Ident]("attachid", this)
-    val content   = Column[String]("content", this)
-    val nerlabels = Column[List[NerLabel]]("nerlabels", this)
-    val proposals = Column[MetaProposalList]("itemproposals", this)
-    val pages     = Column[Int]("page_count", this)
-    val language  = Column[Language]("language", this)
+    val id                = Column[Ident]("attachid", this)
+    val content           = Column[String]("content", this)
+    val nerlabels         = Column[List[NerLabel]]("nerlabels", this)
+    val proposals         = Column[MetaProposalList]("itemproposals", this)
+    val pages             = Column[Int]("page_count", this)
+    val language          = Column[Language]("language", this)
+    val classifyProposals = Column[MetaProposalList]("classify_proposals", this)
     val all =
-      NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language)
+      NonEmptyList.of[Column[_]](
+        id,
+        content,
+        nerlabels,
+        proposals,
+        pages,
+        language,
+        classifyProposals
+      )
   }
 
   val T = Table(None)
@@ -52,7 +62,7 @@ object RAttachmentMeta {
     DML.insert(
       T,
       T.all,
-      fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}"
+      fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}"
     )
 
   def exists(attachId: Ident): ConnectionIO[Boolean] =
@@ -80,7 +90,8 @@ object RAttachmentMeta {
       DML.set(
         T.content.setTo(v.content),
         T.nerlabels.setTo(v.nerlabels),
-        T.proposals.setTo(v.proposals)
+        T.proposals.setTo(v.proposals),
+        T.classifyProposals.setTo(v.classifyProposals)
       )
     )
 
@@ -93,12 +104,17 @@ object RAttachmentMeta {
       )
     )
 
-  def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] =
+  def updateProposals(
+      mid: Ident,
+      plist: MetaProposalList,
+      clist: Option[MetaProposalList]
+  ): ConnectionIO[Int] =
     DML.update(
       T,
       T.id === mid,
       DML.set(
-        T.proposals.setTo(plist)
+        T.proposals.setTo(plist),
+        T.classifyProposals.setTo(clist)
       )
     )
 
diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
index 9c31a5c2..1d7fd5f6 100644
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@@ -20,7 +20,7 @@ case class RClassifierSetting(
     listType: ListType
 ) {
 
-  def enabled: Boolean =
+  def autoTagEnabled: Boolean =
     listType match {
       case ListType.Blacklist =>
         true