From 5c487ef7a9f4e39d22145c6b272b5aeb51a3a1c9 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 19 Jan 2021 21:30:02 +0100
Subject: [PATCH] Refactor running classifier in text analysis

---
 .../scala/docspell/common/MetaProposal.scala  |  2 +-
 .../docspell/joex/learn/ClassifierName.scala  |  5 ++
 .../scala/docspell/joex/learn/Classify.scala  | 43 ++++++++++++++
 .../joex/process/ExtractArchive.scala         |  2 +-
 .../docspell/joex/process/TextAnalysis.scala  | 58 +++++++------------
 5 files changed, 70 insertions(+), 40 deletions(-)
 create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/Classify.scala

diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala
index a68affff..62a9355f 100644
--- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala
+++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala
@@ -87,7 +87,7 @@ object MetaProposal {
     }
   }
 
-  /** Merges candidates with same `IdRef' values and concatenates their
+  /** Merges candidates with same `IdRef` values and concatenates their
     * respective labels. The candidate order is preserved.
     */
   def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
index 0ed2d97e..c08b96db 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
@@ -31,6 +31,11 @@ object ClassifierName {
   val correspondentPerson: ClassifierName =
     apply("correspondentperson")
 
+  def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] =
+    for {
+      categories <- RClassifierSetting.getActiveCategories(coll)
+    } yield categories.map(tagCategory)
+
   def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
     for {
       categories <- RClassifierSetting.getActiveCategories(coll)
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
new file mode 100644
index 00000000..ae34d18f
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
@@ -0,0 +1,43 @@
+package docspell.joex.learn
+
+import java.nio.file.Path
+import cats.implicits._
+import bitpeace.RangeDef
+import cats.data.OptionT
+import cats.effect._
+import docspell.store.Store
+import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
+import docspell.common._
+import docspell.store.records.RClassifierModel
+
+object Classify {
+
+  def apply[F[_]: Sync: ContextShift](
+      blocker: Blocker,
+      logger: Logger[F],
+      workingDir: Path,
+      store: Store[F],
+      classifier: TextClassifier[F],
+      coll: Ident,
+      text: String
+  )(cname: ClassifierName): F[Option[String]] =
+    (for {
+      _     <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …"))
+      model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
+      modelData =
+        store.bitpeace
+          .get(model.fileId.id)
+          .unNoneTerminate
+          .through(store.bitpeace.fetchData2(RangeDef.all))
+      cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
+        val modelFile = dir.resolve("model.ser.gz")
+        modelData
+          .through(fs2.io.file.writeAll(modelFile, blocker))
+          .compile
+          .drain
+          .flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text))
+      }).filter(_ != LearnClassifierTask.noClass)
+      _ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}"))
+    } yield cls).value
+
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
index c48952e2..7de6a086 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
@@ -42,7 +42,7 @@ object ExtractArchive {
       archive: Option[RAttachmentArchive]
   ): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
     singlePass(item, archive).flatMap { t =>
-      if (t._1 == None) Task.pure(t)
+      if (t._1.isEmpty) Task.pure(t)
       else multiPass(t._2, t._1)
     }
 
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index fd7c08bc..b2d50f75 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,22 +1,18 @@
 package docspell.joex.process
 
-import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
-
-import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
+import docspell.analysis.classifier.TextClassifier
 import docspell.analysis.{NlpSettings, TextAnalyser}
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
-import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
+import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask}
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
 
-import bitpeace.RangeDef
-
 object TextAnalysis {
   type Args = ProcessItemArgs
 
@@ -73,40 +69,26 @@ object TextAnalysis {
       cfg: Config.TextAnalysis,
       metas: Vector[RAttachmentMeta],
       classifier: TextClassifier[F]
-  ): F[List[String]] =
+  ): F[List[String]] = {
+    val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
+    val classifyWith: ClassifierName => F[Option[String]] =
+      Classify[F](
+        ctx.blocker,
+        ctx.logger,
+        cfg.workingDir,
+        ctx.store,
+        classifier,
+        ctx.args.meta.collective,
+        text
+      )
     for {
-      models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
-      _      <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
-      tags <- models
-        .map(_.fileId.some)
-        .traverse(predictTag(ctx, cfg, metas, classifier))
+      names <- ctx.store.transact(
+        ClassifierName.findTagClassifiers(ctx.args.meta.collective)
+      )
+      _    <- ctx.logger.debug(s"Guessing tags for ${names.size} categories")
+      tags <- names.traverse(classifyWith)
     } yield tags.flatten
-
-  def predictTag[F[_]: Sync: ContextShift](
-      ctx: Context[F, Args],
-      cfg: Config.TextAnalysis,
-      metas: Vector[RAttachmentMeta],
-      classifier: TextClassifier[F]
-  )(modelFileId: Option[Ident]): F[Option[String]] =
-    (for {
-      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
-      model <- OptionT.fromOption[F](modelFileId)
-      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
-      modelData =
-        ctx.store.bitpeace
-          .get(model.id)
-          .unNoneTerminate
-          .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
-      cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
-        val modelFile = dir.resolve("model.ser.gz")
-        modelData
-          .through(fs2.io.file.writeAll(modelFile, ctx.blocker))
-          .compile
-          .drain
-          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
-      }).filter(_ != LearnClassifierTask.noClass)
-      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
-    } yield cls).value
+  }
 
   private def getActive[F[_]: Sync](
       ctx: Context[F, Args],