Refactor running classifier in text analysis

2025-09-15 21:46:53 +00:00 · 2021-01-19 21:30:02 +01:00
parent 99dcaae66b
commit 5c487ef7a9
5 changed files with 70 additions and 40 deletions
--- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala
+++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala
@@ -87,7 +87,7 @@ object MetaProposal {
    }
  }
-  /** Merges candidates with same `IdRef' values and concatenates their
+  /** Merges candidates with same `IdRef` values and concatenates their
    * respective labels. The candidate order is preserved.
    */
  def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
--- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
@@ -31,6 +31,11 @@ object ClassifierName {
  val correspondentPerson: ClassifierName =
    apply("correspondentperson")
  def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] =
    for {
      categories <- RClassifierSetting.getActiveCategories(coll)
    } yield categories.map(tagCategory)
  def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
    for {
      categories <- RClassifierSetting.getActiveCategories(coll)
--- a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala
@@ -0,0 +1,43 @@
 package docspell.joex.learn
 import java.nio.file.Path
 import cats.implicits._
 import bitpeace.RangeDef
 import cats.data.OptionT
 import cats.effect._
 import docspell.store.Store
 import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
 import docspell.common._
 import docspell.store.records.RClassifierModel
 object Classify {
  def apply[F[_]: Sync: ContextShift](
      blocker: Blocker,
      logger: Logger[F],
      workingDir: Path,
      store: Store[F],
      classifier: TextClassifier[F],
      coll: Ident,
      text: String
  )(cname: ClassifierName): F[Option[String]] =
    (for {
      _     <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …"))
      model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
      modelData =
        store.bitpeace
          .get(model.fileId.id)
          .unNoneTerminate
          .through(store.bitpeace.fetchData2(RangeDef.all))
      cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
        val modelFile = dir.resolve("model.ser.gz")
        modelData
          .through(fs2.io.file.writeAll(modelFile, blocker))
          .compile
          .drain
          .flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text))
      }).filter(_ != LearnClassifierTask.noClass)
      _ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}"))
    } yield cls).value
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
@@ -42,7 +42,7 @@ object ExtractArchive {
      archive: Option[RAttachmentArchive]
  ): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
    singlePass(item, archive).flatMap { t =>
-      if (t._1 == None) Task.pure(t)
+      if (t._1.isEmpty) Task.pure(t)
      else multiPass(t._2, t._1)
    }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,22 +1,18 @@
 package docspell.joex.process
 import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
-
+import docspell.analysis.classifier.TextClassifier
 import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
 import docspell.analysis.{NlpSettings, TextAnalyser}
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
-import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
+import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask}
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
 import bitpeace.RangeDef
 object TextAnalysis {
  type Args = ProcessItemArgs
@@ -73,40 +69,26 @@ object TextAnalysis {
      cfg: Config.TextAnalysis,
      metas: Vector[RAttachmentMeta],
      classifier: TextClassifier[F]
-  ): F[List[String]] =
+  ): F[List[String]] = {
    val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
    val classifyWith: ClassifierName => F[Option[String]] =
      Classify[F](
        ctx.blocker,
        ctx.logger,
        cfg.workingDir,
        ctx.store,
        classifier,
        ctx.args.meta.collective,
        text
      )
    for {
-      models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
+      names <- ctx.store.transact(
-      _      <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
+        ClassifierName.findTagClassifiers(ctx.args.meta.collective)
-      tags <- models
+      )
-        .map(_.fileId.some)
+      _    <- ctx.logger.debug(s"Guessing tags for ${names.size} categories")
-        .traverse(predictTag(ctx, cfg, metas, classifier))
+      tags <- names.traverse(classifyWith)
    } yield tags.flatten
-
+  }
  def predictTag[F[_]: Sync: ContextShift](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis,
      metas: Vector[RAttachmentMeta],
      classifier: TextClassifier[F]
  )(modelFileId: Option[Ident]): F[Option[String]] =
    (for {
      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
      model <- OptionT.fromOption[F](modelFileId)
      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
      modelData =
        ctx.store.bitpeace
          .get(model.id)
          .unNoneTerminate
          .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
      cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
        val modelFile = dir.resolve("model.ser.gz")
        modelData
          .through(fs2.io.file.writeAll(modelFile, ctx.blocker))
          .compile
          .drain
          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
      }).filter(_ != LearnClassifierTask.noClass)
      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
    } yield cls).value
  private def getActive[F[_]: Sync](
      ctx: Context[F, Args],