diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala index a68affff..62a9355f 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala @@ -87,7 +87,7 @@ object MetaProposal { } } - /** Merges candidates with same `IdRef' values and concatenates their + /** Merges candidates with same `IdRef` values and concatenates their * respective labels. The candidate order is preserved. */ def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = { diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala index 0ed2d97e..c08b96db 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -31,6 +31,11 @@ object ClassifierName { val correspondentPerson: ClassifierName = apply("correspondentperson") + def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] = + for { + categories <- RClassifierSetting.getActiveCategories(coll) + } yield categories.map(tagCategory) + def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = for { categories <- RClassifierSetting.getActiveCategories(coll) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala new file mode 100644 index 00000000..ae34d18f --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala @@ -0,0 +1,43 @@ +package docspell.joex.learn + +import java.nio.file.Path +import cats.implicits._ +import bitpeace.RangeDef +import cats.data.OptionT +import cats.effect._ +import docspell.store.Store +import docspell.analysis.classifier.{ClassifierModel, TextClassifier} +import docspell.common._ +import docspell.store.records.RClassifierModel + +object Classify { + + def apply[F[_]: Sync: ContextShift]( + blocker: Blocker, + logger: Logger[F], + workingDir: Path, + store: Store[F], + classifier: TextClassifier[F], + coll: Ident, + text: String + )(cname: ClassifierName): F[Option[String]] = + (for { + _ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …")) + model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name))) + modelData = + store.bitpeace + .get(model.fileId.id) + .unNoneTerminate + .through(store.bitpeace.fetchData2(RangeDef.all)) + cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir => + val modelFile = dir.resolve("model.ser.gz") + modelData + .through(fs2.io.file.writeAll(modelFile, blocker)) + .compile + .drain + .flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text)) + }).filter(_ != LearnClassifierTask.noClass) + _ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}")) + } yield cls).value + +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index c48952e2..7de6a086 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -42,7 +42,7 @@ object ExtractArchive { archive: Option[RAttachmentArchive] ): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] = singlePass(item, archive).flatMap { t => - if (t._1 == None) Task.pure(t) + if (t._1.isEmpty) Task.pure(t) else multiPass(t._2, t._1) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index fd7c08bc..b2d50f75 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,22 +1,18 @@ package docspell.joex.process -import cats.data.OptionT import cats.effect._ import cats.implicits._ - -import docspell.analysis.classifier.{ClassifierModel, TextClassifier} +import docspell.analysis.classifier.TextClassifier import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile -import docspell.joex.learn.{ClassifierName, LearnClassifierTask} +import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask} import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.{RAttachmentMeta, RClassifierSetting} -import bitpeace.RangeDef - object TextAnalysis { type Args = ProcessItemArgs @@ -73,40 +69,26 @@ object TextAnalysis { cfg: Config.TextAnalysis, metas: Vector[RAttachmentMeta], classifier: TextClassifier[F] - ): F[List[String]] = + ): F[List[String]] = { + val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) + val classifyWith: ClassifierName => F[Option[String]] = + Classify[F]( + ctx.blocker, + ctx.logger, + cfg.workingDir, + ctx.store, + classifier, + ctx.args.meta.collective, + text + ) for { - models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective)) - _ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories") - tags <- models - .map(_.fileId.some) - .traverse(predictTag(ctx, cfg, metas, classifier)) + names <- ctx.store.transact( + ClassifierName.findTagClassifiers(ctx.args.meta.collective) + ) + _ <- ctx.logger.debug(s"Guessing tags for ${names.size} categories") + tags <- names.traverse(classifyWith) } yield tags.flatten - - def predictTag[F[_]: Sync: ContextShift]( - ctx: Context[F, Args], - cfg: Config.TextAnalysis, - metas: Vector[RAttachmentMeta], - classifier: TextClassifier[F] - )(modelFileId: Option[Ident]): F[Option[String]] = - (for { - _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …")) - model <- OptionT.fromOption[F](modelFileId) - text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) - modelData = - ctx.store.bitpeace - .get(model.id) - .unNoneTerminate - .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir => - val modelFile = dir.resolve("model.ser.gz") - modelData - .through(fs2.io.file.writeAll(modelFile, ctx.blocker)) - .compile - .drain - .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) - }).filter(_ != LearnClassifierTask.noClass) - _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) - } yield cls).value + } private def getActive[F[_]: Sync]( ctx: Context[F, Args],