diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index d4f83fc2..af9a3db2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -38,6 +38,9 @@ case class ItemData( copy(metas = next) } + def appendTags(tags: Seq[String]): ItemData = + copy(tags = (this.tags ++ tags.toList).distinct) + def changeMeta( attachId: Ident, f: RAttachmentMeta => RAttachmentMeta diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 7b8b6431..fb777b24 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -34,12 +34,12 @@ object ProcessItem { )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item) - def analysisOnly[F[_]: Sync]( + def analysisOnly[F[_]: Sync: ContextShift]( cfg: Config, analyser: TextAnalyser[F], regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](analyser, regexNer)(item) + TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 92975a70..039f52e7 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,23 +1,32 @@ package docspell.joex.process +import cats.data.OptionT import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser +import docspell.analysis.nlp.ClassifierModel import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.nlp.TextClassifier import docspell.common._ +import docspell.joex.Config import docspell.joex.analysis.RegexNerFile import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta +import docspell.store.records.RClassifierSetting + +import bitpeace.RangeDef object TextAnalysis { + type Args = ProcessItemArgs - def apply[F[_]: Sync]( + def apply[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, analyser: TextAnalyser[F], nerFile: RegexNerFile[F] - )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = + )(item: ItemData): Task[F, Args, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text analysis") @@ -34,11 +43,14 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value + } yield item + .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + .appendTags(tag.toSeq) } def annotateAttachment[F[_]: Sync]( - ctx: Context[F, ProcessItemArgs], + ctx: Context[F, Args], analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { @@ -54,4 +66,43 @@ object TextAnalysis { ) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + + def predictTag[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): OptionT[F, String] = + for { + model <- findActiveModel(ctx, cfg) + _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) + text = metas.flatMap(_.content).mkString(" ------ ") + modelData = + ctx.store.bitpeace + .get(model.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir => + val modelFile = dir.resolve("model.ser.gz") + modelData + .through(fs2.io.file.writeAll(modelFile, ctx.blocker)) + .compile + .drain + .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) + + }) + _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) + } yield cls + + private def findActiveModel[F[_]: Sync]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis + ): OptionT[F, Ident] = + if (cfg.classification.enabled) + OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective))) + .filter(_.enabled) + .mapFilter(_.fileId) + else + OptionT.none + }