Guess a tag on item processing using a trained model if available

This commit is contained in:
Eike Kettner 2020-09-01 21:51:57 +02:00
parent 316b490008
commit 237b960625
3 changed files with 60 additions and 6 deletions

View File

@ -38,6 +38,9 @@ case class ItemData(
copy(metas = next)
}
def appendTags(tags: Seq[String]): ItemData =
copy(tags = (this.tags ++ tags.toList).distinct)
def changeMeta(
attachId: Ident,
f: RAttachmentMeta => RAttachmentMeta

View File

@ -34,12 +34,12 @@ object ProcessItem {
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync](
def analysisOnly[F[_]: Sync: ContextShift](
cfg: Config,
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](analyser, regexNer)(item)
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
.flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])

View File

@ -1,23 +1,32 @@
package docspell.joex.process
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.nlp.StanfordNerSettings
import docspell.analysis.nlp.TextClassifier
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
import docspell.store.records.RClassifierSetting
import bitpeace.RangeDef
object TextAnalysis {
type Args = ProcessItemArgs
def apply[F[_]: Sync](
def apply[F[_]: Sync: ContextShift](
cfg: Config.TextAnalysis,
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
)(item: ItemData): Task[F, Args, ItemData] =
Task { ctx =>
for {
_ <- ctx.logger.info("Starting text analysis")
@ -34,11 +43,14 @@ object TextAnalysis {
e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
} yield item
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
.appendTags(tag.toSeq)
}
def annotateAttachment[F[_]: Sync](
ctx: Context[F, ProcessItemArgs],
ctx: Context[F, Args],
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
@ -54,4 +66,43 @@ object TextAnalysis {
)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
def predictTag[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
): OptionT[F, String] =
for {
model <- findActiveModel(ctx, cfg)
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
text = metas.flatMap(_.content).mkString(" ------ ")
modelData =
ctx.store.bitpeace
.get(model.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
val modelFile = dir.resolve("model.ser.gz")
modelData
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
.compile
.drain
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
})
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
} yield cls
private def findActiveModel[F[_]: Sync](
ctx: Context[F, Args],
cfg: Config.TextAnalysis
): OptionT[F, Ident] =
if (cfg.classification.enabled)
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
.filter(_.enabled)
.mapFilter(_.fileId)
else
OptionT.none
}