mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-02-15 20:33:26 +00:00
Guess a tag on item processing using a trained model if available
This commit is contained in:
parent
316b490008
commit
237b960625
@ -38,6 +38,9 @@ case class ItemData(
|
||||
copy(metas = next)
|
||||
}
|
||||
|
||||
def appendTags(tags: Seq[String]): ItemData =
|
||||
copy(tags = (this.tags ++ tags.toList).distinct)
|
||||
|
||||
def changeMeta(
|
||||
attachId: Ident,
|
||||
f: RAttachmentMeta => RAttachmentMeta
|
||||
|
@ -34,12 +34,12 @@ object ProcessItem {
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||
|
||||
def analysisOnly[F[_]: Sync](
|
||||
def analysisOnly[F[_]: Sync: ContextShift](
|
||||
cfg: Config,
|
||||
analyser: TextAnalyser[F],
|
||||
regexNer: RegexNerFile[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](analyser, regexNer)(item)
|
||||
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
|
||||
.flatMap(FindProposal[F](cfg.processing))
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
|
@ -1,23 +1,32 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
import docspell.store.records.RClassifierSetting
|
||||
|
||||
import bitpeace.RangeDef
|
||||
|
||||
object TextAnalysis {
|
||||
type Args = ProcessItemArgs
|
||||
|
||||
def apply[F[_]: Sync](
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: Config.TextAnalysis,
|
||||
analyser: TextAnalyser[F],
|
||||
nerFile: RegexNerFile[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
)(item: ItemData): Task[F, Args, ItemData] =
|
||||
Task { ctx =>
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting text analysis")
|
||||
@ -34,11 +43,14 @@ object TextAnalysis {
|
||||
e <- s
|
||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||
v = t.toVector
|
||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
|
||||
} yield item
|
||||
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
.appendTags(tag.toSeq)
|
||||
}
|
||||
|
||||
def annotateAttachment[F[_]: Sync](
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
ctx: Context[F, Args],
|
||||
analyser: TextAnalyser[F],
|
||||
nerFile: RegexNerFile[F]
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||
@ -54,4 +66,43 @@ object TextAnalysis {
|
||||
)
|
||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||
}
|
||||
|
||||
def predictTag[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis,
|
||||
metas: Vector[RAttachmentMeta],
|
||||
classifier: TextClassifier[F]
|
||||
): OptionT[F, String] =
|
||||
for {
|
||||
model <- findActiveModel(ctx, cfg)
|
||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||
text = metas.flatMap(_.content).mkString(" ------ ")
|
||||
modelData =
|
||||
ctx.store.bitpeace
|
||||
.get(model.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
||||
val modelFile = dir.resolve("model.ser.gz")
|
||||
modelData
|
||||
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
||||
.compile
|
||||
.drain
|
||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||
|
||||
})
|
||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||
} yield cls
|
||||
|
||||
private def findActiveModel[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis
|
||||
): OptionT[F, Ident] =
|
||||
if (cfg.classification.enabled)
|
||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
|
||||
.filter(_.enabled)
|
||||
.mapFilter(_.fileId)
|
||||
else
|
||||
OptionT.none
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user