mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 07:05:59 +00:00
Guess a tag on item processing using a trained model if available
This commit is contained in:
parent
316b490008
commit
237b960625
@ -38,6 +38,9 @@ case class ItemData(
|
|||||||
copy(metas = next)
|
copy(metas = next)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def appendTags(tags: Seq[String]): ItemData =
|
||||||
|
copy(tags = (this.tags ++ tags.toList).distinct)
|
||||||
|
|
||||||
def changeMeta(
|
def changeMeta(
|
||||||
attachId: Ident,
|
attachId: Ident,
|
||||||
f: RAttachmentMeta => RAttachmentMeta
|
f: RAttachmentMeta => RAttachmentMeta
|
||||||
|
@ -34,12 +34,12 @@ object ProcessItem {
|
|||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](
|
def analysisOnly[F[_]: Sync: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
regexNer: RegexNerFile[F]
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
TextAnalysis[F](analyser, regexNer)(item)
|
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
|
||||||
.flatMap(FindProposal[F](cfg.processing))
|
.flatMap(FindProposal[F](cfg.processing))
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
|
@ -1,23 +1,32 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.data.OptionT
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
|
import docspell.analysis.nlp.ClassifierModel
|
||||||
import docspell.analysis.nlp.StanfordNerSettings
|
import docspell.analysis.nlp.StanfordNerSettings
|
||||||
|
import docspell.analysis.nlp.TextClassifier
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
import docspell.store.records.RClassifierSetting
|
||||||
|
|
||||||
|
import bitpeace.RangeDef
|
||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
|
type Args = ProcessItemArgs
|
||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
nerFile: RegexNerFile[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, Args, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.info("Starting text analysis")
|
_ <- ctx.logger.info("Starting text analysis")
|
||||||
@ -34,11 +43,14 @@ object TextAnalysis {
|
|||||||
e <- s
|
e <- s
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
v = t.toVector
|
v = t.toVector
|
||||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
|
||||||
|
} yield item
|
||||||
|
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||||
|
.appendTags(tag.toSeq)
|
||||||
}
|
}
|
||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, Args],
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
nerFile: RegexNerFile[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||||
@ -54,4 +66,43 @@ object TextAnalysis {
|
|||||||
)
|
)
|
||||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def predictTag[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
metas: Vector[RAttachmentMeta],
|
||||||
|
classifier: TextClassifier[F]
|
||||||
|
): OptionT[F, String] =
|
||||||
|
for {
|
||||||
|
model <- findActiveModel(ctx, cfg)
|
||||||
|
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||||
|
text = metas.flatMap(_.content).mkString(" ------ ")
|
||||||
|
modelData =
|
||||||
|
ctx.store.bitpeace
|
||||||
|
.get(model.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
||||||
|
val modelFile = dir.resolve("model.ser.gz")
|
||||||
|
modelData
|
||||||
|
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||||
|
|
||||||
|
})
|
||||||
|
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||||
|
} yield cls
|
||||||
|
|
||||||
|
private def findActiveModel[F[_]: Sync](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis
|
||||||
|
): OptionT[F, Ident] =
|
||||||
|
if (cfg.classification.enabled)
|
||||||
|
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
|
||||||
|
.filter(_.enabled)
|
||||||
|
.mapFilter(_.fileId)
|
||||||
|
else
|
||||||
|
OptionT.none
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user