mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 18:39:33 +00:00
Refactor running classifier in text analysis
This commit is contained in:
parent
99dcaae66b
commit
5c487ef7a9
@ -87,7 +87,7 @@ object MetaProposal {
|
||||
}
|
||||
}
|
||||
|
||||
/** Merges candidates with same `IdRef' values and concatenates their
|
||||
/** Merges candidates with same `IdRef` values and concatenates their
|
||||
* respective labels. The candidate order is preserved.
|
||||
*/
|
||||
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
|
||||
|
@ -31,6 +31,11 @@ object ClassifierName {
|
||||
val correspondentPerson: ClassifierName =
|
||||
apply("correspondentperson")
|
||||
|
||||
def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] =
|
||||
for {
|
||||
categories <- RClassifierSetting.getActiveCategories(coll)
|
||||
} yield categories.map(tagCategory)
|
||||
|
||||
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
||||
for {
|
||||
categories <- RClassifierSetting.getActiveCategories(coll)
|
||||
|
@ -0,0 +1,43 @@
|
||||
package docspell.joex.learn
|
||||
|
||||
import java.nio.file.Path
|
||||
import cats.implicits._
|
||||
import bitpeace.RangeDef
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import docspell.store.Store
|
||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||
import docspell.common._
|
||||
import docspell.store.records.RClassifierModel
|
||||
|
||||
object Classify {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
workingDir: Path,
|
||||
store: Store[F],
|
||||
classifier: TextClassifier[F],
|
||||
coll: Ident,
|
||||
text: String
|
||||
)(cname: ClassifierName): F[Option[String]] =
|
||||
(for {
|
||||
_ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …"))
|
||||
model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
|
||||
modelData =
|
||||
store.bitpeace
|
||||
.get(model.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(store.bitpeace.fetchData2(RangeDef.all))
|
||||
cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
|
||||
val modelFile = dir.resolve("model.ser.gz")
|
||||
modelData
|
||||
.through(fs2.io.file.writeAll(modelFile, blocker))
|
||||
.compile
|
||||
.drain
|
||||
.flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text))
|
||||
}).filter(_ != LearnClassifierTask.noClass)
|
||||
_ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}"))
|
||||
} yield cls).value
|
||||
|
||||
}
|
@ -42,7 +42,7 @@ object ExtractArchive {
|
||||
archive: Option[RAttachmentArchive]
|
||||
): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
|
||||
singlePass(item, archive).flatMap { t =>
|
||||
if (t._1 == None) Task.pure(t)
|
||||
if (t._1.isEmpty) Task.pure(t)
|
||||
else multiPass(t._2, t._1)
|
||||
}
|
||||
|
||||
|
@ -1,22 +1,18 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||
import docspell.analysis.classifier.TextClassifier
|
||||
import docspell.analysis.{NlpSettings, TextAnalyser}
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
|
||||
import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask}
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
|
||||
|
||||
import bitpeace.RangeDef
|
||||
|
||||
object TextAnalysis {
|
||||
type Args = ProcessItemArgs
|
||||
|
||||
@ -73,40 +69,26 @@ object TextAnalysis {
|
||||
cfg: Config.TextAnalysis,
|
||||
metas: Vector[RAttachmentMeta],
|
||||
classifier: TextClassifier[F]
|
||||
): F[List[String]] =
|
||||
): F[List[String]] = {
|
||||
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
val classifyWith: ClassifierName => F[Option[String]] =
|
||||
Classify[F](
|
||||
ctx.blocker,
|
||||
ctx.logger,
|
||||
cfg.workingDir,
|
||||
ctx.store,
|
||||
classifier,
|
||||
ctx.args.meta.collective,
|
||||
text
|
||||
)
|
||||
for {
|
||||
models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
|
||||
_ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
|
||||
tags <- models
|
||||
.map(_.fileId.some)
|
||||
.traverse(predictTag(ctx, cfg, metas, classifier))
|
||||
names <- ctx.store.transact(
|
||||
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
|
||||
)
|
||||
_ <- ctx.logger.debug(s"Guessing tags for ${names.size} categories")
|
||||
tags <- names.traverse(classifyWith)
|
||||
} yield tags.flatten
|
||||
|
||||
def predictTag[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis,
|
||||
metas: Vector[RAttachmentMeta],
|
||||
classifier: TextClassifier[F]
|
||||
)(modelFileId: Option[Ident]): F[Option[String]] =
|
||||
(for {
|
||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
|
||||
model <- OptionT.fromOption[F](modelFileId)
|
||||
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
modelData =
|
||||
ctx.store.bitpeace
|
||||
.get(model.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
||||
val modelFile = dir.resolve("model.ser.gz")
|
||||
modelData
|
||||
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
||||
.compile
|
||||
.drain
|
||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||
}).filter(_ != LearnClassifierTask.noClass)
|
||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||
} yield cls).value
|
||||
}
|
||||
|
||||
private def getActive[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
|
Loading…
x
Reference in New Issue
Block a user