mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 23:25:58 +00:00
Refactor running classifier in text analysis
This commit is contained in:
parent
99dcaae66b
commit
5c487ef7a9
@ -87,7 +87,7 @@ object MetaProposal {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Merges candidates with same `IdRef' values and concatenates their
|
/** Merges candidates with same `IdRef` values and concatenates their
|
||||||
* respective labels. The candidate order is preserved.
|
* respective labels. The candidate order is preserved.
|
||||||
*/
|
*/
|
||||||
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
|
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
|
||||||
|
@ -31,6 +31,11 @@ object ClassifierName {
|
|||||||
val correspondentPerson: ClassifierName =
|
val correspondentPerson: ClassifierName =
|
||||||
apply("correspondentperson")
|
apply("correspondentperson")
|
||||||
|
|
||||||
|
def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] =
|
||||||
|
for {
|
||||||
|
categories <- RClassifierSetting.getActiveCategories(coll)
|
||||||
|
} yield categories.map(tagCategory)
|
||||||
|
|
||||||
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
||||||
for {
|
for {
|
||||||
categories <- RClassifierSetting.getActiveCategories(coll)
|
categories <- RClassifierSetting.getActiveCategories(coll)
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
package docspell.joex.learn
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
import cats.implicits._
|
||||||
|
import bitpeace.RangeDef
|
||||||
|
import cats.data.OptionT
|
||||||
|
import cats.effect._
|
||||||
|
import docspell.store.Store
|
||||||
|
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.records.RClassifierModel
|
||||||
|
|
||||||
|
object Classify {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
|
workingDir: Path,
|
||||||
|
store: Store[F],
|
||||||
|
classifier: TextClassifier[F],
|
||||||
|
coll: Ident,
|
||||||
|
text: String
|
||||||
|
)(cname: ClassifierName): F[Option[String]] =
|
||||||
|
(for {
|
||||||
|
_ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …"))
|
||||||
|
model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
|
||||||
|
modelData =
|
||||||
|
store.bitpeace
|
||||||
|
.get(model.fileId.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
|
||||||
|
val modelFile = dir.resolve("model.ser.gz")
|
||||||
|
modelData
|
||||||
|
.through(fs2.io.file.writeAll(modelFile, blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
.flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text))
|
||||||
|
}).filter(_ != LearnClassifierTask.noClass)
|
||||||
|
_ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}"))
|
||||||
|
} yield cls).value
|
||||||
|
|
||||||
|
}
|
@ -42,7 +42,7 @@ object ExtractArchive {
|
|||||||
archive: Option[RAttachmentArchive]
|
archive: Option[RAttachmentArchive]
|
||||||
): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
|
): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
|
||||||
singlePass(item, archive).flatMap { t =>
|
singlePass(item, archive).flatMap { t =>
|
||||||
if (t._1 == None) Task.pure(t)
|
if (t._1.isEmpty) Task.pure(t)
|
||||||
else multiPass(t._2, t._1)
|
else multiPass(t._2, t._1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,22 +1,18 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import cats.data.OptionT
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
import docspell.analysis.classifier.TextClassifier
|
||||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
|
||||||
import docspell.analysis.{NlpSettings, TextAnalyser}
|
import docspell.analysis.{NlpSettings, TextAnalyser}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
|
import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask}
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
|
import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
|
||||||
|
|
||||||
import bitpeace.RangeDef
|
|
||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
type Args = ProcessItemArgs
|
type Args = ProcessItemArgs
|
||||||
|
|
||||||
@ -73,40 +69,26 @@ object TextAnalysis {
|
|||||||
cfg: Config.TextAnalysis,
|
cfg: Config.TextAnalysis,
|
||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
classifier: TextClassifier[F]
|
classifier: TextClassifier[F]
|
||||||
): F[List[String]] =
|
): F[List[String]] = {
|
||||||
|
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
|
val classifyWith: ClassifierName => F[Option[String]] =
|
||||||
|
Classify[F](
|
||||||
|
ctx.blocker,
|
||||||
|
ctx.logger,
|
||||||
|
cfg.workingDir,
|
||||||
|
ctx.store,
|
||||||
|
classifier,
|
||||||
|
ctx.args.meta.collective,
|
||||||
|
text
|
||||||
|
)
|
||||||
for {
|
for {
|
||||||
models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
|
names <- ctx.store.transact(
|
||||||
_ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
|
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
|
||||||
tags <- models
|
)
|
||||||
.map(_.fileId.some)
|
_ <- ctx.logger.debug(s"Guessing tags for ${names.size} categories")
|
||||||
.traverse(predictTag(ctx, cfg, metas, classifier))
|
tags <- names.traverse(classifyWith)
|
||||||
} yield tags.flatten
|
} yield tags.flatten
|
||||||
|
}
|
||||||
def predictTag[F[_]: Sync: ContextShift](
|
|
||||||
ctx: Context[F, Args],
|
|
||||||
cfg: Config.TextAnalysis,
|
|
||||||
metas: Vector[RAttachmentMeta],
|
|
||||||
classifier: TextClassifier[F]
|
|
||||||
)(modelFileId: Option[Ident]): F[Option[String]] =
|
|
||||||
(for {
|
|
||||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
|
|
||||||
model <- OptionT.fromOption[F](modelFileId)
|
|
||||||
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
|
||||||
modelData =
|
|
||||||
ctx.store.bitpeace
|
|
||||||
.get(model.id)
|
|
||||||
.unNoneTerminate
|
|
||||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
|
||||||
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
|
||||||
val modelFile = dir.resolve("model.ser.gz")
|
|
||||||
modelData
|
|
||||||
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
|
||||||
.compile
|
|
||||||
.drain
|
|
||||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
|
||||||
}).filter(_ != LearnClassifierTask.noClass)
|
|
||||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
|
||||||
} yield cls).value
|
|
||||||
|
|
||||||
private def getActive[F[_]: Sync](
|
private def getActive[F[_]: Sync](
|
||||||
ctx: Context[F, Args],
|
ctx: Context[F, Args],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user