Refactor running classifier in text analysis

This commit is contained in:
Eike Kettner 2021-01-19 21:30:02 +01:00
parent 99dcaae66b
commit 5c487ef7a9
5 changed files with 70 additions and 40 deletions

View File

@ -87,7 +87,7 @@ object MetaProposal {
}
}
/** Merges candidates with same `IdRef' values and concatenates their
/** Merges candidates with same `IdRef` values and concatenates their
* respective labels. The candidate order is preserved.
*/
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {

View File

@ -31,6 +31,11 @@ object ClassifierName {
val correspondentPerson: ClassifierName =
apply("correspondentperson")
def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] =
for {
categories <- RClassifierSetting.getActiveCategories(coll)
} yield categories.map(tagCategory)
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
for {
categories <- RClassifierSetting.getActiveCategories(coll)

View File

@ -0,0 +1,43 @@
package docspell.joex.learn
import java.nio.file.Path
import cats.implicits._
import bitpeace.RangeDef
import cats.data.OptionT
import cats.effect._
import docspell.store.Store
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
import docspell.common._
import docspell.store.records.RClassifierModel
object Classify {
def apply[F[_]: Sync: ContextShift](
blocker: Blocker,
logger: Logger[F],
workingDir: Path,
store: Store[F],
classifier: TextClassifier[F],
coll: Ident,
text: String
)(cname: ClassifierName): F[Option[String]] =
(for {
_ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name}"))
model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
modelData =
store.bitpeace
.get(model.fileId.id)
.unNoneTerminate
.through(store.bitpeace.fetchData2(RangeDef.all))
cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
val modelFile = dir.resolve("model.ser.gz")
modelData
.through(fs2.io.file.writeAll(modelFile, blocker))
.compile
.drain
.flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text))
}).filter(_ != LearnClassifierTask.noClass)
_ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}"))
} yield cls).value
}

View File

@ -42,7 +42,7 @@ object ExtractArchive {
archive: Option[RAttachmentArchive]
): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] =
singlePass(item, archive).flatMap { t =>
if (t._1 == None) Task.pure(t)
if (t._1.isEmpty) Task.pure(t)
else multiPass(t._2, t._1)
}

View File

@ -1,22 +1,18 @@
package docspell.joex.process
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
import docspell.analysis.classifier.TextClassifier
import docspell.analysis.{NlpSettings, TextAnalyser}
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask}
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
import bitpeace.RangeDef
object TextAnalysis {
type Args = ProcessItemArgs
@ -73,40 +69,26 @@ object TextAnalysis {
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
): F[List[String]] =
): F[List[String]] = {
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
val classifyWith: ClassifierName => F[Option[String]] =
Classify[F](
ctx.blocker,
ctx.logger,
cfg.workingDir,
ctx.store,
classifier,
ctx.args.meta.collective,
text
)
for {
models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
_ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
tags <- models
.map(_.fileId.some)
.traverse(predictTag(ctx, cfg, metas, classifier))
names <- ctx.store.transact(
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
)
_ <- ctx.logger.debug(s"Guessing tags for ${names.size} categories")
tags <- names.traverse(classifyWith)
} yield tags.flatten
def predictTag[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
)(modelFileId: Option[Ident]): F[Option[String]] =
(for {
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId}"))
model <- OptionT.fromOption[F](modelFileId)
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
modelData =
ctx.store.bitpeace
.get(model.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
val modelFile = dir.resolve("model.ser.gz")
modelData
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
.compile
.drain
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
}).filter(_ != LearnClassifierTask.noClass)
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
} yield cls).value
}
private def getActive[F[_]: Sync](
ctx: Context[F, Args],