Control what tag categories to use for auto-tagging

This commit is contained in:
Eike Kettner
2021-01-19 01:20:13 +01:00
parent cce8878898
commit a6f29153c4
16 changed files with 436 additions and 125 deletions

View File

@ -4,9 +4,7 @@ import cats.data.NonEmptyList
import cats.implicits._
import docspell.common.Ident
import docspell.store.qb.DSL._
import docspell.store.qb._
import docspell.store.records.{RClassifierModel, RTag}
import docspell.store.records.{RClassifierModel, RClassifierSetting}
import doobie._
@ -16,7 +14,7 @@ object ClassifierName {
def apply(name: String): ClassifierName =
new ClassifierName(name)
val categoryPrefix = "tagcategory-"
private val categoryPrefix = "tagcategory-"
def tagCategory(cat: String): ClassifierName =
apply(s"${categoryPrefix}${cat}")
@ -35,7 +33,7 @@ object ClassifierName {
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
for {
categories <- RTag.listCategories(coll)
categories <- RClassifierSetting.getActiveCategories(coll)
models <- NonEmptyList.fromList(categories) match {
case Some(nel) =>
RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
@ -44,22 +42,20 @@ object ClassifierName {
}
} yield models
def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = {
val model = RClassifierModel.as("m")
val tag = RTag.as("t")
val sql =
Select(
select(model.all),
from(model),
model.cid === coll && model.name.notIn(
Select(
select(concat(lit(categoryPrefix), tag.category.s)),
from(tag),
tag.cid === coll && tag.category.isNotNull
).distinct
)
).build
sql.query[RClassifierModel].to[List]
}
def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
for {
cats <- RClassifierSetting.getActiveCategories(coll)
allModels = RClassifierModel.findAllByQuery(coll, s"${categoryPrefix}%")
result <- NonEmptyList.fromList(cats) match {
case Some(nel) =>
allModels.flatMap(all =>
RClassifierModel
.findAllByName(coll, nel.map(tagCategory).map(_.name))
.map(active => all.diff(active))
)
case None =>
allModels
}
} yield result
}

View File

@ -10,7 +10,7 @@ import docspell.backend.ops.OCollective
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler._
import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag}
import docspell.store.records.{RClassifierModel, RClassifierSetting}
object LearnClassifierTask {
val pageSep = " --n-- "
@ -26,15 +26,23 @@ object LearnClassifierTask {
analyser: TextAnalyser[F]
): Task[F, Args, Unit] =
Task { ctx =>
(for {
sett <- findActiveSettings[F](ctx, cfg)
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
_ <- OptionT.liftF(
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
)
_ <- OptionT.liftF(clearObsoleteModels(ctx))
} yield ())
.getOrElseF(logInactiveWarning(ctx.logger))
val learnTags =
for {
sett <- findActiveSettings[F](ctx, cfg)
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
_ <- OptionT.liftF(
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
)
} yield ()
// learn classifier models from active tag categories
learnTags.getOrElseF(logInactiveWarning(ctx.logger)) *>
// delete classifier model files for categories that have been removed
clearObsoleteTagModels(ctx) *>
// when tags are deleted, categories may get removed. fix the json array
ctx.store
.transact(RClassifierSetting.fixCategoryList(ctx.args.collective))
.map(_ => ())
}
def learnTagCategory[F[_]: Sync: ContextShift, A](
@ -64,13 +72,13 @@ object LearnClassifierTask {
): Task[F, A, Unit] =
Task { ctx =>
for {
cats <- ctx.store.transact(RTag.listCategories(collective))
cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective))
task = learnTagCategory[F, A](analyser, collective, maxItems) _
_ <- cats.map(task).traverse(_.run(ctx))
} yield ()
}
private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] =
private def clearObsoleteTagModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] =
for {
list <- ctx.store.transact(
ClassifierName.findOrphanTagModels(ctx.args.collective)
@ -98,6 +106,6 @@ object LearnClassifierTask {
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
logger.warn(
"Classification is disabled. Check joex config and the collective settings."
"Auto-tagging is disabled. Check joex config and the collective settings."
)
}