Exclude tags w/o category from classifying; remove obsolete models

This commit is contained in:
Eike Kettner 2021-01-18 21:48:40 +01:00
parent 3e28ce1254
commit cce8878898
4 changed files with 59 additions and 12 deletions

View File

@ -2,8 +2,12 @@ package docspell.joex.learn
import cats.data.NonEmptyList
import cats.implicits._
import docspell.common.Ident
import docspell.store.qb.DSL._
import docspell.store.qb._
import docspell.store.records.{RClassifierModel, RTag}
import doobie._
final class ClassifierName(val name: String) extends AnyVal
@ -12,9 +16,6 @@ object ClassifierName {
def apply(name: String): ClassifierName =
new ClassifierName(name)
val noCategory: ClassifierName =
apply("__docspell_no_category__")
val categoryPrefix = "tagcategory-"
def tagCategory(cat: String): ClassifierName =
@ -34,7 +35,7 @@ object ClassifierName {
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
for {
categories <- RTag.listCategories(coll, noCategory.name)
categories <- RTag.listCategories(coll)
models <- NonEmptyList.fromList(categories) match {
case Some(nel) =>
RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
@ -42,4 +43,23 @@ object ClassifierName {
List.empty[RClassifierModel].pure[ConnectionIO]
}
} yield models
def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = {
val model = RClassifierModel.as("m")
val tag = RTag.as("t")
val sql =
Select(
select(model.all),
from(model),
model.cid === coll && model.name.notIn(
Select(
select(concat(lit(categoryPrefix), tag.category.s)),
from(tag),
tag.cid === coll && tag.category.isNotNull
).distinct
)
).build
sql.query[RClassifierModel].to[List]
}
}

View File

@ -4,12 +4,13 @@ import cats.data.Kleisli
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OCollective
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler._
import docspell.store.records.{RClassifierSetting, RTag}
import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag}
object LearnClassifierTask {
val pageSep = " --n-- "
@ -31,6 +32,7 @@ object LearnClassifierTask {
_ <- OptionT.liftF(
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
)
_ <- OptionT.liftF(clearObsoleteModels(ctx))
} yield ())
.getOrElseF(logInactiveWarning(ctx.logger))
}
@ -62,14 +64,27 @@ object LearnClassifierTask {
): Task[F, A, Unit] =
Task { ctx =>
for {
cats <- ctx.store.transact(
RTag.listCategories(collective, ClassifierName.noCategory.name)
)
cats <- ctx.store.transact(RTag.listCategories(collective))
task = learnTagCategory[F, A](analyser, collective, maxItems) _
_ <- cats.map(task).traverse(_.run(ctx))
} yield ()
}
private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] =
for {
list <- ctx.store.transact(
ClassifierName.findOrphanTagModels(ctx.args.collective)
)
_ <- ctx.logger.info(
s"Found ${list.size} obsolete model files that are deleted now."
)
n <- ctx.store.transact(RClassifierModel.deleteAll(list.map(_.id)))
_ <- list
.map(_.fileId.id)
.traverse(id => ctx.store.bitpeace.delete(id).compile.drain)
_ <- ctx.logger.debug(s"Deleted $n model files.")
} yield ()
private def findActiveSettings[F[_]: Sync](
ctx: Context[F, Args],
cfg: Config.TextAnalysis

View File

@ -1,7 +1,7 @@
package docspell.store.records
import cats.effect._
import cats.data.NonEmptyList
import cats.effect._
import cats.implicits._
import docspell.common._
@ -63,6 +63,17 @@ object RClassifierModel {
else 0.pure[ConnectionIO]
} yield n + k
def deleteById(id: Ident): ConnectionIO[Int] =
DML.delete(T, T.id === id)
def deleteAll(ids: List[Ident]): ConnectionIO[Int] =
NonEmptyList.fromList(ids) match {
case Some(nel) =>
DML.delete(T, T.id.in(nel))
case None =>
0.pure[ConnectionIO]
}
def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] =
Select(select(T.all), from(T), T.cid === cid && T.name === name).build
.query[RClassifierModel]
@ -75,4 +86,5 @@ object RClassifierModel {
Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build
.query[RClassifierModel]
.to[List]
}

View File

@ -148,11 +148,11 @@ object RTag {
).orderBy(T.name.asc).build.query[RTag].to[List]
}
def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] =
def listCategories(coll: Ident): ConnectionIO[List[String]] =
Select(
coalesce(T.category.s, lit(fallback)).s,
T.category.s,
from(T),
T.cid === coll
T.cid === coll && T.category.isNotNull
).distinct.build.query[String].to[List]
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =