From cce88788987d2a6c22b41dd9fba12df90a29966a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 21:48:40 +0100 Subject: [PATCH] Exclude tags w/o category from classifying; remove obsolete models --- .../docspell/joex/learn/ClassifierName.scala | 28 ++++++++++++++++--- .../joex/learn/LearnClassifierTask.scala | 23 ++++++++++++--- .../store/records/RClassifierModel.scala | 14 +++++++++- .../scala/docspell/store/records/RTag.scala | 6 ++-- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala index 6b128c24..d667ff80 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -2,8 +2,12 @@ package docspell.joex.learn import cats.data.NonEmptyList import cats.implicits._ + import docspell.common.Ident +import docspell.store.qb.DSL._ +import docspell.store.qb._ import docspell.store.records.{RClassifierModel, RTag} + import doobie._ final class ClassifierName(val name: String) extends AnyVal @@ -12,9 +16,6 @@ object ClassifierName { def apply(name: String): ClassifierName = new ClassifierName(name) - val noCategory: ClassifierName = - apply("__docspell_no_category__") - val categoryPrefix = "tagcategory-" def tagCategory(cat: String): ClassifierName = @@ -34,7 +35,7 @@ object ClassifierName { def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = for { - categories <- RTag.listCategories(coll, noCategory.name) + categories <- RTag.listCategories(coll) models <- NonEmptyList.fromList(categories) match { case Some(nel) => RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name)) @@ -42,4 +43,23 @@ object ClassifierName { List.empty[RClassifierModel].pure[ConnectionIO] } } yield models + + def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = { + val model = RClassifierModel.as("m") + val tag = RTag.as("t") + val sql = + Select( + select(model.all), + from(model), + model.cid === coll && model.name.notIn( + Select( + select(concat(lit(categoryPrefix), tag.category.s)), + from(tag), + tag.cid === coll && tag.category.isNotNull + ).distinct + ) + ).build + sql.query[RClassifierModel].to[List] + } + } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 3949a151..52ee70ac 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -4,12 +4,13 @@ import cats.data.Kleisli import cats.data.OptionT import cats.effect._ import cats.implicits._ + import docspell.analysis.TextAnalyser import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ -import docspell.store.records.{RClassifierSetting, RTag} +import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag} object LearnClassifierTask { val pageSep = " --n-- " @@ -31,6 +32,7 @@ object LearnClassifierTask { _ <- OptionT.liftF( learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) ) + _ <- OptionT.liftF(clearObsoleteModels(ctx)) } yield ()) .getOrElseF(logInactiveWarning(ctx.logger)) } @@ -62,14 +64,27 @@ object LearnClassifierTask { ): Task[F, A, Unit] = Task { ctx => for { - cats <- ctx.store.transact( - RTag.listCategories(collective, ClassifierName.noCategory.name) - ) + cats <- ctx.store.transact(RTag.listCategories(collective)) task = learnTagCategory[F, A](analyser, collective, maxItems) _ _ <- cats.map(task).traverse(_.run(ctx)) } yield () } + private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] = + for { + list <- ctx.store.transact( + ClassifierName.findOrphanTagModels(ctx.args.collective) + ) + _ <- ctx.logger.info( + s"Found ${list.size} obsolete model files that are deleted now." + ) + n <- ctx.store.transact(RClassifierModel.deleteAll(list.map(_.id))) + _ <- list + .map(_.fileId.id) + .traverse(id => ctx.store.bitpeace.delete(id).compile.drain) + _ <- ctx.logger.debug(s"Deleted $n model files.") + } yield () + private def findActiveSettings[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala index 2d018f81..cca0079c 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala @@ -1,7 +1,7 @@ package docspell.store.records -import cats.effect._ import cats.data.NonEmptyList +import cats.effect._ import cats.implicits._ import docspell.common._ @@ -63,6 +63,17 @@ object RClassifierModel { else 0.pure[ConnectionIO] } yield n + k + def deleteById(id: Ident): ConnectionIO[Int] = + DML.delete(T, T.id === id) + + def deleteAll(ids: List[Ident]): ConnectionIO[Int] = + NonEmptyList.fromList(ids) match { + case Some(nel) => + DML.delete(T, T.id.in(nel)) + case None => + 0.pure[ConnectionIO] + } + def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] = Select(select(T.all), from(T), T.cid === cid && T.name === name).build .query[RClassifierModel] @@ -75,4 +86,5 @@ object RClassifierModel { Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build .query[RClassifierModel] .to[List] + } diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 5bba7d67..51f25912 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -148,11 +148,11 @@ object RTag { ).orderBy(T.name.asc).build.query[RTag].to[List] } - def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] = + def listCategories(coll: Ident): ConnectionIO[List[String]] = Select( - coalesce(T.category.s, lit(fallback)).s, + T.category.s, from(T), - T.cid === coll + T.cid === coll && T.category.isNotNull ).distinct.build.query[String].to[List] def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =