Control what tag categories to use for auto-tagging

2025-06-24 11:28:25 +00:00 · 2021-01-19 01:20:13 +01:00
parent cce8878898
commit a6f29153c4
16 changed files with 436 additions and 125 deletions
--- a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql
@ -14,8 +14,31 @@ select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name
 from "classifier_setting"
 where "file_id" is not null;

+alter table "classifier_setting"
+add column "categories" text;
+
+alter table "classifier_setting"
+add column "category_list_type" varchar(254);
+
+update "classifier_setting"
+set "category_list_type" = 'whitelist';
+
+update "classifier_setting"
+set "categories" = concat('["', category, '"]')
+where category is not null;
+
+update "classifier_setting"
+set "categories" = '[]'
+where category is null;
+
 alter table "classifier_setting"
 drop column "category";

 alter table "classifier_setting"
 drop column "file_id";
+
+ALTER TABLE "classifier_setting"
+ALTER COLUMN "categories" SET NOT NULL;
+
+ALTER TABLE "classifier_setting"
+ALTER COLUMN "category_list_type" SET NOT NULL;
--- a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql
@ -14,13 +14,35 @@ select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `f
 from `classifier_setting`
 where `file_id` is not null;

+alter table `classifier_setting`
+add column (`categories` mediumtext);
+
+alter table `classifier_setting`
+add column (`category_list_type` varchar(254));
+
+update `classifier_setting`
+set `category_list_type` = 'whitelist';
+
+update `classifier_setting`
+set `categories` = concat('[`', category, '`]')
+where category is not null;
+
+update `classifier_setting`
+set `categories` = '[]'
+where category is null;
+
 alter table `classifier_setting`
 drop column `category`;

-- mariadb needs special treatment when dropping a column that is part
-- of an index and foreign key
+-- mariadb requires to drop constraint manually when dropping a column
 alter table `classifier_setting`
 drop constraint `classifier_setting_ibfk_2`;

 alter table `classifier_setting`
 drop column `file_id`;
+
+ALTER TABLE `classifier_setting`
+MODIFY `categories` mediumtext NOT NULL;
+
+ALTER TABLE `classifier_setting`
+MODIFY `category_list_type` varchar(254) NOT NULL;
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql
@ -14,8 +14,31 @@ select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name",
 from "classifier_setting"
 where "file_id" is not null;

+alter table "classifier_setting"
+add column "categories" text;
+
+alter table "classifier_setting"
+add column "category_list_type" varchar(254);
+
+update "classifier_setting"
+set "category_list_type" = 'whitelist';
+
+update "classifier_setting"
+set "categories" = concat('["', category, '"]')
+where category is not null;
+
+update "classifier_setting"
+set "categories" = '[]'
+where category is null;
+
 alter table "classifier_setting"
 drop column "category";

 alter table "classifier_setting"
 drop column "file_id";
+
+ALTER TABLE "classifier_setting"
+ALTER COLUMN "categories" SET NOT NULL;
+
+ALTER TABLE "classifier_setting"
+ALTER COLUMN "category_list_type" SET NOT NULL;
--- a/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala
@ -97,6 +97,9 @@ trait DoobieMeta extends EmilDoobieMeta {

  implicit val metaCustomFieldType: Meta[CustomFieldType] =
    Meta[String].timap(CustomFieldType.unsafe)(_.name)
+
+  implicit val metaListType: Meta[ListType] =
+    Meta[String].timap(ListType.unsafeFromString)(_.name)
 }

 object DoobieMeta extends DoobieMeta {
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala
@ -57,7 +57,12 @@ object RClassifierModel {

  def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
    for {
-      n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid)))
+      now <- Timestamp.current[ConnectionIO]
+      n <- DML.update(
+        T,
+        T.cid === coll && T.name === name,
+        DML.set(T.fileId.setTo(fid), T.created.setTo(now))
+      )
      k <-
        if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
        else 0.pure[ConnectionIO]
@ -87,4 +92,11 @@ object RClassifierModel {
      .query[RClassifierModel]
      .to[List]

+  def findAllByQuery(
+      cid: Ident,
+      nameQuery: String
+  ): ConnectionIO[List[RClassifierModel]] =
+    Select(select(T.all), from(T), T.cid === cid && T.name.like(nameQuery)).build
+      .query[RClassifierModel]
+      .to[List]
 }
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@ -1,6 +1,6 @@
 package docspell.store.records

-import cats.data.NonEmptyList
+import cats.data.{NonEmptyList, OptionT}
 import cats.implicits._

 import docspell.common._
@ -13,23 +13,38 @@ import doobie.implicits._

 case class RClassifierSetting(
    cid: Ident,
-    enabled: Boolean,
    schedule: CalEvent,
    itemCount: Int,
-    created: Timestamp
-) {}
+    created: Timestamp,
+    categoryList: List[String],
+    listType: ListType
+) {
+
+  def enabled: Boolean =
+    listType match {
+      case ListType.Blacklist =>
+        true
+      case ListType.Whitelist =>
+        categoryList.nonEmpty
+    }
+}

 object RClassifierSetting {
+  // the categoryList is stored as a json array
+  implicit val stringListMeta: Meta[List[String]] =
+    jsonMeta[List[String]]
+
  final case class Table(alias: Option[String]) extends TableDef {
    val tableName = "classifier_setting"

-    val cid       = Column[Ident]("cid", this)
-    val enabled   = Column[Boolean]("enabled", this)
-    val schedule  = Column[CalEvent]("schedule", this)
-    val itemCount = Column[Int]("item_count", this)
-    val created   = Column[Timestamp]("created", this)
+    val cid        = Column[Ident]("cid", this)
+    val schedule   = Column[CalEvent]("schedule", this)
+    val itemCount  = Column[Int]("item_count", this)
+    val created    = Column[Timestamp]("created", this)
+    val categories = Column[List[String]]("categories", this)
+    val listType   = Column[ListType]("category_list_type", this)
    val all = NonEmptyList
-      .of[Column[_]](cid, enabled, schedule, itemCount, created)
+      .of[Column[_]](cid, schedule, itemCount, created, categories, listType)
  }

  val T = Table(None)
@ -40,29 +55,19 @@ object RClassifierSetting {
    DML.insert(
      T,
      T.all,
-      fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}"
+      fr"${v.cid},${v.schedule},${v.itemCount},${v.created},${v.categoryList},${v.listType}"
    )

-  def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
-    DML.update(
-      T,
-      T.cid === v.cid,
-      DML.set(
-        T.enabled.setTo(v.enabled),
-        T.schedule.setTo(v.schedule),
-        T.itemCount.setTo(v.itemCount)
-      )
-    )
-
-  def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
+  def update(v: RClassifierSetting): ConnectionIO[Int] =
    for {
      n1 <- DML.update(
        T,
        T.cid === v.cid,
        DML.set(
-          T.enabled.setTo(v.enabled),
          T.schedule.setTo(v.schedule),
-          T.itemCount.setTo(v.itemCount)
+          T.itemCount.setTo(v.itemCount),
+          T.categories.setTo(v.categoryList),
+          T.listType.setTo(v.listType)
        )
      )
      n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
@ -76,24 +81,62 @@ object RClassifierSetting {
  def delete(coll: Ident): ConnectionIO[Int] =
    DML.delete(T, T.cid === coll)

+  /** Finds tag categories that exist and match the classifier setting.
+    * If the setting contains a black list, they are removed from the
+    * existing categories. If it is a whitelist, the intersection is
+    * returned.
+    */
+  def getActiveCategories(coll: Ident): ConnectionIO[List[String]] =
+    (for {
+      sett <- OptionT(findById(coll))
+      cats <- OptionT.liftF(RTag.listCategories(coll))
+      res = sett.listType match {
+        case ListType.Blacklist =>
+          cats.diff(sett.categoryList)
+        case ListType.Whitelist =>
+          sett.categoryList.intersect(cats)
+      }
+    } yield res).getOrElse(Nil)
+
+  /** Checks the json array of tag categories and removes those that are not present anymore. */
+  def fixCategoryList(coll: Ident): ConnectionIO[Int] =
+    (for {
+      sett <- OptionT(findById(coll))
+      cats <- OptionT.liftF(RTag.listCategories(coll))
+      fixed = sett.categoryList.intersect(cats)
+      n <- OptionT.liftF(
+        if (fixed == sett.categoryList) 0.pure[ConnectionIO]
+        else DML.update(T, T.cid === coll, DML.set(T.categories.setTo(fixed)))
+      )
+    } yield n).getOrElse(0)
+
  case class Classifier(
-      enabled: Boolean,
      schedule: CalEvent,
-      itemCount: Int
+      itemCount: Int,
+      categories: List[String],
+      listType: ListType
  ) {
+    def enabled: Boolean =
+      listType match {
+        case ListType.Blacklist =>
+          true
+        case ListType.Whitelist =>
+          categories.nonEmpty
+      }

    def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
      RClassifierSetting(
        coll,
-        enabled,
        schedule,
        itemCount,
-        created
+        created,
+        categories,
+        listType
      )
  }
  object Classifier {
    def fromRecord(r: RClassifierSetting): Classifier =
-      Classifier(r.enabled, r.schedule, r.itemCount)
+      Classifier(r.schedule, r.itemCount, r.categoryList, r.listType)
  }

 }
--- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
@ -1,6 +1,6 @@
 package docspell.store.records

-import cats.data.NonEmptyList
+import cats.data.{NonEmptyList, OptionT}
 import fs2.Stream

 import docspell.common._
@ -73,13 +73,24 @@ object RCollective {
          .map(now => settings.classifier.map(_.toRecord(cid, now)))
      n2 <- cls match {
        case Some(cr) =>
-          RClassifierSetting.updateSettings(cr)
+          RClassifierSetting.update(cr)
        case None =>
          RClassifierSetting.delete(cid)
      }
    } yield n1 + n2

-  def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
+  // this hides categories that have been deleted in the meantime
+  // they are finally removed from the json array once the learn classifier task is run
+  def getSettings(coll: Ident): ConnectionIO[Option[Settings]] =
+    (for {
+      sett <- OptionT(getRawSettings(coll))
+      prev <- OptionT.fromOption[ConnectionIO](sett.classifier)
+      cats <- OptionT.liftF(RTag.listCategories(coll))
+      next = prev.copy(categories = prev.categories.intersect(cats))
+    } yield sett.copy(classifier = Some(next))).value
+
+  private def getRawSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
+    import RClassifierSetting.stringListMeta
    val c  = RCollective.as("c")
    val cs = RClassifierSetting.as("cs")

@ -87,9 +98,10 @@ object RCollective {
      select(
        c.language.s,
        c.integration.s,
-        cs.enabled.s,
        cs.schedule.s,
-        cs.itemCount.s
+        cs.itemCount.s,
+        cs.categories.s,
+        cs.listType.s
      ),
      from(c).leftJoin(cs, cs.cid === c.id),
      c.id === coll