Control what tag categories to use for auto-tagging

This commit is contained in:
Eike Kettner
2021-01-19 01:20:13 +01:00
parent cce8878898
commit a6f29153c4
16 changed files with 436 additions and 125 deletions

View File

@ -14,8 +14,31 @@ select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name
from "classifier_setting"
where "file_id" is not null;
alter table "classifier_setting"
add column "categories" text;
alter table "classifier_setting"
add column "category_list_type" varchar(254);
update "classifier_setting"
set "category_list_type" = 'whitelist';
update "classifier_setting"
set "categories" = concat('["', category, '"]')
where category is not null;
update "classifier_setting"
set "categories" = '[]'
where category is null;
alter table "classifier_setting"
drop column "category";
alter table "classifier_setting"
drop column "file_id";
ALTER TABLE "classifier_setting"
ALTER COLUMN "categories" SET NOT NULL;
ALTER TABLE "classifier_setting"
ALTER COLUMN "category_list_type" SET NOT NULL;

View File

@ -14,13 +14,35 @@ select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `f
from `classifier_setting`
where `file_id` is not null;
alter table `classifier_setting`
add column (`categories` mediumtext);
alter table `classifier_setting`
add column (`category_list_type` varchar(254));
update `classifier_setting`
set `category_list_type` = 'whitelist';
update `classifier_setting`
set `categories` = concat('[`', category, '`]')
where category is not null;
update `classifier_setting`
set `categories` = '[]'
where category is null;
alter table `classifier_setting`
drop column `category`;
-- mariadb needs special treatment when dropping a column that is part
-- of an index and foreign key
-- mariadb requires to drop constraint manually when dropping a column
alter table `classifier_setting`
drop constraint `classifier_setting_ibfk_2`;
alter table `classifier_setting`
drop column `file_id`;
ALTER TABLE `classifier_setting`
MODIFY `categories` mediumtext NOT NULL;
ALTER TABLE `classifier_setting`
MODIFY `category_list_type` varchar(254) NOT NULL;

View File

@ -14,8 +14,31 @@ select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name",
from "classifier_setting"
where "file_id" is not null;
alter table "classifier_setting"
add column "categories" text;
alter table "classifier_setting"
add column "category_list_type" varchar(254);
update "classifier_setting"
set "category_list_type" = 'whitelist';
update "classifier_setting"
set "categories" = concat('["', category, '"]')
where category is not null;
update "classifier_setting"
set "categories" = '[]'
where category is null;
alter table "classifier_setting"
drop column "category";
alter table "classifier_setting"
drop column "file_id";
ALTER TABLE "classifier_setting"
ALTER COLUMN "categories" SET NOT NULL;
ALTER TABLE "classifier_setting"
ALTER COLUMN "category_list_type" SET NOT NULL;

View File

@ -97,6 +97,9 @@ trait DoobieMeta extends EmilDoobieMeta {
implicit val metaCustomFieldType: Meta[CustomFieldType] =
Meta[String].timap(CustomFieldType.unsafe)(_.name)
implicit val metaListType: Meta[ListType] =
Meta[String].timap(ListType.unsafeFromString)(_.name)
}
object DoobieMeta extends DoobieMeta {

View File

@ -57,7 +57,12 @@ object RClassifierModel {
def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
for {
n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid)))
now <- Timestamp.current[ConnectionIO]
n <- DML.update(
T,
T.cid === coll && T.name === name,
DML.set(T.fileId.setTo(fid), T.created.setTo(now))
)
k <-
if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
else 0.pure[ConnectionIO]
@ -87,4 +92,11 @@ object RClassifierModel {
.query[RClassifierModel]
.to[List]
def findAllByQuery(
cid: Ident,
nameQuery: String
): ConnectionIO[List[RClassifierModel]] =
Select(select(T.all), from(T), T.cid === cid && T.name.like(nameQuery)).build
.query[RClassifierModel]
.to[List]
}

View File

@ -1,6 +1,6 @@
package docspell.store.records
import cats.data.NonEmptyList
import cats.data.{NonEmptyList, OptionT}
import cats.implicits._
import docspell.common._
@ -13,23 +13,38 @@ import doobie.implicits._
case class RClassifierSetting(
cid: Ident,
enabled: Boolean,
schedule: CalEvent,
itemCount: Int,
created: Timestamp
) {}
created: Timestamp,
categoryList: List[String],
listType: ListType
) {
def enabled: Boolean =
listType match {
case ListType.Blacklist =>
true
case ListType.Whitelist =>
categoryList.nonEmpty
}
}
object RClassifierSetting {
// the categoryList is stored as a json array
implicit val stringListMeta: Meta[List[String]] =
jsonMeta[List[String]]
final case class Table(alias: Option[String]) extends TableDef {
val tableName = "classifier_setting"
val cid = Column[Ident]("cid", this)
val enabled = Column[Boolean]("enabled", this)
val schedule = Column[CalEvent]("schedule", this)
val itemCount = Column[Int]("item_count", this)
val created = Column[Timestamp]("created", this)
val cid = Column[Ident]("cid", this)
val schedule = Column[CalEvent]("schedule", this)
val itemCount = Column[Int]("item_count", this)
val created = Column[Timestamp]("created", this)
val categories = Column[List[String]]("categories", this)
val listType = Column[ListType]("category_list_type", this)
val all = NonEmptyList
.of[Column[_]](cid, enabled, schedule, itemCount, created)
.of[Column[_]](cid, schedule, itemCount, created, categories, listType)
}
val T = Table(None)
@ -40,29 +55,19 @@ object RClassifierSetting {
DML.insert(
T,
T.all,
fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}"
fr"${v.cid},${v.schedule},${v.itemCount},${v.created},${v.categoryList},${v.listType}"
)
def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
DML.update(
T,
T.cid === v.cid,
DML.set(
T.enabled.setTo(v.enabled),
T.schedule.setTo(v.schedule),
T.itemCount.setTo(v.itemCount)
)
)
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
def update(v: RClassifierSetting): ConnectionIO[Int] =
for {
n1 <- DML.update(
T,
T.cid === v.cid,
DML.set(
T.enabled.setTo(v.enabled),
T.schedule.setTo(v.schedule),
T.itemCount.setTo(v.itemCount)
T.itemCount.setTo(v.itemCount),
T.categories.setTo(v.categoryList),
T.listType.setTo(v.listType)
)
)
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
@ -76,24 +81,62 @@ object RClassifierSetting {
def delete(coll: Ident): ConnectionIO[Int] =
DML.delete(T, T.cid === coll)
/** Finds tag categories that exist and match the classifier setting.
* If the setting contains a black list, they are removed from the
* existing categories. If it is a whitelist, the intersection is
* returned.
*/
def getActiveCategories(coll: Ident): ConnectionIO[List[String]] =
(for {
sett <- OptionT(findById(coll))
cats <- OptionT.liftF(RTag.listCategories(coll))
res = sett.listType match {
case ListType.Blacklist =>
cats.diff(sett.categoryList)
case ListType.Whitelist =>
sett.categoryList.intersect(cats)
}
} yield res).getOrElse(Nil)
/** Checks the json array of tag categories and removes those that are not present anymore. */
def fixCategoryList(coll: Ident): ConnectionIO[Int] =
(for {
sett <- OptionT(findById(coll))
cats <- OptionT.liftF(RTag.listCategories(coll))
fixed = sett.categoryList.intersect(cats)
n <- OptionT.liftF(
if (fixed == sett.categoryList) 0.pure[ConnectionIO]
else DML.update(T, T.cid === coll, DML.set(T.categories.setTo(fixed)))
)
} yield n).getOrElse(0)
case class Classifier(
enabled: Boolean,
schedule: CalEvent,
itemCount: Int
itemCount: Int,
categories: List[String],
listType: ListType
) {
def enabled: Boolean =
listType match {
case ListType.Blacklist =>
true
case ListType.Whitelist =>
categories.nonEmpty
}
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
RClassifierSetting(
coll,
enabled,
schedule,
itemCount,
created
created,
categories,
listType
)
}
object Classifier {
def fromRecord(r: RClassifierSetting): Classifier =
Classifier(r.enabled, r.schedule, r.itemCount)
Classifier(r.schedule, r.itemCount, r.categoryList, r.listType)
}
}

View File

@ -1,6 +1,6 @@
package docspell.store.records
import cats.data.NonEmptyList
import cats.data.{NonEmptyList, OptionT}
import fs2.Stream
import docspell.common._
@ -73,13 +73,24 @@ object RCollective {
.map(now => settings.classifier.map(_.toRecord(cid, now)))
n2 <- cls match {
case Some(cr) =>
RClassifierSetting.updateSettings(cr)
RClassifierSetting.update(cr)
case None =>
RClassifierSetting.delete(cid)
}
} yield n1 + n2
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
// this hides categories that have been deleted in the meantime
// they are finally removed from the json array once the learn classifier task is run
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] =
(for {
sett <- OptionT(getRawSettings(coll))
prev <- OptionT.fromOption[ConnectionIO](sett.classifier)
cats <- OptionT.liftF(RTag.listCategories(coll))
next = prev.copy(categories = prev.categories.intersect(cats))
} yield sett.copy(classifier = Some(next))).value
private def getRawSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
import RClassifierSetting.stringListMeta
val c = RCollective.as("c")
val cs = RClassifierSetting.as("cs")
@ -87,9 +98,10 @@ object RCollective {
select(
c.language.s,
c.integration.s,
cs.enabled.s,
cs.schedule.s,
cs.itemCount.s
cs.itemCount.s,
cs.categories.s,
cs.listType.s
),
from(c).leftJoin(cs, cs.cid === c.id),
c.id === coll