mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 07:05:59 +00:00
Control what tag categories to use for auto-tagging
This commit is contained in:
parent
cce8878898
commit
a6f29153c4
@ -131,7 +131,8 @@ val openapiScalaSettings = Seq(
|
|||||||
case "ident" =>
|
case "ident" =>
|
||||||
field => field.copy(typeDef = TypeDef("Ident", Imports("docspell.common.Ident")))
|
field => field.copy(typeDef = TypeDef("Ident", Imports("docspell.common.Ident")))
|
||||||
case "accountid" =>
|
case "accountid" =>
|
||||||
field => field.copy(typeDef = TypeDef("AccountId", Imports("docspell.common.AccountId")))
|
field =>
|
||||||
|
field.copy(typeDef = TypeDef("AccountId", Imports("docspell.common.AccountId")))
|
||||||
case "collectivestate" =>
|
case "collectivestate" =>
|
||||||
field =>
|
field =>
|
||||||
field.copy(typeDef =
|
field.copy(typeDef =
|
||||||
@ -190,6 +191,9 @@ val openapiScalaSettings = Seq(
|
|||||||
field.copy(typeDef =
|
field.copy(typeDef =
|
||||||
TypeDef("CustomFieldType", Imports("docspell.common.CustomFieldType"))
|
TypeDef("CustomFieldType", Imports("docspell.common.CustomFieldType"))
|
||||||
)
|
)
|
||||||
|
case "listtype" =>
|
||||||
|
field =>
|
||||||
|
field.copy(typeDef = TypeDef("ListType", Imports("docspell.common.ListType")))
|
||||||
}))
|
}))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
33
modules/common/src/main/scala/docspell/common/ListType.scala
Normal file
33
modules/common/src/main/scala/docspell/common/ListType.scala
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
|
import io.circe.{Decoder, Encoder}
|
||||||
|
|
||||||
|
sealed trait ListType { self: Product =>
|
||||||
|
def name: String =
|
||||||
|
productPrefix.toLowerCase
|
||||||
|
}
|
||||||
|
|
||||||
|
object ListType {
|
||||||
|
|
||||||
|
case object Whitelist extends ListType
|
||||||
|
val whitelist: ListType = Whitelist
|
||||||
|
|
||||||
|
case object Blacklist extends ListType
|
||||||
|
val blacklist: ListType = Blacklist
|
||||||
|
|
||||||
|
val all: NonEmptyList[ListType] = NonEmptyList.of(Whitelist, Blacklist)
|
||||||
|
|
||||||
|
def fromString(name: String): Either[String, ListType] =
|
||||||
|
all.find(_.name.equalsIgnoreCase(name)).toRight(s"Unknown list type: $name")
|
||||||
|
|
||||||
|
def unsafeFromString(name: String): ListType =
|
||||||
|
fromString(name).fold(sys.error, identity)
|
||||||
|
|
||||||
|
implicit val jsonEncoder: Encoder[ListType] =
|
||||||
|
Encoder.encodeString.contramap(_.name)
|
||||||
|
|
||||||
|
implicit val jsonDecoder: Decoder[ListType] =
|
||||||
|
Decoder.decodeString.emap(fromString)
|
||||||
|
}
|
@ -4,9 +4,7 @@ import cats.data.NonEmptyList
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common.Ident
|
import docspell.common.Ident
|
||||||
import docspell.store.qb.DSL._
|
import docspell.store.records.{RClassifierModel, RClassifierSetting}
|
||||||
import docspell.store.qb._
|
|
||||||
import docspell.store.records.{RClassifierModel, RTag}
|
|
||||||
|
|
||||||
import doobie._
|
import doobie._
|
||||||
|
|
||||||
@ -16,7 +14,7 @@ object ClassifierName {
|
|||||||
def apply(name: String): ClassifierName =
|
def apply(name: String): ClassifierName =
|
||||||
new ClassifierName(name)
|
new ClassifierName(name)
|
||||||
|
|
||||||
val categoryPrefix = "tagcategory-"
|
private val categoryPrefix = "tagcategory-"
|
||||||
|
|
||||||
def tagCategory(cat: String): ClassifierName =
|
def tagCategory(cat: String): ClassifierName =
|
||||||
apply(s"${categoryPrefix}${cat}")
|
apply(s"${categoryPrefix}${cat}")
|
||||||
@ -35,7 +33,7 @@ object ClassifierName {
|
|||||||
|
|
||||||
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
||||||
for {
|
for {
|
||||||
categories <- RTag.listCategories(coll)
|
categories <- RClassifierSetting.getActiveCategories(coll)
|
||||||
models <- NonEmptyList.fromList(categories) match {
|
models <- NonEmptyList.fromList(categories) match {
|
||||||
case Some(nel) =>
|
case Some(nel) =>
|
||||||
RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
|
RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
|
||||||
@ -44,22 +42,20 @@ object ClassifierName {
|
|||||||
}
|
}
|
||||||
} yield models
|
} yield models
|
||||||
|
|
||||||
def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = {
|
def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
||||||
val model = RClassifierModel.as("m")
|
for {
|
||||||
val tag = RTag.as("t")
|
cats <- RClassifierSetting.getActiveCategories(coll)
|
||||||
val sql =
|
allModels = RClassifierModel.findAllByQuery(coll, s"${categoryPrefix}%")
|
||||||
Select(
|
result <- NonEmptyList.fromList(cats) match {
|
||||||
select(model.all),
|
case Some(nel) =>
|
||||||
from(model),
|
allModels.flatMap(all =>
|
||||||
model.cid === coll && model.name.notIn(
|
RClassifierModel
|
||||||
Select(
|
.findAllByName(coll, nel.map(tagCategory).map(_.name))
|
||||||
select(concat(lit(categoryPrefix), tag.category.s)),
|
.map(active => all.diff(active))
|
||||||
from(tag),
|
)
|
||||||
tag.cid === coll && tag.category.isNotNull
|
case None =>
|
||||||
).distinct
|
allModels
|
||||||
)
|
}
|
||||||
).build
|
} yield result
|
||||||
sql.query[RClassifierModel].to[List]
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ import docspell.backend.ops.OCollective
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.scheduler._
|
import docspell.joex.scheduler._
|
||||||
import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag}
|
import docspell.store.records.{RClassifierModel, RClassifierSetting}
|
||||||
|
|
||||||
object LearnClassifierTask {
|
object LearnClassifierTask {
|
||||||
val pageSep = " --n-- "
|
val pageSep = " --n-- "
|
||||||
@ -26,15 +26,23 @@ object LearnClassifierTask {
|
|||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
(for {
|
val learnTags =
|
||||||
sett <- findActiveSettings[F](ctx, cfg)
|
for {
|
||||||
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
|
sett <- findActiveSettings[F](ctx, cfg)
|
||||||
_ <- OptionT.liftF(
|
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
|
||||||
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
|
_ <- OptionT.liftF(
|
||||||
)
|
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
|
||||||
_ <- OptionT.liftF(clearObsoleteModels(ctx))
|
)
|
||||||
} yield ())
|
} yield ()
|
||||||
.getOrElseF(logInactiveWarning(ctx.logger))
|
|
||||||
|
// learn classifier models from active tag categories
|
||||||
|
learnTags.getOrElseF(logInactiveWarning(ctx.logger)) *>
|
||||||
|
// delete classifier model files for categories that have been removed
|
||||||
|
clearObsoleteTagModels(ctx) *>
|
||||||
|
// when tags are deleted, categories may get removed. fix the json array
|
||||||
|
ctx.store
|
||||||
|
.transact(RClassifierSetting.fixCategoryList(ctx.args.collective))
|
||||||
|
.map(_ => ())
|
||||||
}
|
}
|
||||||
|
|
||||||
def learnTagCategory[F[_]: Sync: ContextShift, A](
|
def learnTagCategory[F[_]: Sync: ContextShift, A](
|
||||||
@ -64,13 +72,13 @@ object LearnClassifierTask {
|
|||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
cats <- ctx.store.transact(RTag.listCategories(collective))
|
cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective))
|
||||||
task = learnTagCategory[F, A](analyser, collective, maxItems) _
|
task = learnTagCategory[F, A](analyser, collective, maxItems) _
|
||||||
_ <- cats.map(task).traverse(_.run(ctx))
|
_ <- cats.map(task).traverse(_.run(ctx))
|
||||||
} yield ()
|
} yield ()
|
||||||
}
|
}
|
||||||
|
|
||||||
private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] =
|
private def clearObsoleteTagModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] =
|
||||||
for {
|
for {
|
||||||
list <- ctx.store.transact(
|
list <- ctx.store.transact(
|
||||||
ClassifierName.findOrphanTagModels(ctx.args.collective)
|
ClassifierName.findOrphanTagModels(ctx.args.collective)
|
||||||
@ -98,6 +106,6 @@ object LearnClassifierTask {
|
|||||||
|
|
||||||
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
|
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
|
||||||
logger.warn(
|
logger.warn(
|
||||||
"Classification is disabled. Check joex config and the collective settings."
|
"Auto-tagging is disabled. Check joex config and the collective settings."
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -4850,12 +4850,11 @@ components:
|
|||||||
description: |
|
description: |
|
||||||
Settings for learning a document classifier.
|
Settings for learning a document classifier.
|
||||||
required:
|
required:
|
||||||
- enabled
|
|
||||||
- schedule
|
- schedule
|
||||||
- itemCount
|
- itemCount
|
||||||
|
- categoryList
|
||||||
|
- listType
|
||||||
properties:
|
properties:
|
||||||
enabled:
|
|
||||||
type: boolean
|
|
||||||
itemCount:
|
itemCount:
|
||||||
type: integer
|
type: integer
|
||||||
format: int32
|
format: int32
|
||||||
@ -4865,6 +4864,16 @@ components:
|
|||||||
schedule:
|
schedule:
|
||||||
type: string
|
type: string
|
||||||
format: calevent
|
format: calevent
|
||||||
|
categoryList:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
listType:
|
||||||
|
type: string
|
||||||
|
format: listtype
|
||||||
|
enum:
|
||||||
|
- blacklist
|
||||||
|
- whitelist
|
||||||
|
|
||||||
SourceList:
|
SourceList:
|
||||||
description: |
|
description: |
|
||||||
|
@ -6,7 +6,7 @@ import cats.implicits._
|
|||||||
import docspell.backend.BackendApp
|
import docspell.backend.BackendApp
|
||||||
import docspell.backend.auth.AuthToken
|
import docspell.backend.auth.AuthToken
|
||||||
import docspell.backend.ops.OCollective
|
import docspell.backend.ops.OCollective
|
||||||
import docspell.common.MakePreviewArgs
|
import docspell.common.{ListType, MakePreviewArgs}
|
||||||
import docspell.restapi.model._
|
import docspell.restapi.model._
|
||||||
import docspell.restserver.conv.Conversions
|
import docspell.restserver.conv.Conversions
|
||||||
import docspell.restserver.http4s._
|
import docspell.restserver.http4s._
|
||||||
@ -44,9 +44,10 @@ object CollectiveRoutes {
|
|||||||
settings.integrationEnabled,
|
settings.integrationEnabled,
|
||||||
Some(
|
Some(
|
||||||
OCollective.Classifier(
|
OCollective.Classifier(
|
||||||
settings.classifier.enabled,
|
|
||||||
settings.classifier.schedule,
|
settings.classifier.schedule,
|
||||||
settings.classifier.itemCount
|
settings.classifier.itemCount,
|
||||||
|
settings.classifier.categoryList,
|
||||||
|
settings.classifier.listType
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -64,11 +65,12 @@ object CollectiveRoutes {
|
|||||||
c.language,
|
c.language,
|
||||||
c.integrationEnabled,
|
c.integrationEnabled,
|
||||||
ClassifierSetting(
|
ClassifierSetting(
|
||||||
c.classifier.exists(_.enabled),
|
|
||||||
c.classifier.map(_.itemCount).getOrElse(0),
|
c.classifier.map(_.itemCount).getOrElse(0),
|
||||||
c.classifier
|
c.classifier
|
||||||
.map(_.schedule)
|
.map(_.schedule)
|
||||||
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
|
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00")),
|
||||||
|
c.classifier.map(_.categories).getOrElse(Nil),
|
||||||
|
c.classifier.map(_.listType).getOrElse(ListType.whitelist)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -14,8 +14,31 @@ select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name
|
|||||||
from "classifier_setting"
|
from "classifier_setting"
|
||||||
where "file_id" is not null;
|
where "file_id" is not null;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
add column "categories" text;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
add column "category_list_type" varchar(254);
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "category_list_type" = 'whitelist';
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "categories" = concat('["', category, '"]')
|
||||||
|
where category is not null;
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "categories" = '[]'
|
||||||
|
where category is null;
|
||||||
|
|
||||||
alter table "classifier_setting"
|
alter table "classifier_setting"
|
||||||
drop column "category";
|
drop column "category";
|
||||||
|
|
||||||
alter table "classifier_setting"
|
alter table "classifier_setting"
|
||||||
drop column "file_id";
|
drop column "file_id";
|
||||||
|
|
||||||
|
ALTER TABLE "classifier_setting"
|
||||||
|
ALTER COLUMN "categories" SET NOT NULL;
|
||||||
|
|
||||||
|
ALTER TABLE "classifier_setting"
|
||||||
|
ALTER COLUMN "category_list_type" SET NOT NULL;
|
||||||
|
@ -14,13 +14,35 @@ select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `f
|
|||||||
from `classifier_setting`
|
from `classifier_setting`
|
||||||
where `file_id` is not null;
|
where `file_id` is not null;
|
||||||
|
|
||||||
|
alter table `classifier_setting`
|
||||||
|
add column (`categories` mediumtext);
|
||||||
|
|
||||||
|
alter table `classifier_setting`
|
||||||
|
add column (`category_list_type` varchar(254));
|
||||||
|
|
||||||
|
update `classifier_setting`
|
||||||
|
set `category_list_type` = 'whitelist';
|
||||||
|
|
||||||
|
update `classifier_setting`
|
||||||
|
set `categories` = concat('[`', category, '`]')
|
||||||
|
where category is not null;
|
||||||
|
|
||||||
|
update `classifier_setting`
|
||||||
|
set `categories` = '[]'
|
||||||
|
where category is null;
|
||||||
|
|
||||||
alter table `classifier_setting`
|
alter table `classifier_setting`
|
||||||
drop column `category`;
|
drop column `category`;
|
||||||
|
|
||||||
-- mariadb needs special treatment when dropping a column that is part
|
-- mariadb requires to drop constraint manually when dropping a column
|
||||||
-- of an index and foreign key
|
|
||||||
alter table `classifier_setting`
|
alter table `classifier_setting`
|
||||||
drop constraint `classifier_setting_ibfk_2`;
|
drop constraint `classifier_setting_ibfk_2`;
|
||||||
|
|
||||||
alter table `classifier_setting`
|
alter table `classifier_setting`
|
||||||
drop column `file_id`;
|
drop column `file_id`;
|
||||||
|
|
||||||
|
ALTER TABLE `classifier_setting`
|
||||||
|
MODIFY `categories` mediumtext NOT NULL;
|
||||||
|
|
||||||
|
ALTER TABLE `classifier_setting`
|
||||||
|
MODIFY `category_list_type` varchar(254) NOT NULL;
|
||||||
|
@ -14,8 +14,31 @@ select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name",
|
|||||||
from "classifier_setting"
|
from "classifier_setting"
|
||||||
where "file_id" is not null;
|
where "file_id" is not null;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
add column "categories" text;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
add column "category_list_type" varchar(254);
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "category_list_type" = 'whitelist';
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "categories" = concat('["', category, '"]')
|
||||||
|
where category is not null;
|
||||||
|
|
||||||
|
update "classifier_setting"
|
||||||
|
set "categories" = '[]'
|
||||||
|
where category is null;
|
||||||
|
|
||||||
alter table "classifier_setting"
|
alter table "classifier_setting"
|
||||||
drop column "category";
|
drop column "category";
|
||||||
|
|
||||||
alter table "classifier_setting"
|
alter table "classifier_setting"
|
||||||
drop column "file_id";
|
drop column "file_id";
|
||||||
|
|
||||||
|
ALTER TABLE "classifier_setting"
|
||||||
|
ALTER COLUMN "categories" SET NOT NULL;
|
||||||
|
|
||||||
|
ALTER TABLE "classifier_setting"
|
||||||
|
ALTER COLUMN "category_list_type" SET NOT NULL;
|
||||||
|
@ -97,6 +97,9 @@ trait DoobieMeta extends EmilDoobieMeta {
|
|||||||
|
|
||||||
implicit val metaCustomFieldType: Meta[CustomFieldType] =
|
implicit val metaCustomFieldType: Meta[CustomFieldType] =
|
||||||
Meta[String].timap(CustomFieldType.unsafe)(_.name)
|
Meta[String].timap(CustomFieldType.unsafe)(_.name)
|
||||||
|
|
||||||
|
implicit val metaListType: Meta[ListType] =
|
||||||
|
Meta[String].timap(ListType.unsafeFromString)(_.name)
|
||||||
}
|
}
|
||||||
|
|
||||||
object DoobieMeta extends DoobieMeta {
|
object DoobieMeta extends DoobieMeta {
|
||||||
|
@ -57,7 +57,12 @@ object RClassifierModel {
|
|||||||
|
|
||||||
def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
|
def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
|
||||||
for {
|
for {
|
||||||
n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid)))
|
now <- Timestamp.current[ConnectionIO]
|
||||||
|
n <- DML.update(
|
||||||
|
T,
|
||||||
|
T.cid === coll && T.name === name,
|
||||||
|
DML.set(T.fileId.setTo(fid), T.created.setTo(now))
|
||||||
|
)
|
||||||
k <-
|
k <-
|
||||||
if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
|
if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
|
||||||
else 0.pure[ConnectionIO]
|
else 0.pure[ConnectionIO]
|
||||||
@ -87,4 +92,11 @@ object RClassifierModel {
|
|||||||
.query[RClassifierModel]
|
.query[RClassifierModel]
|
||||||
.to[List]
|
.to[List]
|
||||||
|
|
||||||
|
def findAllByQuery(
|
||||||
|
cid: Ident,
|
||||||
|
nameQuery: String
|
||||||
|
): ConnectionIO[List[RClassifierModel]] =
|
||||||
|
Select(select(T.all), from(T), T.cid === cid && T.name.like(nameQuery)).build
|
||||||
|
.query[RClassifierModel]
|
||||||
|
.to[List]
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
import cats.data.NonEmptyList
|
import cats.data.{NonEmptyList, OptionT}
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -13,23 +13,38 @@ import doobie.implicits._
|
|||||||
|
|
||||||
case class RClassifierSetting(
|
case class RClassifierSetting(
|
||||||
cid: Ident,
|
cid: Ident,
|
||||||
enabled: Boolean,
|
|
||||||
schedule: CalEvent,
|
schedule: CalEvent,
|
||||||
itemCount: Int,
|
itemCount: Int,
|
||||||
created: Timestamp
|
created: Timestamp,
|
||||||
) {}
|
categoryList: List[String],
|
||||||
|
listType: ListType
|
||||||
|
) {
|
||||||
|
|
||||||
|
def enabled: Boolean =
|
||||||
|
listType match {
|
||||||
|
case ListType.Blacklist =>
|
||||||
|
true
|
||||||
|
case ListType.Whitelist =>
|
||||||
|
categoryList.nonEmpty
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
object RClassifierSetting {
|
object RClassifierSetting {
|
||||||
|
// the categoryList is stored as a json array
|
||||||
|
implicit val stringListMeta: Meta[List[String]] =
|
||||||
|
jsonMeta[List[String]]
|
||||||
|
|
||||||
final case class Table(alias: Option[String]) extends TableDef {
|
final case class Table(alias: Option[String]) extends TableDef {
|
||||||
val tableName = "classifier_setting"
|
val tableName = "classifier_setting"
|
||||||
|
|
||||||
val cid = Column[Ident]("cid", this)
|
val cid = Column[Ident]("cid", this)
|
||||||
val enabled = Column[Boolean]("enabled", this)
|
val schedule = Column[CalEvent]("schedule", this)
|
||||||
val schedule = Column[CalEvent]("schedule", this)
|
val itemCount = Column[Int]("item_count", this)
|
||||||
val itemCount = Column[Int]("item_count", this)
|
val created = Column[Timestamp]("created", this)
|
||||||
val created = Column[Timestamp]("created", this)
|
val categories = Column[List[String]]("categories", this)
|
||||||
|
val listType = Column[ListType]("category_list_type", this)
|
||||||
val all = NonEmptyList
|
val all = NonEmptyList
|
||||||
.of[Column[_]](cid, enabled, schedule, itemCount, created)
|
.of[Column[_]](cid, schedule, itemCount, created, categories, listType)
|
||||||
}
|
}
|
||||||
|
|
||||||
val T = Table(None)
|
val T = Table(None)
|
||||||
@ -40,29 +55,19 @@ object RClassifierSetting {
|
|||||||
DML.insert(
|
DML.insert(
|
||||||
T,
|
T,
|
||||||
T.all,
|
T.all,
|
||||||
fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}"
|
fr"${v.cid},${v.schedule},${v.itemCount},${v.created},${v.categoryList},${v.listType}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
|
def update(v: RClassifierSetting): ConnectionIO[Int] =
|
||||||
DML.update(
|
|
||||||
T,
|
|
||||||
T.cid === v.cid,
|
|
||||||
DML.set(
|
|
||||||
T.enabled.setTo(v.enabled),
|
|
||||||
T.schedule.setTo(v.schedule),
|
|
||||||
T.itemCount.setTo(v.itemCount)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
|
||||||
for {
|
for {
|
||||||
n1 <- DML.update(
|
n1 <- DML.update(
|
||||||
T,
|
T,
|
||||||
T.cid === v.cid,
|
T.cid === v.cid,
|
||||||
DML.set(
|
DML.set(
|
||||||
T.enabled.setTo(v.enabled),
|
|
||||||
T.schedule.setTo(v.schedule),
|
T.schedule.setTo(v.schedule),
|
||||||
T.itemCount.setTo(v.itemCount)
|
T.itemCount.setTo(v.itemCount),
|
||||||
|
T.categories.setTo(v.categoryList),
|
||||||
|
T.listType.setTo(v.listType)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
||||||
@ -76,24 +81,62 @@ object RClassifierSetting {
|
|||||||
def delete(coll: Ident): ConnectionIO[Int] =
|
def delete(coll: Ident): ConnectionIO[Int] =
|
||||||
DML.delete(T, T.cid === coll)
|
DML.delete(T, T.cid === coll)
|
||||||
|
|
||||||
|
/** Finds tag categories that exist and match the classifier setting.
|
||||||
|
* If the setting contains a black list, they are removed from the
|
||||||
|
* existing categories. If it is a whitelist, the intersection is
|
||||||
|
* returned.
|
||||||
|
*/
|
||||||
|
def getActiveCategories(coll: Ident): ConnectionIO[List[String]] =
|
||||||
|
(for {
|
||||||
|
sett <- OptionT(findById(coll))
|
||||||
|
cats <- OptionT.liftF(RTag.listCategories(coll))
|
||||||
|
res = sett.listType match {
|
||||||
|
case ListType.Blacklist =>
|
||||||
|
cats.diff(sett.categoryList)
|
||||||
|
case ListType.Whitelist =>
|
||||||
|
sett.categoryList.intersect(cats)
|
||||||
|
}
|
||||||
|
} yield res).getOrElse(Nil)
|
||||||
|
|
||||||
|
/** Checks the json array of tag categories and removes those that are not present anymore. */
|
||||||
|
def fixCategoryList(coll: Ident): ConnectionIO[Int] =
|
||||||
|
(for {
|
||||||
|
sett <- OptionT(findById(coll))
|
||||||
|
cats <- OptionT.liftF(RTag.listCategories(coll))
|
||||||
|
fixed = sett.categoryList.intersect(cats)
|
||||||
|
n <- OptionT.liftF(
|
||||||
|
if (fixed == sett.categoryList) 0.pure[ConnectionIO]
|
||||||
|
else DML.update(T, T.cid === coll, DML.set(T.categories.setTo(fixed)))
|
||||||
|
)
|
||||||
|
} yield n).getOrElse(0)
|
||||||
|
|
||||||
case class Classifier(
|
case class Classifier(
|
||||||
enabled: Boolean,
|
|
||||||
schedule: CalEvent,
|
schedule: CalEvent,
|
||||||
itemCount: Int
|
itemCount: Int,
|
||||||
|
categories: List[String],
|
||||||
|
listType: ListType
|
||||||
) {
|
) {
|
||||||
|
def enabled: Boolean =
|
||||||
|
listType match {
|
||||||
|
case ListType.Blacklist =>
|
||||||
|
true
|
||||||
|
case ListType.Whitelist =>
|
||||||
|
categories.nonEmpty
|
||||||
|
}
|
||||||
|
|
||||||
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
||||||
RClassifierSetting(
|
RClassifierSetting(
|
||||||
coll,
|
coll,
|
||||||
enabled,
|
|
||||||
schedule,
|
schedule,
|
||||||
itemCount,
|
itemCount,
|
||||||
created
|
created,
|
||||||
|
categories,
|
||||||
|
listType
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
object Classifier {
|
object Classifier {
|
||||||
def fromRecord(r: RClassifierSetting): Classifier =
|
def fromRecord(r: RClassifierSetting): Classifier =
|
||||||
Classifier(r.enabled, r.schedule, r.itemCount)
|
Classifier(r.schedule, r.itemCount, r.categoryList, r.listType)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
import cats.data.NonEmptyList
|
import cats.data.{NonEmptyList, OptionT}
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -73,13 +73,24 @@ object RCollective {
|
|||||||
.map(now => settings.classifier.map(_.toRecord(cid, now)))
|
.map(now => settings.classifier.map(_.toRecord(cid, now)))
|
||||||
n2 <- cls match {
|
n2 <- cls match {
|
||||||
case Some(cr) =>
|
case Some(cr) =>
|
||||||
RClassifierSetting.updateSettings(cr)
|
RClassifierSetting.update(cr)
|
||||||
case None =>
|
case None =>
|
||||||
RClassifierSetting.delete(cid)
|
RClassifierSetting.delete(cid)
|
||||||
}
|
}
|
||||||
} yield n1 + n2
|
} yield n1 + n2
|
||||||
|
|
||||||
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
|
// this hides categories that have been deleted in the meantime
|
||||||
|
// they are finally removed from the json array once the learn classifier task is run
|
||||||
|
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] =
|
||||||
|
(for {
|
||||||
|
sett <- OptionT(getRawSettings(coll))
|
||||||
|
prev <- OptionT.fromOption[ConnectionIO](sett.classifier)
|
||||||
|
cats <- OptionT.liftF(RTag.listCategories(coll))
|
||||||
|
next = prev.copy(categories = prev.categories.intersect(cats))
|
||||||
|
} yield sett.copy(classifier = Some(next))).value
|
||||||
|
|
||||||
|
private def getRawSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
|
||||||
|
import RClassifierSetting.stringListMeta
|
||||||
val c = RCollective.as("c")
|
val c = RCollective.as("c")
|
||||||
val cs = RClassifierSetting.as("cs")
|
val cs = RClassifierSetting.as("cs")
|
||||||
|
|
||||||
@ -87,9 +98,10 @@ object RCollective {
|
|||||||
select(
|
select(
|
||||||
c.language.s,
|
c.language.s,
|
||||||
c.integration.s,
|
c.integration.s,
|
||||||
cs.enabled.s,
|
|
||||||
cs.schedule.s,
|
cs.schedule.s,
|
||||||
cs.itemCount.s
|
cs.itemCount.s,
|
||||||
|
cs.categories.s,
|
||||||
|
cs.listType.s
|
||||||
),
|
),
|
||||||
from(c).leftJoin(cs, cs.cid === c.id),
|
from(c).leftJoin(cs, cs.cid === c.id),
|
||||||
c.id === coll
|
c.id === coll
|
||||||
|
@ -11,31 +11,38 @@ import Api
|
|||||||
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
|
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
|
||||||
import Api.Model.TagList exposing (TagList)
|
import Api.Model.TagList exposing (TagList)
|
||||||
import Comp.CalEventInput
|
import Comp.CalEventInput
|
||||||
|
import Comp.Dropdown
|
||||||
import Comp.FixedDropdown
|
import Comp.FixedDropdown
|
||||||
import Comp.IntField
|
import Comp.IntField
|
||||||
import Data.CalEvent exposing (CalEvent)
|
import Data.CalEvent exposing (CalEvent)
|
||||||
import Data.Flags exposing (Flags)
|
import Data.Flags exposing (Flags)
|
||||||
|
import Data.ListType exposing (ListType)
|
||||||
|
import Data.UiSettings exposing (UiSettings)
|
||||||
import Data.Validated exposing (Validated(..))
|
import Data.Validated exposing (Validated(..))
|
||||||
import Html exposing (..)
|
import Html exposing (..)
|
||||||
import Html.Attributes exposing (..)
|
import Html.Attributes exposing (..)
|
||||||
import Html.Events exposing (onCheck)
|
|
||||||
import Http
|
import Http
|
||||||
|
import Markdown
|
||||||
import Util.Tag
|
import Util.Tag
|
||||||
|
|
||||||
|
|
||||||
type alias Model =
|
type alias Model =
|
||||||
{ enabled : Bool
|
{ scheduleModel : Comp.CalEventInput.Model
|
||||||
, scheduleModel : Comp.CalEventInput.Model
|
|
||||||
, schedule : Validated CalEvent
|
, schedule : Validated CalEvent
|
||||||
, itemCountModel : Comp.IntField.Model
|
, itemCountModel : Comp.IntField.Model
|
||||||
, itemCount : Maybe Int
|
, itemCount : Maybe Int
|
||||||
|
, categoryListModel : Comp.Dropdown.Model String
|
||||||
|
, categoryListType : ListType
|
||||||
|
, categoryListTypeModel : Comp.FixedDropdown.Model ListType
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
type Msg
|
type Msg
|
||||||
= ScheduleMsg Comp.CalEventInput.Msg
|
= ScheduleMsg Comp.CalEventInput.Msg
|
||||||
| ToggleEnabled
|
|
||||||
| ItemCountMsg Comp.IntField.Msg
|
| ItemCountMsg Comp.IntField.Msg
|
||||||
|
| GetTagsResp (Result Http.Error TagList)
|
||||||
|
| CategoryListMsg (Comp.Dropdown.Msg String)
|
||||||
|
| CategoryListTypeMsg (Comp.FixedDropdown.Msg ListType)
|
||||||
|
|
||||||
|
|
||||||
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
|
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
|
||||||
@ -48,13 +55,41 @@ init flags sett =
|
|||||||
( cem, cec ) =
|
( cem, cec ) =
|
||||||
Comp.CalEventInput.init flags newSchedule
|
Comp.CalEventInput.init flags newSchedule
|
||||||
in
|
in
|
||||||
( { enabled = sett.enabled
|
( { scheduleModel = cem
|
||||||
, scheduleModel = cem
|
|
||||||
, schedule = Data.Validated.Unknown newSchedule
|
, schedule = Data.Validated.Unknown newSchedule
|
||||||
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
||||||
, itemCount = Just sett.itemCount
|
, itemCount = Just sett.itemCount
|
||||||
|
, categoryListModel =
|
||||||
|
let
|
||||||
|
mkOption s =
|
||||||
|
{ value = s, text = s, additional = "" }
|
||||||
|
|
||||||
|
minit =
|
||||||
|
Comp.Dropdown.makeModel
|
||||||
|
{ multiple = True
|
||||||
|
, searchable = \n -> n > 0
|
||||||
|
, makeOption = mkOption
|
||||||
|
, labelColor = \_ -> \_ -> "grey "
|
||||||
|
, placeholder = "Choose categories …"
|
||||||
|
}
|
||||||
|
|
||||||
|
lm =
|
||||||
|
Comp.Dropdown.SetSelection sett.categoryList
|
||||||
|
|
||||||
|
( m_, _ ) =
|
||||||
|
Comp.Dropdown.update lm minit
|
||||||
|
in
|
||||||
|
m_
|
||||||
|
, categoryListType =
|
||||||
|
Data.ListType.fromString sett.listType
|
||||||
|
|> Maybe.withDefault Data.ListType.Whitelist
|
||||||
|
, categoryListTypeModel =
|
||||||
|
Comp.FixedDropdown.initMap Data.ListType.label Data.ListType.all
|
||||||
}
|
}
|
||||||
, Cmd.map ScheduleMsg cec
|
, Cmd.batch
|
||||||
|
[ Api.getTags flags "" GetTagsResp
|
||||||
|
, Cmd.map ScheduleMsg cec
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -62,10 +97,11 @@ getSettings : Model -> Validated ClassifierSetting
|
|||||||
getSettings model =
|
getSettings model =
|
||||||
Data.Validated.map
|
Data.Validated.map
|
||||||
(\sch ->
|
(\sch ->
|
||||||
{ enabled = model.enabled
|
{ schedule =
|
||||||
, schedule =
|
|
||||||
Data.CalEvent.makeEvent sch
|
Data.CalEvent.makeEvent sch
|
||||||
, itemCount = Maybe.withDefault 0 model.itemCount
|
, itemCount = Maybe.withDefault 0 model.itemCount
|
||||||
|
, listType = Data.ListType.toString model.categoryListType
|
||||||
|
, categoryList = Comp.Dropdown.getSelected model.categoryListModel
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
model.schedule
|
model.schedule
|
||||||
@ -74,6 +110,20 @@ getSettings model =
|
|||||||
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
||||||
update flags msg model =
|
update flags msg model =
|
||||||
case msg of
|
case msg of
|
||||||
|
GetTagsResp (Ok tl) ->
|
||||||
|
let
|
||||||
|
categories =
|
||||||
|
Util.Tag.getCategories tl.items
|
||||||
|
|> List.sort
|
||||||
|
|
||||||
|
lm =
|
||||||
|
Comp.Dropdown.SetOptions categories
|
||||||
|
in
|
||||||
|
update flags (CategoryListMsg lm) model
|
||||||
|
|
||||||
|
GetTagsResp (Err _) ->
|
||||||
|
( model, Cmd.none )
|
||||||
|
|
||||||
ScheduleMsg lmsg ->
|
ScheduleMsg lmsg ->
|
||||||
let
|
let
|
||||||
( cm, cc, ce ) =
|
( cm, cc, ce ) =
|
||||||
@ -90,11 +140,6 @@ update flags msg model =
|
|||||||
, Cmd.map ScheduleMsg cc
|
, Cmd.map ScheduleMsg cc
|
||||||
)
|
)
|
||||||
|
|
||||||
ToggleEnabled ->
|
|
||||||
( { model | enabled = not model.enabled }
|
|
||||||
, Cmd.none
|
|
||||||
)
|
|
||||||
|
|
||||||
ItemCountMsg lmsg ->
|
ItemCountMsg lmsg ->
|
||||||
let
|
let
|
||||||
( im, iv ) =
|
( im, iv ) =
|
||||||
@ -107,32 +152,61 @@ update flags msg model =
|
|||||||
, Cmd.none
|
, Cmd.none
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CategoryListMsg lm ->
|
||||||
|
let
|
||||||
|
( m_, cmd_ ) =
|
||||||
|
Comp.Dropdown.update lm model.categoryListModel
|
||||||
|
in
|
||||||
|
( { model | categoryListModel = m_ }
|
||||||
|
, Cmd.map CategoryListMsg cmd_
|
||||||
|
)
|
||||||
|
|
||||||
view : Model -> Html Msg
|
CategoryListTypeMsg lm ->
|
||||||
view model =
|
let
|
||||||
|
( m_, sel ) =
|
||||||
|
Comp.FixedDropdown.update lm model.categoryListTypeModel
|
||||||
|
|
||||||
|
newListType =
|
||||||
|
Maybe.withDefault model.categoryListType sel
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| categoryListTypeModel = m_
|
||||||
|
, categoryListType = newListType
|
||||||
|
}
|
||||||
|
, Cmd.none
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
view : UiSettings -> Model -> Html Msg
|
||||||
|
view settings model =
|
||||||
|
let
|
||||||
|
catListTypeItem =
|
||||||
|
Comp.FixedDropdown.Item
|
||||||
|
model.categoryListType
|
||||||
|
(Data.ListType.label model.categoryListType)
|
||||||
|
in
|
||||||
div []
|
div []
|
||||||
[ div
|
[ Markdown.toHtml [ class "ui basic segment" ]
|
||||||
[ class "field"
|
"""
|
||||||
|
|
||||||
|
Auto-tagging works by learning from existing documents. The more
|
||||||
|
documents you have correctly tagged, the better. Learning is done
|
||||||
|
periodically based on a schedule. You can specify tag-groups that
|
||||||
|
should either be used (whitelist) or not used (blacklist) for
|
||||||
|
learning.
|
||||||
|
|
||||||
|
Use an empty whitelist to disable auto tagging.
|
||||||
|
|
||||||
|
"""
|
||||||
|
, div [ class "field" ]
|
||||||
|
[ label [] [ text "Is the following a blacklist or whitelist?" ]
|
||||||
|
, Html.map CategoryListTypeMsg
|
||||||
|
(Comp.FixedDropdown.view (Just catListTypeItem) model.categoryListTypeModel)
|
||||||
]
|
]
|
||||||
[ div [ class "ui checkbox" ]
|
, div [ class "field" ]
|
||||||
[ input
|
[ label [] [ text "Choose tag categories for learning" ]
|
||||||
[ type_ "checkbox"
|
, Html.map CategoryListMsg
|
||||||
, onCheck (\_ -> ToggleEnabled)
|
(Comp.Dropdown.view settings model.categoryListModel)
|
||||||
, checked model.enabled
|
|
||||||
]
|
|
||||||
[]
|
|
||||||
, label [] [ text "Enable classification" ]
|
|
||||||
, span [ class "small-info" ]
|
|
||||||
[ text "Disable document classification if not needed."
|
|
||||||
]
|
|
||||||
]
|
|
||||||
]
|
|
||||||
, div [ class "ui basic segment" ]
|
|
||||||
[ text "Document classification tries to predict a tag for new incoming documents. This "
|
|
||||||
, text "works by learning from existing documents in order to find common patterns within "
|
|
||||||
, text "the text. The more documents you have correctly tagged, the better. Learning is done "
|
|
||||||
, text "periodically based on a schedule and you need to specify a tag-group that should "
|
|
||||||
, text "be used for learning."
|
|
||||||
]
|
]
|
||||||
, Html.map ItemCountMsg
|
, Html.map ItemCountMsg
|
||||||
(Comp.IntField.viewWithInfo
|
(Comp.IntField.viewWithInfo
|
||||||
|
@ -280,7 +280,7 @@ view flags settings model =
|
|||||||
, ( "invisible hidden", not flags.config.showClassificationSettings )
|
, ( "invisible hidden", not flags.config.showClassificationSettings )
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
[ text "Document Classifier"
|
[ text "Auto-Tagging"
|
||||||
]
|
]
|
||||||
, div
|
, div
|
||||||
[ classList
|
[ classList
|
||||||
@ -289,13 +289,10 @@ view flags settings model =
|
|||||||
]
|
]
|
||||||
]
|
]
|
||||||
[ Html.map ClassifierSettingMsg
|
[ Html.map ClassifierSettingMsg
|
||||||
(Comp.ClassifierSettingsForm.view model.classifierModel)
|
(Comp.ClassifierSettingsForm.view settings model.classifierModel)
|
||||||
, div [ class "ui vertical segment" ]
|
, div [ class "ui vertical segment" ]
|
||||||
[ button
|
[ button
|
||||||
[ classList
|
[ class "ui small secondary basic button"
|
||||||
[ ( "ui small secondary basic button", True )
|
|
||||||
, ( "disabled", not model.classifierModel.enabled )
|
|
||||||
]
|
|
||||||
, title "Starts a task to train a classifier"
|
, title "Starts a task to train a classifier"
|
||||||
, onClick StartClassifierTask
|
, onClick StartClassifierTask
|
||||||
]
|
]
|
||||||
|
50
modules/webapp/src/main/elm/Data/ListType.elm
Normal file
50
modules/webapp/src/main/elm/Data/ListType.elm
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
module Data.ListType exposing
|
||||||
|
( ListType(..)
|
||||||
|
, all
|
||||||
|
, fromString
|
||||||
|
, label
|
||||||
|
, toString
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
type ListType
|
||||||
|
= Blacklist
|
||||||
|
| Whitelist
|
||||||
|
|
||||||
|
|
||||||
|
all : List ListType
|
||||||
|
all =
|
||||||
|
[ Blacklist, Whitelist ]
|
||||||
|
|
||||||
|
|
||||||
|
toString : ListType -> String
|
||||||
|
toString lt =
|
||||||
|
case lt of
|
||||||
|
Blacklist ->
|
||||||
|
"blacklist"
|
||||||
|
|
||||||
|
Whitelist ->
|
||||||
|
"whitelist"
|
||||||
|
|
||||||
|
|
||||||
|
label : ListType -> String
|
||||||
|
label lt =
|
||||||
|
case lt of
|
||||||
|
Blacklist ->
|
||||||
|
"Blacklist"
|
||||||
|
|
||||||
|
Whitelist ->
|
||||||
|
"Whitelist"
|
||||||
|
|
||||||
|
|
||||||
|
fromString : String -> Maybe ListType
|
||||||
|
fromString str =
|
||||||
|
case String.toLower str of
|
||||||
|
"blacklist" ->
|
||||||
|
Just Blacklist
|
||||||
|
|
||||||
|
"whitelist" ->
|
||||||
|
Just Whitelist
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
Nothing
|
Loading…
x
Reference in New Issue
Block a user