Extend guessing tags to all tag categories

2025-08-05 02:24:52 +00:00 · 2021-01-18 13:35:53 +01:00
parent c5778880d9
commit 249f9e6e2a
18 changed files with 384 additions and 168 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
@ -11,6 +11,7 @@ import docspell.analysis.classifier
 import docspell.analysis.classifier.TextClassifier._
 import docspell.analysis.nlp.Properties
 import docspell.common._
 import docspell.common.syntax.FileSyntax._
 import edu.stanford.nlp.classify.ColumnDataClassifier
@ -28,7 +29,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
      .use { dir =>
        for {
          rawData   <- writeDataFile(blocker, dir, data)
-          _         <- logger.info(s"Learning from ${rawData.count} items.")
+          _         <- logger.debug(s"Learning from ${rawData.count} items.")
          trainData <- splitData(logger, rawData)
          scores    <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
          sorted = scores.sortBy(-_.score)
@ -138,9 +139,9 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
      props: Map[String, String]
  ): Map[String, String] =
    prepend("2.", props) ++ Map(
-      "trainFile"   -> trainData.train.normalize().toAbsolutePath().toString(),
+      "trainFile"   -> trainData.train.absolutePathAsString,
-      "testFile"    -> trainData.test.normalize().toAbsolutePath().toString(),
+      "testFile"    -> trainData.test.absolutePathAsString,
-      "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
+      "serializeTo" -> trainData.modelFile.absolutePathAsString
    ).toList
  case class RawData(count: Long, file: Path)
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@ -169,7 +169,7 @@ object JoexAppImpl {
        .withTask(
          JobTask.json(
            LearnClassifierArgs.taskName,
-            LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
+            LearnClassifierTask[F](cfg.textAnalysis, analyser),
            LearnClassifierTask.onCancel[F]
          )
        )
--- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala
@ -0,0 +1,45 @@
 package docspell.joex.learn
 import cats.data.NonEmptyList
 import cats.implicits._
 import docspell.common.Ident
 import docspell.store.records.{RClassifierModel, RTag}
 import doobie._
 final class ClassifierName(val name: String) extends AnyVal
 object ClassifierName {
  def apply(name: String): ClassifierName =
    new ClassifierName(name)
  val noCategory: ClassifierName =
    apply("__docspell_no_category__")
  val categoryPrefix = "tagcategory-"
  def tagCategory(cat: String): ClassifierName =
    apply(s"${categoryPrefix}${cat}")
  val concernedPerson: ClassifierName =
    apply("concernedperson")
  val concernedEquip: ClassifierName =
    apply("concernedequip")
  val correspondentOrg: ClassifierName =
    apply("correspondentorg")
  val correspondentPerson: ClassifierName =
    apply("correspondentperson")
  def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
    for {
      categories <- RTag.listCategories(coll, noCategory.name)
      models <- NonEmptyList.fromList(categories) match {
        case Some(nel) =>
          RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
        case None =>
          List.empty[RClassifierModel].pure[ConnectionIO]
      }
    } yield models
 }
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@ -4,23 +4,16 @@ import cats.data.Kleisli
 import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 import fs2.{Pipe, Stream}
 import docspell.analysis.TextAnalyser
 import docspell.analysis.classifier.ClassifierModel
 import docspell.analysis.classifier.TextClassifier.Data
 import docspell.backend.ops.OCollective
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.scheduler._
-import docspell.store.queries.QItem
+import docspell.store.records.{RClassifierSetting, RTag}
 import docspell.store.records.RClassifierSetting
 import bitpeace.MimetypeHint
 object LearnClassifierTask {
  val noClass = "__NONE__"
  val pageSep = " --n-- "
  val noClass = "__NONE__"
  type Args = LearnClassifierArgs
@ -29,67 +22,53 @@ object LearnClassifierTask {
  def apply[F[_]: Sync: ContextShift](
      cfg: Config.TextAnalysis,
      blocker: Blocker,
      analyser: TextAnalyser[F]
  ): Task[F, Args, Unit] =
    Task { ctx =>
      (for {
        sett <- findActiveSettings[F](ctx, cfg)
-        data = selectItems(
+        maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
          ctx,
          math.min(cfg.classification.itemCount, sett.itemCount).toLong,
          sett.category.getOrElse("")
        )
        _ <- OptionT.liftF(
-          analyser.classifier
+          learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
        )
      } yield ())
        .getOrElseF(logInactiveWarning(ctx.logger))
    }
-  private def handleModel[F[_]: Sync: ContextShift](
+  def learnTagCategory[F[_]: Sync: ContextShift, A](
-      ctx: Context[F, Args],
+      analyser: TextAnalyser[F],
-      blocker: Blocker
+      collective: Ident,
-  )(trainedModel: ClassifierModel): F[Unit] =
+      maxItems: Int
-    for {
+  )(
      oldFile <- ctx.store.transact(
        RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
      )
      _ <- ctx.logger.info("Storing new trained model")
      fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
      newFile <-
        ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
      _ <- ctx.store.transact(
        RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
      )
      _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
      _ <- oldFile match {
        case Some(fid) =>
          ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
            ctx.store.bitpeace.delete(fid.id).compile.drain
        case None => ().pure[F]
      }
    } yield ()
  private def selectItems[F[_]](
      ctx: Context[F, Args],
      max: Long,
      category: String
-  ): Stream[F, Data] = {
+  ): Task[F, A, Unit] =
-    val connStream =
+    Task { ctx =>
-      for {
+      val data = SelectItems.forCategory(ctx, collective)(maxItems, category)
-        item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
+      ctx.logger.info(s"Learn classifier for tag category: $category") *>
-        tt <- Stream.eval(
+        analyser.classifier.trainClassifier(ctx.logger, data)(
-          QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
+          Kleisli(
            StoreClassifierModel.handleModel(
              ctx,
              collective,
              ClassifierName.tagCategory(category)
            )
          )
        )
-      } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
+    }
    ctx.store.transact(connStream.filter(_.text.nonEmpty))
  }
-  private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
+  def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])(
-    if (max <= 0) identity
+      collective: Ident,
-    else _.take(max)
+      maxItems: Int
  ): Task[F, A, Unit] =
    Task { ctx =>
      for {
        cats <- ctx.store.transact(
          RTag.listCategories(collective, ClassifierName.noCategory.name)
        )
        task = learnTagCategory[F, A](analyser, collective, maxItems) _
        _ <- cats.map(task).traverse(_.run(ctx))
      } yield ()
    }
  private def findActiveSettings[F[_]: Sync](
      ctx: Context[F, Args],
@ -98,7 +77,6 @@ object LearnClassifierTask {
    if (cfg.classification.enabled)
      OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
        .filter(_.enabled)
        .filter(_.category.nonEmpty)
        .map(OCollective.Classifier.fromRecord)
    else
      OptionT.none
--- a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala
@ -0,0 +1,39 @@
 package docspell.joex.learn
 import fs2.Stream
 import docspell.analysis.classifier.TextClassifier.Data
 import docspell.common._
 import docspell.joex.scheduler.Context
 import docspell.store.Store
 import docspell.store.qb.Batch
 import docspell.store.queries.QItem
 object SelectItems {
  val pageSep = LearnClassifierTask.pageSep
  val noClass = LearnClassifierTask.noClass
  def forCategory[F[_]](ctx: Context[F, _], collective: Ident)(
      max: Int,
      category: String
  ): Stream[F, Data] =
    forCategory(ctx.store, collective, max, category)
  def forCategory[F[_]](
      store: Store[F],
      collective: Ident,
      max: Int,
      category: String
  ): Stream[F, Data] = {
    val limit = if (max <= 0) Batch.all else Batch.limit(max)
    val connStream =
      for {
        item <- QItem.findAllNewesFirst(collective, 10, limit)
        tt <- Stream.eval(
          QItem.resolveTextAndTag(collective, item, category, pageSep)
        )
      } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
    store.transact(connStream.filter(_.text.nonEmpty))
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala
@ -0,0 +1,53 @@
 package docspell.joex.learn
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.classifier.ClassifierModel
 import docspell.common._
 import docspell.joex.scheduler._
 import docspell.store.Store
 import docspell.store.records.RClassifierModel
 import bitpeace.MimetypeHint
 object StoreClassifierModel {
  def handleModel[F[_]: Sync: ContextShift](
      ctx: Context[F, _],
      collective: Ident,
      modelName: ClassifierName
  )(
      trainedModel: ClassifierModel
  ): F[Unit] =
    handleModel(ctx.store, ctx.blocker, ctx.logger)(collective, modelName, trainedModel)
  def handleModel[F[_]: Sync: ContextShift](
      store: Store[F],
      blocker: Blocker,
      logger: Logger[F]
  )(
      collective: Ident,
      modelName: ClassifierName,
      trainedModel: ClassifierModel
  ): F[Unit] =
    for {
      oldFile <- store.transact(
        RClassifierModel.findByName(collective, modelName.name).map(_.map(_.fileId))
      )
      _ <- logger.debug(s"Storing new trained model for: ${modelName.name}")
      fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
      newFile <-
        store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
      _ <- store.transact(
        RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id))
      )
      _ <- logger.debug(s"New model stored at file ${newFile.id}")
      _ <- oldFile match {
        case Some(fid) =>
          logger.debug(s"Deleting old model file ${fid.id}") *>
            store.bitpeace.delete(fid.id).compile.drain
        case None => ().pure[F]
      }
    } yield ()
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -9,12 +9,11 @@ import docspell.analysis.{NlpSettings, TextAnalyser}
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
-import docspell.joex.learn.LearnClassifierTask
+import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
-import docspell.store.records.RAttachmentMeta
+import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
 import docspell.store.records.RClassifierSetting
 import bitpeace.RangeDef
@ -42,10 +41,13 @@ object TextAnalysis {
        e <- s
        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
        v = t.toVector
-        tag <- predictTag(ctx, cfg, item.metas, analyser.classifier).value
+        classifierEnabled <- getActive(ctx, cfg)
        tag <-
          if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
          else List.empty[String].pure[F]
      } yield item
        .copy(metas = v.map(_._1), dateLabels = v.map(_._2))
-        .appendTags(tag.toSeq)
+        .appendTags(tag)
    }
  def annotateAttachment[F[_]: Sync](
@ -66,15 +68,29 @@ object TextAnalysis {
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
  }
  def predictTags[F[_]: Sync: ContextShift](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis,
      metas: Vector[RAttachmentMeta],
      classifier: TextClassifier[F]
  ): F[List[String]] =
    for {
      models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
      _      <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
      tags <- models
        .map(_.fileId.some)
        .traverse(predictTag(ctx, cfg, metas, classifier))
    } yield tags.flatten
  def predictTag[F[_]: Sync: ContextShift](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis,
      metas: Vector[RAttachmentMeta],
      classifier: TextClassifier[F]
-  ): OptionT[F, String] =
+  )(modelFileId: Option[Ident]): F[Option[String]] =
-    for {
+    (for {
-      model <- findActiveModel(ctx, cfg)
+      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
-      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
+      model <- OptionT.fromOption[F](modelFileId)
      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
      modelData =
        ctx.store.bitpeace
@ -90,20 +106,21 @@ object TextAnalysis {
          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
      }).filter(_ != LearnClassifierTask.noClass)
      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
-    } yield cls
+    } yield cls).value
-  private def findActiveModel[F[_]: Sync](
+  private def getActive[F[_]: Sync](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis
-  ): OptionT[F, Ident] =
+  ): F[Boolean] =
-    (if (cfg.classification.enabled)
+    if (cfg.classification.enabled)
-       OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
+      ctx.store
-         .filter(_.enabled)
+        .transact(RClassifierSetting.findById(ctx.args.meta.collective))
-         .mapFilter(_.fileId)
+        .map(_.exists(_.enabled))
-     else
+        .flatTap(enabled =>
-       OptionT.none[F, Ident]).orElse(
+          if (enabled) ().pure[F]
-      OptionT.liftF(ctx.logger.info("Classification is disabled.")) *> OptionT
+          else ctx.logger.info("Classification is disabled. Check config or settings.")
-        .none[F, Ident]
+        )
-    )
+    else
      ctx.logger.info("Classification is disabled.") *> false.pure[F]
 }
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@ -4856,8 +4856,6 @@ components:
      properties:
        enabled:
          type: boolean
        category:
          type: string
        itemCount:
          type: integer
          format: int32
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
@ -46,8 +46,7 @@ object CollectiveRoutes {
              OCollective.Classifier(
                settings.classifier.enabled,
                settings.classifier.schedule,
-                settings.classifier.itemCount,
+                settings.classifier.itemCount
                settings.classifier.category
              )
            )
          )
@ -65,8 +64,7 @@ object CollectiveRoutes {
              c.language,
              c.integrationEnabled,
              ClassifierSetting(
-                c.classifier.map(_.enabled).getOrElse(false),
+                c.classifier.exists(_.enabled),
                c.classifier.flatMap(_.category),
                c.classifier.map(_.itemCount).getOrElse(0),
                c.classifier
                  .map(_.schedule)
--- a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql
@ -0,0 +1,21 @@
 CREATE TABLE "classifier_model"(
  "id" varchar(254) not null primary key,
  "cid" varchar(254) not null,
  "name" varchar(254) not null,
  "file_id" varchar(254) not null,
  "created" timestamp not null,
  foreign key ("cid") references "collective"("cid"),
  foreign key ("file_id") references "filemeta"("id"),
  unique ("cid", "name")
 );
 insert into "classifier_model"
 select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name", "file_id", "created"
 from "classifier_setting"
 where "file_id" is not null;
 alter table "classifier_setting"
 drop column "category";
 alter table "classifier_setting"
 drop column "file_id";
--- a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql
@ -0,0 +1,26 @@
 CREATE TABLE `classifier_model`(
  `id` varchar(254) not null primary key,
  `cid` varchar(254) not null,
  `name` varchar(254) not null,
  `file_id` varchar(254) not null,
  `created` timestamp not null,
  foreign key (`cid`) references `collective`(`cid`),
  foreign key (`file_id`) references `filemeta`(`id`),
  unique (`cid`, `name`)
 );
 insert into `classifier_model`
 select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `file_id`, `created`
 from `classifier_setting`
 where `file_id` is not null;
 alter table `classifier_setting`
 drop column `category`;
 -- mariadb needs special treatment when dropping a column that is part
 -- of an index and foreign key
 alter table `classifier_setting`
 drop constraint `classifier_setting_ibfk_2`;
 alter table `classifier_setting`
 drop column `file_id`;
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql
@ -0,0 +1,21 @@
 CREATE TABLE "classifier_model"(
  "id" varchar(254) not null primary key,
  "cid" varchar(254) not null,
  "name" varchar(254) not null,
  "file_id" varchar(254) not null,
  "created" timestamp not null,
  foreign key ("cid") references "collective"("cid"),
  foreign key ("file_id") references "filemeta"("id"),
  unique ("cid", "name")
 );
 insert into "classifier_model"
 select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name", "file_id", "created"
 from "classifier_setting"
 where "file_id" is not null;
 alter table "classifier_setting"
 drop column "category";
 alter table "classifier_setting"
 drop column "file_id";
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@ -543,11 +543,14 @@ object QItem {
  def findAllNewesFirst(
      collective: Ident,
-      chunkSize: Int
+      chunkSize: Int,
      limit: Batch
  ): Stream[ConnectionIO, Ident] = {
    val i = RItem.as("i")
    Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed)
      .orderBy(i.created.desc)
      .limit(limit)
      .build
      .query[Ident]
      .streamWithChunkSize(chunkSize)
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala
@ -0,0 +1,78 @@
 package docspell.store.records
 import cats.effect._
 import cats.data.NonEmptyList
 import cats.implicits._
 import docspell.common._
 import docspell.store.qb.DSL._
 import docspell.store.qb._
 import doobie._
 import doobie.implicits._
 final case class RClassifierModel(
    id: Ident,
    cid: Ident,
    name: String,
    fileId: Ident,
    created: Timestamp
 ) {}
 object RClassifierModel {
  def createNew[F[_]: Sync](
      cid: Ident,
      name: String,
      fileId: Ident
  ): F[RClassifierModel] =
    for {
      id  <- Ident.randomId[F]
      now <- Timestamp.current[F]
    } yield RClassifierModel(id, cid, name, fileId, now)
  final case class Table(alias: Option[String]) extends TableDef {
    val tableName = "classifier_model"
    val id      = Column[Ident]("id", this)
    val cid     = Column[Ident]("cid", this)
    val name    = Column[String]("name", this)
    val fileId  = Column[Ident]("file_id", this)
    val created = Column[Timestamp]("created", this)
    val all = NonEmptyList.of[Column[_]](id, cid, name, fileId, created)
  }
  def as(alias: String): Table =
    Table(Some(alias))
  val T = Table(None)
  def insert(v: RClassifierModel): ConnectionIO[Int] =
    DML.insert(
      T,
      T.all,
      fr"${v.id},${v.cid},${v.name},${v.fileId},${v.created}"
    )
  def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
    for {
      n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid)))
      k <-
        if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
        else 0.pure[ConnectionIO]
    } yield n + k
  def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] =
    Select(select(T.all), from(T), T.cid === cid && T.name === name).build
      .query[RClassifierModel]
      .option
  def findAllByName(
      cid: Ident,
      names: NonEmptyList[String]
  ): ConnectionIO[List[RClassifierModel]] =
    Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build
      .query[RClassifierModel]
      .to[List]
 }
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@ -15,9 +15,7 @@ case class RClassifierSetting(
    cid: Ident,
    enabled: Boolean,
    schedule: CalEvent,
    category: String,
    itemCount: Int,
    fileId: Option[Ident],
    created: Timestamp
 ) {}
@ -28,12 +26,10 @@ object RClassifierSetting {
    val cid       = Column[Ident]("cid", this)
    val enabled   = Column[Boolean]("enabled", this)
    val schedule  = Column[CalEvent]("schedule", this)
    val category  = Column[String]("category", this)
    val itemCount = Column[Int]("item_count", this)
    val fileId    = Column[Ident]("file_id", this)
    val created   = Column[Timestamp]("created", this)
    val all = NonEmptyList
-      .of[Column[_]](cid, enabled, schedule, category, itemCount, fileId, created)
+      .of[Column[_]](cid, enabled, schedule, itemCount, created)
  }
  val T = Table(None)
@ -44,7 +40,7 @@ object RClassifierSetting {
    DML.insert(
      T,
      T.all,
-      fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
+      fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}"
    )
  def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
@ -54,15 +50,10 @@ object RClassifierSetting {
      DML.set(
        T.enabled.setTo(v.enabled),
        T.schedule.setTo(v.schedule),
-        T.category.setTo(v.category),
+        T.itemCount.setTo(v.itemCount)
        T.itemCount.setTo(v.itemCount),
        T.fileId.setTo(v.fileId)
      )
    )
  def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
    DML.update(T, T.cid === coll, DML.set(T.fileId.setTo(fid)))
  def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
    for {
      n1 <- DML.update(
@ -71,8 +62,7 @@ object RClassifierSetting {
        DML.set(
          T.enabled.setTo(v.enabled),
          T.schedule.setTo(v.schedule),
-          T.itemCount.setTo(v.itemCount),
+          T.itemCount.setTo(v.itemCount)
          T.category.setTo(v.category)
        )
      )
      n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
@ -89,8 +79,7 @@ object RClassifierSetting {
  case class Classifier(
      enabled: Boolean,
      schedule: CalEvent,
-      itemCount: Int,
+      itemCount: Int
      category: Option[String]
  ) {
    def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
@ -98,15 +87,13 @@ object RClassifierSetting {
        coll,
        enabled,
        schedule,
        category.getOrElse(""),
        itemCount,
        None,
        created
      )
  }
  object Classifier {
    def fromRecord(r: RClassifierSetting): Classifier =
-      Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
+      Classifier(r.enabled, r.schedule, r.itemCount)
  }
 }
--- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
@ -89,8 +89,7 @@ object RCollective {
        c.integration.s,
        cs.enabled.s,
        cs.schedule.s,
-        cs.itemCount.s,
+        cs.itemCount.s
        cs.category.s
      ),
      from(c).leftJoin(cs, cs.cid === c.id),
      c.id === coll
--- a/modules/store/src/main/scala/docspell/store/records/RTag.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala
@ -148,6 +148,13 @@ object RTag {
    ).orderBy(T.name.asc).build.query[RTag].to[List]
  }
  def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] =
    Select(
      coalesce(T.category.s, lit(fallback)).s,
      from(T),
      T.cid === coll
    ).distinct.build.query[String].to[List]
  def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
    DML.delete(T, T.tid === tagId && T.cid === coll)
 }
--- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
@ -25,8 +25,6 @@ import Util.Tag
 type alias Model =
    { enabled : Bool
    , categoryModel : Comp.FixedDropdown.Model String
    , category : Maybe String
    , scheduleModel : Comp.CalEventInput.Model
    , schedule : Validated CalEvent
    , itemCountModel : Comp.IntField.Model
@ -35,10 +33,8 @@ type alias Model =
 type Msg
-    = GetTagsResp (Result Http.Error TagList)
+    = ScheduleMsg Comp.CalEventInput.Msg
    | ScheduleMsg Comp.CalEventInput.Msg
    | ToggleEnabled
    | CategoryMsg (Comp.FixedDropdown.Msg String)
    | ItemCountMsg Comp.IntField.Msg
@ -53,17 +49,12 @@ init flags sett =
            Comp.CalEventInput.init flags newSchedule
    in
    ( { enabled = sett.enabled
      , categoryModel = Comp.FixedDropdown.initString []
      , category = sett.category
      , scheduleModel = cem
      , schedule = Data.Validated.Unknown newSchedule
      , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
      , itemCount = Just sett.itemCount
      }
-    , Cmd.batch
+    , Cmd.map ScheduleMsg cec
        [ Api.getTags flags "" GetTagsResp
        , Cmd.map ScheduleMsg cec
        ]
    )
@ -72,7 +63,6 @@ getSettings model =
    Data.Validated.map
        (\sch ->
            { enabled = model.enabled
            , category = model.category
            , schedule =
                Data.CalEvent.makeEvent sch
            , itemCount = Maybe.withDefault 0 model.itemCount
@ -84,27 +74,6 @@ getSettings model =
 update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
 update flags msg model =
    case msg of
        GetTagsResp (Ok tl) ->
            let
                categories =
                    Util.Tag.getCategories tl.items
                        |> List.sort
            in
            ( { model
                | categoryModel = Comp.FixedDropdown.initString categories
                , category =
                    if model.category == Nothing then
                        List.head categories
                    else
                        model.category
              }
            , Cmd.none
            )
        GetTagsResp (Err _) ->
            ( model, Cmd.none )
        ScheduleMsg lmsg ->
            let
                ( cm, cc, ce ) =
@ -126,23 +95,6 @@ update flags msg model =
            , Cmd.none
            )
        CategoryMsg lmsg ->
            let
                ( mm, ma ) =
                    Comp.FixedDropdown.update lmsg model.categoryModel
            in
            ( { model
                | categoryModel = mm
                , category =
                    if ma == Nothing then
                        model.category
                    else
                        ma
              }
            , Cmd.none
            )
        ItemCountMsg lmsg ->
            let
                ( im, iv ) =
@ -182,13 +134,6 @@ view model =
            , text "periodically based on a schedule and you need to specify a tag-group that should "
            , text "be used for learning."
            ]
        , div [ class "field" ]
            [ label [] [ text "Category" ]
            , Html.map CategoryMsg
                (Comp.FixedDropdown.viewString model.category
                    model.categoryModel
                )
            ]
        , Html.map ItemCountMsg
            (Comp.IntField.viewWithInfo
                "The maximum number of items to learn from, order by date newest first. Use 0 to mean all."