From 249f9e6e2a22cc1250a8b968e0193c8a96abed5a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 13:35:53 +0100 Subject: [PATCH] Extend guessing tags to all tag categories --- .../classifier/StanfordTextClassifier.scala | 9 +- .../scala/docspell/joex/JoexAppImpl.scala | 2 +- .../docspell/joex/learn/ClassifierName.scala | 45 +++++++++ .../joex/learn/LearnClassifierTask.scala | 92 +++++++------------ .../docspell/joex/learn/SelectItems.scala | 39 ++++++++ .../joex/learn/StoreClassifierModel.scala | 53 +++++++++++ .../docspell/joex/process/TextAnalysis.scala | 59 +++++++----- .../src/main/resources/docspell-openapi.yml | 2 - .../restserver/routes/CollectiveRoutes.scala | 6 +- .../h2/V1.17.1__classifier_model.sql | 21 +++++ .../mariadb/V1.17.1__classifier_model.sql | 26 ++++++ .../postgresql/V1.17.1__classifier_model.sql | 21 +++++ .../scala/docspell/store/queries/QItem.scala | 5 +- .../store/records/RClassifierModel.scala | 78 ++++++++++++++++ .../store/records/RClassifierSetting.scala | 25 ++--- .../docspell/store/records/RCollective.scala | 3 +- .../scala/docspell/store/records/RTag.scala | 7 ++ .../main/elm/Comp/ClassifierSettingsForm.elm | 59 +----------- 18 files changed, 384 insertions(+), 168 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala index edd1c7da..dc567695 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala @@ -11,6 +11,7 @@ import docspell.analysis.classifier import docspell.analysis.classifier.TextClassifier._ import docspell.analysis.nlp.Properties import docspell.common._ +import docspell.common.syntax.FileSyntax._ import edu.stanford.nlp.classify.ColumnDataClassifier @@ -28,7 +29,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( .use { dir => for { rawData <- writeDataFile(blocker, dir, data) - _ <- logger.info(s"Learning from ${rawData.count} items.") + _ <- logger.debug(s"Learning from ${rawData.count} items.") trainData <- splitData(logger, rawData) scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m)) sorted = scores.sortBy(-_.score) @@ -138,9 +139,9 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( props: Map[String, String] ): Map[String, String] = prepend("2.", props) ++ Map( - "trainFile" -> trainData.train.normalize().toAbsolutePath().toString(), - "testFile" -> trainData.test.normalize().toAbsolutePath().toString(), - "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString() + "trainFile" -> trainData.train.absolutePathAsString, + "testFile" -> trainData.test.absolutePathAsString, + "serializeTo" -> trainData.modelFile.absolutePathAsString ).toList case class RawData(count: Long, file: Path) diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index cdbb5a50..c221f187 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -169,7 +169,7 @@ object JoexAppImpl { .withTask( JobTask.json( LearnClassifierArgs.taskName, - LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser), + LearnClassifierTask[F](cfg.textAnalysis, analyser), LearnClassifierTask.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala new file mode 100644 index 00000000..6b128c24 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -0,0 +1,45 @@ +package docspell.joex.learn + +import cats.data.NonEmptyList +import cats.implicits._ +import docspell.common.Ident +import docspell.store.records.{RClassifierModel, RTag} +import doobie._ + +final class ClassifierName(val name: String) extends AnyVal + +object ClassifierName { + def apply(name: String): ClassifierName = + new ClassifierName(name) + + val noCategory: ClassifierName = + apply("__docspell_no_category__") + + val categoryPrefix = "tagcategory-" + + def tagCategory(cat: String): ClassifierName = + apply(s"${categoryPrefix}${cat}") + + val concernedPerson: ClassifierName = + apply("concernedperson") + + val concernedEquip: ClassifierName = + apply("concernedequip") + + val correspondentOrg: ClassifierName = + apply("correspondentorg") + + val correspondentPerson: ClassifierName = + apply("correspondentperson") + + def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = + for { + categories <- RTag.listCategories(coll, noCategory.name) + models <- NonEmptyList.fromList(categories) match { + case Some(nel) => + RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name)) + case None => + List.empty[RClassifierModel].pure[ConnectionIO] + } + } yield models +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index d5c632c3..3949a151 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -4,23 +4,16 @@ import cats.data.Kleisli import cats.data.OptionT import cats.effect._ import cats.implicits._ -import fs2.{Pipe, Stream} - import docspell.analysis.TextAnalyser -import docspell.analysis.classifier.ClassifierModel -import docspell.analysis.classifier.TextClassifier.Data import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ -import docspell.store.queries.QItem -import docspell.store.records.RClassifierSetting - -import bitpeace.MimetypeHint +import docspell.store.records.{RClassifierSetting, RTag} object LearnClassifierTask { - val noClass = "__NONE__" val pageSep = " --n-- " + val noClass = "__NONE__" type Args = LearnClassifierArgs @@ -29,67 +22,53 @@ object LearnClassifierTask { def apply[F[_]: Sync: ContextShift]( cfg: Config.TextAnalysis, - blocker: Blocker, analyser: TextAnalyser[F] ): Task[F, Args, Unit] = Task { ctx => (for { sett <- findActiveSettings[F](ctx, cfg) - data = selectItems( - ctx, - math.min(cfg.classification.itemCount, sett.itemCount).toLong, - sett.category.getOrElse("") - ) + maxItems = math.min(cfg.classification.itemCount, sett.itemCount) _ <- OptionT.liftF( - analyser.classifier - .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker))) + learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) ) } yield ()) .getOrElseF(logInactiveWarning(ctx.logger)) } - private def handleModel[F[_]: Sync: ContextShift]( - ctx: Context[F, Args], - blocker: Blocker - )(trainedModel: ClassifierModel): F[Unit] = - for { - oldFile <- ctx.store.transact( - RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId)) - ) - _ <- ctx.logger.info("Storing new trained model") - fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) - newFile <- - ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError - _ <- ctx.store.transact( - RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id)) - ) - _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}") - _ <- oldFile match { - case Some(fid) => - ctx.logger.debug(s"Deleting old model file ${fid.id}") *> - ctx.store.bitpeace.delete(fid.id).compile.drain - case None => ().pure[F] - } - } yield () - - private def selectItems[F[_]]( - ctx: Context[F, Args], - max: Long, + def learnTagCategory[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + )( category: String - ): Stream[F, Data] = { - val connStream = - for { - item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) - tt <- Stream.eval( - QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep) + ): Task[F, A, Unit] = + Task { ctx => + val data = SelectItems.forCategory(ctx, collective)(maxItems, category) + ctx.logger.info(s"Learn classifier for tag category: $category") *> + analyser.classifier.trainClassifier(ctx.logger, data)( + Kleisli( + StoreClassifierModel.handleModel( + ctx, + collective, + ClassifierName.tagCategory(category) + ) + ) ) - } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) - ctx.store.transact(connStream.filter(_.text.nonEmpty)) - } + } - private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] = - if (max <= 0) identity - else _.take(max) + def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])( + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + Task { ctx => + for { + cats <- ctx.store.transact( + RTag.listCategories(collective, ClassifierName.noCategory.name) + ) + task = learnTagCategory[F, A](analyser, collective, maxItems) _ + _ <- cats.map(task).traverse(_.run(ctx)) + } yield () + } private def findActiveSettings[F[_]: Sync]( ctx: Context[F, Args], @@ -98,7 +77,6 @@ object LearnClassifierTask { if (cfg.classification.enabled) OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) .filter(_.enabled) - .filter(_.category.nonEmpty) .map(OCollective.Classifier.fromRecord) else OptionT.none diff --git a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala new file mode 100644 index 00000000..e7c31d7b --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala @@ -0,0 +1,39 @@ +package docspell.joex.learn + +import fs2.Stream + +import docspell.analysis.classifier.TextClassifier.Data +import docspell.common._ +import docspell.joex.scheduler.Context +import docspell.store.Store +import docspell.store.qb.Batch +import docspell.store.queries.QItem + +object SelectItems { + val pageSep = LearnClassifierTask.pageSep + val noClass = LearnClassifierTask.noClass + + def forCategory[F[_]](ctx: Context[F, _], collective: Ident)( + max: Int, + category: String + ): Stream[F, Data] = + forCategory(ctx.store, collective, max, category) + + def forCategory[F[_]]( + store: Store[F], + collective: Ident, + max: Int, + category: String + ): Stream[F, Data] = { + val limit = if (max <= 0) Batch.all else Batch.limit(max) + val connStream = + for { + item <- QItem.findAllNewesFirst(collective, 10, limit) + tt <- Stream.eval( + QItem.resolveTextAndTag(collective, item, category, pageSep) + ) + } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) + store.transact(connStream.filter(_.text.nonEmpty)) + } + +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala b/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala new file mode 100644 index 00000000..03d027a1 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala @@ -0,0 +1,53 @@ +package docspell.joex.learn + +import cats.effect._ +import cats.implicits._ + +import docspell.analysis.classifier.ClassifierModel +import docspell.common._ +import docspell.joex.scheduler._ +import docspell.store.Store +import docspell.store.records.RClassifierModel + +import bitpeace.MimetypeHint + +object StoreClassifierModel { + + def handleModel[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + collective: Ident, + modelName: ClassifierName + )( + trainedModel: ClassifierModel + ): F[Unit] = + handleModel(ctx.store, ctx.blocker, ctx.logger)(collective, modelName, trainedModel) + + def handleModel[F[_]: Sync: ContextShift]( + store: Store[F], + blocker: Blocker, + logger: Logger[F] + )( + collective: Ident, + modelName: ClassifierName, + trainedModel: ClassifierModel + ): F[Unit] = + for { + oldFile <- store.transact( + RClassifierModel.findByName(collective, modelName.name).map(_.map(_.fileId)) + ) + _ <- logger.debug(s"Storing new trained model for: ${modelName.name}") + fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) + newFile <- + store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError + _ <- store.transact( + RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id)) + ) + _ <- logger.debug(s"New model stored at file ${newFile.id}") + _ <- oldFile match { + case Some(fid) => + logger.debug(s"Deleting old model file ${fid.id}") *> + store.bitpeace.delete(fid.id).compile.drain + case None => ().pure[F] + } + } yield () +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index f336132d..fd7c08bc 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -9,12 +9,11 @@ import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile -import docspell.joex.learn.LearnClassifierTask +import docspell.joex.learn.{ClassifierName, LearnClassifierTask} import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task -import docspell.store.records.RAttachmentMeta -import docspell.store.records.RClassifierSetting +import docspell.store.records.{RAttachmentMeta, RClassifierSetting} import bitpeace.RangeDef @@ -42,10 +41,13 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - tag <- predictTag(ctx, cfg, item.metas, analyser.classifier).value + classifierEnabled <- getActive(ctx, cfg) + tag <- + if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) + else List.empty[String].pure[F] } yield item .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) - .appendTags(tag.toSeq) + .appendTags(tag) } def annotateAttachment[F[_]: Sync]( @@ -66,15 +68,29 @@ object TextAnalysis { } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + def predictTags[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): F[List[String]] = + for { + models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective)) + _ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories") + tags <- models + .map(_.fileId.some) + .traverse(predictTag(ctx, cfg, metas, classifier)) + } yield tags.flatten + def predictTag[F[_]: Sync: ContextShift]( ctx: Context[F, Args], cfg: Config.TextAnalysis, metas: Vector[RAttachmentMeta], classifier: TextClassifier[F] - ): OptionT[F, String] = - for { - model <- findActiveModel(ctx, cfg) - _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) + )(modelFileId: Option[Ident]): F[Option[String]] = + (for { + _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …")) + model <- OptionT.fromOption[F](modelFileId) text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) modelData = ctx.store.bitpeace @@ -90,20 +106,21 @@ object TextAnalysis { .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) }).filter(_ != LearnClassifierTask.noClass) _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) - } yield cls + } yield cls).value - private def findActiveModel[F[_]: Sync]( + private def getActive[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis - ): OptionT[F, Ident] = - (if (cfg.classification.enabled) - OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective))) - .filter(_.enabled) - .mapFilter(_.fileId) - else - OptionT.none[F, Ident]).orElse( - OptionT.liftF(ctx.logger.info("Classification is disabled.")) *> OptionT - .none[F, Ident] - ) + ): F[Boolean] = + if (cfg.classification.enabled) + ctx.store + .transact(RClassifierSetting.findById(ctx.args.meta.collective)) + .map(_.exists(_.enabled)) + .flatTap(enabled => + if (enabled) ().pure[F] + else ctx.logger.info("Classification is disabled. Check config or settings.") + ) + else + ctx.logger.info("Classification is disabled.") *> false.pure[F] } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 20ac6449..d32d2352 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -4856,8 +4856,6 @@ components: properties: enabled: type: boolean - category: - type: string itemCount: type: integer format: int32 diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index 7ecd1e90..ee868254 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -46,8 +46,7 @@ object CollectiveRoutes { OCollective.Classifier( settings.classifier.enabled, settings.classifier.schedule, - settings.classifier.itemCount, - settings.classifier.category + settings.classifier.itemCount ) ) ) @@ -65,8 +64,7 @@ object CollectiveRoutes { c.language, c.integrationEnabled, ClassifierSetting( - c.classifier.map(_.enabled).getOrElse(false), - c.classifier.flatMap(_.category), + c.classifier.exists(_.enabled), c.classifier.map(_.itemCount).getOrElse(0), c.classifier .map(_.schedule) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..11be9909 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql @@ -0,0 +1,21 @@ +CREATE TABLE "classifier_model"( + "id" varchar(254) not null primary key, + "cid" varchar(254) not null, + "name" varchar(254) not null, + "file_id" varchar(254) not null, + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id"), + unique ("cid", "name") +); + +insert into "classifier_model" +select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name", "file_id", "created" +from "classifier_setting" +where "file_id" is not null; + +alter table "classifier_setting" +drop column "category"; + +alter table "classifier_setting" +drop column "file_id"; diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..d6f9da6e --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql @@ -0,0 +1,26 @@ +CREATE TABLE `classifier_model`( + `id` varchar(254) not null primary key, + `cid` varchar(254) not null, + `name` varchar(254) not null, + `file_id` varchar(254) not null, + `created` timestamp not null, + foreign key (`cid`) references `collective`(`cid`), + foreign key (`file_id`) references `filemeta`(`id`), + unique (`cid`, `name`) +); + +insert into `classifier_model` +select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `file_id`, `created` +from `classifier_setting` +where `file_id` is not null; + +alter table `classifier_setting` +drop column `category`; + +-- mariadb needs special treatment when dropping a column that is part +-- of an index and foreign key +alter table `classifier_setting` +drop constraint `classifier_setting_ibfk_2`; + +alter table `classifier_setting` +drop column `file_id`; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..81e327ff --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql @@ -0,0 +1,21 @@ +CREATE TABLE "classifier_model"( + "id" varchar(254) not null primary key, + "cid" varchar(254) not null, + "name" varchar(254) not null, + "file_id" varchar(254) not null, + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id"), + unique ("cid", "name") +); + +insert into "classifier_model" +select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name", "file_id", "created" +from "classifier_setting" +where "file_id" is not null; + +alter table "classifier_setting" +drop column "category"; + +alter table "classifier_setting" +drop column "file_id"; diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 3ce1af55..b68afb22 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -543,11 +543,14 @@ object QItem { def findAllNewesFirst( collective: Ident, - chunkSize: Int + chunkSize: Int, + limit: Batch ): Stream[ConnectionIO, Ident] = { + val i = RItem.as("i") Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed) .orderBy(i.created.desc) + .limit(limit) .build .query[Ident] .streamWithChunkSize(chunkSize) diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala new file mode 100644 index 00000000..2d018f81 --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala @@ -0,0 +1,78 @@ +package docspell.store.records + +import cats.effect._ +import cats.data.NonEmptyList +import cats.implicits._ + +import docspell.common._ +import docspell.store.qb.DSL._ +import docspell.store.qb._ + +import doobie._ +import doobie.implicits._ + +final case class RClassifierModel( + id: Ident, + cid: Ident, + name: String, + fileId: Ident, + created: Timestamp +) {} + +object RClassifierModel { + + def createNew[F[_]: Sync]( + cid: Ident, + name: String, + fileId: Ident + ): F[RClassifierModel] = + for { + id <- Ident.randomId[F] + now <- Timestamp.current[F] + } yield RClassifierModel(id, cid, name, fileId, now) + + final case class Table(alias: Option[String]) extends TableDef { + val tableName = "classifier_model" + + val id = Column[Ident]("id", this) + val cid = Column[Ident]("cid", this) + val name = Column[String]("name", this) + val fileId = Column[Ident]("file_id", this) + val created = Column[Timestamp]("created", this) + + val all = NonEmptyList.of[Column[_]](id, cid, name, fileId, created) + } + + def as(alias: String): Table = + Table(Some(alias)) + + val T = Table(None) + + def insert(v: RClassifierModel): ConnectionIO[Int] = + DML.insert( + T, + T.all, + fr"${v.id},${v.cid},${v.name},${v.fileId},${v.created}" + ) + + def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] = + for { + n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid))) + k <- + if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert) + else 0.pure[ConnectionIO] + } yield n + k + + def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] = + Select(select(T.all), from(T), T.cid === cid && T.name === name).build + .query[RClassifierModel] + .option + + def findAllByName( + cid: Ident, + names: NonEmptyList[String] + ): ConnectionIO[List[RClassifierModel]] = + Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build + .query[RClassifierModel] + .to[List] +} diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index 749435d1..fe634161 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -15,9 +15,7 @@ case class RClassifierSetting( cid: Ident, enabled: Boolean, schedule: CalEvent, - category: String, itemCount: Int, - fileId: Option[Ident], created: Timestamp ) {} @@ -28,12 +26,10 @@ object RClassifierSetting { val cid = Column[Ident]("cid", this) val enabled = Column[Boolean]("enabled", this) val schedule = Column[CalEvent]("schedule", this) - val category = Column[String]("category", this) val itemCount = Column[Int]("item_count", this) - val fileId = Column[Ident]("file_id", this) val created = Column[Timestamp]("created", this) val all = NonEmptyList - .of[Column[_]](cid, enabled, schedule, category, itemCount, fileId, created) + .of[Column[_]](cid, enabled, schedule, itemCount, created) } val T = Table(None) @@ -44,7 +40,7 @@ object RClassifierSetting { DML.insert( T, T.all, - fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}" + fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}" ) def updateAll(v: RClassifierSetting): ConnectionIO[Int] = @@ -54,15 +50,10 @@ object RClassifierSetting { DML.set( T.enabled.setTo(v.enabled), T.schedule.setTo(v.schedule), - T.category.setTo(v.category), - T.itemCount.setTo(v.itemCount), - T.fileId.setTo(v.fileId) + T.itemCount.setTo(v.itemCount) ) ) - def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] = - DML.update(T, T.cid === coll, DML.set(T.fileId.setTo(fid))) - def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = for { n1 <- DML.update( @@ -71,8 +62,7 @@ object RClassifierSetting { DML.set( T.enabled.setTo(v.enabled), T.schedule.setTo(v.schedule), - T.itemCount.setTo(v.itemCount), - T.category.setTo(v.category) + T.itemCount.setTo(v.itemCount) ) ) n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO] @@ -89,8 +79,7 @@ object RClassifierSetting { case class Classifier( enabled: Boolean, schedule: CalEvent, - itemCount: Int, - category: Option[String] + itemCount: Int ) { def toRecord(coll: Ident, created: Timestamp): RClassifierSetting = @@ -98,15 +87,13 @@ object RClassifierSetting { coll, enabled, schedule, - category.getOrElse(""), itemCount, - None, created ) } object Classifier { def fromRecord(r: RClassifierSetting): Classifier = - Classifier(r.enabled, r.schedule, r.itemCount, r.category.some) + Classifier(r.enabled, r.schedule, r.itemCount) } } diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala index ca3b2666..f6114a38 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala @@ -89,8 +89,7 @@ object RCollective { c.integration.s, cs.enabled.s, cs.schedule.s, - cs.itemCount.s, - cs.category.s + cs.itemCount.s ), from(c).leftJoin(cs, cs.cid === c.id), c.id === coll diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 27a30031..5bba7d67 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -148,6 +148,13 @@ object RTag { ).orderBy(T.name.asc).build.query[RTag].to[List] } + def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] = + Select( + coalesce(T.category.s, lit(fallback)).s, + from(T), + T.cid === coll + ).distinct.build.query[String].to[List] + def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = DML.delete(T, T.tid === tagId && T.cid === coll) } diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm index 23e440cd..1181e239 100644 --- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -25,8 +25,6 @@ import Util.Tag type alias Model = { enabled : Bool - , categoryModel : Comp.FixedDropdown.Model String - , category : Maybe String , scheduleModel : Comp.CalEventInput.Model , schedule : Validated CalEvent , itemCountModel : Comp.IntField.Model @@ -35,10 +33,8 @@ type alias Model = type Msg - = GetTagsResp (Result Http.Error TagList) - | ScheduleMsg Comp.CalEventInput.Msg + = ScheduleMsg Comp.CalEventInput.Msg | ToggleEnabled - | CategoryMsg (Comp.FixedDropdown.Msg String) | ItemCountMsg Comp.IntField.Msg @@ -53,17 +49,12 @@ init flags sett = Comp.CalEventInput.init flags newSchedule in ( { enabled = sett.enabled - , categoryModel = Comp.FixedDropdown.initString [] - , category = sett.category , scheduleModel = cem , schedule = Data.Validated.Unknown newSchedule , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" , itemCount = Just sett.itemCount } - , Cmd.batch - [ Api.getTags flags "" GetTagsResp - , Cmd.map ScheduleMsg cec - ] + , Cmd.map ScheduleMsg cec ) @@ -72,7 +63,6 @@ getSettings model = Data.Validated.map (\sch -> { enabled = model.enabled - , category = model.category , schedule = Data.CalEvent.makeEvent sch , itemCount = Maybe.withDefault 0 model.itemCount @@ -84,27 +74,6 @@ getSettings model = update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) update flags msg model = case msg of - GetTagsResp (Ok tl) -> - let - categories = - Util.Tag.getCategories tl.items - |> List.sort - in - ( { model - | categoryModel = Comp.FixedDropdown.initString categories - , category = - if model.category == Nothing then - List.head categories - - else - model.category - } - , Cmd.none - ) - - GetTagsResp (Err _) -> - ( model, Cmd.none ) - ScheduleMsg lmsg -> let ( cm, cc, ce ) = @@ -126,23 +95,6 @@ update flags msg model = , Cmd.none ) - CategoryMsg lmsg -> - let - ( mm, ma ) = - Comp.FixedDropdown.update lmsg model.categoryModel - in - ( { model - | categoryModel = mm - , category = - if ma == Nothing then - model.category - - else - ma - } - , Cmd.none - ) - ItemCountMsg lmsg -> let ( im, iv ) = @@ -182,13 +134,6 @@ view model = , text "periodically based on a schedule and you need to specify a tag-group that should " , text "be used for learning." ] - , div [ class "field" ] - [ label [] [ text "Category" ] - , Html.map CategoryMsg - (Comp.FixedDropdown.viewString model.category - model.categoryModel - ) - ] , Html.map ItemCountMsg (Comp.IntField.viewWithInfo "The maximum number of items to learn from, order by date newest first. Use 0 to mean all."