mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 15:15:58 +00:00
Extend guessing tags to all tag categories
This commit is contained in:
parent
c5778880d9
commit
249f9e6e2a
@ -11,6 +11,7 @@ import docspell.analysis.classifier
|
|||||||
import docspell.analysis.classifier.TextClassifier._
|
import docspell.analysis.classifier.TextClassifier._
|
||||||
import docspell.analysis.nlp.Properties
|
import docspell.analysis.nlp.Properties
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.common.syntax.FileSyntax._
|
||||||
|
|
||||||
import edu.stanford.nlp.classify.ColumnDataClassifier
|
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
|||||||
.use { dir =>
|
.use { dir =>
|
||||||
for {
|
for {
|
||||||
rawData <- writeDataFile(blocker, dir, data)
|
rawData <- writeDataFile(blocker, dir, data)
|
||||||
_ <- logger.info(s"Learning from ${rawData.count} items.")
|
_ <- logger.debug(s"Learning from ${rawData.count} items.")
|
||||||
trainData <- splitData(logger, rawData)
|
trainData <- splitData(logger, rawData)
|
||||||
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||||
sorted = scores.sortBy(-_.score)
|
sorted = scores.sortBy(-_.score)
|
||||||
@ -138,9 +139,9 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
|||||||
props: Map[String, String]
|
props: Map[String, String]
|
||||||
): Map[String, String] =
|
): Map[String, String] =
|
||||||
prepend("2.", props) ++ Map(
|
prepend("2.", props) ++ Map(
|
||||||
"trainFile" -> trainData.train.normalize().toAbsolutePath().toString(),
|
"trainFile" -> trainData.train.absolutePathAsString,
|
||||||
"testFile" -> trainData.test.normalize().toAbsolutePath().toString(),
|
"testFile" -> trainData.test.absolutePathAsString,
|
||||||
"serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
|
"serializeTo" -> trainData.modelFile.absolutePathAsString
|
||||||
).toList
|
).toList
|
||||||
|
|
||||||
case class RawData(count: Long, file: Path)
|
case class RawData(count: Long, file: Path)
|
||||||
|
@ -169,7 +169,7 @@ object JoexAppImpl {
|
|||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
LearnClassifierArgs.taskName,
|
LearnClassifierArgs.taskName,
|
||||||
LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
|
LearnClassifierTask[F](cfg.textAnalysis, analyser),
|
||||||
LearnClassifierTask.onCancel[F]
|
LearnClassifierTask.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
package docspell.joex.learn
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import cats.implicits._
|
||||||
|
import docspell.common.Ident
|
||||||
|
import docspell.store.records.{RClassifierModel, RTag}
|
||||||
|
import doobie._
|
||||||
|
|
||||||
|
final class ClassifierName(val name: String) extends AnyVal
|
||||||
|
|
||||||
|
object ClassifierName {
|
||||||
|
def apply(name: String): ClassifierName =
|
||||||
|
new ClassifierName(name)
|
||||||
|
|
||||||
|
val noCategory: ClassifierName =
|
||||||
|
apply("__docspell_no_category__")
|
||||||
|
|
||||||
|
val categoryPrefix = "tagcategory-"
|
||||||
|
|
||||||
|
def tagCategory(cat: String): ClassifierName =
|
||||||
|
apply(s"${categoryPrefix}${cat}")
|
||||||
|
|
||||||
|
val concernedPerson: ClassifierName =
|
||||||
|
apply("concernedperson")
|
||||||
|
|
||||||
|
val concernedEquip: ClassifierName =
|
||||||
|
apply("concernedequip")
|
||||||
|
|
||||||
|
val correspondentOrg: ClassifierName =
|
||||||
|
apply("correspondentorg")
|
||||||
|
|
||||||
|
val correspondentPerson: ClassifierName =
|
||||||
|
apply("correspondentperson")
|
||||||
|
|
||||||
|
def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] =
|
||||||
|
for {
|
||||||
|
categories <- RTag.listCategories(coll, noCategory.name)
|
||||||
|
models <- NonEmptyList.fromList(categories) match {
|
||||||
|
case Some(nel) =>
|
||||||
|
RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name))
|
||||||
|
case None =>
|
||||||
|
List.empty[RClassifierModel].pure[ConnectionIO]
|
||||||
|
}
|
||||||
|
} yield models
|
||||||
|
}
|
@ -4,23 +4,16 @@ import cats.data.Kleisli
|
|||||||
import cats.data.OptionT
|
import cats.data.OptionT
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.{Pipe, Stream}
|
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.analysis.classifier.ClassifierModel
|
|
||||||
import docspell.analysis.classifier.TextClassifier.Data
|
|
||||||
import docspell.backend.ops.OCollective
|
import docspell.backend.ops.OCollective
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.scheduler._
|
import docspell.joex.scheduler._
|
||||||
import docspell.store.queries.QItem
|
import docspell.store.records.{RClassifierSetting, RTag}
|
||||||
import docspell.store.records.RClassifierSetting
|
|
||||||
|
|
||||||
import bitpeace.MimetypeHint
|
|
||||||
|
|
||||||
object LearnClassifierTask {
|
object LearnClassifierTask {
|
||||||
val noClass = "__NONE__"
|
|
||||||
val pageSep = " --n-- "
|
val pageSep = " --n-- "
|
||||||
|
val noClass = "__NONE__"
|
||||||
|
|
||||||
type Args = LearnClassifierArgs
|
type Args = LearnClassifierArgs
|
||||||
|
|
||||||
@ -29,67 +22,53 @@ object LearnClassifierTask {
|
|||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
cfg: Config.TextAnalysis,
|
cfg: Config.TextAnalysis,
|
||||||
blocker: Blocker,
|
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
(for {
|
(for {
|
||||||
sett <- findActiveSettings[F](ctx, cfg)
|
sett <- findActiveSettings[F](ctx, cfg)
|
||||||
data = selectItems(
|
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
|
||||||
ctx,
|
|
||||||
math.min(cfg.classification.itemCount, sett.itemCount).toLong,
|
|
||||||
sett.category.getOrElse("")
|
|
||||||
)
|
|
||||||
_ <- OptionT.liftF(
|
_ <- OptionT.liftF(
|
||||||
analyser.classifier
|
learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx)
|
||||||
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
|
|
||||||
)
|
)
|
||||||
} yield ())
|
} yield ())
|
||||||
.getOrElseF(logInactiveWarning(ctx.logger))
|
.getOrElseF(logInactiveWarning(ctx.logger))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def handleModel[F[_]: Sync: ContextShift](
|
def learnTagCategory[F[_]: Sync: ContextShift, A](
|
||||||
ctx: Context[F, Args],
|
analyser: TextAnalyser[F],
|
||||||
blocker: Blocker
|
collective: Ident,
|
||||||
)(trainedModel: ClassifierModel): F[Unit] =
|
maxItems: Int
|
||||||
for {
|
)(
|
||||||
oldFile <- ctx.store.transact(
|
|
||||||
RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
|
|
||||||
)
|
|
||||||
_ <- ctx.logger.info("Storing new trained model")
|
|
||||||
fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
|
|
||||||
newFile <-
|
|
||||||
ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
|
|
||||||
_ <- ctx.store.transact(
|
|
||||||
RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
|
|
||||||
)
|
|
||||||
_ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
|
|
||||||
_ <- oldFile match {
|
|
||||||
case Some(fid) =>
|
|
||||||
ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
|
|
||||||
ctx.store.bitpeace.delete(fid.id).compile.drain
|
|
||||||
case None => ().pure[F]
|
|
||||||
}
|
|
||||||
} yield ()
|
|
||||||
|
|
||||||
private def selectItems[F[_]](
|
|
||||||
ctx: Context[F, Args],
|
|
||||||
max: Long,
|
|
||||||
category: String
|
category: String
|
||||||
): Stream[F, Data] = {
|
): Task[F, A, Unit] =
|
||||||
val connStream =
|
Task { ctx =>
|
||||||
for {
|
val data = SelectItems.forCategory(ctx, collective)(maxItems, category)
|
||||||
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
ctx.logger.info(s"Learn classifier for tag category: $category") *>
|
||||||
tt <- Stream.eval(
|
analyser.classifier.trainClassifier(ctx.logger, data)(
|
||||||
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
|
Kleisli(
|
||||||
|
StoreClassifierModel.handleModel(
|
||||||
|
ctx,
|
||||||
|
collective,
|
||||||
|
ClassifierName.tagCategory(category)
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
}
|
||||||
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
|
||||||
}
|
|
||||||
|
|
||||||
private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
|
def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])(
|
||||||
if (max <= 0) identity
|
collective: Ident,
|
||||||
else _.take(max)
|
maxItems: Int
|
||||||
|
): Task[F, A, Unit] =
|
||||||
|
Task { ctx =>
|
||||||
|
for {
|
||||||
|
cats <- ctx.store.transact(
|
||||||
|
RTag.listCategories(collective, ClassifierName.noCategory.name)
|
||||||
|
)
|
||||||
|
task = learnTagCategory[F, A](analyser, collective, maxItems) _
|
||||||
|
_ <- cats.map(task).traverse(_.run(ctx))
|
||||||
|
} yield ()
|
||||||
|
}
|
||||||
|
|
||||||
private def findActiveSettings[F[_]: Sync](
|
private def findActiveSettings[F[_]: Sync](
|
||||||
ctx: Context[F, Args],
|
ctx: Context[F, Args],
|
||||||
@ -98,7 +77,6 @@ object LearnClassifierTask {
|
|||||||
if (cfg.classification.enabled)
|
if (cfg.classification.enabled)
|
||||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
||||||
.filter(_.enabled)
|
.filter(_.enabled)
|
||||||
.filter(_.category.nonEmpty)
|
|
||||||
.map(OCollective.Classifier.fromRecord)
|
.map(OCollective.Classifier.fromRecord)
|
||||||
else
|
else
|
||||||
OptionT.none
|
OptionT.none
|
||||||
|
@ -0,0 +1,39 @@
|
|||||||
|
package docspell.joex.learn
|
||||||
|
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.analysis.classifier.TextClassifier.Data
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.joex.scheduler.Context
|
||||||
|
import docspell.store.Store
|
||||||
|
import docspell.store.qb.Batch
|
||||||
|
import docspell.store.queries.QItem
|
||||||
|
|
||||||
|
object SelectItems {
|
||||||
|
val pageSep = LearnClassifierTask.pageSep
|
||||||
|
val noClass = LearnClassifierTask.noClass
|
||||||
|
|
||||||
|
def forCategory[F[_]](ctx: Context[F, _], collective: Ident)(
|
||||||
|
max: Int,
|
||||||
|
category: String
|
||||||
|
): Stream[F, Data] =
|
||||||
|
forCategory(ctx.store, collective, max, category)
|
||||||
|
|
||||||
|
def forCategory[F[_]](
|
||||||
|
store: Store[F],
|
||||||
|
collective: Ident,
|
||||||
|
max: Int,
|
||||||
|
category: String
|
||||||
|
): Stream[F, Data] = {
|
||||||
|
val limit = if (max <= 0) Batch.all else Batch.limit(max)
|
||||||
|
val connStream =
|
||||||
|
for {
|
||||||
|
item <- QItem.findAllNewesFirst(collective, 10, limit)
|
||||||
|
tt <- Stream.eval(
|
||||||
|
QItem.resolveTextAndTag(collective, item, category, pageSep)
|
||||||
|
)
|
||||||
|
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
||||||
|
store.transact(connStream.filter(_.text.nonEmpty))
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,53 @@
|
|||||||
|
package docspell.joex.learn
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.classifier.ClassifierModel
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.joex.scheduler._
|
||||||
|
import docspell.store.Store
|
||||||
|
import docspell.store.records.RClassifierModel
|
||||||
|
|
||||||
|
import bitpeace.MimetypeHint
|
||||||
|
|
||||||
|
object StoreClassifierModel {
|
||||||
|
|
||||||
|
def handleModel[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, _],
|
||||||
|
collective: Ident,
|
||||||
|
modelName: ClassifierName
|
||||||
|
)(
|
||||||
|
trainedModel: ClassifierModel
|
||||||
|
): F[Unit] =
|
||||||
|
handleModel(ctx.store, ctx.blocker, ctx.logger)(collective, modelName, trainedModel)
|
||||||
|
|
||||||
|
def handleModel[F[_]: Sync: ContextShift](
|
||||||
|
store: Store[F],
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
|
)(
|
||||||
|
collective: Ident,
|
||||||
|
modelName: ClassifierName,
|
||||||
|
trainedModel: ClassifierModel
|
||||||
|
): F[Unit] =
|
||||||
|
for {
|
||||||
|
oldFile <- store.transact(
|
||||||
|
RClassifierModel.findByName(collective, modelName.name).map(_.map(_.fileId))
|
||||||
|
)
|
||||||
|
_ <- logger.debug(s"Storing new trained model for: ${modelName.name}")
|
||||||
|
fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
|
||||||
|
newFile <-
|
||||||
|
store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
|
||||||
|
_ <- store.transact(
|
||||||
|
RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id))
|
||||||
|
)
|
||||||
|
_ <- logger.debug(s"New model stored at file ${newFile.id}")
|
||||||
|
_ <- oldFile match {
|
||||||
|
case Some(fid) =>
|
||||||
|
logger.debug(s"Deleting old model file ${fid.id}") *>
|
||||||
|
store.bitpeace.delete(fid.id).compile.drain
|
||||||
|
case None => ().pure[F]
|
||||||
|
}
|
||||||
|
} yield ()
|
||||||
|
}
|
@ -9,12 +9,11 @@ import docspell.analysis.{NlpSettings, TextAnalyser}
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.learn.LearnClassifierTask
|
import docspell.joex.learn.{ClassifierName, LearnClassifierTask}
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.{RAttachmentMeta, RClassifierSetting}
|
||||||
import docspell.store.records.RClassifierSetting
|
|
||||||
|
|
||||||
import bitpeace.RangeDef
|
import bitpeace.RangeDef
|
||||||
|
|
||||||
@ -42,10 +41,13 @@ object TextAnalysis {
|
|||||||
e <- s
|
e <- s
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
v = t.toVector
|
v = t.toVector
|
||||||
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier).value
|
classifierEnabled <- getActive(ctx, cfg)
|
||||||
|
tag <-
|
||||||
|
if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
|
||||||
|
else List.empty[String].pure[F]
|
||||||
} yield item
|
} yield item
|
||||||
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||||
.appendTags(tag.toSeq)
|
.appendTags(tag)
|
||||||
}
|
}
|
||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
@ -66,15 +68,29 @@ object TextAnalysis {
|
|||||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def predictTags[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
metas: Vector[RAttachmentMeta],
|
||||||
|
classifier: TextClassifier[F]
|
||||||
|
): F[List[String]] =
|
||||||
|
for {
|
||||||
|
models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective))
|
||||||
|
_ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories")
|
||||||
|
tags <- models
|
||||||
|
.map(_.fileId.some)
|
||||||
|
.traverse(predictTag(ctx, cfg, metas, classifier))
|
||||||
|
} yield tags.flatten
|
||||||
|
|
||||||
def predictTag[F[_]: Sync: ContextShift](
|
def predictTag[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, Args],
|
ctx: Context[F, Args],
|
||||||
cfg: Config.TextAnalysis,
|
cfg: Config.TextAnalysis,
|
||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
classifier: TextClassifier[F]
|
classifier: TextClassifier[F]
|
||||||
): OptionT[F, String] =
|
)(modelFileId: Option[Ident]): F[Option[String]] =
|
||||||
for {
|
(for {
|
||||||
model <- findActiveModel(ctx, cfg)
|
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …"))
|
||||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
model <- OptionT.fromOption[F](modelFileId)
|
||||||
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
modelData =
|
modelData =
|
||||||
ctx.store.bitpeace
|
ctx.store.bitpeace
|
||||||
@ -90,20 +106,21 @@ object TextAnalysis {
|
|||||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||||
}).filter(_ != LearnClassifierTask.noClass)
|
}).filter(_ != LearnClassifierTask.noClass)
|
||||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||||
} yield cls
|
} yield cls).value
|
||||||
|
|
||||||
private def findActiveModel[F[_]: Sync](
|
private def getActive[F[_]: Sync](
|
||||||
ctx: Context[F, Args],
|
ctx: Context[F, Args],
|
||||||
cfg: Config.TextAnalysis
|
cfg: Config.TextAnalysis
|
||||||
): OptionT[F, Ident] =
|
): F[Boolean] =
|
||||||
(if (cfg.classification.enabled)
|
if (cfg.classification.enabled)
|
||||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
|
ctx.store
|
||||||
.filter(_.enabled)
|
.transact(RClassifierSetting.findById(ctx.args.meta.collective))
|
||||||
.mapFilter(_.fileId)
|
.map(_.exists(_.enabled))
|
||||||
else
|
.flatTap(enabled =>
|
||||||
OptionT.none[F, Ident]).orElse(
|
if (enabled) ().pure[F]
|
||||||
OptionT.liftF(ctx.logger.info("Classification is disabled.")) *> OptionT
|
else ctx.logger.info("Classification is disabled. Check config or settings.")
|
||||||
.none[F, Ident]
|
)
|
||||||
)
|
else
|
||||||
|
ctx.logger.info("Classification is disabled.") *> false.pure[F]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -4856,8 +4856,6 @@ components:
|
|||||||
properties:
|
properties:
|
||||||
enabled:
|
enabled:
|
||||||
type: boolean
|
type: boolean
|
||||||
category:
|
|
||||||
type: string
|
|
||||||
itemCount:
|
itemCount:
|
||||||
type: integer
|
type: integer
|
||||||
format: int32
|
format: int32
|
||||||
|
@ -46,8 +46,7 @@ object CollectiveRoutes {
|
|||||||
OCollective.Classifier(
|
OCollective.Classifier(
|
||||||
settings.classifier.enabled,
|
settings.classifier.enabled,
|
||||||
settings.classifier.schedule,
|
settings.classifier.schedule,
|
||||||
settings.classifier.itemCount,
|
settings.classifier.itemCount
|
||||||
settings.classifier.category
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@ -65,8 +64,7 @@ object CollectiveRoutes {
|
|||||||
c.language,
|
c.language,
|
||||||
c.integrationEnabled,
|
c.integrationEnabled,
|
||||||
ClassifierSetting(
|
ClassifierSetting(
|
||||||
c.classifier.map(_.enabled).getOrElse(false),
|
c.classifier.exists(_.enabled),
|
||||||
c.classifier.flatMap(_.category),
|
|
||||||
c.classifier.map(_.itemCount).getOrElse(0),
|
c.classifier.map(_.itemCount).getOrElse(0),
|
||||||
c.classifier
|
c.classifier
|
||||||
.map(_.schedule)
|
.map(_.schedule)
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
CREATE TABLE "classifier_model"(
|
||||||
|
"id" varchar(254) not null primary key,
|
||||||
|
"cid" varchar(254) not null,
|
||||||
|
"name" varchar(254) not null,
|
||||||
|
"file_id" varchar(254) not null,
|
||||||
|
"created" timestamp not null,
|
||||||
|
foreign key ("cid") references "collective"("cid"),
|
||||||
|
foreign key ("file_id") references "filemeta"("id"),
|
||||||
|
unique ("cid", "name")
|
||||||
|
);
|
||||||
|
|
||||||
|
insert into "classifier_model"
|
||||||
|
select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name", "file_id", "created"
|
||||||
|
from "classifier_setting"
|
||||||
|
where "file_id" is not null;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
drop column "category";
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
drop column "file_id";
|
@ -0,0 +1,26 @@
|
|||||||
|
CREATE TABLE `classifier_model`(
|
||||||
|
`id` varchar(254) not null primary key,
|
||||||
|
`cid` varchar(254) not null,
|
||||||
|
`name` varchar(254) not null,
|
||||||
|
`file_id` varchar(254) not null,
|
||||||
|
`created` timestamp not null,
|
||||||
|
foreign key (`cid`) references `collective`(`cid`),
|
||||||
|
foreign key (`file_id`) references `filemeta`(`id`),
|
||||||
|
unique (`cid`, `name`)
|
||||||
|
);
|
||||||
|
|
||||||
|
insert into `classifier_model`
|
||||||
|
select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `file_id`, `created`
|
||||||
|
from `classifier_setting`
|
||||||
|
where `file_id` is not null;
|
||||||
|
|
||||||
|
alter table `classifier_setting`
|
||||||
|
drop column `category`;
|
||||||
|
|
||||||
|
-- mariadb needs special treatment when dropping a column that is part
|
||||||
|
-- of an index and foreign key
|
||||||
|
alter table `classifier_setting`
|
||||||
|
drop constraint `classifier_setting_ibfk_2`;
|
||||||
|
|
||||||
|
alter table `classifier_setting`
|
||||||
|
drop column `file_id`;
|
@ -0,0 +1,21 @@
|
|||||||
|
CREATE TABLE "classifier_model"(
|
||||||
|
"id" varchar(254) not null primary key,
|
||||||
|
"cid" varchar(254) not null,
|
||||||
|
"name" varchar(254) not null,
|
||||||
|
"file_id" varchar(254) not null,
|
||||||
|
"created" timestamp not null,
|
||||||
|
foreign key ("cid") references "collective"("cid"),
|
||||||
|
foreign key ("file_id") references "filemeta"("id"),
|
||||||
|
unique ("cid", "name")
|
||||||
|
);
|
||||||
|
|
||||||
|
insert into "classifier_model"
|
||||||
|
select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name", "file_id", "created"
|
||||||
|
from "classifier_setting"
|
||||||
|
where "file_id" is not null;
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
drop column "category";
|
||||||
|
|
||||||
|
alter table "classifier_setting"
|
||||||
|
drop column "file_id";
|
@ -543,11 +543,14 @@ object QItem {
|
|||||||
|
|
||||||
def findAllNewesFirst(
|
def findAllNewesFirst(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
chunkSize: Int
|
chunkSize: Int,
|
||||||
|
limit: Batch
|
||||||
): Stream[ConnectionIO, Ident] = {
|
): Stream[ConnectionIO, Ident] = {
|
||||||
|
|
||||||
val i = RItem.as("i")
|
val i = RItem.as("i")
|
||||||
Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed)
|
Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed)
|
||||||
.orderBy(i.created.desc)
|
.orderBy(i.created.desc)
|
||||||
|
.limit(limit)
|
||||||
.build
|
.build
|
||||||
.query[Ident]
|
.query[Ident]
|
||||||
.streamWithChunkSize(chunkSize)
|
.streamWithChunkSize(chunkSize)
|
||||||
|
@ -0,0 +1,78 @@
|
|||||||
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.qb.DSL._
|
||||||
|
import docspell.store.qb._
|
||||||
|
|
||||||
|
import doobie._
|
||||||
|
import doobie.implicits._
|
||||||
|
|
||||||
|
final case class RClassifierModel(
|
||||||
|
id: Ident,
|
||||||
|
cid: Ident,
|
||||||
|
name: String,
|
||||||
|
fileId: Ident,
|
||||||
|
created: Timestamp
|
||||||
|
) {}
|
||||||
|
|
||||||
|
object RClassifierModel {
|
||||||
|
|
||||||
|
def createNew[F[_]: Sync](
|
||||||
|
cid: Ident,
|
||||||
|
name: String,
|
||||||
|
fileId: Ident
|
||||||
|
): F[RClassifierModel] =
|
||||||
|
for {
|
||||||
|
id <- Ident.randomId[F]
|
||||||
|
now <- Timestamp.current[F]
|
||||||
|
} yield RClassifierModel(id, cid, name, fileId, now)
|
||||||
|
|
||||||
|
final case class Table(alias: Option[String]) extends TableDef {
|
||||||
|
val tableName = "classifier_model"
|
||||||
|
|
||||||
|
val id = Column[Ident]("id", this)
|
||||||
|
val cid = Column[Ident]("cid", this)
|
||||||
|
val name = Column[String]("name", this)
|
||||||
|
val fileId = Column[Ident]("file_id", this)
|
||||||
|
val created = Column[Timestamp]("created", this)
|
||||||
|
|
||||||
|
val all = NonEmptyList.of[Column[_]](id, cid, name, fileId, created)
|
||||||
|
}
|
||||||
|
|
||||||
|
def as(alias: String): Table =
|
||||||
|
Table(Some(alias))
|
||||||
|
|
||||||
|
val T = Table(None)
|
||||||
|
|
||||||
|
def insert(v: RClassifierModel): ConnectionIO[Int] =
|
||||||
|
DML.insert(
|
||||||
|
T,
|
||||||
|
T.all,
|
||||||
|
fr"${v.id},${v.cid},${v.name},${v.fileId},${v.created}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] =
|
||||||
|
for {
|
||||||
|
n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid)))
|
||||||
|
k <-
|
||||||
|
if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert)
|
||||||
|
else 0.pure[ConnectionIO]
|
||||||
|
} yield n + k
|
||||||
|
|
||||||
|
def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] =
|
||||||
|
Select(select(T.all), from(T), T.cid === cid && T.name === name).build
|
||||||
|
.query[RClassifierModel]
|
||||||
|
.option
|
||||||
|
|
||||||
|
def findAllByName(
|
||||||
|
cid: Ident,
|
||||||
|
names: NonEmptyList[String]
|
||||||
|
): ConnectionIO[List[RClassifierModel]] =
|
||||||
|
Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build
|
||||||
|
.query[RClassifierModel]
|
||||||
|
.to[List]
|
||||||
|
}
|
@ -15,9 +15,7 @@ case class RClassifierSetting(
|
|||||||
cid: Ident,
|
cid: Ident,
|
||||||
enabled: Boolean,
|
enabled: Boolean,
|
||||||
schedule: CalEvent,
|
schedule: CalEvent,
|
||||||
category: String,
|
|
||||||
itemCount: Int,
|
itemCount: Int,
|
||||||
fileId: Option[Ident],
|
|
||||||
created: Timestamp
|
created: Timestamp
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
@ -28,12 +26,10 @@ object RClassifierSetting {
|
|||||||
val cid = Column[Ident]("cid", this)
|
val cid = Column[Ident]("cid", this)
|
||||||
val enabled = Column[Boolean]("enabled", this)
|
val enabled = Column[Boolean]("enabled", this)
|
||||||
val schedule = Column[CalEvent]("schedule", this)
|
val schedule = Column[CalEvent]("schedule", this)
|
||||||
val category = Column[String]("category", this)
|
|
||||||
val itemCount = Column[Int]("item_count", this)
|
val itemCount = Column[Int]("item_count", this)
|
||||||
val fileId = Column[Ident]("file_id", this)
|
|
||||||
val created = Column[Timestamp]("created", this)
|
val created = Column[Timestamp]("created", this)
|
||||||
val all = NonEmptyList
|
val all = NonEmptyList
|
||||||
.of[Column[_]](cid, enabled, schedule, category, itemCount, fileId, created)
|
.of[Column[_]](cid, enabled, schedule, itemCount, created)
|
||||||
}
|
}
|
||||||
|
|
||||||
val T = Table(None)
|
val T = Table(None)
|
||||||
@ -44,7 +40,7 @@ object RClassifierSetting {
|
|||||||
DML.insert(
|
DML.insert(
|
||||||
T,
|
T,
|
||||||
T.all,
|
T.all,
|
||||||
fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
|
fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
|
def updateAll(v: RClassifierSetting): ConnectionIO[Int] =
|
||||||
@ -54,15 +50,10 @@ object RClassifierSetting {
|
|||||||
DML.set(
|
DML.set(
|
||||||
T.enabled.setTo(v.enabled),
|
T.enabled.setTo(v.enabled),
|
||||||
T.schedule.setTo(v.schedule),
|
T.schedule.setTo(v.schedule),
|
||||||
T.category.setTo(v.category),
|
T.itemCount.setTo(v.itemCount)
|
||||||
T.itemCount.setTo(v.itemCount),
|
|
||||||
T.fileId.setTo(v.fileId)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
|
|
||||||
DML.update(T, T.cid === coll, DML.set(T.fileId.setTo(fid)))
|
|
||||||
|
|
||||||
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
||||||
for {
|
for {
|
||||||
n1 <- DML.update(
|
n1 <- DML.update(
|
||||||
@ -71,8 +62,7 @@ object RClassifierSetting {
|
|||||||
DML.set(
|
DML.set(
|
||||||
T.enabled.setTo(v.enabled),
|
T.enabled.setTo(v.enabled),
|
||||||
T.schedule.setTo(v.schedule),
|
T.schedule.setTo(v.schedule),
|
||||||
T.itemCount.setTo(v.itemCount),
|
T.itemCount.setTo(v.itemCount)
|
||||||
T.category.setTo(v.category)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
||||||
@ -89,8 +79,7 @@ object RClassifierSetting {
|
|||||||
case class Classifier(
|
case class Classifier(
|
||||||
enabled: Boolean,
|
enabled: Boolean,
|
||||||
schedule: CalEvent,
|
schedule: CalEvent,
|
||||||
itemCount: Int,
|
itemCount: Int
|
||||||
category: Option[String]
|
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
||||||
@ -98,15 +87,13 @@ object RClassifierSetting {
|
|||||||
coll,
|
coll,
|
||||||
enabled,
|
enabled,
|
||||||
schedule,
|
schedule,
|
||||||
category.getOrElse(""),
|
|
||||||
itemCount,
|
itemCount,
|
||||||
None,
|
|
||||||
created
|
created
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
object Classifier {
|
object Classifier {
|
||||||
def fromRecord(r: RClassifierSetting): Classifier =
|
def fromRecord(r: RClassifierSetting): Classifier =
|
||||||
Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
|
Classifier(r.enabled, r.schedule, r.itemCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -89,8 +89,7 @@ object RCollective {
|
|||||||
c.integration.s,
|
c.integration.s,
|
||||||
cs.enabled.s,
|
cs.enabled.s,
|
||||||
cs.schedule.s,
|
cs.schedule.s,
|
||||||
cs.itemCount.s,
|
cs.itemCount.s
|
||||||
cs.category.s
|
|
||||||
),
|
),
|
||||||
from(c).leftJoin(cs, cs.cid === c.id),
|
from(c).leftJoin(cs, cs.cid === c.id),
|
||||||
c.id === coll
|
c.id === coll
|
||||||
|
@ -148,6 +148,13 @@ object RTag {
|
|||||||
).orderBy(T.name.asc).build.query[RTag].to[List]
|
).orderBy(T.name.asc).build.query[RTag].to[List]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] =
|
||||||
|
Select(
|
||||||
|
coalesce(T.category.s, lit(fallback)).s,
|
||||||
|
from(T),
|
||||||
|
T.cid === coll
|
||||||
|
).distinct.build.query[String].to[List]
|
||||||
|
|
||||||
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
||||||
DML.delete(T, T.tid === tagId && T.cid === coll)
|
DML.delete(T, T.tid === tagId && T.cid === coll)
|
||||||
}
|
}
|
||||||
|
@ -25,8 +25,6 @@ import Util.Tag
|
|||||||
|
|
||||||
type alias Model =
|
type alias Model =
|
||||||
{ enabled : Bool
|
{ enabled : Bool
|
||||||
, categoryModel : Comp.FixedDropdown.Model String
|
|
||||||
, category : Maybe String
|
|
||||||
, scheduleModel : Comp.CalEventInput.Model
|
, scheduleModel : Comp.CalEventInput.Model
|
||||||
, schedule : Validated CalEvent
|
, schedule : Validated CalEvent
|
||||||
, itemCountModel : Comp.IntField.Model
|
, itemCountModel : Comp.IntField.Model
|
||||||
@ -35,10 +33,8 @@ type alias Model =
|
|||||||
|
|
||||||
|
|
||||||
type Msg
|
type Msg
|
||||||
= GetTagsResp (Result Http.Error TagList)
|
= ScheduleMsg Comp.CalEventInput.Msg
|
||||||
| ScheduleMsg Comp.CalEventInput.Msg
|
|
||||||
| ToggleEnabled
|
| ToggleEnabled
|
||||||
| CategoryMsg (Comp.FixedDropdown.Msg String)
|
|
||||||
| ItemCountMsg Comp.IntField.Msg
|
| ItemCountMsg Comp.IntField.Msg
|
||||||
|
|
||||||
|
|
||||||
@ -53,17 +49,12 @@ init flags sett =
|
|||||||
Comp.CalEventInput.init flags newSchedule
|
Comp.CalEventInput.init flags newSchedule
|
||||||
in
|
in
|
||||||
( { enabled = sett.enabled
|
( { enabled = sett.enabled
|
||||||
, categoryModel = Comp.FixedDropdown.initString []
|
|
||||||
, category = sett.category
|
|
||||||
, scheduleModel = cem
|
, scheduleModel = cem
|
||||||
, schedule = Data.Validated.Unknown newSchedule
|
, schedule = Data.Validated.Unknown newSchedule
|
||||||
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
||||||
, itemCount = Just sett.itemCount
|
, itemCount = Just sett.itemCount
|
||||||
}
|
}
|
||||||
, Cmd.batch
|
, Cmd.map ScheduleMsg cec
|
||||||
[ Api.getTags flags "" GetTagsResp
|
|
||||||
, Cmd.map ScheduleMsg cec
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -72,7 +63,6 @@ getSettings model =
|
|||||||
Data.Validated.map
|
Data.Validated.map
|
||||||
(\sch ->
|
(\sch ->
|
||||||
{ enabled = model.enabled
|
{ enabled = model.enabled
|
||||||
, category = model.category
|
|
||||||
, schedule =
|
, schedule =
|
||||||
Data.CalEvent.makeEvent sch
|
Data.CalEvent.makeEvent sch
|
||||||
, itemCount = Maybe.withDefault 0 model.itemCount
|
, itemCount = Maybe.withDefault 0 model.itemCount
|
||||||
@ -84,27 +74,6 @@ getSettings model =
|
|||||||
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
||||||
update flags msg model =
|
update flags msg model =
|
||||||
case msg of
|
case msg of
|
||||||
GetTagsResp (Ok tl) ->
|
|
||||||
let
|
|
||||||
categories =
|
|
||||||
Util.Tag.getCategories tl.items
|
|
||||||
|> List.sort
|
|
||||||
in
|
|
||||||
( { model
|
|
||||||
| categoryModel = Comp.FixedDropdown.initString categories
|
|
||||||
, category =
|
|
||||||
if model.category == Nothing then
|
|
||||||
List.head categories
|
|
||||||
|
|
||||||
else
|
|
||||||
model.category
|
|
||||||
}
|
|
||||||
, Cmd.none
|
|
||||||
)
|
|
||||||
|
|
||||||
GetTagsResp (Err _) ->
|
|
||||||
( model, Cmd.none )
|
|
||||||
|
|
||||||
ScheduleMsg lmsg ->
|
ScheduleMsg lmsg ->
|
||||||
let
|
let
|
||||||
( cm, cc, ce ) =
|
( cm, cc, ce ) =
|
||||||
@ -126,23 +95,6 @@ update flags msg model =
|
|||||||
, Cmd.none
|
, Cmd.none
|
||||||
)
|
)
|
||||||
|
|
||||||
CategoryMsg lmsg ->
|
|
||||||
let
|
|
||||||
( mm, ma ) =
|
|
||||||
Comp.FixedDropdown.update lmsg model.categoryModel
|
|
||||||
in
|
|
||||||
( { model
|
|
||||||
| categoryModel = mm
|
|
||||||
, category =
|
|
||||||
if ma == Nothing then
|
|
||||||
model.category
|
|
||||||
|
|
||||||
else
|
|
||||||
ma
|
|
||||||
}
|
|
||||||
, Cmd.none
|
|
||||||
)
|
|
||||||
|
|
||||||
ItemCountMsg lmsg ->
|
ItemCountMsg lmsg ->
|
||||||
let
|
let
|
||||||
( im, iv ) =
|
( im, iv ) =
|
||||||
@ -182,13 +134,6 @@ view model =
|
|||||||
, text "periodically based on a schedule and you need to specify a tag-group that should "
|
, text "periodically based on a schedule and you need to specify a tag-group that should "
|
||||||
, text "be used for learning."
|
, text "be used for learning."
|
||||||
]
|
]
|
||||||
, div [ class "field" ]
|
|
||||||
[ label [] [ text "Category" ]
|
|
||||||
, Html.map CategoryMsg
|
|
||||||
(Comp.FixedDropdown.viewString model.category
|
|
||||||
model.categoryModel
|
|
||||||
)
|
|
||||||
]
|
|
||||||
, Html.map ItemCountMsg
|
, Html.map ItemCountMsg
|
||||||
(Comp.IntField.viewWithInfo
|
(Comp.IntField.viewWithInfo
|
||||||
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
|
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
|
||||||
|
Loading…
x
Reference in New Issue
Block a user