diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 75d07eef..44f7203b 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind import docspell.analysis.nlp.PipelineCache import docspell.analysis.nlp.StanfordNerClassifier -import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.nlp.StanfordTextClassifier +import docspell.analysis.nlp.TextClassifier import docspell.common._ trait TextAnalyser[F[_]] { def annotate( logger: Logger[F], - settings: StanfordSettings, + settings: StanfordNerSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] + def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F] } object TextAnalyser { @@ -35,7 +38,7 @@ object TextAnalyser { new TextAnalyser[F] { def annotate( logger: Logger[F], - settings: StanfordSettings, + settings: StanfordNerSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] = @@ -48,6 +51,11 @@ object TextAnalyser { spans = NerLabelSpan.build(list) } yield Result(spans ++ list, dates) + def classifier(blocker: Blocker)(implicit + CS: ContextShift[F] + ): TextClassifier[F] = + new StanfordTextClassifier[F](cfg.classifier, blocker) + private def textLimit(logger: Logger[F], text: String): F[String] = if (text.length <= cfg.maxLength) text.pure[F] else @@ -56,7 +64,7 @@ object TextAnalyser { s" Analysing only first ${cfg.maxLength} characters." ) *> text.take(cfg.maxLength).pure[F] - private def stanfordNer(key: Ident, settings: StanfordSettings, text: String) + private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) : F[Vector[NerLabel]] = StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index 577f6753..596a6247 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,5 +1,8 @@ package docspell.analysis +import docspell.analysis.nlp.TextClassifierConfig + case class TextAnalysisConfig( - maxLength: Int + maxLength: Int, + classifier: TextClassifierConfig ) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala new file mode 100644 index 00000000..82f9f9cc --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala @@ -0,0 +1,5 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +case class ClassifierModel(model: Path) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 9787563f..88e13ee3 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -19,7 +19,7 @@ import org.log4s.getLogger */ trait PipelineCache[F[_]] { - def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] + def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] } @@ -28,7 +28,7 @@ object PipelineCache { def none[F[_]: Applicative]: PipelineCache[F] = new PipelineCache[F] { - def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] = + def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] = makeClassifier(settings).pure[F] } @@ -38,7 +38,7 @@ object PipelineCache { final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]]) extends PipelineCache[F] { - def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] = + def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] = for { id <- makeSettingsId(settings) nlp <- data.modify(cache => getOrCreate(key, id, cache, settings)) @@ -48,7 +48,7 @@ object PipelineCache { key: String, id: String, cache: Map[String, Entry], - settings: StanfordSettings + settings: StanfordNerSettings ): (Map[String, Entry], StanfordCoreNLP) = cache.get(key) match { case Some(entry) => @@ -68,7 +68,7 @@ object PipelineCache { (cache.updated(key, e), nlp) } - private def makeSettingsId(settings: StanfordSettings): F[String] = { + private def makeSettingsId(settings: StanfordNerSettings): F[String] = { val base = settings.copy(regexNer = None).toString val size: F[Long] = settings.regexNer match { @@ -81,7 +81,7 @@ object PipelineCache { } } - private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = { + private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = { logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") new StanfordCoreNLP(Properties.forSettings(settings)) } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index 314f04fb..46a614d1 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -7,6 +7,9 @@ import docspell.common._ object Properties { + def fromMap(m: Map[String, String]): JProps = + apply(m.toSeq: _*) + def apply(ps: (String, String)*): JProps = { val p = new JProps() for ((k, v) <- ps) @@ -14,7 +17,7 @@ object Properties { p } - def forSettings(settings: StanfordSettings): JProps = { + def forSettings(settings: StanfordNerSettings): JProps = { val regexNerFile = settings.regexNer .map(p => p.normalize().toAbsolutePath().toString()) settings.lang match { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 424396e5..383a07ea 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -25,7 +25,7 @@ object StanfordNerClassifier { def nerAnnotate[F[_]: Applicative]( cacheKey: String, cache: PipelineCache[F] - )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] = + )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = cache .obtain(cacheKey, settings) .map(crf => runClassifier(crf, text)) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala similarity index 88% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala index c2f6f98c..06136a18 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala @@ -19,4 +19,8 @@ import docspell.common._ * as a last step to tag untagged tokens using the provided list of * regexps. */ -case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path]) +case class StanfordNerSettings( + lang: Language, + highRecall: Boolean, + regexNer: Option[Path] +) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala new file mode 100644 index 00000000..d8846fc4 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala @@ -0,0 +1,153 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +import cats.effect._ +import cats.effect.concurrent.Ref +import cats.implicits._ +import fs2.Stream + +import docspell.analysis.nlp.TextClassifier._ +import docspell.common._ + +import edu.stanford.nlp.classify.ColumnDataClassifier + +final class StanfordTextClassifier[F[_]: Sync: ContextShift]( + cfg: TextClassifierConfig, + blocker: Blocker +) extends TextClassifier[F] { + + def trainClassifier[A]( + logger: Logger[F], + data: Stream[F, Data] + )(handler: TextClassifier.Handler[F, A]): F[A] = + File + .withTempDir(cfg.workingDir, "trainclassifier") + .use { dir => + for { + rawData <- writeDataFile(blocker, dir, data) + _ <- logger.info(s"Learning from ${rawData.count} items.") + trainData <- splitData(logger, rawData) + scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m)) + sorted = scores.sortBy(-_.score) + res <- handler(sorted.head.model) + } yield res + } + + def classify( + logger: Logger[F], + model: ClassifierModel, + text: String + ): F[Option[String]] = + Sync[F].delay { + val cls = ColumnDataClassifier.getClassifier( + model.model.normalize().toAbsolutePath().toString() + ) + val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text))) + Option(cat) + } + + // --- helpers + + def train( + logger: Logger[F], + in: TrainData, + props: Map[String, String] + ): F[TrainResult] = + for { + _ <- logger.debug(s"Training classifier from $props") + res <- Sync[F].delay { + val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props))) + cdc.trainClassifier(in.train.toString()) + val score = cdc.testClassifier(in.test.toString()) + TrainResult(score.first(), ClassifierModel(in.modelFile)) + } + _ <- logger.debug(s"Trained with result $res") + } yield res + + def splitData(logger: Logger[F], in: RawData): F[TrainData] = { + val nTest = (in.count * 0.15).toLong + + val td = + TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt")) + + val fileLines = + fs2.io.file + .readAll(in.file, blocker, 4096) + .through(fs2.text.utf8Decode) + .through(fs2.text.lines) + + for { + _ <- logger.debug( + s"Splitting raw data into test/train data. Testing with $nTest entries" + ) + _ <- + fileLines + .take(nTest) + .intersperse("\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(td.test, blocker)) + .compile + .drain + _ <- + fileLines + .drop(nTest) + .intersperse("\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(td.train, blocker)) + .compile + .drain + } yield td + } + + def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = { + val target = dir.resolve("rawdata") + for { + counter <- Ref.of[F, Long](0L) + _ <- + data + .filter(_.text.nonEmpty) + .map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}") + .evalTap(_ => counter.update(_ + 1)) + .intersperse("\r\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(target, blocker)) + .compile + .drain + lines <- counter.get + } yield RawData(lines, target) + + } + + def normalisedText(text: String): String = + text.replaceAll("[\n\r\t]+", " ") + + def fixRef(str: String): String = + str.replace('\t', '_') + + def amendProps( + trainData: TrainData, + props: Map[String, String] + ): Map[String, String] = + prepend("2.", props) ++ Map( + "trainFile" -> trainData.train.normalize().toAbsolutePath().toString(), + "testFile" -> trainData.test.normalize().toAbsolutePath().toString(), + "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString() + ).toList + + case class RawData(count: Long, file: Path) + case class TrainData(train: Path, test: Path) { + val modelFile = train.resolveSibling("model.ser.gz") + } + + case class TrainResult(score: Double, model: ClassifierModel) + + def prepend(pre: String, data: Map[String, String]): Map[String, String] = + data.toList + .map({ + case (k, v) => + if (k.startsWith(pre)) (k, v) + else (pre + k, v) + }) + .toMap +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala new file mode 100644 index 00000000..f2927d0c --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala @@ -0,0 +1,25 @@ +package docspell.analysis.nlp + +import cats.data.Kleisli +import fs2.Stream + +import docspell.analysis.nlp.TextClassifier.Data +import docspell.common._ + +trait TextClassifier[F[_]] { + + def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])( + handler: TextClassifier.Handler[F, A] + ): F[A] + + def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]] + +} + +object TextClassifier { + + type Handler[F[_], A] = Kleisli[F, ClassifierModel, A] + + case class Data(cls: String, ref: String, text: String) + +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala new file mode 100644 index 00000000..e3baac46 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala @@ -0,0 +1,10 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +import cats.data.NonEmptyList + +case class TextClassifierConfig( + workingDir: Path, + classifierConfigs: NonEmptyList[Map[String, String]] +) diff --git a/modules/analysis/src/test/resources/test.ser.gz b/modules/analysis/src/test/resources/test.ser.gz new file mode 100644 index 00000000..b6d0956b Binary files /dev/null and b/modules/analysis/src/test/resources/test.ser.gz differ diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala new file mode 100644 index 00000000..b9596923 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala @@ -0,0 +1,76 @@ +package docspell.analysis.nlp + +import minitest._ +import cats.effect._ +import scala.concurrent.ExecutionContext +import java.nio.file.Paths +import cats.data.NonEmptyList +import docspell.common._ +import fs2.Stream +import cats.data.Kleisli +import TextClassifier.Data + +object StanfordTextClassifierSuite extends SimpleTestSuite { + val logger = Logger.log4s[IO](org.log4s.getLogger) + + implicit val CS = IO.contextShift(ExecutionContext.global) + + test("learn from data") { + val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map())) + + val data = + Stream + .emit(Data("invoice", "n", "this is your invoice total $421")) + .repeat + .take(10) + .zip( + Stream + .emit(Data("receipt", "n", "shopping receipt cheese cake bar")) + .repeat + .take(10) + ) + .flatMap({ + case (a, b) => + Stream.emits(Seq(a, b)) + }) + .covary[IO] + + val modelExists = + Blocker[IO].use { blocker => + val classifier = new StanfordTextClassifier[IO](cfg, blocker) + classifier.trainClassifier[Boolean](logger, data)( + Kleisli(result => File.existsNonEmpty[IO](result.model)) + ) + } + assertEquals(modelExists.unsafeRunSync(), true) + } + + test("run classifier") { + val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map())) + val things = for { + dir <- File.withTempDir[IO](Paths.get("target"), "testcls") + blocker <- Blocker[IO] + } yield (dir, blocker) + + things + .use { + case (dir, blocker) => + val classifier = new StanfordTextClassifier[IO](cfg, blocker) + + val modelFile = dir.resolve("test.ser.gz") + for { + _ <- + LenientUri + .fromJava(getClass.getResource("/test.ser.gz")) + .readURL[IO](4096, blocker) + .through(fs2.io.file.writeAll(modelFile, blocker)) + .compile + .drain + model = ClassifierModel(modelFile) + cat <- classifier.classify(logger, model, "there is receipt always") + _ = assertEquals(cat, Some("receipt")) + } yield () + } + .unsafeRunSync() + } +} diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala index 6ff3c73e..be76d45b 100644 --- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala +++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala @@ -52,12 +52,12 @@ object BackendApp { queue <- JobQueue(store) loginImpl <- Login[F](store) signupImpl <- OSignup[F](store) - collImpl <- OCollective[F](store) + joexImpl <- OJoex(JoexClient(httpClient), store) + collImpl <- OCollective[F](store, utStore, queue, joexImpl) sourceImpl <- OSource[F](store) tagImpl <- OTag[F](store) equipImpl <- OEquipment[F](store) orgImpl <- OOrganization(store) - joexImpl <- OJoex(JoexClient(httpClient), store) uploadImpl <- OUpload(store, queue, cfg.files, joexImpl) nodeImpl <- ONode(store) jobImpl <- OJob(store, joexImpl) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala index e3835448..5e9b5aaf 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala @@ -8,14 +8,21 @@ import docspell.backend.PasswordCrypt import docspell.backend.ops.OCollective._ import docspell.common._ import docspell.store.queries.QCollective +import docspell.store.queue.JobQueue import docspell.store.records._ +import docspell.store.usertask.UserTask +import docspell.store.usertask.UserTaskStore import docspell.store.{AddResult, Store} +import com.github.eikek.calev._ + trait OCollective[F[_]] { def find(name: Ident): F[Option[RCollective]] - def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult] + def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult] + + def findSettings(collective: Ident): F[Option[OCollective.Settings]] def listUser(collective: Ident): F[Vector[RUser]] @@ -43,6 +50,7 @@ trait OCollective[F[_]] { def findEnabledSource(sourceId: Ident): F[Option[RSource]] + def startLearnClassifier(collective: Ident): F[Unit] } object OCollective { @@ -55,6 +63,8 @@ object OCollective { type Settings = RCollective.Settings val Settings = RCollective.Settings + type Classifier = RClassifierSetting.Classifier + val Classifier = RClassifierSetting.Classifier sealed trait PassChangeResult object PassChangeResult { @@ -91,7 +101,12 @@ object OCollective { } } - def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] = + def apply[F[_]: Effect]( + store: Store[F], + uts: UserTaskStore[F], + queue: JobQueue[F], + joex: OJoex[F] + ): Resource[F, OCollective[F]] = Resource.pure[F, OCollective[F]](new OCollective[F] { def find(name: Ident): F[Option[RCollective]] = store.transact(RCollective.findById(name)) @@ -101,6 +116,41 @@ object OCollective { .transact(RCollective.updateSettings(collective, sett)) .attempt .map(AddResult.fromUpdate) + .flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F]) + + def updateLearnClassifierTask(coll: Ident, sett: Settings) = + for { + id <- Ident.randomId[F] + on = sett.classifier.map(_.enabled).getOrElse(false) + timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe("")) + ut = UserTask( + id, + LearnClassifierArgs.taskName, + on, + timer, + LearnClassifierArgs(coll) + ) + _ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut) + _ <- joex.notifyAllNodes + } yield () + + def startLearnClassifier(collective: Ident): F[Unit] = + for { + id <- Ident.randomId[F] + ut <- UserTask( + id, + LearnClassifierArgs.taskName, + true, + CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All), + LearnClassifierArgs(collective) + ).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName)) + job <- ut.toJob + _ <- queue.insert(job) + _ <- joex.notifyAllNodes + } yield () + + def findSettings(collective: Ident): F[Option[OCollective.Settings]] = + store.transact(RCollective.getSettings(collective)) def listUser(collective: Ident): F[Vector[RUser]] = store.transact(RUser.findAll(collective, _.login)) diff --git a/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala new file mode 100644 index 00000000..9cfa9395 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala @@ -0,0 +1,35 @@ +package docspell.common + +import docspell.common.syntax.all._ + +import io.circe._ +import io.circe.generic.semiauto._ + +/** Arguments to the classify-item task. + * + * This task is run periodically and learns from existing documents + * to create a model for predicting tags of new documents. The user + * must give a tag category as a subset of possible tags.. + */ +case class LearnClassifierArgs( + collective: Ident +) { + + def makeSubject: String = + "Learn tags" + +} + +object LearnClassifierArgs { + + val taskName = Ident.unsafe("learn-classifier") + + implicit val jsonEncoder: Encoder[LearnClassifierArgs] = + deriveEncoder[LearnClassifierArgs] + implicit val jsonDecoder: Decoder[LearnClassifierArgs] = + deriveDecoder[LearnClassifierArgs] + + def parse(str: String): Either[Throwable, LearnClassifierArgs] = + str.parseJsonAs[LearnClassifierArgs] + +} diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 115d2893..23ec5b47 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -271,6 +271,50 @@ docspell.joex { # file will be kept until a check for a state change is done. file-cache-time = "1 minute" } + + # Settings for doing document classification. + # + # This works by learning from existing documents. A collective can + # specify a tag category and the system will try to predict a tag + # from this category for new incoming documents. + # + # This requires a satstical model that is computed from all + # existing documents. This process is run periodically as + # configured by the collective. It may require a lot of memory, + # depending on the amount of data. + # + # It utilises this NLP library: https://nlp.stanford.edu/. + classification { + # Whether to enable classification globally. Each collective can + # decide to disable it. If it is disabled here, no collective + # can use classification. + enabled = true + + # If concerned with memory consumption, this restricts the + # number of items to consider. More are better for training. A + # negative value or zero means no train on all items. + item-count = 0 + + # These settings are used to configure the classifier. If + # multiple are given, they are all tried and the "best" is + # chosen at the end. See + # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html + # for more info about these settings. The settings here yielded + # good results with *my* dataset. + # + # Enclose regexps in triple quotes. + classifiers = [ + { "useSplitWords" = "true" + "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.""" + "splitWordsIgnoreRegexp" = """\s+""" + "useSplitPrefixSuffixNGrams" = "true" + "maxNGramLeng" = "4" + "minNGramLeng" = "1" + "splitWordShape" = "chris4" + "intern" = "true" # makes it slower but saves memory + } + ] + } } # Configuration for converting files into PDFs. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index cb6bb9f3..cbbb4a33 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -2,7 +2,10 @@ package docspell.joex import java.nio.file.Path +import cats.data.NonEmptyList + import docspell.analysis.TextAnalysisConfig +import docspell.analysis.nlp.TextClassifierConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -57,15 +60,30 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - regexNer: RegexNer + regexNer: RegexNer, + classification: Classification ) { def textAnalysisConfig: TextAnalysisConfig = - TextAnalysisConfig(maxLength) + TextAnalysisConfig( + maxLength, + TextClassifierConfig( + workingDir, + NonEmptyList + .fromList(classification.classifiers) + .getOrElse(NonEmptyList.of(Map.empty)) + ) + ) def regexNerFileConfig: RegexNerFile.Config = RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) } case class RegexNer(enabled: Boolean, fileCacheTime: Duration) + + case class Classification( + enabled: Boolean, + itemCount: Int, + classifiers: List[Map[String, String]] + ) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 2fa94c25..7c3f57fc 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient import docspell.joex.analysis.RegexNerFile import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.hk._ +import docspell.joex.learn.LearnClassifierTask import docspell.joex.notify._ import docspell.joex.pdfconv.ConvertAllPdfTask import docspell.joex.pdfconv.PdfConvTask @@ -159,6 +160,13 @@ object JoexAppImpl { ConvertAllPdfTask.onCancel[F] ) ) + .withTask( + JobTask.json( + LearnClassifierArgs.taskName, + LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser), + LearnClassifierTask.onCancel[F] + ) + ) .resource psch <- PeriodicScheduler.create( cfg.periodicScheduler, diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala new file mode 100644 index 00000000..c3d6e3f9 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -0,0 +1,111 @@ +package docspell.joex.learn + +import cats.data.Kleisli +import cats.data.OptionT +import cats.effect._ +import cats.implicits._ +import fs2.{Pipe, Stream} + +import docspell.analysis.TextAnalyser +import docspell.analysis.nlp.ClassifierModel +import docspell.analysis.nlp.TextClassifier.Data +import docspell.backend.ops.OCollective +import docspell.common._ +import docspell.joex.Config +import docspell.joex.scheduler._ +import docspell.store.queries.QItem +import docspell.store.records.RClassifierSetting + +import bitpeace.MimetypeHint + +object LearnClassifierTask { + val noClass = "__NONE__" + val pageSep = " --n-- " + + type Args = LearnClassifierArgs + + def onCancel[F[_]: Sync]: Task[F, Args, Unit] = + Task.log(_.warn("Cancelling learn-classifier task")) + + def apply[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, + blocker: Blocker, + analyser: TextAnalyser[F] + ): Task[F, Args, Unit] = + Task { ctx => + (for { + sett <- findActiveSettings[F](ctx, cfg) + data = selectItems( + ctx, + math.min(cfg.classification.itemCount, sett.itemCount).toLong, + sett.category.getOrElse("") + ) + _ <- OptionT.liftF( + analyser + .classifier(blocker) + .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker))) + ) + } yield ()) + .getOrElseF(logInactiveWarning(ctx.logger)) + } + + private def handleModel[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + blocker: Blocker + )(trainedModel: ClassifierModel): F[Unit] = + for { + oldFile <- ctx.store.transact( + RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId)) + ) + _ <- ctx.logger.info("Storing new trained model") + fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) + newFile <- + ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError + _ <- ctx.store.transact( + RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id)) + ) + _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}") + _ <- oldFile match { + case Some(fid) => + ctx.logger.debug(s"Deleting old model file ${fid.id}") *> + ctx.store.bitpeace.delete(fid.id).compile.drain + case None => ().pure[F] + } + } yield () + + private def selectItems[F[_]]( + ctx: Context[F, Args], + max: Long, + category: String + ): Stream[F, Data] = { + val connStream = + for { + item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) + tt <- Stream.eval( + QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep) + ) + } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) + ctx.store.transact(connStream.filter(_.text.nonEmpty)) + } + + private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] = + if (max <= 0) identity + else _.take(max) + + private def findActiveSettings[F[_]: Sync]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis + ): OptionT[F, OCollective.Classifier] = + if (cfg.classification.enabled) + OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) + .filter(_.enabled) + .filter(_.category.nonEmpty) + .map(OCollective.Classifier.fromRecord) + else + OptionT.none + + private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] = + logger.warn( + "Classification is disabled. Check joex config and the collective settings." + ) +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index d4f83fc2..af9a3db2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -38,6 +38,9 @@ case class ItemData( copy(metas = next) } + def appendTags(tags: Seq[String]): ItemData = + copy(tags = (this.tags ++ tags.toList).distinct) + def changeMeta( attachId: Ident, f: RAttachmentMeta => RAttachmentMeta diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 7b8b6431..fb777b24 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -34,12 +34,12 @@ object ProcessItem { )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item) - def analysisOnly[F[_]: Sync]( + def analysisOnly[F[_]: Sync: ContextShift]( cfg: Config, analyser: TextAnalyser[F], regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](analyser, regexNer)(item) + TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index abbb6870..ebb0894a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,23 +1,33 @@ package docspell.joex.process +import cats.data.OptionT import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.nlp.ClassifierModel +import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.nlp.TextClassifier import docspell.common._ +import docspell.joex.Config import docspell.joex.analysis.RegexNerFile +import docspell.joex.learn.LearnClassifierTask import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta +import docspell.store.records.RClassifierSetting + +import bitpeace.RangeDef object TextAnalysis { + type Args = ProcessItemArgs - def apply[F[_]: Sync]( + def apply[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, analyser: TextAnalyser[F], nerFile: RegexNerFile[F] - )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = + )(item: ItemData): Task[F, Args, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text analysis") @@ -34,15 +44,18 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value + } yield item + .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + .appendTags(tag.toSeq) } def annotateAttachment[F[_]: Sync]( - ctx: Context[F, ProcessItemArgs], + ctx: Context[F, Args], analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordSettings(ctx.args.meta.language, false, None) + val settings = StanfordNerSettings(ctx.args.meta.language, false, None) for { customNer <- nerFile.makeFile(ctx.args.meta.collective) sett = settings.copy(regexNer = customNer) @@ -54,4 +67,42 @@ object TextAnalysis { ) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + + def predictTag[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): OptionT[F, String] = + for { + model <- findActiveModel(ctx, cfg) + _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) + text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) + modelData = + ctx.store.bitpeace + .get(model.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir => + val modelFile = dir.resolve("model.ser.gz") + modelData + .through(fs2.io.file.writeAll(modelFile, ctx.blocker)) + .compile + .drain + .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) + }).filter(_ != LearnClassifierTask.noClass) + _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) + } yield cls + + private def findActiveModel[F[_]: Sync]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis + ): OptionT[F, Ident] = + if (cfg.classification.enabled) + OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective))) + .filter(_.enabled) + .mapFilter(_.fileId) + else + OptionT.none + } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 1a48eece..a03a0e2e 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -1047,6 +1047,28 @@ paths: application/json: schema: $ref: "#/components/schemas/ContactList" + + /sec/collective/classifier/startonce: + post: + tags: [ Collective ] + summary: Starts the learn-classifier task + description: | + If the collective has classification enabled, this will submit + the task for learning a classifier from existing data. This + task is usally run periodically as determined by the + collective settings. + + The request is empty, settings are used from the collective. + security: + - authTokenHeader: [] + responses: + 200: + description: Ok + content: + application/json: + schema: + $ref: "#/components/schemas/BasicResult" + /sec/user: get: tags: [ Collective ] @@ -3643,12 +3665,14 @@ components: description: DateTime type: integer format: date-time + CollectiveSettings: description: | Settings for a collective. required: - language - integrationEnabled + - classifier properties: language: type: string @@ -3658,6 +3682,31 @@ components: description: | Whether the collective has the integration endpoint enabled. + classifier: + $ref: "#/components/schemas/ClassifierSetting" + + ClassifierSetting: + description: | + Settings for learning a document classifier. + required: + - enabled + - schedule + - itemCount + properties: + enabled: + type: boolean + category: + type: string + itemCount: + type: integer + format: int32 + description: | + The max. number of items to learn from. The newest items + are considered. + schedule: + type: string + format: calevent + SourceList: description: | A list of sources. diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index 8a84fa77..bf7eaddd 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -10,6 +10,7 @@ import docspell.restapi.model._ import docspell.restserver.conv.Conversions import docspell.restserver.http4s._ +import com.github.eikek.calev.CalEvent import org.http4s.HttpRoutes import org.http4s.circe.CirceEntityDecoder._ import org.http4s.circe.CirceEntityEncoder._ @@ -37,7 +38,18 @@ object CollectiveRoutes { case req @ POST -> Root / "settings" => for { settings <- req.as[CollectiveSettings] - sett = OCollective.Settings(settings.language, settings.integrationEnabled) + sett = OCollective.Settings( + settings.language, + settings.integrationEnabled, + Some( + OCollective.Classifier( + settings.classifier.enabled, + settings.classifier.schedule, + settings.classifier.itemCount, + settings.classifier.category + ) + ) + ) res <- backend.collective .updateSettings(user.account.collective, sett) @@ -46,8 +58,21 @@ object CollectiveRoutes { case GET -> Root / "settings" => for { - collDb <- backend.collective.find(user.account.collective) - sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled)) + settDb <- backend.collective.findSettings(user.account.collective) + sett = settDb.map(c => + CollectiveSettings( + c.language, + c.integrationEnabled, + ClassifierSetting( + c.classifier.map(_.enabled).getOrElse(false), + c.classifier.flatMap(_.category), + c.classifier.map(_.itemCount).getOrElse(0), + c.classifier + .map(_.schedule) + .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00")) + ) + ) + ) resp <- sett.toResponse() } yield resp @@ -63,6 +88,12 @@ object CollectiveRoutes { resp <- Ok(ContactList(res.map(Conversions.mkContact))) } yield resp + case POST -> Root / "classifier" / "startonce" => + for { + _ <- backend.collective.startLearnClassifier(user.account.collective) + resp <- Ok(BasicResult(true, "Task submitted")) + } yield resp + case GET -> Root => for { collDb <- backend.collective.find(user.account.collective) diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql new file mode 100644 index 00000000..fb1e85cd --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql @@ -0,0 +1,9 @@ +CREATE TABLE `classifier_setting` ( + `cid` varchar(254) not null primary key, + `enabled` boolean not null, + `schedule` varchar(254) not null, + `category` varchar(254) not null, + `file_id` varchar(254), + `created` timestamp not null, + foreign key (`cid`) references `collective`(`cid`) +); diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql new file mode 100644 index 00000000..5e81feea --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql @@ -0,0 +1,11 @@ +CREATE TABLE "classifier_setting" ( + "cid" varchar(254) not null primary key, + "enabled" boolean not null, + "schedule" varchar(254) not null, + "category" varchar(254) not null, + "item_count" int not null, + "file_id" varchar(254), + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id") +); diff --git a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala index e4a67538..3a992b71 100644 --- a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala +++ b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala @@ -67,8 +67,8 @@ trait DoobieSyntax { Fragment.const(" FROM ") ++ table ++ this.where(where) def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment = - Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++ - Fragment.const(") FROM ") ++ table ++ this.where(where) + Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++ + Fragment.const(" FROM ") ++ table ++ this.where(where) def selectCount(col: Column, table: Fragment, where: Fragment): Fragment = Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 1240d4a7..d3d2653e 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref import cats.implicits._ import fs2.Stream +import docspell.common.syntax.all._ import docspell.common.{IdRef, _} import docspell.store.Store import docspell.store.impl.Implicits._ @@ -615,4 +616,75 @@ object QItem { .query[NameAndNotes] .streamWithChunkSize(chunkSize) } + + def findAllNewesFirst( + collective: Ident, + chunkSize: Int + ): Stream[ConnectionIO, Ident] = { + val cols = Seq(RItem.Columns.id) + (selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++ + orderBy(RItem.Columns.created.desc)) + .query[Ident] + .streamWithChunkSize(chunkSize) + } + + case class TagName(id: Ident, name: String) + case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName]) + + def resolveTextAndTag( + collective: Ident, + itemId: Ident, + tagCategory: String, + pageSep: String + ): ConnectionIO[TextAndTag] = { + val aId = RAttachment.Columns.id.prefix("a") + val aItem = RAttachment.Columns.itemId.prefix("a") + val mId = RAttachmentMeta.Columns.id.prefix("m") + val mText = RAttachmentMeta.Columns.content.prefix("m") + val tiItem = RTagItem.Columns.itemId.prefix("ti") + val tiTag = RTagItem.Columns.tagId.prefix("ti") + val tId = RTag.Columns.tid.prefix("t") + val tName = RTag.Columns.name.prefix("t") + val tCat = RTag.Columns.category.prefix("t") + val iId = RItem.Columns.id.prefix("i") + val iColl = RItem.Columns.cid.prefix("i") + + val cte = withCTE( + "tags" -> selectSimple( + Seq(tiItem, tId, tName), + RTagItem.table ++ fr"ti INNER JOIN" ++ + RTag.table ++ fr"t ON" ++ tId.is(tiTag), + and(tiItem.is(itemId), tCat.is(tagCategory)) + ) + ) + + val cols = Seq(mText, tId, tName) + + val from = RItem.table ++ fr"i INNER JOIN" ++ + RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++ + RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++ + fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId) + + val where = + and( + iId.is(itemId), + iColl.is(collective), + mText.isNotNull, + mText.isNot("") + ) + + val q = cte ++ selectDistinct(cols, from, where) + for { + _ <- logger.ftrace[ConnectionIO]( + s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})" + ) + texts <- q.query[(String, Option[TagName])].to[List] + _ <- logger.ftrace[ConnectionIO]( + s"Got ${texts.size} text and tag entries for item ${itemId.id}" + ) + tag = texts.headOption.flatMap(_._2) + txt = texts.map(_._1).mkString(pageSep) + } yield TextAndTag(itemId, txt, tag) + } + } diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala new file mode 100644 index 00000000..680741a0 --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -0,0 +1,113 @@ +package docspell.store.records + +import cats.implicits._ + +import docspell.common._ +import docspell.store.impl.Implicits._ +import docspell.store.impl._ + +import com.github.eikek.calev._ +import doobie._ +import doobie.implicits._ + +case class RClassifierSetting( + cid: Ident, + enabled: Boolean, + schedule: CalEvent, + category: String, + itemCount: Int, + fileId: Option[Ident], + created: Timestamp +) {} + +object RClassifierSetting { + + val table = fr"classifier_setting" + + object Columns { + val cid = Column("cid") + val enabled = Column("enabled") + val schedule = Column("schedule") + val category = Column("category") + val itemCount = Column("item_count") + val fileId = Column("file_id") + val created = Column("created") + val all = List(cid, enabled, schedule, category, itemCount, fileId, created) + } + import Columns._ + + def insert(v: RClassifierSetting): ConnectionIO[Int] = { + val sql = + insertRow( + table, + all, + fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}" + ) + sql.update.run + } + + def updateAll(v: RClassifierSetting): ConnectionIO[Int] = { + val sql = updateRow( + table, + cid.is(v.cid), + commas( + enabled.setTo(v.enabled), + schedule.setTo(v.schedule), + category.setTo(v.category), + itemCount.setTo(v.itemCount), + fileId.setTo(v.fileId) + ) + ) + sql.update.run + } + + def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] = + updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run + + def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = + for { + n1 <- updateRow( + table, + cid.is(v.cid), + commas( + enabled.setTo(v.enabled), + schedule.setTo(v.schedule), + itemCount.setTo(v.itemCount), + category.setTo(v.category) + ) + ).update.run + n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO] + } yield n1 + n2 + + def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = { + val sql = selectSimple(all, table, cid.is(id)) + sql.query[RClassifierSetting].option + } + + def delete(coll: Ident): ConnectionIO[Int] = + deleteFrom(table, cid.is(coll)).update.run + + case class Classifier( + enabled: Boolean, + schedule: CalEvent, + itemCount: Int, + category: Option[String] + ) { + + def toRecord(coll: Ident, created: Timestamp): RClassifierSetting = + RClassifierSetting( + coll, + enabled, + schedule, + category.getOrElse(""), + itemCount, + None, + created + ) + } + object Classifier { + def fromRecord(r: RClassifierSetting): Classifier = + Classifier(r.enabled, r.schedule, r.itemCount, r.category.some) + } + +} diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala index fa40e374..2487ed22 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala @@ -61,14 +61,47 @@ object RCollective { updateRow(table, id.is(cid), language.setTo(lang)).update.run def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] = - updateRow( - table, - id.is(cid), - commas( - language.setTo(settings.language), - integration.setTo(settings.integrationEnabled) - ) - ).update.run + for { + n1 <- updateRow( + table, + id.is(cid), + commas( + language.setTo(settings.language), + integration.setTo(settings.integrationEnabled) + ) + ).update.run + cls <- + Timestamp + .current[ConnectionIO] + .map(now => settings.classifier.map(_.toRecord(cid, now))) + n2 <- cls match { + case Some(cr) => + RClassifierSetting.updateSettings(cr) + case None => + RClassifierSetting.delete(cid) + } + } yield n1 + n2 + + def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = { + val cId = id.prefix("c") + val CS = RClassifierSetting.Columns + val csCid = CS.cid.prefix("cs") + + val cols = Seq( + language.prefix("c"), + integration.prefix("c"), + CS.enabled.prefix("cs"), + CS.schedule.prefix("cs"), + CS.itemCount.prefix("cs"), + CS.category.prefix("cs") + ) + val from = table ++ fr"c LEFT JOIN" ++ + RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId) + + selectSimple(cols, from, cId.is(coll)) + .query[Settings] + .option + } def findById(cid: Ident): ConnectionIO[Option[RCollective]] = { val sql = selectSimple(all, table, id.is(cid)) @@ -112,5 +145,10 @@ object RCollective { selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option } - case class Settings(language: Language, integrationEnabled: Boolean) + case class Settings( + language: Language, + integrationEnabled: Boolean, + classifier: Option[RClassifierSetting.Classifier] + ) + } diff --git a/modules/webapp/src/main/elm/Api.elm b/modules/webapp/src/main/elm/Api.elm index 10bcf7ff..ccba8570 100644 --- a/modules/webapp/src/main/elm/Api.elm +++ b/modules/webapp/src/main/elm/Api.elm @@ -88,6 +88,7 @@ module Api exposing , setItemNotes , setTags , setUnconfirmed + , startClassifier , startOnceNotifyDueItems , startOnceScanMailbox , startReIndex @@ -795,6 +796,19 @@ versionInfo flags receive = --- Collective +startClassifier : + Flags + -> (Result Http.Error BasicResult -> msg) + -> Cmd msg +startClassifier flags receive = + Http2.authPost + { url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce" + , account = getAccount flags + , body = Http.emptyBody + , expect = Http.expectJson receive Api.Model.BasicResult.decoder + } + + getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg getTagCloud flags receive = Http2.authGet diff --git a/modules/webapp/src/main/elm/App/View.elm b/modules/webapp/src/main/elm/App/View.elm index 6906fd2f..346983e6 100644 --- a/modules/webapp/src/main/elm/App/View.elm +++ b/modules/webapp/src/main/elm/App/View.elm @@ -218,12 +218,12 @@ loginInfo model = , menuEntry model CollectiveSettingPage [ i [ class "users circle icon" ] [] - , text "Collective Settings" + , text "Collective Profile" ] , menuEntry model UserSettingPage [ i [ class "user circle icon" ] [] - , text "User Settings" + , text "User Profile" ] , div [ class "divider" ] [] , menuEntry model diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm new file mode 100644 index 00000000..23e440cd --- /dev/null +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -0,0 +1,204 @@ +module Comp.ClassifierSettingsForm exposing + ( Model + , Msg + , getSettings + , init + , update + , view + ) + +import Api +import Api.Model.ClassifierSetting exposing (ClassifierSetting) +import Api.Model.TagList exposing (TagList) +import Comp.CalEventInput +import Comp.FixedDropdown +import Comp.IntField +import Data.CalEvent exposing (CalEvent) +import Data.Flags exposing (Flags) +import Data.Validated exposing (Validated(..)) +import Html exposing (..) +import Html.Attributes exposing (..) +import Html.Events exposing (onCheck) +import Http +import Util.Tag + + +type alias Model = + { enabled : Bool + , categoryModel : Comp.FixedDropdown.Model String + , category : Maybe String + , scheduleModel : Comp.CalEventInput.Model + , schedule : Validated CalEvent + , itemCountModel : Comp.IntField.Model + , itemCount : Maybe Int + } + + +type Msg + = GetTagsResp (Result Http.Error TagList) + | ScheduleMsg Comp.CalEventInput.Msg + | ToggleEnabled + | CategoryMsg (Comp.FixedDropdown.Msg String) + | ItemCountMsg Comp.IntField.Msg + + +init : Flags -> ClassifierSetting -> ( Model, Cmd Msg ) +init flags sett = + let + newSchedule = + Data.CalEvent.fromEvent sett.schedule + |> Maybe.withDefault Data.CalEvent.everyMonth + + ( cem, cec ) = + Comp.CalEventInput.init flags newSchedule + in + ( { enabled = sett.enabled + , categoryModel = Comp.FixedDropdown.initString [] + , category = sett.category + , scheduleModel = cem + , schedule = Data.Validated.Unknown newSchedule + , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" + , itemCount = Just sett.itemCount + } + , Cmd.batch + [ Api.getTags flags "" GetTagsResp + , Cmd.map ScheduleMsg cec + ] + ) + + +getSettings : Model -> Validated ClassifierSetting +getSettings model = + Data.Validated.map + (\sch -> + { enabled = model.enabled + , category = model.category + , schedule = + Data.CalEvent.makeEvent sch + , itemCount = Maybe.withDefault 0 model.itemCount + } + ) + model.schedule + + +update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) +update flags msg model = + case msg of + GetTagsResp (Ok tl) -> + let + categories = + Util.Tag.getCategories tl.items + |> List.sort + in + ( { model + | categoryModel = Comp.FixedDropdown.initString categories + , category = + if model.category == Nothing then + List.head categories + + else + model.category + } + , Cmd.none + ) + + GetTagsResp (Err _) -> + ( model, Cmd.none ) + + ScheduleMsg lmsg -> + let + ( cm, cc, ce ) = + Comp.CalEventInput.update + flags + (Data.Validated.value model.schedule) + lmsg + model.scheduleModel + in + ( { model + | scheduleModel = cm + , schedule = ce + } + , Cmd.map ScheduleMsg cc + ) + + ToggleEnabled -> + ( { model | enabled = not model.enabled } + , Cmd.none + ) + + CategoryMsg lmsg -> + let + ( mm, ma ) = + Comp.FixedDropdown.update lmsg model.categoryModel + in + ( { model + | categoryModel = mm + , category = + if ma == Nothing then + model.category + + else + ma + } + , Cmd.none + ) + + ItemCountMsg lmsg -> + let + ( im, iv ) = + Comp.IntField.update lmsg model.itemCountModel + in + ( { model + | itemCountModel = im + , itemCount = iv + } + , Cmd.none + ) + + +view : Model -> Html Msg +view model = + div [] + [ div + [ class "field" + ] + [ div [ class "ui checkbox" ] + [ input + [ type_ "checkbox" + , onCheck (\_ -> ToggleEnabled) + , checked model.enabled + ] + [] + , label [] [ text "Enable classification" ] + , span [ class "small-info" ] + [ text "Disable document classification if not needed." + ] + ] + ] + , div [ class "ui basic segment" ] + [ text "Document classification tries to predict a tag for new incoming documents. This " + , text "works by learning from existing documents in order to find common patterns within " + , text "the text. The more documents you have correctly tagged, the better. Learning is done " + , text "periodically based on a schedule and you need to specify a tag-group that should " + , text "be used for learning." + ] + , div [ class "field" ] + [ label [] [ text "Category" ] + , Html.map CategoryMsg + (Comp.FixedDropdown.viewString model.category + model.categoryModel + ) + ] + , Html.map ItemCountMsg + (Comp.IntField.viewWithInfo + "The maximum number of items to learn from, order by date newest first. Use 0 to mean all." + model.itemCount + "field" + model.itemCountModel + ) + , div [ class "field" ] + [ label [] [ text "Schedule" ] + , Html.map ScheduleMsg + (Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel) + ] + ] diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm index 342473c1..1efef12d 100644 --- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm @@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing import Api import Api.Model.BasicResult exposing (BasicResult) import Api.Model.CollectiveSettings exposing (CollectiveSettings) +import Comp.ClassifierSettingsForm import Comp.Dropdown import Data.Flags exposing (Flags) import Data.Language exposing (Language) import Data.UiSettings exposing (UiSettings) +import Data.Validated exposing (Validated) import Html exposing (..) import Html.Attributes exposing (..) import Html.Events exposing (onCheck, onClick, onInput) @@ -27,44 +29,60 @@ type alias Model = , initSettings : CollectiveSettings , fullTextConfirmText : String , fullTextReIndexResult : Maybe BasicResult + , classifierModel : Comp.ClassifierSettingsForm.Model + , startClassifierResult : Maybe BasicResult } -init : CollectiveSettings -> Model -init settings = +init : Flags -> CollectiveSettings -> ( Model, Cmd Msg ) +init flags settings = let lang = Data.Language.fromString settings.language |> Maybe.withDefault Data.Language.German + + ( cm, cc ) = + Comp.ClassifierSettingsForm.init flags settings.classifier in - { langModel = - Comp.Dropdown.makeSingleList - { makeOption = - \l -> - { value = Data.Language.toIso3 l - , text = Data.Language.toName l - , additional = "" - } - , placeholder = "" - , options = Data.Language.all - , selected = Just lang - } - , intEnabled = settings.integrationEnabled - , initSettings = settings - , fullTextConfirmText = "" - , fullTextReIndexResult = Nothing - } + ( { langModel = + Comp.Dropdown.makeSingleList + { makeOption = + \l -> + { value = Data.Language.toIso3 l + , text = Data.Language.toName l + , additional = "" + } + , placeholder = "" + , options = Data.Language.all + , selected = Just lang + } + , intEnabled = settings.integrationEnabled + , initSettings = settings + , fullTextConfirmText = "" + , fullTextReIndexResult = Nothing + , classifierModel = cm + , startClassifierResult = Nothing + } + , Cmd.map ClassifierSettingMsg cc + ) -getSettings : Model -> CollectiveSettings +getSettings : Model -> Validated CollectiveSettings getSettings model = - CollectiveSettings - (Comp.Dropdown.getSelected model.langModel - |> List.head - |> Maybe.map Data.Language.toIso3 - |> Maybe.withDefault model.initSettings.language + Data.Validated.map + (\cls -> + { language = + Comp.Dropdown.getSelected model.langModel + |> List.head + |> Maybe.map Data.Language.toIso3 + |> Maybe.withDefault model.initSettings.language + , integrationEnabled = model.intEnabled + , classifier = cls + } + ) + (Comp.ClassifierSettingsForm.getSettings + model.classifierModel ) - model.intEnabled type Msg @@ -73,6 +91,10 @@ type Msg | SetFullTextConfirm String | TriggerReIndex | TriggerReIndexResult (Result Http.Error BasicResult) + | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg + | SaveSettings + | StartClassifierTask + | StartClassifierResp (Result Http.Error BasicResult) update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings ) @@ -85,22 +107,15 @@ update flags msg model = nextModel = { model | langModel = m2 } - - nextSettings = - if Comp.Dropdown.isDropdownChangeMsg m then - Just (getSettings nextModel) - - else - Nothing in - ( nextModel, Cmd.map LangDropdownMsg c2, nextSettings ) + ( nextModel, Cmd.map LangDropdownMsg c2, Nothing ) ToggleIntegrationEndpoint -> let nextModel = { model | intEnabled = not model.intEnabled } in - ( nextModel, Cmd.none, Just (getSettings nextModel) ) + ( nextModel, Cmd.none, Nothing ) SetFullTextConfirm str -> ( { model | fullTextConfirmText = str }, Cmd.none, Nothing ) @@ -138,12 +153,50 @@ update flags msg model = , Nothing ) + ClassifierSettingMsg lmsg -> + let + ( cm, cc ) = + Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel + in + ( { model + | classifierModel = cm + } + , Cmd.map ClassifierSettingMsg cc + , Nothing + ) + + SaveSettings -> + case getSettings model of + Data.Validated.Valid s -> + ( model, Cmd.none, Just s ) + + _ -> + ( model, Cmd.none, Nothing ) + + StartClassifierTask -> + ( model, Api.startClassifier flags StartClassifierResp, Nothing ) + + StartClassifierResp (Ok br) -> + ( { model | startClassifierResult = Just br } + , Cmd.none + , Nothing + ) + + StartClassifierResp (Err err) -> + ( { model + | startClassifierResult = + Just (BasicResult False (Util.Http.errorToString err)) + } + , Cmd.none + , Nothing + ) + view : Flags -> UiSettings -> Model -> Html Msg view flags settings model = div [ classList - [ ( "ui form", True ) + [ ( "ui form error success", True ) , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False ) , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True ) ] @@ -219,17 +272,62 @@ view flags settings model = [ text "This starts a task that clears the full-text index and re-indexes all your data again." , text "You must type OK before clicking the button to avoid accidental re-indexing." ] - , div - [ classList - [ ( "ui message", True ) - , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False ) - , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True ) - , ( "hidden invisible", model.fullTextReIndexResult == Nothing ) - ] - ] - [ Maybe.map .message model.fullTextReIndexResult - |> Maybe.withDefault "" - |> text + , renderResultMessage model.fullTextReIndexResult + ] + , h3 + [ classList + [ ( "ui dividing header", True ) + , ( "invisible hidden", False ) ] ] + [ text "Document Classifier" + ] + , div + [ classList + [ ( "field", True ) + , ( "invisible hidden", False ) + ] + ] + [ Html.map ClassifierSettingMsg + (Comp.ClassifierSettingsForm.view model.classifierModel) + , div [ class "ui vertical segment" ] + [ button + [ classList + [ ( "ui small secondary basic button", True ) + , ( "disabled", not model.classifierModel.enabled ) + ] + , title "Starts a task to train a classifier" + , onClick StartClassifierTask + ] + [ text "Start now" + ] + , renderResultMessage model.startClassifierResult + ] + ] + , div [ class "ui divider" ] [] + , button + [ classList + [ ( "ui primary button", True ) + , ( "disabled", getSettings model |> Data.Validated.isInvalid ) + ] + , onClick SaveSettings + ] + [ text "Save" + ] + ] + + +renderResultMessage : Maybe BasicResult -> Html msg +renderResultMessage result = + div + [ classList + [ ( "ui message", True ) + , ( "error", Maybe.map .success result == Just False ) + , ( "success", Maybe.map .success result == Just True ) + , ( "hidden invisible", result == Nothing ) + ] + ] + [ Maybe.map .message result + |> Maybe.withDefault "" + |> text ] diff --git a/modules/webapp/src/main/elm/Data/Validated.elm b/modules/webapp/src/main/elm/Data/Validated.elm index c56f98c6..40e0f97e 100644 --- a/modules/webapp/src/main/elm/Data/Validated.elm +++ b/modules/webapp/src/main/elm/Data/Validated.elm @@ -1,5 +1,6 @@ module Data.Validated exposing ( Validated(..) + , isInvalid , map , map2 , map3 @@ -14,6 +15,19 @@ type Validated a | Unknown a +isInvalid : Validated a -> Bool +isInvalid v = + case v of + Valid _ -> + False + + Invalid _ _ -> + True + + Unknown _ -> + False + + value : Validated a -> a value va = case va of diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm index 1b1bd53b..b8dd6a2b 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm @@ -30,15 +30,21 @@ init flags = let ( sm, sc ) = Comp.SourceManage.init flags + + ( cm, cc ) = + Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty in ( { currentTab = Just InsightsTab , sourceModel = sm , userModel = Comp.UserManage.emptyModel - , settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty + , settingsModel = cm , insights = Api.Model.ItemInsights.empty , submitResult = Nothing } - , Cmd.map SourceMsg sc + , Cmd.batch + [ Cmd.map SourceMsg sc + , Cmd.map SettingsFormMsg cc + ] ) diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm index fa9ab433..7ad68e16 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm @@ -77,7 +77,13 @@ update flags msg model = ( model, Cmd.none ) CollectiveSettingsResp (Ok data) -> - ( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none ) + let + ( cm, cc ) = + Comp.CollectiveSettingsForm.init flags data + in + ( { model | settingsModel = cm } + , Cmd.map SettingsFormMsg cc + ) CollectiveSettingsResp (Err _) -> ( model, Cmd.none ) diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm index 513e2719..c46aacfb 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm @@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg) viewSettings flags settings model = [ h2 [ class "ui header" ] [ i [ class "cog icon" ] [] - , text "Settings" + , text "Collective Settings" ] , div [ class "ui segment" ] - [ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel) + [ Html.map SettingsFormMsg + (Comp.CollectiveSettingsForm.view flags settings model.settingsModel) ] , div [ classList diff --git a/nix/module-joex.nix b/nix/module-joex.nix index d550c2d3..7619711f 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -95,6 +95,21 @@ let enabled = true; file-cache-time = "1 minute"; }; + classification = { + enabled = true; + item-count = 0; + classifiers = [ + { "useSplitWords" = "true"; + "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.''; + "splitWordsIgnoreRegexp" = ''\s+''; + "useSplitPrefixSuffixNGrams" = "true"; + "maxNGramLeng" = "4"; + "minNGramLeng" = "1"; + "splitWordShape" = "chris4"; + "intern" = "true"; + } + ]; + }; working-dir = "/tmp/docspell-analysis"; }; processing = { @@ -736,6 +751,59 @@ in { default = defaults.text-analysis.regex-ner; description = ""; }; + + classification = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.text-analysis.classification.enabled; + description = '' + Whether to enable classification globally. Each collective can + decide to disable it. If it is disabled here, no collective + can use classification. + ''; + }; + item-count = mkOption { + type = types.int; + default = defaults.text-analysis.classification.item-count; + description = '' + If concerned with memory consumption, this restricts the + number of items to consider. More are better for training. A + negative value or zero means no train on all items. + ''; + }; + classifiers = mkOption { + type = types.listOf types.attrs; + default = defaults.text-analysis.classification.classifiers; + description = '' + These settings are used to configure the classifier. If + multiple are given, they are all tried and the "best" is + chosen at the end. See + https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html + for more info about these settings. The settings here yielded + good results with *my* dataset. + ''; + }; + + }; + }); + default = defaults.text-analysis.classification; + description = '' + Settings for doing document classification. + + This works by learning from existing documents. A collective can + specify a tag category and the system will try to predict a tag + from this category for new incoming documents. + + This requires a satstical model that is computed from all + existing documents. This process is run periodically as + configured by the collective. It may require a lot of memory, + depending on the amount of data. + + It utilises this NLP library: https://nlp.stanford.edu/. + ''; + }; }; }); default = defaults.text-analysis; diff --git a/website/elm/Feature.elm b/website/elm/Feature.elm index 246aa7ad..4d2fb734 100644 --- a/website/elm/Feature.elm +++ b/website/elm/Feature.elm @@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u , { image = "img/analyze-feature.png" , header = "Text Analysis" , description = """ -The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically. +The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically. """ } , { image = "img/filetype-feature.svg" diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md index 36e5d57c..0f5e23b2 100644 --- a/website/site/content/docs/webapp/metadata.md +++ b/website/site/content/docs/webapp/metadata.md @@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like "assignment" semantics. Docspell doesn't propose any workflow, but it can help to implement some. -The tags are *not* taken into account when creating suggestions from -analyzed text yet. However, PDF files may contain metadata itself and -if there is a metadata *keywords* list, these keywords are matched -against the tags in the database. If they match, the item is tagged -automatically. +Docspell can try to predict a tag for new incoming documents +automatically based on your existing data. This requires to train an +algorithm. There are some caveats: the more data you have correctly +tagged, the better are the results. So it won't work well for maybe +the first 100 documents. Then the tags must somehow relate to a +pattern in the document text. Tags like *todo* or *waiting* probably +won't work, obviously. But the typical "document type" tag, like +*invoice* and *receipt* is a good fit! That is why you need to provide +a tag category so only sensible tags are being learned. The algorithm +goes through all your items and learns patterns in the text that +relate to the given tags. This training step can be run periodically, +as specified in your collective settings such that docspell keeps +learning from your already tagged data! More information about the +algorithm can be found in the config, where it is possible to +fine-tune this process. + +Another way to have items tagged automatically is when an input PDF +file contains a list of keywords in its metadata section (this only +applies to PDF files). These keywords are then matched against the +tags in the database. If they match, the item is tagged with them. ## Organization and Person