From f02f15e5bd1e3ebac255b6c2b1a10f1b69c4199c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 21:26:44 +0100 Subject: [PATCH 01/38] Move blocker into constructor of text analyser --- .../main/scala/docspell/analysis/TextAnalyser.scala | 11 +++++------ .../src/main/scala/docspell/joex/JoexAppImpl.scala | 2 +- .../docspell/joex/learn/LearnClassifierTask.scala | 3 +-- .../scala/docspell/joex/process/TextAnalysis.scala | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 8ec4854e..c319b784 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -21,7 +21,7 @@ trait TextAnalyser[F[_]] { text: String ): F[TextAnalyser.Result] - def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F] + def classifier: TextClassifier[F] } object TextAnalyser { @@ -31,8 +31,9 @@ object TextAnalyser { labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString)) } - def create[F[_]: Concurrent: Timer]( - cfg: TextAnalysisConfig + def create[F[_]: Concurrent: Timer: ContextShift]( + cfg: TextAnalysisConfig, + blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource .liftF(PipelineCache[F](cfg.clearStanfordPipelineInterval)) @@ -53,9 +54,7 @@ object TextAnalyser { spans = NerLabelSpan.build(list) } yield Result(spans ++ list, dates) - def classifier(blocker: Blocker)(implicit - CS: ContextShift[F] - ): TextClassifier[F] = + def classifier: TextClassifier[F] = new StanfordTextClassifier[F](cfg.classifier, blocker) private def textLimit(logger: Logger[F], text: String): F[String] = diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 51fed2bc..cdbb5a50 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -97,7 +97,7 @@ object JoexAppImpl { upload <- OUpload(store, queue, cfg.files, joex) fts <- createFtsClient(cfg)(httpClient) itemOps <- OItem(store, fts, queue, joex) - analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig) + analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig, blocker) regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store) javaEmil = JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index c3d6e3f9..535b7f0d 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -41,8 +41,7 @@ object LearnClassifierTask { sett.category.getOrElse("") ) _ <- OptionT.liftF( - analyser - .classifier(blocker) + analyser.classifier .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker))) ) } yield ()) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 6864b390..4a868d47 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -44,7 +44,7 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value + tag <- predictTag(ctx, cfg, item.metas, analyser.classifier).value } yield item .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) .appendTags(tag.toSeq) From a699e87304ac17be62a72d9a09e6d97493ec8cc9 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 21:41:51 +0100 Subject: [PATCH 02/38] Separate ner from classification --- .../main/scala/docspell/analysis/TextAnalyser.scala | 9 +++------ .../scala/docspell/analysis/TextAnalysisConfig.scala | 2 +- .../analysis/{nlp => classifier}/ClassifierModel.scala | 2 +- .../{nlp => classifier}/StanfordTextClassifier.scala | 10 ++++++---- .../analysis/{nlp => classifier}/TextClassifier.scala | 4 ++-- .../{nlp => classifier}/TextClassifierConfig.scala | 2 +- ...dNerClassifier.scala => StanfordNerAnnotator.scala} | 6 +++--- .../StanfordTextClassifierSuite.scala | 2 +- .../docspell/analysis/nlp/TextAnalyserSuite.scala | 4 ++-- modules/joex/src/main/scala/docspell/joex/Config.scala | 5 ++--- .../docspell/joex/learn/LearnClassifierTask.scala | 4 ++-- .../scala/docspell/joex/process/TextAnalysis.scala | 3 +-- 12 files changed, 25 insertions(+), 28 deletions(-) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/ClassifierModel.scala (64%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/StanfordTextClassifier.scala (93%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/TextClassifier.scala (83%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/TextClassifierConfig.scala (82%) rename modules/analysis/src/main/scala/docspell/analysis/nlp/{StanfordNerClassifier.scala => StanfordNerAnnotator.scala} (86%) rename modules/analysis/src/test/scala/docspell/analysis/{nlp => classifier}/StanfordTextClassifierSuite.scala (98%) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index c319b784..b67347ae 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -3,13 +3,10 @@ package docspell.analysis import cats.effect._ import cats.implicits._ +import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier} import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind -import docspell.analysis.nlp.PipelineCache -import docspell.analysis.nlp.StanfordNerClassifier -import docspell.analysis.nlp.StanfordNerSettings -import docspell.analysis.nlp.StanfordTextClassifier -import docspell.analysis.nlp.TextClassifier +import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings} import docspell.common._ trait TextAnalyser[F[_]] { @@ -67,7 +64,7 @@ object TextAnalyser { private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) : F[Vector[NerLabel]] = - StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text) + StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text) private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay { diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index cb6e1d39..2dbfbfc4 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,6 +1,6 @@ package docspell.analysis -import docspell.analysis.nlp.TextClassifierConfig +import docspell.analysis.classifier.TextClassifierConfig import docspell.common._ case class TextAnalysisConfig( diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala similarity index 64% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala index 82f9f9cc..071a8e29 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala similarity index 93% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala index 091d9e16..edd1c7da 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path @@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref import cats.implicits._ import fs2.Stream -import docspell.analysis.nlp.TextClassifier._ +import docspell.analysis.classifier +import docspell.analysis.classifier.TextClassifier._ +import docspell.analysis.nlp.Properties import docspell.common._ import edu.stanford.nlp.classify.ColumnDataClassifier @@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( case Some(text) => Sync[F].delay { val cls = ColumnDataClassifier.getClassifier( - model.model.normalize().toAbsolutePath().toString() + model.model.normalize().toAbsolutePath.toString ) val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text))) Option(cat) @@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props))) cdc.trainClassifier(in.train.toString()) val score = cdc.testClassifier(in.test.toString()) - TrainResult(score.first(), ClassifierModel(in.modelFile)) + TrainResult(score.first(), classifier.ClassifierModel(in.modelFile)) } _ <- logger.debug(s"Trained with result $res") } yield res diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala similarity index 83% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala index f2927d0c..3569f499 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala @@ -1,9 +1,9 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import cats.data.Kleisli import fs2.Stream -import docspell.analysis.nlp.TextClassifier.Data +import docspell.analysis.classifier.TextClassifier.Data import docspell.common._ trait TextClassifier[F[_]] { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala similarity index 82% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala index e3baac46..bb628ebf 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala similarity index 86% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala index 3f196b8e..df9fa431 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala @@ -9,7 +9,7 @@ import docspell.common._ import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} -object StanfordNerClassifier { +object StanfordNerAnnotator { /** Runs named entity recognition on the given `text`. * @@ -28,9 +28,9 @@ object StanfordNerClassifier { )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = cache .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(runClassifier(crf, text))) + .use(crf => Applicative[F].pure(nerAnnotate(crf, text))) - def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { + def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { val doc = new CoreDocument(text) nerClassifier.annotate(doc) doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala similarity index 98% rename from modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala index e38ba703..0229585c 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import minitest._ import cats.effect._ diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index b22093f1..e0dfc4a0 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find english ner labels") { val labels = - StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText) + StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find german ner labels") { val labels = - StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText) + StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 601d0049..8fba3582 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -4,8 +4,7 @@ import java.nio.file.Path import cats.data.NonEmptyList -import docspell.analysis.TextAnalysisConfig -import docspell.analysis.nlp.TextClassifierConfig +import docspell.analysis.{TextAnalysisConfig, classifier} import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -69,7 +68,7 @@ object Config { TextAnalysisConfig( maxLength, clearStanfordNlpInterval, - TextClassifierConfig( + classifier.TextClassifierConfig( workingDir, NonEmptyList .fromList(classification.classifiers) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 535b7f0d..d5c632c3 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -7,8 +7,8 @@ import cats.implicits._ import fs2.{Pipe, Stream} import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.ClassifierModel -import docspell.analysis.nlp.TextClassifier.Data +import docspell.analysis.classifier.ClassifierModel +import docspell.analysis.classifier.TextClassifier.Data import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 4a868d47..1fd2401a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -5,9 +5,8 @@ import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.ClassifierModel +import docspell.analysis.classifier.{ClassifierModel, TextClassifier} import docspell.analysis.nlp.StanfordNerSettings -import docspell.analysis.nlp.TextClassifier import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile From 4462ebae0fb1abafdfc6ec5f7dca56da81cc4014 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 22:29:53 +0100 Subject: [PATCH 03/38] Resurrect the basic ner classifier --- .../analysis/nlp/BasicCRFAnnotator.scala | 75 +++++++++++++++++++ .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 63 ++++++++++++++++ ....scala => StanfordNerAnnotatorSuite.scala} | 4 +- 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala create mode 100644 modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala rename modules/analysis/src/test/scala/docspell/analysis/nlp/{TextAnalyserSuite.scala => StanfordNerAnnotatorSuite.scala} (95%) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala new file mode 100644 index 00000000..5823fba2 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -0,0 +1,75 @@ +package docspell.analysis.nlp + +import docspell.common._ +import edu.stanford.nlp.ie.AbstractSequenceClassifier +import edu.stanford.nlp.ie.crf.CRFClassifier +import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger + +import java.net.URL +import java.util.zip.GZIPInputStream + +import scala.jdk.CollectionConverters._ +import scala.util.Using + +/** This is only using the CRFClassifier without building an analysis + * pipeline. The ner-classifier cannot use results from POS-tagging + * etc. and is therefore not as good as the [[StanfordNerAnnotator]]. + * But it uses less memory, while still being not bad. + */ +object BasicCRFAnnotator { + private[this] val logger = getLogger + + lazy val germanNerClassifier = makeClassifier(Language.German) + lazy val englishNerClassifier = makeClassifier(Language.English) + lazy val frenchNerClassifier = makeClassifier(Language.French) + + def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { + val nerClassifier = lang match { + case Language.English => englishNerClassifier + case Language.German => germanNerClassifier + case Language.French => frenchNerClassifier + } + nerClassifier + .classify(text) + .asScala + .flatMap(a => a.asScala) + .collect(Function.unlift { label => + val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) + NerTag + .fromString(Option(tag).getOrElse("")) + .toOption + .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) + }) + .toVector + } + + private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") + val ner = classifierResource(lang) + Using(new GZIPInputStream(ner.openStream())) { in => + CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] + }.fold(throw _, identity) + } + + private def classifierResource(lang: Language): URL = { + def check(u: URL): URL = + if (u == null) sys.error(s"NER model url not found for language ${lang.name}") + else u + + check(lang match { + case Language.French => + getClass.getResource( + "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" + ) + case Language.German => + getClass.getResource( + "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" + ) + case Language.English => + getClass.getResource( + "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ) + }) + } +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala new file mode 100644 index 00000000..bffc6744 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -0,0 +1,63 @@ +package docspell.analysis.nlp + +import minitest.SimpleTestSuite +import docspell.files.TestFiles +import docspell.common._ + +object BaseCRFAnnotatorSuite extends SimpleTestSuite { + test("find english ner labels") { + val labels = + BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText) + val expect = Vector( + NerLabel("Derek", NerTag.Person, 0, 5), + NerLabel("Jeter", NerTag.Person, 6, 11), + NerLabel("Elm", NerTag.Misc, 17, 20), + NerLabel("Ave.", NerTag.Misc, 21, 25), + NerLabel("Treesville", NerTag.Misc, 27, 37), + NerLabel("Derek", NerTag.Person, 68, 73), + NerLabel("Jeter", NerTag.Person, 74, 79), + NerLabel("Elm", NerTag.Misc, 85, 88), + NerLabel("Ave.", NerTag.Misc, 89, 93), + NerLabel("Treesville", NerTag.Person, 95, 105), + NerLabel("Leaf", NerTag.Organization, 144, 148), + NerLabel("Chief", NerTag.Organization, 150, 155), + NerLabel("of", NerTag.Organization, 156, 158), + NerLabel("Syrup", NerTag.Organization, 159, 164), + NerLabel("Production", NerTag.Organization, 165, 175), + NerLabel("Old", NerTag.Organization, 176, 179), + NerLabel("Sticky", NerTag.Organization, 180, 186), + NerLabel("Pancake", NerTag.Organization, 187, 194), + NerLabel("Company", NerTag.Organization, 195, 202), + NerLabel("Maple", NerTag.Organization, 207, 212), + NerLabel("Lane", NerTag.Organization, 213, 217), + NerLabel("Forest", NerTag.Organization, 219, 225), + NerLabel("Hemptown", NerTag.Location, 239, 247), + NerLabel("Leaf", NerTag.Person, 276, 280), + NerLabel("Little", NerTag.Misc, 347, 353), + NerLabel("League", NerTag.Misc, 354, 360), + NerLabel("Derek", NerTag.Person, 1117, 1122), + NerLabel("Jeter", NerTag.Person, 1123, 1128) + ) + assertEquals(labels, expect) + } + + test("find german ner labels") { + val labels = + BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText) + val expect = Vector( + NerLabel("Max", NerTag.Person, 0, 3), + NerLabel("Mustermann", NerTag.Person, 4, 14), + NerLabel("Lilienweg", NerTag.Person, 16, 25), + NerLabel("Max", NerTag.Person, 77, 80), + NerLabel("Mustermann", NerTag.Person, 81, 91), + NerLabel("Lilienweg", NerTag.Location, 93, 102), + NerLabel("EasyCare", NerTag.Organization, 124, 132), + NerLabel("AG", NerTag.Organization, 133, 135), + NerLabel("Ackerweg", NerTag.Location, 158, 166), + NerLabel("Nebendorf", NerTag.Location, 184, 193), + NerLabel("Max", NerTag.Person, 505, 508), + NerLabel("Mustermann", NerTag.Person, 509, 519) + ) + assertEquals(labels, expect) + } +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala similarity index 95% rename from modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala index e0dfc4a0..1704ef1b 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala @@ -5,7 +5,7 @@ import docspell.files.TestFiles import docspell.common._ import edu.stanford.nlp.pipeline.StanfordCoreNLP -object TextAnalyserSuite extends SimpleTestSuite { +object StanfordNerAnnotatorSuite extends SimpleTestSuite { lazy val germanClassifier = new StanfordCoreNLP(Properties.nerGerman(None, false)) lazy val englishClassifier = @@ -45,6 +45,7 @@ object TextAnalyserSuite extends SimpleTestSuite { NerLabel("Jeter", NerTag.Person, 1123, 1128) ) assertEquals(labels, expect) + StanfordCoreNLP.clearAnnotatorPool() } test("find german ner labels") { @@ -65,5 +66,6 @@ object TextAnalyserSuite extends SimpleTestSuite { NerLabel("Mustermann", NerTag.Person, 509, 519) ) assertEquals(labels, expect) + StanfordCoreNLP.clearAnnotatorPool() } } From a77f67d73ab4ef5c850d388404e14212c545afe9 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 22:59:02 +0100 Subject: [PATCH 04/38] Make pipeline cache generic to be used with BasicCRFAnnotator --- .../docspell/analysis/TextAnalyser.scala | 7 +- .../docspell/analysis/nlp/PipelineCache.scala | 91 ++++++++++--------- .../analysis/nlp/StanfordNerAnnotator.scala | 14 ++- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index b67347ae..6c8e6cff 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -33,7 +33,12 @@ object TextAnalyser { blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource - .liftF(PipelineCache[F](cfg.clearStanfordPipelineInterval)) + .liftF( + PipelineCache(cfg.clearStanfordPipelineInterval)( + StanfordNerAnnotator.makePipeline, + StanfordNerAnnotator.clearPipelineCaches[F] + ) + ) .map(cache => new TextAnalyser[F] { def annotate( diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 9cc3f2d7..61598f9a 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -1,15 +1,12 @@ package docspell.analysis.nlp import scala.concurrent.duration.{Duration => _, _} - import cats.Applicative +import cats.data.Kleisli import cats.effect._ import cats.effect.concurrent.Ref import cats.implicits._ - import docspell.common._ - -import edu.stanford.nlp.pipeline.StanfordCoreNLP import org.log4s.getLogger /** Creating the StanfordCoreNLP pipeline is quite expensive as it @@ -19,48 +16,57 @@ import org.log4s.getLogger * * **This is an internal API** */ -trait PipelineCache[F[_]] { +trait PipelineCache[F[_], A] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, StanfordCoreNLP] + def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] } object PipelineCache { private[this] val logger = getLogger - def none[F[_]: Applicative]: PipelineCache[F] = - new PipelineCache[F] { + def none[F[_]: Applicative, A]( + creator: Kleisli[F, StanfordNerSettings, A] + ): PipelineCache[F, A] = + new PipelineCache[F, A] { def obtain( ignored: String, settings: StanfordNerSettings - ): Resource[F, StanfordCoreNLP] = - Resource.liftF(makeClassifier(settings).pure[F]) + ): Resource[F, A] = + Resource.liftF(creator.run(settings)) } - def apply[F[_]: Concurrent: Timer](clearInterval: Duration): F[PipelineCache[F]] = + def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)( + creator: StanfordNerSettings => A, + release: F[Unit] + ): F[PipelineCache[F, A]] = for { - data <- Ref.of(Map.empty[String, Entry]) - cacheClear <- CacheClearing.create(data, clearInterval) - } yield new Impl[F](data, cacheClear) + data <- Ref.of(Map.empty[String, Entry[A]]) + cacheClear <- CacheClearing.create(data, clearInterval, release) + } yield new Impl[F, A](data, creator, cacheClear) - final private class Impl[F[_]: Sync]( - data: Ref[F, Map[String, Entry]], + final private class Impl[F[_]: Sync, A]( + data: Ref[F, Map[String, Entry[A]]], + creator: StanfordNerSettings => A, cacheClear: CacheClearing[F] - ) extends PipelineCache[F] { + ) extends PipelineCache[F, A] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, StanfordCoreNLP] = + def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] = for { - _ <- cacheClear.withCache - id <- Resource.liftF(makeSettingsId(settings)) - nlp <- Resource.liftF(data.modify(cache => getOrCreate(key, id, cache, settings))) + _ <- cacheClear.withCache + id <- Resource.liftF(makeSettingsId(settings)) + nlp <- Resource.liftF( + data.modify(cache => getOrCreate(key, id, cache, settings, creator)) + ) } yield nlp private def getOrCreate( key: String, id: String, - cache: Map[String, Entry], - settings: StanfordNerSettings - ): (Map[String, Entry], StanfordCoreNLP) = + cache: Map[String, Entry[A]], + settings: StanfordNerSettings, + creator: StanfordNerSettings => A + ): (Map[String, Entry[A]], A) = cache.get(key) match { case Some(entry) => if (entry.id == id) (cache, entry.value) @@ -68,13 +74,13 @@ object PipelineCache { logger.info( s"StanfordNLP settings changed for key $key. Creating new classifier" ) - val nlp = makeClassifier(settings) + val nlp = creator(settings) val e = Entry(id, nlp) (cache.updated(key, e), nlp) } case None => - val nlp = makeClassifier(settings) + val nlp = creator(settings) val e = Entry(id, nlp) (cache.updated(key, e), nlp) } @@ -104,9 +110,10 @@ object PipelineCache { Resource.pure[F, Unit](()) } - def create[F[_]: Concurrent: Timer]( - data: Ref[F, Map[String, Entry]], - interval: Duration + def create[F[_]: Concurrent: Timer, A]( + data: Ref[F, Map[String, Entry[A]]], + interval: Duration, + release: F[Unit] ): F[CacheClearing[F]] = for { counter <- Ref.of(0L) @@ -121,16 +128,23 @@ object PipelineCache { log .info(s"Clearing StanfordNLP cache after $interval idle time") .map(_ => - new CacheClearingImpl[F](data, counter, cleaning, interval.toScala) + new CacheClearingImpl[F, A]( + data, + counter, + cleaning, + interval.toScala, + release + ) ) } yield result } - final private class CacheClearingImpl[F[_]]( - data: Ref[F, Map[String, Entry]], + final private class CacheClearingImpl[F[_], A]( + data: Ref[F, Map[String, Entry[A]]], counter: Ref[F, Long], cleaningFiber: Ref[F, Option[Fiber[F, Unit]]], - clearInterval: FiniteDuration + clearInterval: FiniteDuration, + release: F[Unit] )(implicit T: Timer[F], F: Concurrent[F]) extends CacheClearing[F] { private[this] val log = Logger.log4s[F](logger) @@ -158,17 +172,10 @@ object PipelineCache { def clearAll: F[Unit] = log.info("Clearing stanford nlp cache now!") *> - data.set(Map.empty) *> Sync[F].delay { - // turns out that everything is cached in a static map - StanfordCoreNLP.clearAnnotatorPool() + data.set(Map.empty) *> release *> Sync[F].delay { System.gc(); } } - private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = { - logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") - new StanfordCoreNLP(Properties.forSettings(settings)) - } - - private case class Entry(id: String, value: StanfordCoreNLP) + private case class Entry[A](id: String, value: A) } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala index df9fa431..37b54b40 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala @@ -8,8 +8,10 @@ import cats.effect._ import docspell.common._ import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} +import org.log4s.getLogger object StanfordNerAnnotator { + private[this] val logger = getLogger /** Runs named entity recognition on the given `text`. * @@ -24,7 +26,7 @@ object StanfordNerAnnotator { */ def nerAnnotate[F[_]: BracketThrow]( cacheKey: String, - cache: PipelineCache[F] + cache: PipelineCache[F, StanfordCoreNLP] )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = cache .obtain(cacheKey, settings) @@ -36,4 +38,14 @@ object StanfordNerAnnotator { doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } + def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = { + logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") + new StanfordCoreNLP(Properties.forSettings(settings)) + } + + def clearPipelineCaches[F[_]: Sync]: F[Unit] = + Sync[F].delay { + // turns out that everything is cached in a static map + StanfordCoreNLP.clearAnnotatorPool() + } } From 54a09861c45b5f6b580e70e3bbfd811d7f52e472 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 23:59:28 +0100 Subject: [PATCH 05/38] Use model cache with basic annotator --- .../docspell/analysis/TextAnalyser.scala | 7 +- .../analysis/nlp/BasicCRFAnnotator.scala | 89 ++++++++++++------- .../docspell/analysis/nlp/PipelineCache.scala | 20 +++++ .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 12 ++- 4 files changed, 88 insertions(+), 40 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 6c8e6cff..38491c3a 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -33,12 +33,7 @@ object TextAnalyser { blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource - .liftF( - PipelineCache(cfg.clearStanfordPipelineInterval)( - StanfordNerAnnotator.makePipeline, - StanfordNerAnnotator.clearPipelineCaches[F] - ) - ) + .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval)) .map(cache => new TextAnalyser[F] { def annotate( diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala index 5823fba2..a6fb6af0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -1,17 +1,22 @@ package docspell.analysis.nlp -import docspell.common._ -import edu.stanford.nlp.ie.AbstractSequenceClassifier -import edu.stanford.nlp.ie.crf.CRFClassifier -import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} -import org.log4s.getLogger - import java.net.URL +import java.util.concurrent.atomic.AtomicReference import java.util.zip.GZIPInputStream import scala.jdk.CollectionConverters._ import scala.util.Using +import cats.Applicative +import cats.effect.BracketThrow + +import docspell.common._ + +import edu.stanford.nlp.ie.AbstractSequenceClassifier +import edu.stanford.nlp.ie.crf.CRFClassifier +import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger + /** This is only using the CRFClassifier without building an analysis * pipeline. The ner-classifier cannot use results from POS-tagging * etc. and is therefore not as good as the [[StanfordNerAnnotator]]. @@ -20,16 +25,20 @@ import scala.util.Using object BasicCRFAnnotator { private[this] val logger = getLogger - lazy val germanNerClassifier = makeClassifier(Language.German) - lazy val englishNerClassifier = makeClassifier(Language.English) - lazy val frenchNerClassifier = makeClassifier(Language.French) + // assert correct resource names + List(Language.French, Language.German, Language.English).foreach(classifierResource) - def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { - val nerClassifier = lang match { - case Language.English => englishNerClassifier - case Language.German => germanNerClassifier - case Language.French => frenchNerClassifier - } + type Annotator = AbstractSequenceClassifier[CoreLabel] + + def nerAnnotate[F[_]: BracketThrow]( + cacheKey: String, + cache: PipelineCache[F, Annotator] + )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = + cache + .obtain(cacheKey, settings) + .use(crf => Applicative[F].pure(nerAnnotate(crf)(text))) + + def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] = nerClassifier .classify(text) .asScala @@ -42,34 +51,54 @@ object BasicCRFAnnotator { .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) }) .toVector - } - private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + private def makeClassifier(lang: Language): Annotator = { logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") val ner = classifierResource(lang) Using(new GZIPInputStream(ner.openStream())) { in => - CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] + CRFClassifier.getClassifier(in).asInstanceOf[Annotator] }.fold(throw _, identity) } private def classifierResource(lang: Language): URL = { - def check(u: URL): URL = - if (u == null) sys.error(s"NER model url not found for language ${lang.name}") - else u + def check(name: String): URL = + Option(getClass.getResource(name)) match { + case None => + sys.error(s"NER model resource '$name' not found for language ${lang.name}") + case Some(url) => url + } check(lang match { case Language.French => - getClass.getResource( - "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" case Language.German => - getClass.getResource( - "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" case Language.English => - getClass.getResource( - "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" - ) + "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" }) } + + final class Cache { + private[this] lazy val germanNerClassifier = makeClassifier(Language.German) + private[this] lazy val englishNerClassifier = makeClassifier(Language.English) + private[this] lazy val frenchNerClassifier = makeClassifier(Language.French) + + def forLang(language: Language): Annotator = + language match { + case Language.French => frenchNerClassifier + case Language.German => germanNerClassifier + case Language.English => englishNerClassifier + } + } + + object Cache { + + private[this] val cacheRef = new AtomicReference[Cache](new Cache) + + def getAnnotator(language: Language): Annotator = + cacheRef.get().forLang(language) + + def clearCache(): Unit = + cacheRef.set(new Cache) + } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 61598f9a..2b567548 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -1,12 +1,16 @@ package docspell.analysis.nlp import scala.concurrent.duration.{Duration => _, _} + import cats.Applicative import cats.data.Kleisli import cats.effect._ import cats.effect.concurrent.Ref import cats.implicits._ + import docspell.common._ + +import edu.stanford.nlp.pipeline.StanfordCoreNLP import org.log4s.getLogger /** Creating the StanfordCoreNLP pipeline is quite expensive as it @@ -45,6 +49,22 @@ object PipelineCache { cacheClear <- CacheClearing.create(data, clearInterval, release) } yield new Impl[F, A](data, creator, cacheClear) + def full[F[_]: Concurrent: Timer]( + clearInterval: Duration + ): F[PipelineCache[F, StanfordCoreNLP]] = + apply(clearInterval)( + StanfordNerAnnotator.makePipeline, + StanfordNerAnnotator.clearPipelineCaches + ) + + def basic[F[_]: Concurrent: Timer]( + clearInterval: Duration + ): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] = + apply(clearInterval)( + settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang), + Sync[F].delay(BasicCRFAnnotator.Cache.clearCache()) + ) + final private class Impl[F[_]: Sync, A]( data: Ref[F, Map[String, Entry[A]]], creator: StanfordNerSettings => A, diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala index bffc6744..0abab7e9 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -5,9 +5,12 @@ import docspell.files.TestFiles import docspell.common._ object BaseCRFAnnotatorSuite extends SimpleTestSuite { + + def annotate(language: Language): String => Vector[NerLabel] = + BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language)) + test("find english ner labels") { - val labels = - BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText) + val labels = annotate(Language.English)(TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -39,11 +42,11 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { NerLabel("Jeter", NerTag.Person, 1123, 1128) ) assertEquals(labels, expect) + BasicCRFAnnotator.Cache.clearCache() } test("find german ner labels") { - val labels = - BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText) + val labels = annotate(Language.German)(TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), @@ -59,5 +62,6 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { NerLabel("Mustermann", NerTag.Person, 509, 519) ) assertEquals(labels, expect) + BasicCRFAnnotator.Cache.clearCache() } } From aa937797bed2411d8bea6a6f8fa80fa0e30a866b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 14 Jan 2021 00:55:19 +0100 Subject: [PATCH 06/38] Choose nlp mode in config file --- .../docspell/analysis/TextAnalyser.scala | 42 +++++++++++++++---- .../analysis/TextAnalysisConfig.scala | 8 +++- .../main/scala/docspell/common/NlpMode.scala | 23 ++++++++++ .../docspell/common/config/Implicits.scala | 3 ++ .../joex/src/main/resources/reference.conf | 29 ++++++++++--- .../src/main/scala/docspell/joex/Config.scala | 9 ++-- 6 files changed, 95 insertions(+), 19 deletions(-) create mode 100644 modules/common/src/main/scala/docspell/common/NlpMode.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 38491c3a..a9234027 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -1,14 +1,17 @@ package docspell.analysis +import cats.Applicative import cats.effect._ import cats.implicits._ import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier} import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind -import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings} +import docspell.analysis.nlp._ import docspell.common._ +import edu.stanford.nlp.pipeline.StanfordCoreNLP + trait TextAnalyser[F[_]] { def annotate( @@ -33,8 +36,8 @@ object TextAnalyser { blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource - .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval)) - .map(cache => + .liftF(Nlp(cfg.nlpConfig)) + .map(stanfordNer => new TextAnalyser[F] { def annotate( logger: Logger[F], @@ -44,7 +47,7 @@ object TextAnalyser { ): F[TextAnalyser.Result] = for { input <- textLimit(logger, text) - tags0 <- stanfordNer(cacheKey, settings, input) + tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input)) tags1 <- contactNer(input) dates <- dateNer(settings.lang, input) list = tags0 ++ tags1 @@ -62,10 +65,6 @@ object TextAnalyser { s" Analysing only first ${cfg.maxLength} characters." ) *> text.take(cfg.maxLength).pure[F] - private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) - : F[Vector[NerLabel]] = - StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text) - private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay { Contact.annotate(text) @@ -78,4 +77,31 @@ object TextAnalyser { } ) + private object Nlp { + + def apply[F[_]: Concurrent: Timer: BracketThrow]( + cfg: TextAnalysisConfig.NlpConfig + ): F[Input => F[Vector[NerLabel]]] = + cfg.mode match { + case NlpMode.Full => + PipelineCache.full(cfg.clearInterval).map(cache => full(cache)) + case NlpMode.Basic => + PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache)) + case NlpMode.Disabled => + Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + } + + final case class Input(key: Ident, settings: StanfordNerSettings, text: String) + + def full[F[_]: BracketThrow]( + cache: PipelineCache[F, StanfordCoreNLP] + )(input: Input): F[Vector[NerLabel]] = + StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + + def basic[F[_]: BracketThrow]( + cache: PipelineCache[F, BasicCRFAnnotator.Annotator] + )(input: Input): F[Vector[NerLabel]] = + BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + + } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index 2dbfbfc4..abc92043 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,10 +1,16 @@ package docspell.analysis +import docspell.analysis.TextAnalysisConfig.NlpConfig import docspell.analysis.classifier.TextClassifierConfig import docspell.common._ case class TextAnalysisConfig( maxLength: Int, - clearStanfordPipelineInterval: Duration, + nlpConfig: NlpConfig, classifier: TextClassifierConfig ) + +object TextAnalysisConfig { + + case class NlpConfig(clearInterval: Duration, mode: NlpMode) +} diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala new file mode 100644 index 00000000..36ebf7db --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala @@ -0,0 +1,23 @@ +package docspell.common + +sealed trait NlpMode { self: Product => + + def name: String = + self.productPrefix +} +object NlpMode { + case object Full extends NlpMode + case object Basic extends NlpMode + case object Disabled extends NlpMode + + def fromString(name: String): Either[String, NlpMode] = + name.toLowerCase match { + case "full" => Right(Full) + case "basic" => Right(Basic) + case "disabled" => Right(Disabled) + case _ => Left(s"Unknown nlp-mode: $name") + } + + def unsafeFromString(name: String): NlpMode = + fromString(name).fold(sys.error, identity) +} diff --git a/modules/common/src/main/scala/docspell/common/config/Implicits.scala b/modules/common/src/main/scala/docspell/common/config/Implicits.scala index c99c430a..9dab40dc 100644 --- a/modules/common/src/main/scala/docspell/common/config/Implicits.scala +++ b/modules/common/src/main/scala/docspell/common/config/Implicits.scala @@ -44,6 +44,9 @@ object Implicits { implicit val priorityReader: ConfigReader[Priority] = ConfigReader[String].emap(reason(Priority.fromString)) + implicit val nlpModeReader: ConfigReader[NlpMode] = + ConfigReader[String].emap(reason(NlpMode.fromString)) + def reason[A: ClassTag]( f: String => Either[String, A] ): String => Either[FailureReason, A] = diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 4aeb5a1b..583b40b1 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -277,12 +277,27 @@ docspell.joex { # files. working-dir = ${java.io.tmpdir}"/docspell-analysis" - # The StanfordCoreNLP library caches language models which - # requires quite some amount of memory. Setting this interval to a - # positive duration, the cache is cleared after this amount of - # idle time. Set it to 0 to disable it if you have enough memory, - # processing will be faster. - clear-stanford-nlp-interval = "15 minutes" + nlp-config { + # The StanfordCoreNLP library caches language models which + # requires quite some amount of memory. Setting this interval to a + # positive duration, the cache is cleared after this amount of + # idle time. Set it to 0 to disable it if you have enough memory, + # processing will be faster. + # + # This has only any effect, if mode != disabled. + clear-interval = "15 minutes" + + # The mode for configuring NLP models. Currently 3 are available: + # + # 1. full – builds the complete pipeline, run with -Xmx1500M or more + # 2. basic - builds only the ner annotator, run with -Xmx600M or more + # 3. disabled - doesn't use any stanford-nlp feature + # + # The basic variant does a quite good job for German and + # English. It might be worse for French, always depending on the + # type of text that is analysed. + mode = full + } regex-ner { # Whether to enable custom NER annotation. This uses the address @@ -295,6 +310,8 @@ docspell.joex { # # This setting might be moved to the collective settings in the # future. + # + # Note, this is only relevant if nlp-config.mode = full. enabled = true # The NER annotation uses a file of patterns that is derived from diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 8fba3582..5b2bccc5 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -4,7 +4,8 @@ import java.nio.file.Path import cats.data.NonEmptyList -import docspell.analysis.{TextAnalysisConfig, classifier} +import docspell.analysis.TextAnalysisConfig +import docspell.analysis.classifier.TextClassifierConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -59,7 +60,7 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - clearStanfordNlpInterval: Duration, + nlpConfig: TextAnalysisConfig.NlpConfig, regexNer: RegexNer, classification: Classification ) { @@ -67,8 +68,8 @@ object Config { def textAnalysisConfig: TextAnalysisConfig = TextAnalysisConfig( maxLength, - clearStanfordNlpInterval, - classifier.TextClassifierConfig( + nlpConfig, + TextClassifierConfig( workingDir, NonEmptyList .fromList(classification.classifiers) From 6cf3f9be5a9fe74306c9c3a040c5cfcd77255da8 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 15 Jan 2021 18:04:02 +0100 Subject: [PATCH 07/38] Fix joex version endpoint in spec --- modules/joexapi/src/main/resources/joex-openapi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/joexapi/src/main/resources/joex-openapi.yml b/modules/joexapi/src/main/resources/joex-openapi.yml index 4f4e327d..2447d0ce 100644 --- a/modules/joexapi/src/main/resources/joex-openapi.yml +++ b/modules/joexapi/src/main/resources/joex-openapi.yml @@ -9,7 +9,7 @@ servers: description: Current host paths: - /api/info: + /api/info/version: get: tags: [ Api Info ] summary: Get basic information about this software. From a70e9ab614e2a52684f754d4d563c1d3b45e5274 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 15 Jan 2021 23:30:49 +0100 Subject: [PATCH 08/38] Store used language for processing on attachmentmeta Issue: #570 --- .../joex/process/AttachmentPageCount.scala | 9 ++++- .../docspell/joex/process/ConvertPdf.scala | 13 ++++++- .../docspell/joex/process/ItemData.scala | 15 +++++--- .../joex/process/TextExtraction.scala | 3 +- .../migration/h2/V1.17.0__meta_language.sql | 35 +++++++++++++++++++ .../mariadb/V1.17.0__meta_language.sql | 14 ++++++++ .../postgresql/V1.17.0__meta_language.sql | 15 ++++++++ .../docspell/store/queries/QAttachment.scala | 10 +++++- .../store/records/RAttachmentMeta.scala | 13 ++++--- 9 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala index f3cf7b0e..0373db8a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -78,7 +78,14 @@ object AttachmentPageCount { s"No attachmentmeta record exists for ${ra.id.id}. Creating new." ) *> ctx.store.transact( RAttachmentMeta.insert( - RAttachmentMeta(ra.id, None, Nil, MetaProposalList.empty, md.pageCount.some) + RAttachmentMeta( + ra.id, + None, + Nil, + MetaProposalList.empty, + md.pageCount.some, + None + ) ) ) else 0.pure[F] diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 65ff0dda..56c27666 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -108,7 +108,18 @@ object ConvertPdf { ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *> storePDF(ctx, cfg, ra, pdf) .flatMap(r => - txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some)) + txt.map(t => + ( + r, + item + .changeMeta( + ra.id, + ctx.args.meta.language, + _.setContentIfEmpty(t.some) + ) + .some + ) + ) ) case ConversionResult.UnsupportedFormat(mt) => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index af9a3db2..0435e37c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -32,8 +32,12 @@ case class ItemData( def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] = dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty) - def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = { - val item = changeMeta(attachId, f) + def mapMeta( + attachId: Ident, + lang: Language, + f: RAttachmentMeta => RAttachmentMeta + ): ItemData = { + val item = changeMeta(attachId, lang, f) val next = metas.map(a => if (a.id == attachId) item else a) copy(metas = next) } @@ -43,13 +47,14 @@ case class ItemData( def changeMeta( attachId: Ident, + lang: Language, f: RAttachmentMeta => RAttachmentMeta ): RAttachmentMeta = - f(findOrCreate(attachId)) + f(findOrCreate(attachId, lang)) - def findOrCreate(attachId: Ident): RAttachmentMeta = + def findOrCreate(attachId: Ident, lang: Language): RAttachmentMeta = metas.find(_.id == attachId).getOrElse { - RAttachmentMeta.empty(attachId) + RAttachmentMeta.empty(attachId, lang) } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index db2988b8..fee7d323 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -78,7 +78,7 @@ object TextExtraction { pair._2 ) - val rm = item.findOrCreate(ra.id) + val rm = item.findOrCreate(ra.id, lang) rm.content match { case Some(_) => ctx.logger.info("TextExtraction skipped, since text is already available.") *> @@ -102,6 +102,7 @@ object TextExtraction { res <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, + lang, rm => rm.setContentIfEmpty( res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql new file mode 100644 index 00000000..35004e08 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql @@ -0,0 +1,35 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "language" varchar(254); + +update "attachmentmeta" +set "language" = 'deu' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'deu' +); + +update "attachmentmeta" +set "language" = 'eng' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'eng' +); + +update "attachmentmeta" +set "language" = 'fra' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'fra' +); diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql new file mode 100644 index 00000000..bd12e732 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql @@ -0,0 +1,14 @@ +ALTER TABLE `attachmentmeta` +ADD COLUMN (`language` varchar(254)); + +update `attachmentmeta` `m` +inner join ( + select `m`.`attachid`, `c`.`doclang` + from `attachmentmeta` m + inner join `attachment` a on `a`.`attachid` = `m`.`attachid` + inner join `item` i on `a`.`itemid` = `i`.`itemid` + inner join `collective` c on `c`.`cid` = `i`.`cid` + ) as `c` +set `m`.`language` = `c`.`doclang` +where `m`.`attachid` = `c`.`attachid` and `m`.`language` is null; + diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql new file mode 100644 index 00000000..ba84fc2a --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql @@ -0,0 +1,15 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "language" varchar(254); + +with + "attachlang" as ( + select "m"."attachid", "m"."language", "c"."doclang" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + ) +update "attachmentmeta" as "m" +set "language" = "c"."doclang" +from "attachlang" c +where "m"."attachid" = "c"."attachid" and "m"."language" is null; diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index 6ac9327a..a9afc0bf 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -160,7 +160,15 @@ object QAttachment { chunkSize: Int ): Stream[ConnectionIO, ContentAndName] = Select( - select(a.id, a.itemId, item.cid, item.folder, c.language, a.name, am.content), + select( + a.id.s, + a.itemId.s, + item.cid.s, + item.folder.s, + coalesce(am.language.s, c.language.s).s, + a.name.s, + am.content.s + ), from(a) .innerJoin(am, am.id === a.id) .innerJoin(item, item.id === a.itemId) diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index 4adfbad7..919a5b17 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -15,7 +15,8 @@ case class RAttachmentMeta( content: Option[String], nerlabels: List[NerLabel], proposals: MetaProposalList, - pages: Option[Int] + pages: Option[Int], + language: Option[Language] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = @@ -27,8 +28,8 @@ case class RAttachmentMeta( } object RAttachmentMeta { - def empty(attachId: Ident) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None) + def empty(attachId: Ident, lang: Language) = + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) final case class Table(alias: Option[String]) extends TableDef { val tableName = "attachmentmeta" @@ -38,7 +39,9 @@ object RAttachmentMeta { val nerlabels = Column[List[NerLabel]]("nerlabels", this) val proposals = Column[MetaProposalList]("itemproposals", this) val pages = Column[Int]("page_count", this) - val all = NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages) + val language = Column[Language]("language", this) + val all = + NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language) } val T = Table(None) @@ -49,7 +52,7 @@ object RAttachmentMeta { DML.insert( T, T.all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" ) def exists(attachId: Ident): ConnectionIO[Boolean] = From f01646aeb5a08246ace732fa53963a40c32fd182 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 16 Jan 2021 23:43:24 +0100 Subject: [PATCH 09/38] Reorganize nlp pipeline and add nlp-unsupported language italian Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing. --- .travis.yml | 2 +- docker/joex-base.dockerfile | 1 + .../scala/docspell/analysis/NlpSettings.scala | 7 ++ .../docspell/analysis/TextAnalyser.scala | 48 ++++---- .../docspell/analysis/date/DateFind.scala | 44 +++---- .../docspell/analysis/date/MonthName.scala | 101 +++++++++++++++ .../docspell/analysis/nlp/Annotator.scala | 98 +++++++++++++++ .../analysis/nlp/BasicCRFAnnotator.scala | 26 ++-- .../docspell/analysis/nlp/PipelineCache.scala | 65 +++------- .../docspell/analysis/nlp/Properties.scala | 32 +++-- .../analysis/nlp/StanfordNerAnnotator.scala | 27 ++-- .../analysis/nlp/StanfordNerSettings.scala | 58 +++++---- .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 3 +- .../nlp/StanfordNerAnnotatorSuite.scala | 36 ++++++ .../main/scala/docspell/common/Language.scala | 23 +++- .../main/scala/docspell/common/NlpMode.scala | 16 +-- .../docspell/common/syntax/FileSyntax.scala | 20 +++ .../docspell/common/syntax/package.scala | 7 +- .../test/resources/examples/letter-ita.txt | 13 ++ .../main/scala/docspell/ftssolr/Field.scala | 3 + .../scala/docspell/ftssolr/SolrQuery.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 19 ++- .../joex/src/main/resources/reference.conf | 85 ++++++++----- .../src/main/scala/docspell/joex/Config.scala | 15 ++- .../docspell/joex/analysis/RegexNerFile.scala | 6 +- .../docspell/joex/process/TextAnalysis.scala | 5 +- .../docspell/store/queries/QCollective.scala | 43 +++++-- modules/webapp/src/main/elm/Data/Language.elm | 11 +- nix/module-joex.nix | 116 +++++++++++++----- 29 files changed, 676 insertions(+), 255 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala create mode 100644 modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala create mode 100644 modules/files/src/test/resources/examples/letter-ita.txt diff --git a/.travis.yml b/.travis.yml index 4d750d05..d78ff4b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ before_script: - export TZ=Europe/Berlin script: - - sbt ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test" + - sbt -J-XX:+UseG1GC ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test" diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index 0baa1973..8ebad224 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -15,6 +15,7 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr \ tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ + tesseract-ocr-data-ita \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala new file mode 100644 index 00000000..a1b426e5 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala @@ -0,0 +1,7 @@ +package docspell.analysis + +import java.nio.file.Path + +import docspell.common._ + +case class NlpSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path]) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index a9234027..c2deafce 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -10,13 +10,13 @@ import docspell.analysis.date.DateFind import docspell.analysis.nlp._ import docspell.common._ -import edu.stanford.nlp.pipeline.StanfordCoreNLP +import org.log4s.getLogger trait TextAnalyser[F[_]] { def annotate( logger: Logger[F], - settings: StanfordNerSettings, + settings: NlpSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] @@ -24,6 +24,7 @@ trait TextAnalyser[F[_]] { def classifier: TextClassifier[F] } object TextAnalyser { + private[this] val logger = getLogger case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) { @@ -41,13 +42,13 @@ object TextAnalyser { new TextAnalyser[F] { def annotate( logger: Logger[F], - settings: StanfordNerSettings, + settings: NlpSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] = for { input <- textLimit(logger, text) - tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input)) + tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input)) tags1 <- contactNer(input) dates <- dateNer(settings.lang, input) list = tags0 ++ tags1 @@ -77,31 +78,36 @@ object TextAnalyser { } ) + /** Provides the nlp pipeline based on the configuration. */ private object Nlp { - def apply[F[_]: Concurrent: Timer: BracketThrow]( cfg: TextAnalysisConfig.NlpConfig - ): F[Input => F[Vector[NerLabel]]] = + ): F[Input[F] => F[Vector[NerLabel]]] = cfg.mode match { - case NlpMode.Full => - PipelineCache.full(cfg.clearInterval).map(cache => full(cache)) - case NlpMode.Basic => - PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache)) case NlpMode.Disabled => - Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + Logger.log4s(logger).info("NLP is disabled as defined in config.") *> + Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + case _ => + PipelineCache(cfg.clearInterval)( + Annotator[F](cfg.mode), + Annotator.clearCaches[F] + ) + .map(annotate[F]) } - final case class Input(key: Ident, settings: StanfordNerSettings, text: String) + final case class Input[F[_]]( + key: Ident, + settings: NlpSettings, + logger: Logger[F], + text: String + ) - def full[F[_]: BracketThrow]( - cache: PipelineCache[F, StanfordCoreNLP] - )(input: Input): F[Vector[NerLabel]] = - StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) - - def basic[F[_]: BracketThrow]( - cache: PipelineCache[F, BasicCRFAnnotator.Annotator] - )(input: Input): F[Vector[NerLabel]] = - BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + def annotate[F[_]: BracketThrow]( + cache: PipelineCache[F] + )(input: Input[F]): F[Vector[NerLabel]] = + cache + .obtain(input.key.id, input.settings) + .use(ann => ann.nerAnnotate(input.logger)(input.text)) } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 90fcd8cd..5feb8b57 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -41,23 +41,30 @@ object DateFind { } object SimpleDate { - val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) => - List(SimpleDate(y, m, d)) + def pattern0(lang: Language) = (readYear >> readMonth(lang) >> readDay).map { + case ((y, m), d) => + List(SimpleDate(y, m, d)) } - val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) => - List(SimpleDate(y, m, d)) + def pattern1(lang: Language) = (readDay >> readMonth(lang) >> readYear).map { + case ((d, m), y) => + List(SimpleDate(y, m, d)) } - val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) => - List(SimpleDate(y, m, d)) + def pattern2(lang: Language) = (readMonth(lang) >> readDay >> readYear).map { + case ((m, d), y) => + List(SimpleDate(y, m, d)) } // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { + val p0 = pattern0(lang) + val p1 = pattern1(lang) + val p2 = pattern2(lang) val p = lang match { case Language.English => p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1) - case Language.German => p1.or(p0).or(p2) - case Language.French => p1.or(p0).or(p2) + case Language.German => p1.or(p0).or(p2) + case Language.French => p1.or(p0).or(p2) + case Language.Italian => p1.or(p0).or(p2) } p.read(parts) match { case Result.Success(sds, _) => @@ -76,9 +83,11 @@ object DateFind { } ) - def readMonth: Reader[Int] = + def readMonth(lang: Language): Reader[Int] = Reader.readFirst(w => - Some(months.indexWhere(_.contains(w.value))).filter(_ >= 0).map(_ + 1) + Some(MonthName.getAll(lang).indexWhere(_.contains(w.value))) + .filter(_ >= 0) + .map(_ + 1) ) def readDay: Reader[Int] = @@ -150,20 +159,5 @@ object DateFind { Failure } } - - private val months = List( - List("jan", "january", "januar", "01"), - List("feb", "february", "februar", "02"), - List("mar", "march", "märz", "marz", "03"), - List("apr", "april", "04"), - List("may", "mai", "05"), - List("jun", "june", "juni", "06"), - List("jul", "july", "juli", "07"), - List("aug", "august", "08"), - List("sep", "september", "09"), - List("oct", "october", "oktober", "10"), - List("nov", "november", "11"), - List("dec", "december", "dezember", "12") - ) } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala new file mode 100644 index 00000000..cf61cd72 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -0,0 +1,101 @@ +package docspell.analysis.date + +import docspell.common.Language + +object MonthName { + + def getAll(lang: Language): List[List[String]] = + merge(numbers, forLang(lang)) + + private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] = + ns.foldLeft(n0) { (res, el) => + res.zip(el).map({ case (a, b) => a ++ b }) + } + + private def forLang(lang: Language): List[List[String]] = + lang match { + case Language.English => + english + case Language.German => + german + case Language.French => + french + case Language.Italian => + italian + } + + private val numbers = List( + List("01"), + List("02"), + List("03"), + List("04"), + List("05"), + List("06"), + List("07"), + List("08"), + List("09"), + List("10"), + List("11"), + List("12") + ) + + private val english = List( + List("jan", "january"), + List("feb", "february"), + List("mar", "march"), + List("apr", "april"), + List("may"), + List("jun", "june"), + List("jul", "july"), + List("aug", "august"), + List("sept", "september"), + List("oct", "october"), + List("nov", "november"), + List("dec", "december") + ) + + private val german = List( + List("jan", "januar"), + List("feb", "februar"), + List("märz"), + List("apr", "april"), + List("mai"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dez", "dezember") + ) + + private val french = List( + List("janv", "janvier"), + List("févr", "fevr", "février", "fevrier"), + List("mars"), + List("avril"), + List("mai"), + List("juin"), + List("juil", "juillet"), + List("aout", "août"), + List("sept", "septembre"), + List("oct", "octobre"), + List("nov", "novembre"), + List("dec", "déc", "décembre", "decembre") + ) + + private val italian = List( + List("genn", "gennaio"), + List("febbr", "febbraio"), + List("mar", "marzo"), + List("apr", "aprile"), + List("magg", "maggio"), + List("giugno"), + List("luglio"), + List("ag", "agosto"), + List("sett", "settembre"), + List("ott", "ottobre"), + List("nov", "novembre"), + List("dic", "dicembre") + ) +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala new file mode 100644 index 00000000..d509805a --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala @@ -0,0 +1,98 @@ +package docspell.analysis.nlp + +import cats.effect.Sync +import cats.implicits._ +import cats.{Applicative, FlatMap} + +import docspell.analysis.NlpSettings +import docspell.common._ + +import edu.stanford.nlp.pipeline.StanfordCoreNLP + +/** Analyses a text to mark certain parts with a `NerLabel`. */ +trait Annotator[F[_]] { self => + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] + + def ++(next: Annotator[F])(implicit F: FlatMap[F]): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + for { + n0 <- self.nerAnnotate(logger)(text) + n1 <- next.nerAnnotate(logger)(text) + } yield (n0 ++ n1).distinct + } +} + +object Annotator { + + /** Creates an annotator according to the given `mode` and `settings`. + * + * There are the following ways: + * + * - disabled: it returns a no-op annotator that always gives an empty list + * - full: the complete stanford pipeline is used + * - basic: only the ner classifier is used + * + * Additionally, if there is a regexNer-file specified, the regexner annotator is + * also run. In case the full pipeline is used, this is already included. + */ + def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] = + mode match { + case NlpMode.Disabled => + Annotator.none[F] + case NlpMode.Full => + StanfordNerSettings.fromNlpSettings(settings) match { + case Some(ss) => + Annotator.pipeline(StanfordNerAnnotator.makePipeline(ss)) + case None => + Annotator.none[F] + } + case NlpMode.Basic => + StanfordNerSettings.fromNlpSettings(settings) match { + case Some(StanfordNerSettings.Full(lang, _, Some(file))) => + Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) ++ + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case Some(StanfordNerSettings.Full(lang, _, None)) => + Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) + case Some(StanfordNerSettings.RegexOnly(file)) => + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case None => + Annotator.none[F] + } + case NlpMode.RegexOnly => + settings.regexNer match { + case Some(file) => + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case None => + Annotator.none[F] + } + } + + def none[F[_]: Applicative]: Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + logger.debug("Running empty annotator. NLP not supported.") *> + Vector.empty[NerLabel].pure[F] + } + + def basic[F[_]: Sync](ann: BasicCRFAnnotator.Annotator): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + Sync[F].delay( + BasicCRFAnnotator.nerAnnotate(ann)(text) + ) + } + + def pipeline[F[_]: Sync](cp: StanfordCoreNLP): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + Sync[F].delay(StanfordNerAnnotator.nerAnnotate(cp, text)) + + } + + def clearCaches[F[_]: Sync]: F[Unit] = + Sync[F].delay { + StanfordCoreNLP.clearAnnotatorPool() + BasicCRFAnnotator.Cache.clearCache() + } +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala index a6fb6af0..76ffe7c6 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -7,9 +7,7 @@ import java.util.zip.GZIPInputStream import scala.jdk.CollectionConverters._ import scala.util.Using -import cats.Applicative -import cats.effect.BracketThrow - +import docspell.common.Language.NLPLanguage import docspell.common._ import edu.stanford.nlp.ie.AbstractSequenceClassifier @@ -30,14 +28,6 @@ object BasicCRFAnnotator { type Annotator = AbstractSequenceClassifier[CoreLabel] - def nerAnnotate[F[_]: BracketThrow]( - cacheKey: String, - cache: PipelineCache[F, Annotator] - )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = - cache - .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(nerAnnotate(crf)(text))) - def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] = nerClassifier .classify(text) @@ -52,7 +42,7 @@ object BasicCRFAnnotator { }) .toVector - private def makeClassifier(lang: Language): Annotator = { + def makeAnnotator(lang: NLPLanguage): Annotator = { logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") val ner = classifierResource(lang) Using(new GZIPInputStream(ner.openStream())) { in => @@ -60,7 +50,7 @@ object BasicCRFAnnotator { }.fold(throw _, identity) } - private def classifierResource(lang: Language): URL = { + private def classifierResource(lang: NLPLanguage): URL = { def check(name: String): URL = Option(getClass.getResource(name)) match { case None => @@ -79,11 +69,11 @@ object BasicCRFAnnotator { } final class Cache { - private[this] lazy val germanNerClassifier = makeClassifier(Language.German) - private[this] lazy val englishNerClassifier = makeClassifier(Language.English) - private[this] lazy val frenchNerClassifier = makeClassifier(Language.French) + private[this] lazy val germanNerClassifier = makeAnnotator(Language.German) + private[this] lazy val englishNerClassifier = makeAnnotator(Language.English) + private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French) - def forLang(language: Language): Annotator = + def forLang(language: NLPLanguage): Annotator = language match { case Language.French => frenchNerClassifier case Language.German => germanNerClassifier @@ -95,7 +85,7 @@ object BasicCRFAnnotator { private[this] val cacheRef = new AtomicReference[Cache](new Cache) - def getAnnotator(language: Language): Annotator = + def getAnnotator(language: NLPLanguage): Annotator = cacheRef.get().forLang(language) def clearCache(): Unit = diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 2b567548..3b38da22 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -3,14 +3,13 @@ package docspell.analysis.nlp import scala.concurrent.duration.{Duration => _, _} import cats.Applicative -import cats.data.Kleisli import cats.effect._ import cats.effect.concurrent.Ref import cats.implicits._ +import docspell.analysis.NlpSettings import docspell.common._ -import edu.stanford.nlp.pipeline.StanfordCoreNLP import org.log4s.getLogger /** Creating the StanfordCoreNLP pipeline is quite expensive as it @@ -20,58 +19,32 @@ import org.log4s.getLogger * * **This is an internal API** */ -trait PipelineCache[F[_], A] { +trait PipelineCache[F[_]] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] + def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] } object PipelineCache { private[this] val logger = getLogger - def none[F[_]: Applicative, A]( - creator: Kleisli[F, StanfordNerSettings, A] - ): PipelineCache[F, A] = - new PipelineCache[F, A] { - def obtain( - ignored: String, - settings: StanfordNerSettings - ): Resource[F, A] = - Resource.liftF(creator.run(settings)) - } - - def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)( - creator: StanfordNerSettings => A, + def apply[F[_]: Concurrent: Timer](clearInterval: Duration)( + creator: NlpSettings => Annotator[F], release: F[Unit] - ): F[PipelineCache[F, A]] = + ): F[PipelineCache[F]] = for { - data <- Ref.of(Map.empty[String, Entry[A]]) + data <- Ref.of(Map.empty[String, Entry[Annotator[F]]]) cacheClear <- CacheClearing.create(data, clearInterval, release) - } yield new Impl[F, A](data, creator, cacheClear) + _ <- Logger.log4s(logger).info("Creating nlp pipeline cache") + } yield new Impl[F](data, creator, cacheClear) - def full[F[_]: Concurrent: Timer]( - clearInterval: Duration - ): F[PipelineCache[F, StanfordCoreNLP]] = - apply(clearInterval)( - StanfordNerAnnotator.makePipeline, - StanfordNerAnnotator.clearPipelineCaches - ) - - def basic[F[_]: Concurrent: Timer]( - clearInterval: Duration - ): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] = - apply(clearInterval)( - settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang), - Sync[F].delay(BasicCRFAnnotator.Cache.clearCache()) - ) - - final private class Impl[F[_]: Sync, A]( - data: Ref[F, Map[String, Entry[A]]], - creator: StanfordNerSettings => A, + final private class Impl[F[_]: Sync]( + data: Ref[F, Map[String, Entry[Annotator[F]]]], + creator: NlpSettings => Annotator[F], cacheClear: CacheClearing[F] - ) extends PipelineCache[F, A] { + ) extends PipelineCache[F] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] = + def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] = for { _ <- cacheClear.withCache id <- Resource.liftF(makeSettingsId(settings)) @@ -83,10 +56,10 @@ object PipelineCache { private def getOrCreate( key: String, id: String, - cache: Map[String, Entry[A]], - settings: StanfordNerSettings, - creator: StanfordNerSettings => A - ): (Map[String, Entry[A]], A) = + cache: Map[String, Entry[Annotator[F]]], + settings: NlpSettings, + creator: NlpSettings => Annotator[F] + ): (Map[String, Entry[Annotator[F]]], Annotator[F]) = cache.get(key) match { case Some(entry) => if (entry.id == id) (cache, entry.value) @@ -105,7 +78,7 @@ object PipelineCache { (cache.updated(key, e), nlp) } - private def makeSettingsId(settings: StanfordNerSettings): F[String] = { + private def makeSettingsId(settings: NlpSettings): F[String] = { val base = settings.copy(regexNer = None).toString val size: F[Long] = settings.regexNer match { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index 46a614d1..75fe9d36 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -1,9 +1,11 @@ package docspell.analysis.nlp +import java.nio.file.Path import java.util.{Properties => JProps} import docspell.analysis.nlp.Properties.Implicits._ import docspell.common._ +import docspell.common.syntax.FileSyntax._ object Properties { @@ -17,18 +19,21 @@ object Properties { p } - def forSettings(settings: StanfordNerSettings): JProps = { - val regexNerFile = settings.regexNer - .map(p => p.normalize().toAbsolutePath().toString()) - settings.lang match { - case Language.German => - Properties.nerGerman(regexNerFile, settings.highRecall) - case Language.English => - Properties.nerEnglish(regexNerFile) - case Language.French => - Properties.nerFrench(regexNerFile, settings.highRecall) + def forSettings(settings: StanfordNerSettings): JProps = + settings match { + case StanfordNerSettings.Full(lang, highRecall, regexNer) => + val regexNerFile = regexNer.map(p => p.absolutePathAsString) + lang match { + case Language.German => + Properties.nerGerman(regexNerFile, highRecall) + case Language.English => + Properties.nerEnglish(regexNerFile) + case Language.French => + Properties.nerFrench(regexNerFile, highRecall) + } + case StanfordNerSettings.RegexOnly(path) => + Properties.regexNerOnly(path) } - } def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = Properties( @@ -76,6 +81,11 @@ object Properties { "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + def regexNerOnly(regexNerMappingFile: Path): JProps = + Properties( + "annotators" -> "tokenize,ssplit" + ).withRegexNer(Some(regexNerMappingFile.absolutePathAsString)) + object Implicits { implicit final class JPropsOps(val p: JProps) extends AnyVal { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala index 37b54b40..2ec4e802 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala @@ -1,8 +1,9 @@ package docspell.analysis.nlp +import java.nio.file.Path + import scala.jdk.CollectionConverters._ -import cats.Applicative import cats.effect._ import docspell.common._ @@ -24,24 +25,24 @@ object StanfordNerAnnotator { * a new classifier must be created. It will then replace the * previous one. */ - def nerAnnotate[F[_]: BracketThrow]( - cacheKey: String, - cache: PipelineCache[F, StanfordCoreNLP] - )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = - cache - .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(nerAnnotate(crf, text))) - def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { val doc = new CoreDocument(text) nerClassifier.annotate(doc) doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } - def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = { - logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") - new StanfordCoreNLP(Properties.forSettings(settings)) - } + def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = + settings match { + case s: StanfordNerSettings.Full => + logger.info(s"Creating ${s.lang.name} Stanford NLP NER classifier...") + new StanfordCoreNLP(Properties.forSettings(settings)) + case StanfordNerSettings.RegexOnly(path) => + logger.info(s"Creating regexNer-only Stanford NLP NER classifier...") + regexNerPipeline(path) + } + + def regexNerPipeline(regexNerFile: Path): StanfordCoreNLP = + new StanfordCoreNLP(Properties.regexNerOnly(regexNerFile)) def clearPipelineCaches[F[_]: Sync]: F[Unit] = Sync[F].delay { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala index 06136a18..fd0a7ecd 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala @@ -2,25 +2,41 @@ package docspell.analysis.nlp import java.nio.file.Path -import docspell.common._ +import docspell.analysis.NlpSettings +import docspell.common.Language.NLPLanguage -/** Settings for configuring the stanford NER pipeline. - * - * The language is mandatory, only the provided ones are supported. - * The `highRecall` only applies for non-English languages. For - * non-English languages the english classifier is run as second - * classifier and if `highRecall` is true, then it will be used to - * tag untagged tokens. This may lead to a lot of false positives, - * but since English is omnipresent in other languages, too it - * depends on the use case for whether this is useful or not. - * - * The `regexNer` allows to specify a text file as described here: - * https://nlp.stanford.edu/software/regexner.html. This will be used - * as a last step to tag untagged tokens using the provided list of - * regexps. - */ -case class StanfordNerSettings( - lang: Language, - highRecall: Boolean, - regexNer: Option[Path] -) +sealed trait StanfordNerSettings + +object StanfordNerSettings { + + /** Settings for configuring the stanford NER pipeline. + * + * The language is mandatory, only the provided ones are supported. + * The `highRecall` only applies for non-English languages. For + * non-English languages the english classifier is run as second + * classifier and if `highRecall` is true, then it will be used to + * tag untagged tokens. This may lead to a lot of false positives, + * but since English is omnipresent in other languages, too it + * depends on the use case for whether this is useful or not. + * + * The `regexNer` allows to specify a text file as described here: + * https://nlp.stanford.edu/software/regexner.html. This will be used + * as a last step to tag untagged tokens using the provided list of + * regexps. + */ + case class Full( + lang: NLPLanguage, + highRecall: Boolean, + regexNer: Option[Path] + ) extends StanfordNerSettings + + /** Not all languages are supported with predefined statistical models. This allows to provide regexps only. + */ + case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings + + def fromNlpSettings(ns: NlpSettings): Option[StanfordNerSettings] = + NLPLanguage.all + .find(nl => nl == ns.lang) + .map(nl => Full(nl, ns.highRecall, ns.regexNer)) + .orElse(ns.regexNer.map(nrf => RegexOnly(nrf))) +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala index 0abab7e9..2f0cab57 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -1,12 +1,13 @@ package docspell.analysis.nlp +import docspell.common.Language.NLPLanguage import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ object BaseCRFAnnotatorSuite extends SimpleTestSuite { - def annotate(language: Language): String => Vector[NerLabel] = + def annotate(language: NLPLanguage): String => Vector[NerLabel] = BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language)) test("find english ner labels") { diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala index 1704ef1b..416cdff7 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala @@ -1,8 +1,12 @@ package docspell.analysis.nlp +import java.nio.file.Paths + +import cats.effect.IO import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ +import docspell.common.syntax.FileSyntax._ import edu.stanford.nlp.pipeline.StanfordCoreNLP object StanfordNerAnnotatorSuite extends SimpleTestSuite { @@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite { assertEquals(labels, expect) StanfordCoreNLP.clearAnnotatorPool() } + + test("regexner-only annotator") { + val regexNerContent = + s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |""".stripMargin + + File + .withTempDir[IO](Paths.get("target"), "test-regex-ner") + .use { dir => + for { + out <- File.writeString[IO](dir / "regex.txt", regexNerContent) + ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out)) + labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.") + _ <- IO( + assertEquals( + labels, + Vector( + NerLabel("Andrea", NerTag.Person, 6, 12), + NerLabel("Rossi", NerTag.Person, 13, 18) + ) + ) + ) + } yield () + } + .unsafeRunSync() + StanfordCoreNLP.clearAnnotatorPool() + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 92c32f4b..f18d4adf 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -1,5 +1,7 @@ package docspell.common +import cats.data.NonEmptyList + import io.circe.{Decoder, Encoder} sealed trait Language { self: Product => @@ -11,28 +13,41 @@ sealed trait Language { self: Product => def iso3: String + val allowsNLP: Boolean = false + private[common] def allNames = Set(name, iso3, iso2) } object Language { + sealed trait NLPLanguage extends Language with Product { + override val allowsNLP = true + } + object NLPLanguage { + val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French) + } - case object German extends Language { + case object German extends NLPLanguage { val iso2 = "de" val iso3 = "deu" } - case object English extends Language { + case object English extends NLPLanguage { val iso2 = "en" val iso3 = "eng" } - case object French extends Language { + case object French extends NLPLanguage { val iso2 = "fr" val iso3 = "fra" } - val all: List[Language] = List(German, English, French) + case object Italian extends Language { + val iso2 = "it" + val iso3 = "ita" + } + + val all: List[Language] = List(German, English, French, Italian) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala index 36ebf7db..013b2275 100644 --- a/modules/common/src/main/scala/docspell/common/NlpMode.scala +++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala @@ -6,16 +6,18 @@ sealed trait NlpMode { self: Product => self.productPrefix } object NlpMode { - case object Full extends NlpMode - case object Basic extends NlpMode - case object Disabled extends NlpMode + case object Full extends NlpMode + case object Basic extends NlpMode + case object RegexOnly extends NlpMode + case object Disabled extends NlpMode def fromString(name: String): Either[String, NlpMode] = name.toLowerCase match { - case "full" => Right(Full) - case "basic" => Right(Basic) - case "disabled" => Right(Disabled) - case _ => Left(s"Unknown nlp-mode: $name") + case "full" => Right(Full) + case "basic" => Right(Basic) + case "regexonly" => Right(RegexOnly) + case "disabled" => Right(Disabled) + case _ => Left(s"Unknown nlp-mode: $name") } def unsafeFromString(name: String): NlpMode = diff --git a/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala new file mode 100644 index 00000000..6eef143b --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala @@ -0,0 +1,20 @@ +package docspell.common.syntax + +import java.nio.file.Path + +trait FileSyntax { + + implicit final class PathOps(p: Path) { + + def absolutePath: Path = + p.normalize().toAbsolutePath + + def absolutePathAsString: String = + absolutePath.toString + + def /(next: String): Path = + p.resolve(next) + } +} + +object FileSyntax extends FileSyntax diff --git a/modules/common/src/main/scala/docspell/common/syntax/package.scala b/modules/common/src/main/scala/docspell/common/syntax/package.scala index 77e17039..8d512741 100644 --- a/modules/common/src/main/scala/docspell/common/syntax/package.scala +++ b/modules/common/src/main/scala/docspell/common/syntax/package.scala @@ -2,6 +2,11 @@ package docspell.common package object syntax { - object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax + object all + extends EitherSyntax + with StreamSyntax + with StringSyntax + with LoggerSyntax + with FileSyntax } diff --git a/modules/files/src/test/resources/examples/letter-ita.txt b/modules/files/src/test/resources/examples/letter-ita.txt new file mode 100644 index 00000000..cca09122 --- /dev/null +++ b/modules/files/src/test/resources/examples/letter-ita.txt @@ -0,0 +1,13 @@ +Pontremoli, 9 aprile 2013 + +Spettabile Villa Albicocca +Via Francigena, 9 +55100 Pontetetto (LU) + +Oggetto: Prenotazione + +Gentile Direttore, + +Vorrei prenotare una camera matrimoniale ……. + +In attesa di una Sua pronta risposta, La saluto cordialmente diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 2306a44d..345f4665 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -24,6 +24,7 @@ object Field { val content_de = Field("content_de") val content_en = Field("content_en") val content_fr = Field("content_fr") + val content_it = Field("content_it") val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") @@ -36,6 +37,8 @@ object Field { Field.content_en case Language.French => Field.content_fr + case Language.Italian => + Field.content_it } implicit val jsonEncoder: Encoder[Field] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index 1e3b09b3..0b7e6e31 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -40,6 +40,7 @@ object SolrQuery { Field.content_de, Field.content_en, Field.content_fr, + Field.content_it, Field.itemName, Field.itemNotes, Field.attachmentName diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 3deba577..769919bd 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -63,6 +63,12 @@ object SolrSetup { solrEngine, "Index all from database", FtsMigration.Result.indexAll.pure[F] + ), + FtsMigration[F]( + 7, + solrEngine, + "Add content_it field", + addContentItField.map(_ => FtsMigration.Result.reIndexAll) ) ) @@ -72,6 +78,9 @@ object SolrSetup { def addContentFrField: F[Unit] = addTextField(Some(Language.French))(Field.content_fr) + def addContentItField: F[Unit] = + addTextField(Some(Language.Italian))(Field.content_it) + def setupCoreSchema: F[Unit] = { val cmds0 = List( @@ -90,13 +99,15 @@ object SolrSetup { ) .traverse(addTextField(None)) - val cntLang = Language.all.traverse { + val cntLang = List(Language.German, Language.English, Language.French).traverse { case l @ Language.German => addTextField(l.some)(Field.content_de) case l @ Language.English => addTextField(l.some)(Field.content_en) case l @ Language.French => addTextField(l.some)(Field.content_fr) + case _ => + ().pure[F] } cmds0 *> cmds1 *> cntLang *> ().pure[F] @@ -125,6 +136,9 @@ object SolrSetup { case Some(Language.French) => run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.textFR(field))) + case Some(Language.Italian) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textIT(field))) } } } @@ -161,6 +175,9 @@ object SolrSetup { def textFR(field: Field): AddField = AddField(field, "text_fr", true, true, false) + + def textIT(field: Field): AddField = + AddField(field, "text_it", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 583b40b1..a495ea5a 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -277,7 +277,39 @@ docspell.joex { # files. working-dir = ${java.io.tmpdir}"/docspell-analysis" - nlp-config { + nlp { + # The mode for configuring NLP models: + # + # 1. full – builds the complete pipeline + # 2. basic - builds only the ner annotator + # 3. regexonly - matches each entry in your address book via regexps + # 4. disabled - doesn't use any stanford-nlp feature + # + # The full and basic variants rely on pre-build language models + # that are available for only 3 lanugages at the moment: German, + # English and French. + # + # Memory usage varies greatly among the languages. German has + # quite large models, that require about 1G heap. So joex should + # run with -Xmx1500M at least when using mode=full. + # + # The basic variant does a quite good job for German and + # English. It might be worse for French, always depending on the + # type of text that is analysed. Joex should run with about 600M + # heap, here again lanugage German uses the most. + # + # The regexonly variant doesn't depend on a language. It roughly + # works by converting all entries in your addressbook into + # regexps and matches each one against the text. This can get + # memory intensive, too, when the addressbook grows large. This + # is included in the full and basic by default, but can be used + # independently by setting mode=regexner. + # + # When mode=disabled, then the whole nlp pipeline is disabled, + # and you won't get any suggestions. Only what the classifier + # returns (if enabled). + mode = full + # The StanfordCoreNLP library caches language models which # requires quite some amount of memory. Setting this interval to a # positive duration, the cache is cleared after this amount of @@ -287,37 +319,28 @@ docspell.joex { # This has only any effect, if mode != disabled. clear-interval = "15 minutes" - # The mode for configuring NLP models. Currently 3 are available: - # - # 1. full – builds the complete pipeline, run with -Xmx1500M or more - # 2. basic - builds only the ner annotator, run with -Xmx600M or more - # 3. disabled - doesn't use any stanford-nlp feature - # - # The basic variant does a quite good job for German and - # English. It might be worse for French, always depending on the - # type of text that is analysed. - mode = full - } + regex-ner { + # Whether to enable custom NER annotation. This uses the + # address book of a collective as input for NER tagging (to + # automatically find correspondent and concerned entities). If + # the address book is large, this can be quite memory + # intensive and also makes text analysis much slower. But it + # improves accuracy and can be used independent of the + # lanugage. If this is set to 0, it is effectively disabled + # and NER tagging uses only statistical models (that also work + # quite well, but are restricted to the languages mentioned + # above). + # + # Note, this is only relevant if nlp-config.mode is not + # "disabled". + max-entries = 1000 - regex-ner { - # Whether to enable custom NER annotation. This uses the address - # book of a collective as input for NER tagging (to automatically - # find correspondent and concerned entities). If the address book - # is large, this can be quite memory intensive and also makes text - # analysis slower. But it greatly improves accuracy. If this is - # false, NER tagging uses only statistical models (that also work - # quite well). - # - # This setting might be moved to the collective settings in the - # future. - # - # Note, this is only relevant if nlp-config.mode = full. - enabled = true - - # The NER annotation uses a file of patterns that is derived from - # a collective's address book. This is is the time how long this - # file will be kept until a check for a state change is done. - file-cache-time = "1 minute" + # The NER annotation uses a file of patterns that is derived + # from a collective's address book. This is is the time how + # long this data will be kept until a check for a state change + # is done. + file-cache-time = "1 minute" + } } # Settings for doing document classification. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 5b2bccc5..4ad72d7c 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -60,15 +60,14 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - nlpConfig: TextAnalysisConfig.NlpConfig, - regexNer: RegexNer, + nlp: NlpConfig, classification: Classification ) { def textAnalysisConfig: TextAnalysisConfig = TextAnalysisConfig( maxLength, - nlpConfig, + TextAnalysisConfig.NlpConfig(nlp.clearInterval, nlp.mode), TextClassifierConfig( workingDir, NonEmptyList @@ -78,10 +77,16 @@ object Config { ) def regexNerFileConfig: RegexNerFile.Config = - RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) + RegexNerFile.Config( + nlp.regexNer.maxEntries, + workingDir, + nlp.regexNer.fileCacheTime + ) } - case class RegexNer(enabled: Boolean, fileCacheTime: Duration) + case class NlpConfig(mode: NlpMode, clearInterval: Duration, regexNer: RegexNer) + + case class RegexNer(maxEntries: Int, fileCacheTime: Duration) case class Classification( enabled: Boolean, diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala index 24e7f6ae..56e48012 100644 --- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala +++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala @@ -29,7 +29,7 @@ trait RegexNerFile[F[_]] { object RegexNerFile { private[this] val logger = getLogger - case class Config(enabled: Boolean, directory: Path, minTime: Duration) + case class Config(maxEntries: Int, directory: Path, minTime: Duration) def apply[F[_]: Concurrent: ContextShift]( cfg: Config, @@ -49,7 +49,7 @@ object RegexNerFile { ) extends RegexNerFile[F] { def makeFile(collective: Ident): F[Option[Path]] = - if (cfg.enabled) doMakeFile(collective) + if (cfg.maxEntries > 0) doMakeFile(collective) else (None: Option[Path]).pure[F] def doMakeFile(collective: Ident): F[Option[Path]] = @@ -127,7 +127,7 @@ object RegexNerFile { for { _ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'") - names <- store.transact(QCollective.allNames(collective)) + names <- store.transact(QCollective.allNames(collective, cfg.maxEntries)) nerFile = NerFile(collective, lastUpdate, now) _ <- update(nerFile, NerFile.mkNerConfig(names)) } yield nerFile diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 1fd2401a..f336132d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -4,9 +4,8 @@ import cats.data.OptionT import cats.effect._ import cats.implicits._ -import docspell.analysis.TextAnalyser import docspell.analysis.classifier.{ClassifierModel, TextClassifier} -import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile @@ -54,7 +53,7 @@ object TextAnalysis { analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordNerSettings(ctx.args.meta.language, false, None) + val settings = NlpSettings(ctx.args.meta.language, false, None) for { customNer <- nerFile.makeFile(ctx.args.meta.collective) sett = settings.copy(regexNer = customNer) diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala index b9fe40c7..84caa840 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala @@ -1,10 +1,8 @@ package docspell.store.queries -import cats.data.OptionT import fs2.Stream -import docspell.common.ContactKind -import docspell.common.{Direction, Ident} +import docspell.common._ import docspell.store.qb.DSL._ import docspell.store.qb._ import docspell.store.records._ @@ -17,6 +15,7 @@ object QCollective { private val t = RTag.as("t") private val ro = ROrganization.as("o") private val rp = RPerson.as("p") + private val re = REquipment.as("e") private val rc = RContact.as("c") private val i = RItem.as("i") @@ -25,13 +24,37 @@ object QCollective { val empty = Names(Vector.empty, Vector.empty, Vector.empty) } - def allNames(collective: Ident): ConnectionIO[Names] = - (for { - orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name)) - pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name)) - equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name)) - } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name))) - .getOrElse(Names.empty) + def allNames(collective: Ident, maxEntries: Int): ConnectionIO[Names] = { + val created = Column[Timestamp]("created", TableDef("")) + union( + Select( + select(ro.name.s, lit(1).as("kind"), ro.created.as(created)), + from(ro), + ro.cid === collective + ), + Select( + select(rp.name.s, lit(2).as("kind"), rp.created.as(created)), + from(rp), + rp.cid === collective + ), + Select( + select(re.name.s, lit(3).as("kind"), re.created.as(created)), + from(re), + re.cid === collective + ) + ).orderBy(created.desc) + .limit(Batch.limit(maxEntries)) + .build + .query[(String, Int)] + .streamWithChunkSize(maxEntries) + .fold(Names.empty) { case (names, (name, kind)) => + if (kind == 1) names.copy(org = names.org :+ name) + else if (kind == 2) names.copy(pers = names.pers :+ name) + else names.copy(equip = names.equip :+ name) + } + .compile + .lastOrError + } case class InsightData( incoming: Int, diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 40fe5eb2..c7e04b7b 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -11,6 +11,7 @@ type Language = German | English | French + | Italian fromString : String -> Maybe Language @@ -24,6 +25,8 @@ fromString str = else if str == "fra" || str == "fr" || str == "french" then Just French + else if str == "ita" || str == "it" || str == "italian" then + Just Italian else Nothing @@ -40,6 +43,9 @@ toIso3 lang = French -> "fra" + Italian -> + "ita" + toName : Language -> String toName lang = @@ -53,7 +59,10 @@ toName lang = French -> "French" + Italian -> + "Italian" + all : List Language all = - [ German, English, French ] + [ German, English, French, Italian ] diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 373a6aed..aae8d835 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -98,9 +98,13 @@ let }; text-analysis = { max-length = 10000; - regex-ner = { - enabled = true; - file-cache-time = "1 minute"; + nlp = { + mode = "full"; + clear-interval = "15 minutes"; + regex-ner = { + max-entries = 1000; + file-cache-time = "1 minute"; + }; }; classification = { enabled = true; @@ -118,7 +122,6 @@ let ]; }; working-dir = "/tmp/docspell-analysis"; - clear-stanford-nlp-interval = "15 minutes"; }; processing = { max-due-date-years = 10; @@ -772,47 +775,96 @@ in { files. ''; }; - clear-stanford-nlp-interval = mkOption { - type = types.str; - default = defaults.text-analysis.clear-stanford-nlp-interval; - description = '' - Idle time after which the NLP caches are cleared to free - memory. If <= 0 clearing the cache is disabled. - ''; - }; - regex-ner = mkOption { + nlp = mkOption { type = types.submodule({ options = { - enabled = mkOption { - type = types.bool; - default = defaults.text-analysis.regex-ner.enabled; + mode = mkOption { + type = types.str; + default = defaults.text-analysis.nlp.mode; description = '' - Whether to enable custom NER annotation. This uses the address - book of a collective as input for NER tagging (to automatically - find correspondent and concerned entities). If the address book - is large, this can be quite memory intensive and also makes text - analysis slower. But it greatly improves accuracy. If this is - false, NER tagging uses only statistical models (that also work - quite well). + The mode for configuring NLP models: - This setting might be moved to the collective settings in the - future. + 1. full – builds the complete pipeline + 2. basic - builds only the ner annotator + 3. regexonly - matches each entry in your address book via regexps + 4. disabled - doesn't use any stanford-nlp feature + + The full and basic variants rely on pre-build language models + that are available for only 3 lanugages at the moment: German, + English and French. + + Memory usage varies greatly among the languages. German has + quite large models, that require about 1G heap. So joex should + run with -Xmx1500M at least when using mode=full. + + The basic variant does a quite good job for German and + English. It might be worse for French, always depending on the + type of text that is analysed. Joex should run with about 600M + heap, here again lanugage German uses the most. + + The regexonly variant doesn't depend on a language. It roughly + works by converting all entries in your addressbook into + regexps and matches each one against the text. This can get + memory intensive, too, when the addressbook grows large. This + is included in the full and basic by default, but can be used + independently by setting mode=regexner. + + When mode=disabled, then the whole nlp pipeline is disabled, + and you won't get any suggestions. Only what the classifier + returns (if enabled). ''; }; - file-cache-time = mkOption { + + clear-interval = mkOption { type = types.str; - default = defaults.text-analysis.ner-file-cache-time; + default = defaults.text-analysis.nlp.clear-interval; description = '' - The NER annotation uses a file of patterns that is derived from - a collective's address book. This is is the time how long this - file will be kept until a check for a state change is done. + Idle time after which the NLP caches are cleared to free + memory. If <= 0 clearing the cache is disabled. ''; }; + + regex-ner = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.int; + default = defaults.text-analysis.regex-ner.max-entries; + description = '' + Whether to enable custom NER annotation. This uses the + address book of a collective as input for NER tagging (to + automatically find correspondent and concerned entities). If + the address book is large, this can be quite memory + intensive and also makes text analysis much slower. But it + improves accuracy and can be used independent of the + lanugage. If this is set to 0, it is effectively disabled + and NER tagging uses only statistical models (that also work + quite well, but are restricted to the languages mentioned + above). + + Note, this is only relevant if nlp-config.mode is not + "disabled". + ''; + }; + file-cache-time = mkOption { + type = types.str; + default = defaults.text-analysis.ner-file-cache-time; + description = '' + The NER annotation uses a file of patterns that is derived from + a collective's address book. This is is the time how long this + file will be kept until a check for a state change is done. + ''; + }; + }; + }); + default = defaults.text-analysis.nlp.regex-ner; + description = ""; + }; }; }); - default = defaults.text-analysis.regex-ner; - description = ""; + default = defaults.text-analysis.nlp; + description = "Configure NLP"; }; classification = mkOption { From ff121d462ced2589b0d671d48c27cb135928ca98 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 01:15:21 +0100 Subject: [PATCH 10/38] Disable memory intensive tests on travis --- .../src/test/scala/docspell/analysis/Env.scala | 12 ++++++++++++ .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 9 +++++++++ .../analysis/nlp/StanfordNerAnnotatorSuite.scala | 13 +++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 modules/analysis/src/test/scala/docspell/analysis/Env.scala diff --git a/modules/analysis/src/test/scala/docspell/analysis/Env.scala b/modules/analysis/src/test/scala/docspell/analysis/Env.scala new file mode 100644 index 00000000..dec26074 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/Env.scala @@ -0,0 +1,12 @@ +package docspell.analysis + +object Env { + + def isCI = bool("CI") + + def bool(key: String): Boolean = + string(key).contains("true") + + def string(key: String): Option[String] = + Option(System.getenv(key)).filter(_.nonEmpty) +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala index 2f0cab57..29b3b966 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -1,5 +1,6 @@ package docspell.analysis.nlp +import docspell.analysis.Env import docspell.common.Language.NLPLanguage import minitest.SimpleTestSuite import docspell.files.TestFiles @@ -11,6 +12,10 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language)) test("find english ner labels") { + if (Env.isCI) { + ignore("Test ignored on travis.") + } + val labels = annotate(Language.English)(TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), @@ -47,6 +52,10 @@ object BaseCRFAnnotatorSuite extends SimpleTestSuite { } test("find german ner labels") { + if (Env.isCI) { + ignore("Test ignored on travis.") + } + val labels = annotate(Language.German)(TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala index 416cdff7..91ab7a39 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala @@ -3,6 +3,7 @@ package docspell.analysis.nlp import java.nio.file.Paths import cats.effect.IO +import docspell.analysis.Env import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ @@ -16,6 +17,10 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite { new StanfordCoreNLP(Properties.nerEnglish(None)) test("find english ner labels") { + if (Env.isCI) { + ignore("Test ignored on travis.") + } + val labels = StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText) val expect = Vector( @@ -53,6 +58,10 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite { } test("find german ner labels") { + if (Env.isCI) { + ignore("Test ignored on travis.") + } + val labels = StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText) val expect = Vector( @@ -74,6 +83,10 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite { } test("regexner-only annotator") { + if (Env.isCI) { + ignore("Test ignored on travis.") + } + val regexNerContent = s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 |(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 From 360cad3304af85fa56fccc20605a4d481c1cf6fe Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 11:19:27 +0100 Subject: [PATCH 11/38] Refactoring solr/fts migration When re-indexing everything, skip intermediate populating the index and do this as the very last step. Parameterize adding new fields by their language. --- .../docspell/ftsclient/FtsMigration.scala | 11 ++++++- .../scala/docspell/ftssolr/SolrSetup.scala | 30 ++++--------------- .../scala/docspell/joex/fts/FtsWork.scala | 20 +++++++++---- .../scala/docspell/joex/fts/Migration.scala | 5 ++++ .../scala/docspell/joex/fts/ReIndexTask.scala | 2 +- .../scala/docspell/joex/fts/package.scala | 3 ++ 6 files changed, 40 insertions(+), 31 deletions(-) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsMigration.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsMigration.scala index 3e8fae4e..22858c19 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsMigration.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsMigration.scala @@ -1,5 +1,8 @@ package docspell.ftsclient +import cats.Functor +import cats.implicits._ + import docspell.common._ final case class FtsMigration[F[_]]( @@ -7,7 +10,13 @@ final case class FtsMigration[F[_]]( engine: Ident, description: String, task: F[FtsMigration.Result] -) +) { + + def changeResult(f: FtsMigration.Result => FtsMigration.Result)(implicit + F: Functor[F] + ): FtsMigration[F] = + copy(task = task.map(f)) +} object FtsMigration { diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 769919bd..fb31d912 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -126,19 +126,10 @@ object SolrSetup { lang match { case None => run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.text(field))) - case Some(Language.German) => + run(AddField.command(AddField.textGeneral(field))) + case Some(lang) => run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.textDE(field))) - case Some(Language.English) => - run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.textEN(field))) - case Some(Language.French) => - run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.textFR(field))) - case Some(Language.Italian) => - run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.textIT(field))) + run(AddField.command(AddField.textLang(field, lang))) } } } @@ -164,20 +155,11 @@ object SolrSetup { def string(field: Field): AddField = AddField(field, "string", true, true, false) - def text(field: Field): AddField = + def textGeneral(field: Field): AddField = AddField(field, "text_general", true, true, false) - def textDE(field: Field): AddField = - AddField(field, "text_de", true, true, false) - - def textEN(field: Field): AddField = - AddField(field, "text_en", true, true, false) - - def textFR(field: Field): AddField = - AddField(field, "text_fr", true, true, false) - - def textIT(field: Field): AddField = - AddField(field, "text_it", true, true, false) + def textLang(field: Field, lang: Language): AddField = + AddField(field, s"text_${lang.iso2}", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala b/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala index 88369f9f..7ddfa99d 100644 --- a/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala +++ b/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala @@ -14,16 +14,26 @@ object FtsWork { def apply[F[_]](f: FtsContext[F] => F[Unit]): FtsWork[F] = Kleisli(f) - def allInitializeTasks[F[_]: Monad]: FtsWork[F] = - FtsWork[F](_ => ().pure[F]).tap[FtsContext[F]].flatMap { ctx => - NonEmptyList.fromList(ctx.fts.initialize.map(fm => from[F](fm.task))) match { + /** Runs all migration tasks unconditionally and inserts all data as last step. */ + def reInitializeTasks[F[_]: Monad]: FtsWork[F] = + FtsWork { ctx => + val migrations = + ctx.fts.initialize.map(fm => fm.changeResult(_ => FtsMigration.Result.workDone)) + + NonEmptyList.fromList(migrations) match { case Some(nel) => - nel.reduce(semigroup[F]) + nel + .map(fm => from[F](fm.task)) + .append(insertAll[F](None)) + .reduce(semigroup[F]) + .run(ctx) case None => - FtsWork[F](_ => ().pure[F]) + ().pure[F] } } + /** + */ def from[F[_]: FlatMap: Applicative](t: F[FtsMigration.Result]): FtsWork[F] = Kleisli.liftF(t).flatMap(transformResult[F]) diff --git a/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala index c47d4308..5ad9d028 100644 --- a/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala +++ b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala @@ -11,6 +11,11 @@ import docspell.joex.Config import docspell.store.records.RFtsMigration import docspell.store.{AddResult, Store} +/** Migrating the index from the previous version to this version. + * + * The sql database stores the outcome of a migration task. If this + * task has already been applied, it is skipped. + */ case class Migration[F[_]]( version: Int, engine: Ident, diff --git a/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala b/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala index c1d794e4..5dd45943 100644 --- a/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala @@ -46,6 +46,6 @@ object ReIndexTask { FtsWork.log[F](_.info("Clearing data failed. Continue re-indexing.")) ) ++ FtsWork.log[F](_.info("Running index initialize")) ++ - FtsWork.allInitializeTasks[F] + FtsWork.reInitializeTasks[F] }) } diff --git a/modules/joex/src/main/scala/docspell/joex/fts/package.scala b/modules/joex/src/main/scala/docspell/joex/fts/package.scala index 784754ab..7cf8de80 100644 --- a/modules/joex/src/main/scala/docspell/joex/fts/package.scala +++ b/modules/joex/src/main/scala/docspell/joex/fts/package.scala @@ -4,6 +4,9 @@ import cats.data.Kleisli package object fts { + /** Some work that must be done to advance the schema of the fulltext + * index. + */ type FtsWork[F[_]] = Kleisli[F, FtsContext[F], Unit] } From 26dff18ae0d32ce2b32b4d11ce381ada0e99314f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 12:18:23 +0100 Subject: [PATCH 12/38] Add spanish as an example Adding a new language without nlp requires now only to fill out the pieces: - define a list of month names to support date recognition - add it to joex' dockerfile to be available for tesseract - update the solr migration/field definitions - update the elm file so it shows up on the client --- docker/joex-base.dockerfile | 1 + .../scala/docspell/analysis/date/DateFind.scala | 1 + .../docspell/analysis/date/MonthName.scala | 17 +++++++++++++++++ .../main/scala/docspell/common/Language.scala | 7 ++++++- .../src/main/scala/docspell/ftssolr/Field.scala | 3 +++ .../main/scala/docspell/ftssolr/SolrQuery.scala | 1 + .../main/scala/docspell/ftssolr/SolrSetup.scala | 8 ++++++++ modules/webapp/src/main/elm/Data/Language.elm | 13 ++++++++++++- 8 files changed, 49 insertions(+), 2 deletions(-) diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index 8ebad224..b9b160ed 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -16,6 +16,7 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ tesseract-ocr-data-ita \ + tesseract-ocr-data-spa \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 5feb8b57..438bff85 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -65,6 +65,7 @@ object DateFind { case Language.German => p1.or(p0).or(p2) case Language.French => p1.or(p0).or(p2) case Language.Italian => p1.or(p0).or(p2) + case Language.Spanish => p1.or(p0).or(p2) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index cf61cd72..503e15e4 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -22,6 +22,8 @@ object MonthName { french case Language.Italian => italian + case Language.Spanish => + spanish } private val numbers = List( @@ -98,4 +100,19 @@ object MonthName { List("nov", "novembre"), List("dic", "dicembre") ) + + private val spanish = List( + List("ene", "enero"), + List("feb", "febrero"), + List("mar", "marzo"), + List("abr", "abril"), + List("may", "mayo"), + List("jun"), + List("jul"), + List("ago", "agosto"), + List("sep", "septiembre"), + List("oct", "octubre"), + List("nov", "noviembre"), + List("dic", "diciembre") + ) } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index f18d4adf..3a39dd11 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -47,7 +47,12 @@ object Language { val iso3 = "ita" } - val all: List[Language] = List(German, English, French, Italian) + case object Spanish extends Language { + val iso2 = "es" + val iso3 = "spa" + } + + val all: List[Language] = List(German, English, French, Italian, Spanish) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 345f4665..596d817e 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -25,6 +25,7 @@ object Field { val content_en = Field("content_en") val content_fr = Field("content_fr") val content_it = Field("content_it") + val content_es = Field("content_es") val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") @@ -39,6 +40,8 @@ object Field { Field.content_fr case Language.Italian => Field.content_it + case Language.Spanish => + Field.content_es } implicit val jsonEncoder: Encoder[Field] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index 0b7e6e31..c6a1fd82 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -41,6 +41,7 @@ object SolrQuery { Field.content_en, Field.content_fr, Field.content_it, + Field.content_es, Field.itemName, Field.itemNotes, Field.attachmentName diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index fb31d912..766b9c48 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -69,6 +69,14 @@ object SolrSetup { solrEngine, "Add content_it field", addContentItField.map(_ => FtsMigration.Result.reIndexAll) + ), + FtsMigration[F]( + 8, + solrEngine, + "Add content_es field", + addTextField(Some(Language.Spanish))(Field.content_es).map(_ => + FtsMigration.Result.reIndexAll + ) ) ) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index c7e04b7b..f6c1caee 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -12,6 +12,7 @@ type Language | English | French | Italian + | Spanish fromString : String -> Maybe Language @@ -27,6 +28,10 @@ fromString str = else if str == "ita" || str == "it" || str == "italian" then Just Italian + + else if str == "spa" || str == "es" || str == "spanish" then + Just Spanish + else Nothing @@ -46,6 +51,9 @@ toIso3 lang = Italian -> "ita" + Spanish -> + "spa" + toName : Language -> String toName lang = @@ -62,7 +70,10 @@ toName lang = Italian -> "Italian" + Spanish -> + "Spanish" + all : List Language all = - [ German, English, French, Italian ] + [ German, English, French, Italian, Spanish ] From 94bb18c152e61ffacecb2c0b50ee937e2f0249da Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 20:21:42 +0100 Subject: [PATCH 13/38] Refactor solr language fields --- .../main/scala/docspell/ftssolr/Field.scala | 24 ++++++------------- .../scala/docspell/ftssolr/SolrQuery.scala | 7 +----- .../scala/docspell/ftssolr/SolrSetup.scala | 17 +++++-------- 3 files changed, 14 insertions(+), 34 deletions(-) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 596d817e..a10ca0e8 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -21,28 +21,18 @@ object Field { val discriminator = Field("discriminator") val attachmentName = Field("attachmentName") val content = Field("content") - val content_de = Field("content_de") - val content_en = Field("content_en") - val content_fr = Field("content_fr") - val content_it = Field("content_it") - val content_es = Field("content_es") + val content_de = contentField(Language.German) + val content_en = contentField(Language.English) + val content_fr = contentField(Language.French) val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") + val contentLangFields = Language.all + .map(contentField) + def contentField(lang: Language): Field = - lang match { - case Language.German => - Field.content_de - case Language.English => - Field.content_en - case Language.French => - Field.content_fr - case Language.Italian => - Field.content_it - case Language.Spanish => - Field.content_es - } + Field(s"content_${lang.iso2}") implicit val jsonEncoder: Encoder[Field] = Encoder.encodeString.contramap(_.name) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index c6a1fd82..ae286220 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -37,15 +37,10 @@ object SolrQuery { cfg, List( Field.content, - Field.content_de, - Field.content_en, - Field.content_fr, - Field.content_it, - Field.content_es, Field.itemName, Field.itemNotes, Field.attachmentName - ), + ) ++ Field.contentLangFields, List( Field.id, Field.itemId, diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 766b9c48..63b90db9 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -56,7 +56,7 @@ object SolrSetup { 5, solrEngine, "Add content_fr field", - addContentFrField.map(_ => FtsMigration.Result.workDone) + addContentField(Language.French).map(_ => FtsMigration.Result.workDone) ), FtsMigration[F]( 6, @@ -68,27 +68,19 @@ object SolrSetup { 7, solrEngine, "Add content_it field", - addContentItField.map(_ => FtsMigration.Result.reIndexAll) + addContentField(Language.Italian).map(_ => FtsMigration.Result.reIndexAll) ), FtsMigration[F]( 8, solrEngine, "Add content_es field", - addTextField(Some(Language.Spanish))(Field.content_es).map(_ => - FtsMigration.Result.reIndexAll - ) + addContentField(Language.Spanish).map(_ => FtsMigration.Result.reIndexAll) ) ) def addFolderField: F[Unit] = addStringField(Field.folderId) - def addContentFrField: F[Unit] = - addTextField(Some(Language.French))(Field.content_fr) - - def addContentItField: F[Unit] = - addTextField(Some(Language.Italian))(Field.content_it) - def setupCoreSchema: F[Unit] = { val cmds0 = List( @@ -130,6 +122,9 @@ object SolrSetup { run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.string(field))) + private def addContentField(lang: Language): F[Unit] = + addTextField(Some(lang))(Field.contentField(lang)) + private def addTextField(lang: Option[Language])(field: Field): F[Unit] = lang match { case None => From 3f75af0807627f0ec43b8120df1e5dfcdc1f905c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 22:53:12 +0100 Subject: [PATCH 14/38] Add 9 more lanugages to the list of document lanugages --- docker/joex-base.dockerfile | 9 ++ .../docspell/analysis/date/DateFind.scala | 26 ++- .../docspell/analysis/date/MonthName.scala | 152 ++++++++++++++++++ .../main/scala/docspell/common/Language.scala | 63 +++++++- .../main/scala/docspell/ftssolr/Field.scala | 3 +- .../scala/docspell/ftssolr/SolrSetup.scala | 24 ++- modules/webapp/src/main/elm/Data/Language.elm | 106 +++++++++++- 7 files changed, 371 insertions(+), 12 deletions(-) diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index b9b160ed..87633eb0 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -17,6 +17,15 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr-data-fra \ tesseract-ocr-data-ita \ tesseract-ocr-data-spa \ + tesseract-ocr-data-por \ + tesseract-ocr-data-ces \ + tesseract-ocr-data-nld \ + tesseract-ocr-data-dan \ + tesseract-ocr-data-fin \ + tesseract-ocr-data-nor \ + tesseract-ocr-data-swe \ + tesseract-ocr-data-rus \ + tesseract-ocr-data-ron \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 438bff85..698606f0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -56,16 +56,26 @@ object DateFind { // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { - val p0 = pattern0(lang) - val p1 = pattern1(lang) - val p2 = pattern2(lang) + val ymd = pattern0(lang) + val dmy = pattern1(lang) + val mdy = pattern2(lang) + // most is from wikipedia… val p = lang match { case Language.English => - p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1) - case Language.German => p1.or(p0).or(p2) - case Language.French => p1.or(p0).or(p2) - case Language.Italian => p1.or(p0).or(p2) - case Language.Spanish => p1.or(p0).or(p2) + mdy.alt(dmy).map(t => t._1 ++ t._2).or(mdy).or(ymd).or(dmy) + case Language.German => dmy.or(ymd).or(mdy) + case Language.French => dmy.or(ymd).or(mdy) + case Language.Italian => dmy.or(ymd).or(mdy) + case Language.Spanish => dmy.or(ymd).or(mdy) + case Language.Czech => dmy.or(ymd).or(mdy) + case Language.Danish => dmy.or(ymd).or(mdy) + case Language.Finnish => dmy.or(ymd).or(mdy) + case Language.Norwegian => dmy.or(ymd).or(mdy) + case Language.Portuguese => dmy.or(ymd).or(mdy) + case Language.Romanian => dmy.or(ymd).or(mdy) + case Language.Russian => dmy.or(ymd).or(mdy) + case Language.Swedish => ymd.or(dmy).or(mdy) + case Language.Dutch => dmy.or(ymd).or(mdy) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 503e15e4..333275a0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -24,6 +24,24 @@ object MonthName { italian case Language.Spanish => spanish + case Language.Swedish => + swedish + case Language.Norwegian => + norwegian + case Language.Dutch => + dutch + case Language.Czech => + czech + case Language.Danish => + danish + case Language.Portuguese => + portuguese + case Language.Romanian => + romanian + case Language.Finnish => + finnish + case Language.Russian => + russian } private val numbers = List( @@ -115,4 +133,138 @@ object MonthName { List("nov", "noviembre"), List("dic", "diciembre") ) + + private val swedish = List( + List("jan", "januari"), + List("febr", "februari"), + List("mars"), + List("april"), + List("maj"), + List("juni"), + List("juli"), + List("aug", "augusti"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dec", "december") + ) + private val norwegian = List( + List("jan", "januar"), + List("febr", "februar"), + List("mars"), + List("april"), + List("mai"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("des", "desember") + ) + + private val czech = List( + List("led", "leden"), + List("un", "ún", "únor", "unor"), + List("brez", "březen", "brezen"), + List("dub", "duben"), + List("kvet", "květen"), + List("cerv", "červen"), + List("cerven", "červenec"), + List("srp", "srpen"), + List("zari", "září"), + List("ríj", "rij", "říjen"), + List("list", "listopad"), + List("pros", "prosinec") + ) + + private val romanian = List( + List("ian", "ianuarie"), + List("feb", "februarie"), + List("mar", "martie"), + List("apr", "aprilie"), + List("mai"), + List("iunie"), + List("iulie"), + List("aug", "august"), + List("sept", "septembrie"), + List("oct", "octombrie"), + List("noem", "nov", "noiembrie"), + List("dec", "decembrie") + ) + + private val danish = List( + List("jan", "januar"), + List("febr", "februar"), + List("marts"), + List("april"), + List("maj"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dec", "december") + ) + + private val portuguese = List( + List("jan", "janeiro"), + List("fev", "fevereiro"), + List("março", "marco"), + List("abril"), + List("maio"), + List("junho"), + List("julho"), + List("agosto"), + List("set", "setembro"), + List("out", "outubro"), + List("nov", "novembro"), + List("dez", "dezembro") + ) + + private val finnish = List( + List("tammikuu"), + List("helmikuu"), + List("maaliskuu"), + List("huhtikuu"), + List("toukokuu"), + List("kesäkuu"), + List("heinäkuu"), + List("elokuu"), + List("syyskuu"), + List("lokakuu"), + List("marraskuu"), + List("joulukuu") + ) + + private val russian = List( + List("январь"), + List("февраль"), + List("март"), + List("апрель"), + List("май"), + List("июнь"), + List("июль"), + List("август"), + List("сентябрь"), + List("октябрь"), + List("ноябрь"), + List("декабрь") + ) + + private val dutch = List( + List("jan", "januari"), + List("feb", "februari"), + List("maart"), + List("apr", "april"), + List("mei"), + List("juni"), + List("juli"), + List("aug", "augustus"), + List("sept", "september"), + List("okt", "oct", "oktober"), + List("nov", "november"), + List("dec", "december") + ) } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 3a39dd11..72f5e0df 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -52,7 +52,68 @@ object Language { val iso3 = "spa" } - val all: List[Language] = List(German, English, French, Italian, Spanish) + case object Portuguese extends Language { + val iso2 = "pt" + val iso3 = "por" + } + + case object Czech extends Language { + val iso2 = "cs" + val iso3 = "ces" + } + + case object Danish extends Language { + val iso2 = "da" + val iso3 = "dan" + } + + case object Finnish extends Language { + val iso2 = "fi" + val iso3 = "fin" + } + + case object Norwegian extends Language { + val iso2 = "no" + val iso3 = "nor" + } + + case object Swedish extends Language { + val iso2 = "sv" + val iso3 = "swe" + } + + case object Russian extends Language { + val iso2 = "ru" + val iso3 = "rus" + } + + case object Romanian extends Language { + val iso2 = "ro" + val iso3 = "ron" + } + + case object Dutch extends Language { + val iso2 = "nl" + val iso3 = "nld" + } + + val all: List[Language] = + List( + German, + English, + French, + Italian, + Spanish, + Dutch, + Portuguese, + Czech, + Danish, + Finnish, + Norwegian, + Swedish, + Russian, + Romanian + ) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index a10ca0e8..ff55e5ae 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -32,7 +32,8 @@ object Field { .map(contentField) def contentField(lang: Language): Field = - Field(s"content_${lang.iso2}") + if (lang == Language.Czech) Field(s"content_cz") + else Field(s"content_${lang.iso2}") implicit val jsonEncoder: Encoder[Field] = Encoder.encodeString.contramap(_.name) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 63b90db9..e4a9df04 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -75,12 +75,33 @@ object SolrSetup { solrEngine, "Add content_es field", addContentField(Language.Spanish).map(_ => FtsMigration.Result.reIndexAll) + ), + FtsMigration[F]( + 9, + solrEngine, + "Add more content fields", + addMoreContentFields.map(_ => FtsMigration.Result.reIndexAll) ) ) def addFolderField: F[Unit] = addStringField(Field.folderId) + def addMoreContentFields: F[Unit] = { + val remain = List[Language]( + Language.Norwegian, + Language.Romanian, + Language.Swedish, + Language.Finnish, + Language.Danish, + Language.Czech, + Language.Dutch, + Language.Portuguese, + Language.Russian + ) + remain.traverse(addContentField).map(_ => ()) + } + def setupCoreSchema: F[Unit] = { val cmds0 = List( @@ -162,7 +183,8 @@ object SolrSetup { AddField(field, "text_general", true, true, false) def textLang(field: Field, lang: Language): AddField = - AddField(field, s"text_${lang.iso2}", true, true, false) + if (lang == Language.Czech) AddField(field, s"text_cz", true, true, false) + else AddField(field, s"text_${lang.iso2}", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index f6c1caee..9df00fa3 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -13,6 +13,15 @@ type Language | French | Italian | Spanish + | Portuguese + | Czech + | Danish + | Finnish + | Norwegian + | Swedish + | Russian + | Romanian + | Dutch fromString : String -> Maybe Language @@ -32,6 +41,33 @@ fromString str = else if str == "spa" || str == "es" || str == "spanish" then Just Spanish + else if str == "por" || str == "pt" || str == "portuguese" then + Just Portuguese + + else if str == "ces" || str == "cs" || str == "czech" then + Just Czech + + else if str == "dan" || str == "da" || str == "danish" then + Just Danish + + else if str == "nld" || str == "nd" || str == "dutch" then + Just Dutch + + else if str == "fin" || str == "fi" || str == "finnish" then + Just Finnish + + else if str == "nor" || str == "no" || str == "norwegian" then + Just Norwegian + + else if str == "swe" || str == "sv" || str == "swedish" then + Just Swedish + + else if str == "rus" || str == "ru" || str == "russian" then + Just Russian + + else if str == "ron" || str == "ro" || str == "romanian" then + Just Romanian + else Nothing @@ -54,6 +90,33 @@ toIso3 lang = Spanish -> "spa" + Portuguese -> + "por" + + Czech -> + "ces" + + Danish -> + "dan" + + Finnish -> + "fin" + + Norwegian -> + "nor" + + Swedish -> + "swe" + + Russian -> + "rus" + + Romanian -> + "ron" + + Dutch -> + "nld" + toName : Language -> String toName lang = @@ -73,7 +136,48 @@ toName lang = Spanish -> "Spanish" + Portuguese -> + "Portuguese" + + Czech -> + "Czech" + + Danish -> + "Danish" + + Finnish -> + "Finnish" + + Norwegian -> + "Norwegian" + + Swedish -> + "Swedish" + + Russian -> + "Russian" + + Romanian -> + "Romanian" + + Dutch -> + "Dutch" + all : List Language all = - [ German, English, French, Italian, Spanish ] + [ German + , English + , French + , Italian + , Spanish + , Portuguese + , Czech + , Dutch + , Danish + , Finnish + , Norwegian + , Swedish + , Russian + , Romanian + ] From c5778880d919effa372ad4ffba68f84ecf4fdc40 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 00:59:37 +0100 Subject: [PATCH 15/38] Update documentation --- .../site/content/docs/joex/file-processing.md | 90 ++++++++++++++++--- website/site/content/docs/webapp/metadata.md | 22 ++++- 2 files changed, 97 insertions(+), 15 deletions(-) diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index 68811bca..7c0f7610 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -334,33 +334,97 @@ images for a collective. There is also a bash script provided in the This uses the extracted text to find what could be attached to the new item. There are multiple things provided. +Docspell depends on the [Stanford NLP +Library](https://nlp.stanford.edu/software/) for its AI features. +Among other things they provide a classifier (used for guessing tags) +and NER annotators. The latter is also a classifier, that associates a +label to terms in a text. It finds out whether some term is probably +an organization, a person etc. This is then used to find matches in +your address book. + +When docspell finds several possible candidates for a match, it will +show the first few to you. If then the first was not the correct one, +it can usually be fixed by a single click, because it is among the +suggestions. ## Classification If you enabled classification in the config file, a model is trained -periodically from your files. This is now used to guess a tag for the -item. +periodically from your files. This is used to guess a tag for the item +for new documents. + +You can tell docspell how many documents it should use for training. +Sometimes (when moving?), documents may change and you only like to +base next guesses on the documents of last year only. This can be +found in the collective settings. + +The admin can also limit the number of documents to train with, +because it affects memory usage. ## Natural Language Processing -NLP is used to find out which terms in the text may be a company or -person that is later used to find metadata to attach to. It also uses -your address book to match terms in the text. +NLP is used to find out which terms in a text may be a company or +person that is then used to find metadata in your address book. It can +also uses your complete address book to match terms in the text. So +there are two ways: using a statistical model, terms in a text are +identified as organization or person etc. This information is then +used to search your address book. Second, regexp rules are derived +from the address book and run against the text. By default, both are +applied, where the rules are run as the last step to identify missing +terms. -This requires to load language model files in memory, which is quite a -lot. Also, the number of languages is much more restricted than for -tesseract. Currently English, German and French are supported. +The statistical model approach is good, i.e. for large address books. +Normally, a document contains only very few organizations or person +names. So it is much more efficient to check these against your +address book (in contrast to the other way around). It can also find +things *not* in your address book. However, it might not detect all or +there are no statistical models for your language. Then the address +book is used to automatically create rules that are run against the +document. -Another feature that is planned, but not yet provided is to propose -new companies/people you don't have yet in your address book. +These statistical models are provided by [Stanford +NLP](https://nlp.stanford.edu/software/) and are currently available +for German, English and French. All other languages can use the rule +approach. The statistcal models, however, require quite some memory – +depending on the size of the models which varies between languages. +English has a lower memory footprint than German, for example. If you +have a very large address book, the rule approach may also use a lot +memory. + +In the config file, you can specify different modes of operation for +nlp processing as follows: + +- mode `full`: creates the complete nlp pipeline, requiring the most + amount of memory, providing the best results. I'd recommend to run + joex with a heap size of a least 1.5G (for English only, it can be + lower that that). +- mode `basic`: it only loads the NER tagger. This doesn't work as + well as the complete pipeline, because some steps are simply + skipped. But it gives quite good results and uses less memory. I'd + recommend to run joex with at least 600m heap in this mode. +- mode `regexonly`: this doesn't load any statistical models and is + therefore very memory efficient (depending on the address book size, + of course). It will use the address book to create regex rules and + match them against your document. It doesn't depend on a language, + so this is available for all languages. +- mode = disabled: this disables nlp processing altogether + +Note that mode `full` and `basic` is only relevant for the languages +where models are available. For all other languages, it is effectively +the same as `regexonly`. The config file allows some settings. You can specify a limit for texts. Large texts result in higher memory consumption. By default, the first 10'000 characters are taken into account. +Then, for the `regexonly` mode, you can restrict the number of address +book entries that are used to create the rule set via +`regex-ner.max-entries`. This may be useful to reduce memory +footprint. + The setting `clear-stanford-nlp-interval` allows to define an idle time after which the model files are cleared from memory. This allows -to be reclaimed by the OS. The timer starts after the last file has -been processed. If you can afford it, it is recommended to disable it -by setting it to `0`. +memory to be reclaimed by the OS. The timer starts after the last file +has been processed. If you can afford it, it is recommended to disable +it by setting it to `0`. diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md index c6f03498..fb096641 100644 --- a/website/site/content/docs/webapp/metadata.md +++ b/website/site/content/docs/webapp/metadata.md @@ -130,10 +130,28 @@ page](@/docs/webapp/customfields.md) for more information. # Document Language An important setting is the language of your documents. This helps OCR -and text analysis. You can select between English, German and French -currently. The language can also specified with each [upload +and text analysis. You can select between various languages. The +language can also specified with each [upload request](@/docs/api/upload.md). Go to the *Collective Settings* page and click *Document Language*. This will set the lanugage for all your documents. It is not (yet) possible to specify it when uploading. + +The language has effects in several areas: text extraction, fulltext +search and text analysis. When extracting text from images, tesseract +(the external tool used for this) can yield better results if the +language is known. Also, solr (the fulltext search tool) can optimize +its index given the language, which results in better fulltext search +experience. The features of text analysis strongly depend on the +language. Docspell uses the [Stanford NLP +Library](https://nlp.stanford.edu/software/) for its great machine +learning algorithms. Some of them, like certain NLP features, are only +available for some languages – namely German, English and French. The +reason is that the required statistical models are not available for +other languages. However, docspell can still run other algorithms for +the other languages, like classification and custom rules based on the +address book. + +More information about file processing and text analysis can be found +[here](@/docs/joex/file-processing.md#text-analysis). From 249f9e6e2a22cc1250a8b968e0193c8a96abed5a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 13:35:53 +0100 Subject: [PATCH 16/38] Extend guessing tags to all tag categories --- .../classifier/StanfordTextClassifier.scala | 9 +- .../scala/docspell/joex/JoexAppImpl.scala | 2 +- .../docspell/joex/learn/ClassifierName.scala | 45 +++++++++ .../joex/learn/LearnClassifierTask.scala | 92 +++++++------------ .../docspell/joex/learn/SelectItems.scala | 39 ++++++++ .../joex/learn/StoreClassifierModel.scala | 53 +++++++++++ .../docspell/joex/process/TextAnalysis.scala | 59 +++++++----- .../src/main/resources/docspell-openapi.yml | 2 - .../restserver/routes/CollectiveRoutes.scala | 6 +- .../h2/V1.17.1__classifier_model.sql | 21 +++++ .../mariadb/V1.17.1__classifier_model.sql | 26 ++++++ .../postgresql/V1.17.1__classifier_model.sql | 21 +++++ .../scala/docspell/store/queries/QItem.scala | 5 +- .../store/records/RClassifierModel.scala | 78 ++++++++++++++++ .../store/records/RClassifierSetting.scala | 25 ++--- .../docspell/store/records/RCollective.scala | 3 +- .../scala/docspell/store/records/RTag.scala | 7 ++ .../main/elm/Comp/ClassifierSettingsForm.elm | 59 +----------- 18 files changed, 384 insertions(+), 168 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql create mode 100644 modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala index edd1c7da..dc567695 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala @@ -11,6 +11,7 @@ import docspell.analysis.classifier import docspell.analysis.classifier.TextClassifier._ import docspell.analysis.nlp.Properties import docspell.common._ +import docspell.common.syntax.FileSyntax._ import edu.stanford.nlp.classify.ColumnDataClassifier @@ -28,7 +29,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( .use { dir => for { rawData <- writeDataFile(blocker, dir, data) - _ <- logger.info(s"Learning from ${rawData.count} items.") + _ <- logger.debug(s"Learning from ${rawData.count} items.") trainData <- splitData(logger, rawData) scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m)) sorted = scores.sortBy(-_.score) @@ -138,9 +139,9 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( props: Map[String, String] ): Map[String, String] = prepend("2.", props) ++ Map( - "trainFile" -> trainData.train.normalize().toAbsolutePath().toString(), - "testFile" -> trainData.test.normalize().toAbsolutePath().toString(), - "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString() + "trainFile" -> trainData.train.absolutePathAsString, + "testFile" -> trainData.test.absolutePathAsString, + "serializeTo" -> trainData.modelFile.absolutePathAsString ).toList case class RawData(count: Long, file: Path) diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index cdbb5a50..c221f187 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -169,7 +169,7 @@ object JoexAppImpl { .withTask( JobTask.json( LearnClassifierArgs.taskName, - LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser), + LearnClassifierTask[F](cfg.textAnalysis, analyser), LearnClassifierTask.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala new file mode 100644 index 00000000..6b128c24 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -0,0 +1,45 @@ +package docspell.joex.learn + +import cats.data.NonEmptyList +import cats.implicits._ +import docspell.common.Ident +import docspell.store.records.{RClassifierModel, RTag} +import doobie._ + +final class ClassifierName(val name: String) extends AnyVal + +object ClassifierName { + def apply(name: String): ClassifierName = + new ClassifierName(name) + + val noCategory: ClassifierName = + apply("__docspell_no_category__") + + val categoryPrefix = "tagcategory-" + + def tagCategory(cat: String): ClassifierName = + apply(s"${categoryPrefix}${cat}") + + val concernedPerson: ClassifierName = + apply("concernedperson") + + val concernedEquip: ClassifierName = + apply("concernedequip") + + val correspondentOrg: ClassifierName = + apply("correspondentorg") + + val correspondentPerson: ClassifierName = + apply("correspondentperson") + + def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = + for { + categories <- RTag.listCategories(coll, noCategory.name) + models <- NonEmptyList.fromList(categories) match { + case Some(nel) => + RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name)) + case None => + List.empty[RClassifierModel].pure[ConnectionIO] + } + } yield models +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index d5c632c3..3949a151 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -4,23 +4,16 @@ import cats.data.Kleisli import cats.data.OptionT import cats.effect._ import cats.implicits._ -import fs2.{Pipe, Stream} - import docspell.analysis.TextAnalyser -import docspell.analysis.classifier.ClassifierModel -import docspell.analysis.classifier.TextClassifier.Data import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ -import docspell.store.queries.QItem -import docspell.store.records.RClassifierSetting - -import bitpeace.MimetypeHint +import docspell.store.records.{RClassifierSetting, RTag} object LearnClassifierTask { - val noClass = "__NONE__" val pageSep = " --n-- " + val noClass = "__NONE__" type Args = LearnClassifierArgs @@ -29,67 +22,53 @@ object LearnClassifierTask { def apply[F[_]: Sync: ContextShift]( cfg: Config.TextAnalysis, - blocker: Blocker, analyser: TextAnalyser[F] ): Task[F, Args, Unit] = Task { ctx => (for { sett <- findActiveSettings[F](ctx, cfg) - data = selectItems( - ctx, - math.min(cfg.classification.itemCount, sett.itemCount).toLong, - sett.category.getOrElse("") - ) + maxItems = math.min(cfg.classification.itemCount, sett.itemCount) _ <- OptionT.liftF( - analyser.classifier - .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker))) + learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) ) } yield ()) .getOrElseF(logInactiveWarning(ctx.logger)) } - private def handleModel[F[_]: Sync: ContextShift]( - ctx: Context[F, Args], - blocker: Blocker - )(trainedModel: ClassifierModel): F[Unit] = - for { - oldFile <- ctx.store.transact( - RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId)) - ) - _ <- ctx.logger.info("Storing new trained model") - fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) - newFile <- - ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError - _ <- ctx.store.transact( - RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id)) - ) - _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}") - _ <- oldFile match { - case Some(fid) => - ctx.logger.debug(s"Deleting old model file ${fid.id}") *> - ctx.store.bitpeace.delete(fid.id).compile.drain - case None => ().pure[F] - } - } yield () - - private def selectItems[F[_]]( - ctx: Context[F, Args], - max: Long, + def learnTagCategory[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + )( category: String - ): Stream[F, Data] = { - val connStream = - for { - item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) - tt <- Stream.eval( - QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep) + ): Task[F, A, Unit] = + Task { ctx => + val data = SelectItems.forCategory(ctx, collective)(maxItems, category) + ctx.logger.info(s"Learn classifier for tag category: $category") *> + analyser.classifier.trainClassifier(ctx.logger, data)( + Kleisli( + StoreClassifierModel.handleModel( + ctx, + collective, + ClassifierName.tagCategory(category) + ) + ) ) - } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) - ctx.store.transact(connStream.filter(_.text.nonEmpty)) - } + } - private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] = - if (max <= 0) identity - else _.take(max) + def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])( + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + Task { ctx => + for { + cats <- ctx.store.transact( + RTag.listCategories(collective, ClassifierName.noCategory.name) + ) + task = learnTagCategory[F, A](analyser, collective, maxItems) _ + _ <- cats.map(task).traverse(_.run(ctx)) + } yield () + } private def findActiveSettings[F[_]: Sync]( ctx: Context[F, Args], @@ -98,7 +77,6 @@ object LearnClassifierTask { if (cfg.classification.enabled) OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) .filter(_.enabled) - .filter(_.category.nonEmpty) .map(OCollective.Classifier.fromRecord) else OptionT.none diff --git a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala new file mode 100644 index 00000000..e7c31d7b --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala @@ -0,0 +1,39 @@ +package docspell.joex.learn + +import fs2.Stream + +import docspell.analysis.classifier.TextClassifier.Data +import docspell.common._ +import docspell.joex.scheduler.Context +import docspell.store.Store +import docspell.store.qb.Batch +import docspell.store.queries.QItem + +object SelectItems { + val pageSep = LearnClassifierTask.pageSep + val noClass = LearnClassifierTask.noClass + + def forCategory[F[_]](ctx: Context[F, _], collective: Ident)( + max: Int, + category: String + ): Stream[F, Data] = + forCategory(ctx.store, collective, max, category) + + def forCategory[F[_]]( + store: Store[F], + collective: Ident, + max: Int, + category: String + ): Stream[F, Data] = { + val limit = if (max <= 0) Batch.all else Batch.limit(max) + val connStream = + for { + item <- QItem.findAllNewesFirst(collective, 10, limit) + tt <- Stream.eval( + QItem.resolveTextAndTag(collective, item, category, pageSep) + ) + } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) + store.transact(connStream.filter(_.text.nonEmpty)) + } + +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala b/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala new file mode 100644 index 00000000..03d027a1 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/StoreClassifierModel.scala @@ -0,0 +1,53 @@ +package docspell.joex.learn + +import cats.effect._ +import cats.implicits._ + +import docspell.analysis.classifier.ClassifierModel +import docspell.common._ +import docspell.joex.scheduler._ +import docspell.store.Store +import docspell.store.records.RClassifierModel + +import bitpeace.MimetypeHint + +object StoreClassifierModel { + + def handleModel[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + collective: Ident, + modelName: ClassifierName + )( + trainedModel: ClassifierModel + ): F[Unit] = + handleModel(ctx.store, ctx.blocker, ctx.logger)(collective, modelName, trainedModel) + + def handleModel[F[_]: Sync: ContextShift]( + store: Store[F], + blocker: Blocker, + logger: Logger[F] + )( + collective: Ident, + modelName: ClassifierName, + trainedModel: ClassifierModel + ): F[Unit] = + for { + oldFile <- store.transact( + RClassifierModel.findByName(collective, modelName.name).map(_.map(_.fileId)) + ) + _ <- logger.debug(s"Storing new trained model for: ${modelName.name}") + fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) + newFile <- + store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError + _ <- store.transact( + RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id)) + ) + _ <- logger.debug(s"New model stored at file ${newFile.id}") + _ <- oldFile match { + case Some(fid) => + logger.debug(s"Deleting old model file ${fid.id}") *> + store.bitpeace.delete(fid.id).compile.drain + case None => ().pure[F] + } + } yield () +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index f336132d..fd7c08bc 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -9,12 +9,11 @@ import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile -import docspell.joex.learn.LearnClassifierTask +import docspell.joex.learn.{ClassifierName, LearnClassifierTask} import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task -import docspell.store.records.RAttachmentMeta -import docspell.store.records.RClassifierSetting +import docspell.store.records.{RAttachmentMeta, RClassifierSetting} import bitpeace.RangeDef @@ -42,10 +41,13 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - tag <- predictTag(ctx, cfg, item.metas, analyser.classifier).value + classifierEnabled <- getActive(ctx, cfg) + tag <- + if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) + else List.empty[String].pure[F] } yield item .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) - .appendTags(tag.toSeq) + .appendTags(tag) } def annotateAttachment[F[_]: Sync]( @@ -66,15 +68,29 @@ object TextAnalysis { } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + def predictTags[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): F[List[String]] = + for { + models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective)) + _ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories") + tags <- models + .map(_.fileId.some) + .traverse(predictTag(ctx, cfg, metas, classifier)) + } yield tags.flatten + def predictTag[F[_]: Sync: ContextShift]( ctx: Context[F, Args], cfg: Config.TextAnalysis, metas: Vector[RAttachmentMeta], classifier: TextClassifier[F] - ): OptionT[F, String] = - for { - model <- findActiveModel(ctx, cfg) - _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) + )(modelFileId: Option[Ident]): F[Option[String]] = + (for { + _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …")) + model <- OptionT.fromOption[F](modelFileId) text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) modelData = ctx.store.bitpeace @@ -90,20 +106,21 @@ object TextAnalysis { .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) }).filter(_ != LearnClassifierTask.noClass) _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) - } yield cls + } yield cls).value - private def findActiveModel[F[_]: Sync]( + private def getActive[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis - ): OptionT[F, Ident] = - (if (cfg.classification.enabled) - OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective))) - .filter(_.enabled) - .mapFilter(_.fileId) - else - OptionT.none[F, Ident]).orElse( - OptionT.liftF(ctx.logger.info("Classification is disabled.")) *> OptionT - .none[F, Ident] - ) + ): F[Boolean] = + if (cfg.classification.enabled) + ctx.store + .transact(RClassifierSetting.findById(ctx.args.meta.collective)) + .map(_.exists(_.enabled)) + .flatTap(enabled => + if (enabled) ().pure[F] + else ctx.logger.info("Classification is disabled. Check config or settings.") + ) + else + ctx.logger.info("Classification is disabled.") *> false.pure[F] } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 20ac6449..d32d2352 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -4856,8 +4856,6 @@ components: properties: enabled: type: boolean - category: - type: string itemCount: type: integer format: int32 diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index 7ecd1e90..ee868254 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -46,8 +46,7 @@ object CollectiveRoutes { OCollective.Classifier( settings.classifier.enabled, settings.classifier.schedule, - settings.classifier.itemCount, - settings.classifier.category + settings.classifier.itemCount ) ) ) @@ -65,8 +64,7 @@ object CollectiveRoutes { c.language, c.integrationEnabled, ClassifierSetting( - c.classifier.map(_.enabled).getOrElse(false), - c.classifier.flatMap(_.category), + c.classifier.exists(_.enabled), c.classifier.map(_.itemCount).getOrElse(0), c.classifier .map(_.schedule) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..11be9909 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql @@ -0,0 +1,21 @@ +CREATE TABLE "classifier_model"( + "id" varchar(254) not null primary key, + "cid" varchar(254) not null, + "name" varchar(254) not null, + "file_id" varchar(254) not null, + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id"), + unique ("cid", "name") +); + +insert into "classifier_model" +select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name", "file_id", "created" +from "classifier_setting" +where "file_id" is not null; + +alter table "classifier_setting" +drop column "category"; + +alter table "classifier_setting" +drop column "file_id"; diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..d6f9da6e --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql @@ -0,0 +1,26 @@ +CREATE TABLE `classifier_model`( + `id` varchar(254) not null primary key, + `cid` varchar(254) not null, + `name` varchar(254) not null, + `file_id` varchar(254) not null, + `created` timestamp not null, + foreign key (`cid`) references `collective`(`cid`), + foreign key (`file_id`) references `filemeta`(`id`), + unique (`cid`, `name`) +); + +insert into `classifier_model` +select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `file_id`, `created` +from `classifier_setting` +where `file_id` is not null; + +alter table `classifier_setting` +drop column `category`; + +-- mariadb needs special treatment when dropping a column that is part +-- of an index and foreign key +alter table `classifier_setting` +drop constraint `classifier_setting_ibfk_2`; + +alter table `classifier_setting` +drop column `file_id`; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql new file mode 100644 index 00000000..81e327ff --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql @@ -0,0 +1,21 @@ +CREATE TABLE "classifier_model"( + "id" varchar(254) not null primary key, + "cid" varchar(254) not null, + "name" varchar(254) not null, + "file_id" varchar(254) not null, + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id"), + unique ("cid", "name") +); + +insert into "classifier_model" +select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name", "file_id", "created" +from "classifier_setting" +where "file_id" is not null; + +alter table "classifier_setting" +drop column "category"; + +alter table "classifier_setting" +drop column "file_id"; diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 3ce1af55..b68afb22 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -543,11 +543,14 @@ object QItem { def findAllNewesFirst( collective: Ident, - chunkSize: Int + chunkSize: Int, + limit: Batch ): Stream[ConnectionIO, Ident] = { + val i = RItem.as("i") Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed) .orderBy(i.created.desc) + .limit(limit) .build .query[Ident] .streamWithChunkSize(chunkSize) diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala new file mode 100644 index 00000000..2d018f81 --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala @@ -0,0 +1,78 @@ +package docspell.store.records + +import cats.effect._ +import cats.data.NonEmptyList +import cats.implicits._ + +import docspell.common._ +import docspell.store.qb.DSL._ +import docspell.store.qb._ + +import doobie._ +import doobie.implicits._ + +final case class RClassifierModel( + id: Ident, + cid: Ident, + name: String, + fileId: Ident, + created: Timestamp +) {} + +object RClassifierModel { + + def createNew[F[_]: Sync]( + cid: Ident, + name: String, + fileId: Ident + ): F[RClassifierModel] = + for { + id <- Ident.randomId[F] + now <- Timestamp.current[F] + } yield RClassifierModel(id, cid, name, fileId, now) + + final case class Table(alias: Option[String]) extends TableDef { + val tableName = "classifier_model" + + val id = Column[Ident]("id", this) + val cid = Column[Ident]("cid", this) + val name = Column[String]("name", this) + val fileId = Column[Ident]("file_id", this) + val created = Column[Timestamp]("created", this) + + val all = NonEmptyList.of[Column[_]](id, cid, name, fileId, created) + } + + def as(alias: String): Table = + Table(Some(alias)) + + val T = Table(None) + + def insert(v: RClassifierModel): ConnectionIO[Int] = + DML.insert( + T, + T.all, + fr"${v.id},${v.cid},${v.name},${v.fileId},${v.created}" + ) + + def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] = + for { + n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid))) + k <- + if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert) + else 0.pure[ConnectionIO] + } yield n + k + + def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] = + Select(select(T.all), from(T), T.cid === cid && T.name === name).build + .query[RClassifierModel] + .option + + def findAllByName( + cid: Ident, + names: NonEmptyList[String] + ): ConnectionIO[List[RClassifierModel]] = + Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build + .query[RClassifierModel] + .to[List] +} diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index 749435d1..fe634161 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -15,9 +15,7 @@ case class RClassifierSetting( cid: Ident, enabled: Boolean, schedule: CalEvent, - category: String, itemCount: Int, - fileId: Option[Ident], created: Timestamp ) {} @@ -28,12 +26,10 @@ object RClassifierSetting { val cid = Column[Ident]("cid", this) val enabled = Column[Boolean]("enabled", this) val schedule = Column[CalEvent]("schedule", this) - val category = Column[String]("category", this) val itemCount = Column[Int]("item_count", this) - val fileId = Column[Ident]("file_id", this) val created = Column[Timestamp]("created", this) val all = NonEmptyList - .of[Column[_]](cid, enabled, schedule, category, itemCount, fileId, created) + .of[Column[_]](cid, enabled, schedule, itemCount, created) } val T = Table(None) @@ -44,7 +40,7 @@ object RClassifierSetting { DML.insert( T, T.all, - fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}" + fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}" ) def updateAll(v: RClassifierSetting): ConnectionIO[Int] = @@ -54,15 +50,10 @@ object RClassifierSetting { DML.set( T.enabled.setTo(v.enabled), T.schedule.setTo(v.schedule), - T.category.setTo(v.category), - T.itemCount.setTo(v.itemCount), - T.fileId.setTo(v.fileId) + T.itemCount.setTo(v.itemCount) ) ) - def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] = - DML.update(T, T.cid === coll, DML.set(T.fileId.setTo(fid))) - def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = for { n1 <- DML.update( @@ -71,8 +62,7 @@ object RClassifierSetting { DML.set( T.enabled.setTo(v.enabled), T.schedule.setTo(v.schedule), - T.itemCount.setTo(v.itemCount), - T.category.setTo(v.category) + T.itemCount.setTo(v.itemCount) ) ) n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO] @@ -89,8 +79,7 @@ object RClassifierSetting { case class Classifier( enabled: Boolean, schedule: CalEvent, - itemCount: Int, - category: Option[String] + itemCount: Int ) { def toRecord(coll: Ident, created: Timestamp): RClassifierSetting = @@ -98,15 +87,13 @@ object RClassifierSetting { coll, enabled, schedule, - category.getOrElse(""), itemCount, - None, created ) } object Classifier { def fromRecord(r: RClassifierSetting): Classifier = - Classifier(r.enabled, r.schedule, r.itemCount, r.category.some) + Classifier(r.enabled, r.schedule, r.itemCount) } } diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala index ca3b2666..f6114a38 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala @@ -89,8 +89,7 @@ object RCollective { c.integration.s, cs.enabled.s, cs.schedule.s, - cs.itemCount.s, - cs.category.s + cs.itemCount.s ), from(c).leftJoin(cs, cs.cid === c.id), c.id === coll diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 27a30031..5bba7d67 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -148,6 +148,13 @@ object RTag { ).orderBy(T.name.asc).build.query[RTag].to[List] } + def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] = + Select( + coalesce(T.category.s, lit(fallback)).s, + from(T), + T.cid === coll + ).distinct.build.query[String].to[List] + def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = DML.delete(T, T.tid === tagId && T.cid === coll) } diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm index 23e440cd..1181e239 100644 --- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -25,8 +25,6 @@ import Util.Tag type alias Model = { enabled : Bool - , categoryModel : Comp.FixedDropdown.Model String - , category : Maybe String , scheduleModel : Comp.CalEventInput.Model , schedule : Validated CalEvent , itemCountModel : Comp.IntField.Model @@ -35,10 +33,8 @@ type alias Model = type Msg - = GetTagsResp (Result Http.Error TagList) - | ScheduleMsg Comp.CalEventInput.Msg + = ScheduleMsg Comp.CalEventInput.Msg | ToggleEnabled - | CategoryMsg (Comp.FixedDropdown.Msg String) | ItemCountMsg Comp.IntField.Msg @@ -53,17 +49,12 @@ init flags sett = Comp.CalEventInput.init flags newSchedule in ( { enabled = sett.enabled - , categoryModel = Comp.FixedDropdown.initString [] - , category = sett.category , scheduleModel = cem , schedule = Data.Validated.Unknown newSchedule , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" , itemCount = Just sett.itemCount } - , Cmd.batch - [ Api.getTags flags "" GetTagsResp - , Cmd.map ScheduleMsg cec - ] + , Cmd.map ScheduleMsg cec ) @@ -72,7 +63,6 @@ getSettings model = Data.Validated.map (\sch -> { enabled = model.enabled - , category = model.category , schedule = Data.CalEvent.makeEvent sch , itemCount = Maybe.withDefault 0 model.itemCount @@ -84,27 +74,6 @@ getSettings model = update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) update flags msg model = case msg of - GetTagsResp (Ok tl) -> - let - categories = - Util.Tag.getCategories tl.items - |> List.sort - in - ( { model - | categoryModel = Comp.FixedDropdown.initString categories - , category = - if model.category == Nothing then - List.head categories - - else - model.category - } - , Cmd.none - ) - - GetTagsResp (Err _) -> - ( model, Cmd.none ) - ScheduleMsg lmsg -> let ( cm, cc, ce ) = @@ -126,23 +95,6 @@ update flags msg model = , Cmd.none ) - CategoryMsg lmsg -> - let - ( mm, ma ) = - Comp.FixedDropdown.update lmsg model.categoryModel - in - ( { model - | categoryModel = mm - , category = - if ma == Nothing then - model.category - - else - ma - } - , Cmd.none - ) - ItemCountMsg lmsg -> let ( im, iv ) = @@ -182,13 +134,6 @@ view model = , text "periodically based on a schedule and you need to specify a tag-group that should " , text "be used for learning." ] - , div [ class "field" ] - [ label [] [ text "Category" ] - , Html.map CategoryMsg - (Comp.FixedDropdown.viewString model.category - model.categoryModel - ) - ] , Html.map ItemCountMsg (Comp.IntField.viewWithInfo "The maximum number of items to learn from, order by date newest first. Use 0 to mean all." From 3e28ce1254202f74da9b80953ae3e416bf666178 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 21:48:05 +0100 Subject: [PATCH 17/38] Add the sql concat function to query builder --- .../store/src/main/scala/docspell/store/qb/DBFunction.scala | 4 ++++ modules/store/src/main/scala/docspell/store/qb/DSL.scala | 3 +++ .../main/scala/docspell/store/qb/impl/DBFunctionBuilder.scala | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/modules/store/src/main/scala/docspell/store/qb/DBFunction.scala b/modules/store/src/main/scala/docspell/store/qb/DBFunction.scala index dc418810..58cca850 100644 --- a/modules/store/src/main/scala/docspell/store/qb/DBFunction.scala +++ b/modules/store/src/main/scala/docspell/store/qb/DBFunction.scala @@ -1,5 +1,7 @@ package docspell.store.qb +import cats.data.NonEmptyList + sealed trait DBFunction {} object DBFunction { @@ -31,6 +33,8 @@ object DBFunction { case class Sum(expr: SelectExpr) extends DBFunction + case class Concat(exprs: NonEmptyList[SelectExpr]) extends DBFunction + sealed trait Operator object Operator { case object Plus extends Operator diff --git a/modules/store/src/main/scala/docspell/store/qb/DSL.scala b/modules/store/src/main/scala/docspell/store/qb/DSL.scala index db80e20b..b3df71dd 100644 --- a/modules/store/src/main/scala/docspell/store/qb/DSL.scala +++ b/modules/store/src/main/scala/docspell/store/qb/DSL.scala @@ -98,6 +98,9 @@ trait DSL extends DoobieMeta { def substring(expr: SelectExpr, start: Int, length: Int): DBFunction = DBFunction.Substring(expr, start, length) + def concat(expr: SelectExpr, exprs: SelectExpr*): DBFunction = + DBFunction.Concat(Nel.of(expr, exprs: _*)) + def lit[A](value: A)(implicit P: Put[A]): SelectExpr.SelectLit[A] = SelectExpr.SelectLit(value, None) diff --git a/modules/store/src/main/scala/docspell/store/qb/impl/DBFunctionBuilder.scala b/modules/store/src/main/scala/docspell/store/qb/impl/DBFunctionBuilder.scala index 16c3e33f..3a75569a 100644 --- a/modules/store/src/main/scala/docspell/store/qb/impl/DBFunctionBuilder.scala +++ b/modules/store/src/main/scala/docspell/store/qb/impl/DBFunctionBuilder.scala @@ -32,6 +32,10 @@ object DBFunctionBuilder extends CommonBuilder { case DBFunction.Substring(expr, start, len) => sql"SUBSTRING(" ++ SelectExprBuilder.build(expr) ++ fr" FROM $start FOR $len)" + case DBFunction.Concat(exprs) => + val inner = exprs.map(SelectExprBuilder.build).toList.reduce(_ ++ comma ++ _) + sql"CONCAT(" ++ inner ++ sql")" + case DBFunction.Calc(op, left, right) => SelectExprBuilder.build(left) ++ buildOperator(op) ++ From cce88788987d2a6c22b41dd9fba12df90a29966a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 18 Jan 2021 21:48:40 +0100 Subject: [PATCH 18/38] Exclude tags w/o category from classifying; remove obsolete models --- .../docspell/joex/learn/ClassifierName.scala | 28 ++++++++++++++++--- .../joex/learn/LearnClassifierTask.scala | 23 ++++++++++++--- .../store/records/RClassifierModel.scala | 14 +++++++++- .../scala/docspell/store/records/RTag.scala | 6 ++-- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala index 6b128c24..d667ff80 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -2,8 +2,12 @@ package docspell.joex.learn import cats.data.NonEmptyList import cats.implicits._ + import docspell.common.Ident +import docspell.store.qb.DSL._ +import docspell.store.qb._ import docspell.store.records.{RClassifierModel, RTag} + import doobie._ final class ClassifierName(val name: String) extends AnyVal @@ -12,9 +16,6 @@ object ClassifierName { def apply(name: String): ClassifierName = new ClassifierName(name) - val noCategory: ClassifierName = - apply("__docspell_no_category__") - val categoryPrefix = "tagcategory-" def tagCategory(cat: String): ClassifierName = @@ -34,7 +35,7 @@ object ClassifierName { def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = for { - categories <- RTag.listCategories(coll, noCategory.name) + categories <- RTag.listCategories(coll) models <- NonEmptyList.fromList(categories) match { case Some(nel) => RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name)) @@ -42,4 +43,23 @@ object ClassifierName { List.empty[RClassifierModel].pure[ConnectionIO] } } yield models + + def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = { + val model = RClassifierModel.as("m") + val tag = RTag.as("t") + val sql = + Select( + select(model.all), + from(model), + model.cid === coll && model.name.notIn( + Select( + select(concat(lit(categoryPrefix), tag.category.s)), + from(tag), + tag.cid === coll && tag.category.isNotNull + ).distinct + ) + ).build + sql.query[RClassifierModel].to[List] + } + } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 3949a151..52ee70ac 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -4,12 +4,13 @@ import cats.data.Kleisli import cats.data.OptionT import cats.effect._ import cats.implicits._ + import docspell.analysis.TextAnalyser import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ -import docspell.store.records.{RClassifierSetting, RTag} +import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag} object LearnClassifierTask { val pageSep = " --n-- " @@ -31,6 +32,7 @@ object LearnClassifierTask { _ <- OptionT.liftF( learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) ) + _ <- OptionT.liftF(clearObsoleteModels(ctx)) } yield ()) .getOrElseF(logInactiveWarning(ctx.logger)) } @@ -62,14 +64,27 @@ object LearnClassifierTask { ): Task[F, A, Unit] = Task { ctx => for { - cats <- ctx.store.transact( - RTag.listCategories(collective, ClassifierName.noCategory.name) - ) + cats <- ctx.store.transact(RTag.listCategories(collective)) task = learnTagCategory[F, A](analyser, collective, maxItems) _ _ <- cats.map(task).traverse(_.run(ctx)) } yield () } + private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] = + for { + list <- ctx.store.transact( + ClassifierName.findOrphanTagModels(ctx.args.collective) + ) + _ <- ctx.logger.info( + s"Found ${list.size} obsolete model files that are deleted now." + ) + n <- ctx.store.transact(RClassifierModel.deleteAll(list.map(_.id))) + _ <- list + .map(_.fileId.id) + .traverse(id => ctx.store.bitpeace.delete(id).compile.drain) + _ <- ctx.logger.debug(s"Deleted $n model files.") + } yield () + private def findActiveSettings[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala index 2d018f81..cca0079c 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala @@ -1,7 +1,7 @@ package docspell.store.records -import cats.effect._ import cats.data.NonEmptyList +import cats.effect._ import cats.implicits._ import docspell.common._ @@ -63,6 +63,17 @@ object RClassifierModel { else 0.pure[ConnectionIO] } yield n + k + def deleteById(id: Ident): ConnectionIO[Int] = + DML.delete(T, T.id === id) + + def deleteAll(ids: List[Ident]): ConnectionIO[Int] = + NonEmptyList.fromList(ids) match { + case Some(nel) => + DML.delete(T, T.id.in(nel)) + case None => + 0.pure[ConnectionIO] + } + def findByName(cid: Ident, name: String): ConnectionIO[Option[RClassifierModel]] = Select(select(T.all), from(T), T.cid === cid && T.name === name).build .query[RClassifierModel] @@ -75,4 +86,5 @@ object RClassifierModel { Select(select(T.all), from(T), T.cid === cid && T.name.in(names)).build .query[RClassifierModel] .to[List] + } diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 5bba7d67..51f25912 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -148,11 +148,11 @@ object RTag { ).orderBy(T.name.asc).build.query[RTag].to[List] } - def listCategories(coll: Ident, fallback: String): ConnectionIO[List[String]] = + def listCategories(coll: Ident): ConnectionIO[List[String]] = Select( - coalesce(T.category.s, lit(fallback)).s, + T.category.s, from(T), - T.cid === coll + T.cid === coll && T.category.isNotNull ).distinct.build.query[String].to[List] def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = From a6f29153c4f72f74b671b5c5be707e3fcda658b6 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 01:20:13 +0100 Subject: [PATCH 19/38] Control what tag categories to use for auto-tagging --- build.sbt | 6 +- .../main/scala/docspell/common/ListType.scala | 33 ++++ .../docspell/joex/learn/ClassifierName.scala | 40 +++-- .../joex/learn/LearnClassifierTask.scala | 34 ++-- .../src/main/resources/docspell-openapi.yml | 15 +- .../restserver/routes/CollectiveRoutes.scala | 12 +- .../h2/V1.17.1__classifier_model.sql | 23 +++ .../mariadb/V1.17.1__classifier_model.sql | 26 ++- .../postgresql/V1.17.1__classifier_model.sql | 23 +++ .../docspell/store/impl/DoobieMeta.scala | 3 + .../store/records/RClassifierModel.scala | 14 +- .../store/records/RClassifierSetting.scala | 103 ++++++++---- .../docspell/store/records/RCollective.scala | 22 ++- .../main/elm/Comp/ClassifierSettingsForm.elm | 148 +++++++++++++----- .../main/elm/Comp/CollectiveSettingsForm.elm | 9 +- modules/webapp/src/main/elm/Data/ListType.elm | 50 ++++++ 16 files changed, 436 insertions(+), 125 deletions(-) create mode 100644 modules/common/src/main/scala/docspell/common/ListType.scala create mode 100644 modules/webapp/src/main/elm/Data/ListType.elm diff --git a/build.sbt b/build.sbt index 91016ca0..ccdc73ca 100644 --- a/build.sbt +++ b/build.sbt @@ -131,7 +131,8 @@ val openapiScalaSettings = Seq( case "ident" => field => field.copy(typeDef = TypeDef("Ident", Imports("docspell.common.Ident"))) case "accountid" => - field => field.copy(typeDef = TypeDef("AccountId", Imports("docspell.common.AccountId"))) + field => + field.copy(typeDef = TypeDef("AccountId", Imports("docspell.common.AccountId"))) case "collectivestate" => field => field.copy(typeDef = @@ -190,6 +191,9 @@ val openapiScalaSettings = Seq( field.copy(typeDef = TypeDef("CustomFieldType", Imports("docspell.common.CustomFieldType")) ) + case "listtype" => + field => + field.copy(typeDef = TypeDef("ListType", Imports("docspell.common.ListType"))) })) ) diff --git a/modules/common/src/main/scala/docspell/common/ListType.scala b/modules/common/src/main/scala/docspell/common/ListType.scala new file mode 100644 index 00000000..d2b29e91 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/ListType.scala @@ -0,0 +1,33 @@ +package docspell.common + +import cats.data.NonEmptyList + +import io.circe.{Decoder, Encoder} + +sealed trait ListType { self: Product => + def name: String = + productPrefix.toLowerCase +} + +object ListType { + + case object Whitelist extends ListType + val whitelist: ListType = Whitelist + + case object Blacklist extends ListType + val blacklist: ListType = Blacklist + + val all: NonEmptyList[ListType] = NonEmptyList.of(Whitelist, Blacklist) + + def fromString(name: String): Either[String, ListType] = + all.find(_.name.equalsIgnoreCase(name)).toRight(s"Unknown list type: $name") + + def unsafeFromString(name: String): ListType = + fromString(name).fold(sys.error, identity) + + implicit val jsonEncoder: Encoder[ListType] = + Encoder.encodeString.contramap(_.name) + + implicit val jsonDecoder: Decoder[ListType] = + Decoder.decodeString.emap(fromString) +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala index d667ff80..0ed2d97e 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -4,9 +4,7 @@ import cats.data.NonEmptyList import cats.implicits._ import docspell.common.Ident -import docspell.store.qb.DSL._ -import docspell.store.qb._ -import docspell.store.records.{RClassifierModel, RTag} +import docspell.store.records.{RClassifierModel, RClassifierSetting} import doobie._ @@ -16,7 +14,7 @@ object ClassifierName { def apply(name: String): ClassifierName = new ClassifierName(name) - val categoryPrefix = "tagcategory-" + private val categoryPrefix = "tagcategory-" def tagCategory(cat: String): ClassifierName = apply(s"${categoryPrefix}${cat}") @@ -35,7 +33,7 @@ object ClassifierName { def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = for { - categories <- RTag.listCategories(coll) + categories <- RClassifierSetting.getActiveCategories(coll) models <- NonEmptyList.fromList(categories) match { case Some(nel) => RClassifierModel.findAllByName(coll, nel.map(tagCategory).map(_.name)) @@ -44,22 +42,20 @@ object ClassifierName { } } yield models - def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = { - val model = RClassifierModel.as("m") - val tag = RTag.as("t") - val sql = - Select( - select(model.all), - from(model), - model.cid === coll && model.name.notIn( - Select( - select(concat(lit(categoryPrefix), tag.category.s)), - from(tag), - tag.cid === coll && tag.category.isNotNull - ).distinct - ) - ).build - sql.query[RClassifierModel].to[List] - } + def findOrphanTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = + for { + cats <- RClassifierSetting.getActiveCategories(coll) + allModels = RClassifierModel.findAllByQuery(coll, s"${categoryPrefix}%") + result <- NonEmptyList.fromList(cats) match { + case Some(nel) => + allModels.flatMap(all => + RClassifierModel + .findAllByName(coll, nel.map(tagCategory).map(_.name)) + .map(active => all.diff(active)) + ) + case None => + allModels + } + } yield result } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 52ee70ac..843ee951 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -10,7 +10,7 @@ import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ -import docspell.store.records.{RClassifierModel, RClassifierSetting, RTag} +import docspell.store.records.{RClassifierModel, RClassifierSetting} object LearnClassifierTask { val pageSep = " --n-- " @@ -26,15 +26,23 @@ object LearnClassifierTask { analyser: TextAnalyser[F] ): Task[F, Args, Unit] = Task { ctx => - (for { - sett <- findActiveSettings[F](ctx, cfg) - maxItems = math.min(cfg.classification.itemCount, sett.itemCount) - _ <- OptionT.liftF( - learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) - ) - _ <- OptionT.liftF(clearObsoleteModels(ctx)) - } yield ()) - .getOrElseF(logInactiveWarning(ctx.logger)) + val learnTags = + for { + sett <- findActiveSettings[F](ctx, cfg) + maxItems = math.min(cfg.classification.itemCount, sett.itemCount) + _ <- OptionT.liftF( + learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) + ) + } yield () + + // learn classifier models from active tag categories + learnTags.getOrElseF(logInactiveWarning(ctx.logger)) *> + // delete classifier model files for categories that have been removed + clearObsoleteTagModels(ctx) *> + // when tags are deleted, categories may get removed. fix the json array + ctx.store + .transact(RClassifierSetting.fixCategoryList(ctx.args.collective)) + .map(_ => ()) } def learnTagCategory[F[_]: Sync: ContextShift, A]( @@ -64,13 +72,13 @@ object LearnClassifierTask { ): Task[F, A, Unit] = Task { ctx => for { - cats <- ctx.store.transact(RTag.listCategories(collective)) + cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective)) task = learnTagCategory[F, A](analyser, collective, maxItems) _ _ <- cats.map(task).traverse(_.run(ctx)) } yield () } - private def clearObsoleteModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] = + private def clearObsoleteTagModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] = for { list <- ctx.store.transact( ClassifierName.findOrphanTagModels(ctx.args.collective) @@ -98,6 +106,6 @@ object LearnClassifierTask { private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] = logger.warn( - "Classification is disabled. Check joex config and the collective settings." + "Auto-tagging is disabled. Check joex config and the collective settings." ) } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index d32d2352..90ce21ae 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -4850,12 +4850,11 @@ components: description: | Settings for learning a document classifier. required: - - enabled - schedule - itemCount + - categoryList + - listType properties: - enabled: - type: boolean itemCount: type: integer format: int32 @@ -4865,6 +4864,16 @@ components: schedule: type: string format: calevent + categoryList: + type: array + items: + type: string + listType: + type: string + format: listtype + enum: + - blacklist + - whitelist SourceList: description: | diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index ee868254..663ca46b 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -6,7 +6,7 @@ import cats.implicits._ import docspell.backend.BackendApp import docspell.backend.auth.AuthToken import docspell.backend.ops.OCollective -import docspell.common.MakePreviewArgs +import docspell.common.{ListType, MakePreviewArgs} import docspell.restapi.model._ import docspell.restserver.conv.Conversions import docspell.restserver.http4s._ @@ -44,9 +44,10 @@ object CollectiveRoutes { settings.integrationEnabled, Some( OCollective.Classifier( - settings.classifier.enabled, settings.classifier.schedule, - settings.classifier.itemCount + settings.classifier.itemCount, + settings.classifier.categoryList, + settings.classifier.listType ) ) ) @@ -64,11 +65,12 @@ object CollectiveRoutes { c.language, c.integrationEnabled, ClassifierSetting( - c.classifier.exists(_.enabled), c.classifier.map(_.itemCount).getOrElse(0), c.classifier .map(_.schedule) - .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00")) + .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00")), + c.classifier.map(_.categories).getOrElse(Nil), + c.classifier.map(_.listType).getOrElse(ListType.whitelist) ) ) ) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql index 11be9909..d0aab38b 100644 --- a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql @@ -14,8 +14,31 @@ select random_uuid() as "id", "cid", concat('tagcategory-', "category") as "name from "classifier_setting" where "file_id" is not null; +alter table "classifier_setting" +add column "categories" text; + +alter table "classifier_setting" +add column "category_list_type" varchar(254); + +update "classifier_setting" +set "category_list_type" = 'whitelist'; + +update "classifier_setting" +set "categories" = concat('["', category, '"]') +where category is not null; + +update "classifier_setting" +set "categories" = '[]' +where category is null; + alter table "classifier_setting" drop column "category"; alter table "classifier_setting" drop column "file_id"; + +ALTER TABLE "classifier_setting" +ALTER COLUMN "categories" SET NOT NULL; + +ALTER TABLE "classifier_setting" +ALTER COLUMN "category_list_type" SET NOT NULL; diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql index d6f9da6e..59bec4b2 100644 --- a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql @@ -14,13 +14,35 @@ select md5(rand()) as id, `cid`,concat('tagcategory-', `category`) as `name`, `f from `classifier_setting` where `file_id` is not null; +alter table `classifier_setting` +add column (`categories` mediumtext); + +alter table `classifier_setting` +add column (`category_list_type` varchar(254)); + +update `classifier_setting` +set `category_list_type` = 'whitelist'; + +update `classifier_setting` +set `categories` = concat('[`', category, '`]') +where category is not null; + +update `classifier_setting` +set `categories` = '[]' +where category is null; + alter table `classifier_setting` drop column `category`; --- mariadb needs special treatment when dropping a column that is part --- of an index and foreign key +-- mariadb requires to drop constraint manually when dropping a column alter table `classifier_setting` drop constraint `classifier_setting_ibfk_2`; alter table `classifier_setting` drop column `file_id`; + +ALTER TABLE `classifier_setting` +MODIFY `categories` mediumtext NOT NULL; + +ALTER TABLE `classifier_setting` +MODIFY `category_list_type` varchar(254) NOT NULL; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql index 81e327ff..1e44679a 100644 --- a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql @@ -14,8 +14,31 @@ select md5(random()::text) as id, "cid",'tagcategory-' || "category" as "name", from "classifier_setting" where "file_id" is not null; +alter table "classifier_setting" +add column "categories" text; + +alter table "classifier_setting" +add column "category_list_type" varchar(254); + +update "classifier_setting" +set "category_list_type" = 'whitelist'; + +update "classifier_setting" +set "categories" = concat('["', category, '"]') +where category is not null; + +update "classifier_setting" +set "categories" = '[]' +where category is null; + alter table "classifier_setting" drop column "category"; alter table "classifier_setting" drop column "file_id"; + +ALTER TABLE "classifier_setting" +ALTER COLUMN "categories" SET NOT NULL; + +ALTER TABLE "classifier_setting" +ALTER COLUMN "category_list_type" SET NOT NULL; diff --git a/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala b/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala index cbe3ab0f..db60a19e 100644 --- a/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala +++ b/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala @@ -97,6 +97,9 @@ trait DoobieMeta extends EmilDoobieMeta { implicit val metaCustomFieldType: Meta[CustomFieldType] = Meta[String].timap(CustomFieldType.unsafe)(_.name) + + implicit val metaListType: Meta[ListType] = + Meta[String].timap(ListType.unsafeFromString)(_.name) } object DoobieMeta extends DoobieMeta { diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala index cca0079c..2032e61e 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierModel.scala @@ -57,7 +57,12 @@ object RClassifierModel { def updateFile(coll: Ident, name: String, fid: Ident): ConnectionIO[Int] = for { - n <- DML.update(T, T.cid === coll && T.name === name, DML.set(T.fileId.setTo(fid))) + now <- Timestamp.current[ConnectionIO] + n <- DML.update( + T, + T.cid === coll && T.name === name, + DML.set(T.fileId.setTo(fid), T.created.setTo(now)) + ) k <- if (n == 0) createNew[ConnectionIO](coll, name, fid).flatMap(insert) else 0.pure[ConnectionIO] @@ -87,4 +92,11 @@ object RClassifierModel { .query[RClassifierModel] .to[List] + def findAllByQuery( + cid: Ident, + nameQuery: String + ): ConnectionIO[List[RClassifierModel]] = + Select(select(T.all), from(T), T.cid === cid && T.name.like(nameQuery)).build + .query[RClassifierModel] + .to[List] } diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index fe634161..9c31a5c2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -1,6 +1,6 @@ package docspell.store.records -import cats.data.NonEmptyList +import cats.data.{NonEmptyList, OptionT} import cats.implicits._ import docspell.common._ @@ -13,23 +13,38 @@ import doobie.implicits._ case class RClassifierSetting( cid: Ident, - enabled: Boolean, schedule: CalEvent, itemCount: Int, - created: Timestamp -) {} + created: Timestamp, + categoryList: List[String], + listType: ListType +) { + + def enabled: Boolean = + listType match { + case ListType.Blacklist => + true + case ListType.Whitelist => + categoryList.nonEmpty + } +} object RClassifierSetting { + // the categoryList is stored as a json array + implicit val stringListMeta: Meta[List[String]] = + jsonMeta[List[String]] + final case class Table(alias: Option[String]) extends TableDef { val tableName = "classifier_setting" - val cid = Column[Ident]("cid", this) - val enabled = Column[Boolean]("enabled", this) - val schedule = Column[CalEvent]("schedule", this) - val itemCount = Column[Int]("item_count", this) - val created = Column[Timestamp]("created", this) + val cid = Column[Ident]("cid", this) + val schedule = Column[CalEvent]("schedule", this) + val itemCount = Column[Int]("item_count", this) + val created = Column[Timestamp]("created", this) + val categories = Column[List[String]]("categories", this) + val listType = Column[ListType]("category_list_type", this) val all = NonEmptyList - .of[Column[_]](cid, enabled, schedule, itemCount, created) + .of[Column[_]](cid, schedule, itemCount, created, categories, listType) } val T = Table(None) @@ -40,29 +55,19 @@ object RClassifierSetting { DML.insert( T, T.all, - fr"${v.cid},${v.enabled},${v.schedule},${v.itemCount},${v.created}" + fr"${v.cid},${v.schedule},${v.itemCount},${v.created},${v.categoryList},${v.listType}" ) - def updateAll(v: RClassifierSetting): ConnectionIO[Int] = - DML.update( - T, - T.cid === v.cid, - DML.set( - T.enabled.setTo(v.enabled), - T.schedule.setTo(v.schedule), - T.itemCount.setTo(v.itemCount) - ) - ) - - def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = + def update(v: RClassifierSetting): ConnectionIO[Int] = for { n1 <- DML.update( T, T.cid === v.cid, DML.set( - T.enabled.setTo(v.enabled), T.schedule.setTo(v.schedule), - T.itemCount.setTo(v.itemCount) + T.itemCount.setTo(v.itemCount), + T.categories.setTo(v.categoryList), + T.listType.setTo(v.listType) ) ) n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO] @@ -76,24 +81,62 @@ object RClassifierSetting { def delete(coll: Ident): ConnectionIO[Int] = DML.delete(T, T.cid === coll) + /** Finds tag categories that exist and match the classifier setting. + * If the setting contains a black list, they are removed from the + * existing categories. If it is a whitelist, the intersection is + * returned. + */ + def getActiveCategories(coll: Ident): ConnectionIO[List[String]] = + (for { + sett <- OptionT(findById(coll)) + cats <- OptionT.liftF(RTag.listCategories(coll)) + res = sett.listType match { + case ListType.Blacklist => + cats.diff(sett.categoryList) + case ListType.Whitelist => + sett.categoryList.intersect(cats) + } + } yield res).getOrElse(Nil) + + /** Checks the json array of tag categories and removes those that are not present anymore. */ + def fixCategoryList(coll: Ident): ConnectionIO[Int] = + (for { + sett <- OptionT(findById(coll)) + cats <- OptionT.liftF(RTag.listCategories(coll)) + fixed = sett.categoryList.intersect(cats) + n <- OptionT.liftF( + if (fixed == sett.categoryList) 0.pure[ConnectionIO] + else DML.update(T, T.cid === coll, DML.set(T.categories.setTo(fixed))) + ) + } yield n).getOrElse(0) + case class Classifier( - enabled: Boolean, schedule: CalEvent, - itemCount: Int + itemCount: Int, + categories: List[String], + listType: ListType ) { + def enabled: Boolean = + listType match { + case ListType.Blacklist => + true + case ListType.Whitelist => + categories.nonEmpty + } def toRecord(coll: Ident, created: Timestamp): RClassifierSetting = RClassifierSetting( coll, - enabled, schedule, itemCount, - created + created, + categories, + listType ) } object Classifier { def fromRecord(r: RClassifierSetting): Classifier = - Classifier(r.enabled, r.schedule, r.itemCount) + Classifier(r.schedule, r.itemCount, r.categoryList, r.listType) } } diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala index f6114a38..d1a0cb09 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala @@ -1,6 +1,6 @@ package docspell.store.records -import cats.data.NonEmptyList +import cats.data.{NonEmptyList, OptionT} import fs2.Stream import docspell.common._ @@ -73,13 +73,24 @@ object RCollective { .map(now => settings.classifier.map(_.toRecord(cid, now))) n2 <- cls match { case Some(cr) => - RClassifierSetting.updateSettings(cr) + RClassifierSetting.update(cr) case None => RClassifierSetting.delete(cid) } } yield n1 + n2 - def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = { + // this hides categories that have been deleted in the meantime + // they are finally removed from the json array once the learn classifier task is run + def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = + (for { + sett <- OptionT(getRawSettings(coll)) + prev <- OptionT.fromOption[ConnectionIO](sett.classifier) + cats <- OptionT.liftF(RTag.listCategories(coll)) + next = prev.copy(categories = prev.categories.intersect(cats)) + } yield sett.copy(classifier = Some(next))).value + + private def getRawSettings(coll: Ident): ConnectionIO[Option[Settings]] = { + import RClassifierSetting.stringListMeta val c = RCollective.as("c") val cs = RClassifierSetting.as("cs") @@ -87,9 +98,10 @@ object RCollective { select( c.language.s, c.integration.s, - cs.enabled.s, cs.schedule.s, - cs.itemCount.s + cs.itemCount.s, + cs.categories.s, + cs.listType.s ), from(c).leftJoin(cs, cs.cid === c.id), c.id === coll diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm index 1181e239..579506d6 100644 --- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -11,31 +11,38 @@ import Api import Api.Model.ClassifierSetting exposing (ClassifierSetting) import Api.Model.TagList exposing (TagList) import Comp.CalEventInput +import Comp.Dropdown import Comp.FixedDropdown import Comp.IntField import Data.CalEvent exposing (CalEvent) import Data.Flags exposing (Flags) +import Data.ListType exposing (ListType) +import Data.UiSettings exposing (UiSettings) import Data.Validated exposing (Validated(..)) import Html exposing (..) import Html.Attributes exposing (..) -import Html.Events exposing (onCheck) import Http +import Markdown import Util.Tag type alias Model = - { enabled : Bool - , scheduleModel : Comp.CalEventInput.Model + { scheduleModel : Comp.CalEventInput.Model , schedule : Validated CalEvent , itemCountModel : Comp.IntField.Model , itemCount : Maybe Int + , categoryListModel : Comp.Dropdown.Model String + , categoryListType : ListType + , categoryListTypeModel : Comp.FixedDropdown.Model ListType } type Msg = ScheduleMsg Comp.CalEventInput.Msg - | ToggleEnabled | ItemCountMsg Comp.IntField.Msg + | GetTagsResp (Result Http.Error TagList) + | CategoryListMsg (Comp.Dropdown.Msg String) + | CategoryListTypeMsg (Comp.FixedDropdown.Msg ListType) init : Flags -> ClassifierSetting -> ( Model, Cmd Msg ) @@ -48,13 +55,41 @@ init flags sett = ( cem, cec ) = Comp.CalEventInput.init flags newSchedule in - ( { enabled = sett.enabled - , scheduleModel = cem + ( { scheduleModel = cem , schedule = Data.Validated.Unknown newSchedule , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" , itemCount = Just sett.itemCount + , categoryListModel = + let + mkOption s = + { value = s, text = s, additional = "" } + + minit = + Comp.Dropdown.makeModel + { multiple = True + , searchable = \n -> n > 0 + , makeOption = mkOption + , labelColor = \_ -> \_ -> "grey " + , placeholder = "Choose categories …" + } + + lm = + Comp.Dropdown.SetSelection sett.categoryList + + ( m_, _ ) = + Comp.Dropdown.update lm minit + in + m_ + , categoryListType = + Data.ListType.fromString sett.listType + |> Maybe.withDefault Data.ListType.Whitelist + , categoryListTypeModel = + Comp.FixedDropdown.initMap Data.ListType.label Data.ListType.all } - , Cmd.map ScheduleMsg cec + , Cmd.batch + [ Api.getTags flags "" GetTagsResp + , Cmd.map ScheduleMsg cec + ] ) @@ -62,10 +97,11 @@ getSettings : Model -> Validated ClassifierSetting getSettings model = Data.Validated.map (\sch -> - { enabled = model.enabled - , schedule = + { schedule = Data.CalEvent.makeEvent sch , itemCount = Maybe.withDefault 0 model.itemCount + , listType = Data.ListType.toString model.categoryListType + , categoryList = Comp.Dropdown.getSelected model.categoryListModel } ) model.schedule @@ -74,6 +110,20 @@ getSettings model = update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) update flags msg model = case msg of + GetTagsResp (Ok tl) -> + let + categories = + Util.Tag.getCategories tl.items + |> List.sort + + lm = + Comp.Dropdown.SetOptions categories + in + update flags (CategoryListMsg lm) model + + GetTagsResp (Err _) -> + ( model, Cmd.none ) + ScheduleMsg lmsg -> let ( cm, cc, ce ) = @@ -90,11 +140,6 @@ update flags msg model = , Cmd.map ScheduleMsg cc ) - ToggleEnabled -> - ( { model | enabled = not model.enabled } - , Cmd.none - ) - ItemCountMsg lmsg -> let ( im, iv ) = @@ -107,32 +152,61 @@ update flags msg model = , Cmd.none ) + CategoryListMsg lm -> + let + ( m_, cmd_ ) = + Comp.Dropdown.update lm model.categoryListModel + in + ( { model | categoryListModel = m_ } + , Cmd.map CategoryListMsg cmd_ + ) -view : Model -> Html Msg -view model = + CategoryListTypeMsg lm -> + let + ( m_, sel ) = + Comp.FixedDropdown.update lm model.categoryListTypeModel + + newListType = + Maybe.withDefault model.categoryListType sel + in + ( { model + | categoryListTypeModel = m_ + , categoryListType = newListType + } + , Cmd.none + ) + + +view : UiSettings -> Model -> Html Msg +view settings model = + let + catListTypeItem = + Comp.FixedDropdown.Item + model.categoryListType + (Data.ListType.label model.categoryListType) + in div [] - [ div - [ class "field" + [ Markdown.toHtml [ class "ui basic segment" ] + """ + +Auto-tagging works by learning from existing documents. The more +documents you have correctly tagged, the better. Learning is done +periodically based on a schedule. You can specify tag-groups that +should either be used (whitelist) or not used (blacklist) for +learning. + +Use an empty whitelist to disable auto tagging. + + """ + , div [ class "field" ] + [ label [] [ text "Is the following a blacklist or whitelist?" ] + , Html.map CategoryListTypeMsg + (Comp.FixedDropdown.view (Just catListTypeItem) model.categoryListTypeModel) ] - [ div [ class "ui checkbox" ] - [ input - [ type_ "checkbox" - , onCheck (\_ -> ToggleEnabled) - , checked model.enabled - ] - [] - , label [] [ text "Enable classification" ] - , span [ class "small-info" ] - [ text "Disable document classification if not needed." - ] - ] - ] - , div [ class "ui basic segment" ] - [ text "Document classification tries to predict a tag for new incoming documents. This " - , text "works by learning from existing documents in order to find common patterns within " - , text "the text. The more documents you have correctly tagged, the better. Learning is done " - , text "periodically based on a schedule and you need to specify a tag-group that should " - , text "be used for learning." + , div [ class "field" ] + [ label [] [ text "Choose tag categories for learning" ] + , Html.map CategoryListMsg + (Comp.Dropdown.view settings model.categoryListModel) ] , Html.map ItemCountMsg (Comp.IntField.viewWithInfo diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm index c73217e5..277e11bd 100644 --- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm @@ -280,7 +280,7 @@ view flags settings model = , ( "invisible hidden", not flags.config.showClassificationSettings ) ] ] - [ text "Document Classifier" + [ text "Auto-Tagging" ] , div [ classList @@ -289,13 +289,10 @@ view flags settings model = ] ] [ Html.map ClassifierSettingMsg - (Comp.ClassifierSettingsForm.view model.classifierModel) + (Comp.ClassifierSettingsForm.view settings model.classifierModel) , div [ class "ui vertical segment" ] [ button - [ classList - [ ( "ui small secondary basic button", True ) - , ( "disabled", not model.classifierModel.enabled ) - ] + [ class "ui small secondary basic button" , title "Starts a task to train a classifier" , onClick StartClassifierTask ] diff --git a/modules/webapp/src/main/elm/Data/ListType.elm b/modules/webapp/src/main/elm/Data/ListType.elm new file mode 100644 index 00000000..8a9a75fb --- /dev/null +++ b/modules/webapp/src/main/elm/Data/ListType.elm @@ -0,0 +1,50 @@ +module Data.ListType exposing + ( ListType(..) + , all + , fromString + , label + , toString + ) + + +type ListType + = Blacklist + | Whitelist + + +all : List ListType +all = + [ Blacklist, Whitelist ] + + +toString : ListType -> String +toString lt = + case lt of + Blacklist -> + "blacklist" + + Whitelist -> + "whitelist" + + +label : ListType -> String +label lt = + case lt of + Blacklist -> + "Blacklist" + + Whitelist -> + "Whitelist" + + +fromString : String -> Maybe ListType +fromString str = + case String.toLower str of + "blacklist" -> + Just Blacklist + + "whitelist" -> + Just Whitelist + + _ -> + Nothing From 99dcaae66b6008d7ef913f42ba646f572903cdc2 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 20:54:47 +0100 Subject: [PATCH 20/38] Learn classifiers for item entities Learns classifiers for concerned and correspondent entities. This can be used as an alternative to or after nlp. --- .../joex/learn/LearnClassifierTask.scala | 62 ++++++--------- .../joex/learn/LearnItemEntities.scala | 74 +++++++++++++++++ .../scala/docspell/joex/learn/LearnTags.scala | 46 +++++++++++ .../docspell/joex/learn/SelectItems.scala | 74 ++++++++++++++--- .../scala/docspell/store/queries/QItem.scala | 79 +++++++++++++++++-- 5 files changed, 284 insertions(+), 51 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 843ee951..354a8e39 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -1,6 +1,5 @@ package docspell.joex.learn -import cats.data.Kleisli import cats.data.OptionT import cats.effect._ import cats.implicits._ @@ -24,6 +23,29 @@ object LearnClassifierTask { def apply[F[_]: Sync: ContextShift]( cfg: Config.TextAnalysis, analyser: TextAnalyser[F] + ): Task[F, Args, Unit] = + learnTags(cfg, analyser) + .flatMap(_ => learnItemEntities(cfg, analyser)) + + private def learnItemEntities[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, + analyser: TextAnalyser[F] + ): Task[F, Args, Unit] = + Task { ctx => + if (cfg.classification.enabled) + LearnItemEntities + .learnAll( + analyser, + ctx.args.collective, + cfg.classification.itemCount + ) + .run(ctx) + else ().pure[F] + } + + private def learnTags[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, + analyser: TextAnalyser[F] ): Task[F, Args, Unit] = Task { ctx => val learnTags = @@ -31,10 +53,11 @@ object LearnClassifierTask { sett <- findActiveSettings[F](ctx, cfg) maxItems = math.min(cfg.classification.itemCount, sett.itemCount) _ <- OptionT.liftF( - learnAllTagCategories(analyser)(ctx.args.collective, maxItems).run(ctx) + LearnTags + .learnAllTagCategories(analyser)(ctx.args.collective, maxItems) + .run(ctx) ) } yield () - // learn classifier models from active tag categories learnTags.getOrElseF(logInactiveWarning(ctx.logger)) *> // delete classifier model files for categories that have been removed @@ -45,39 +68,6 @@ object LearnClassifierTask { .map(_ => ()) } - def learnTagCategory[F[_]: Sync: ContextShift, A]( - analyser: TextAnalyser[F], - collective: Ident, - maxItems: Int - )( - category: String - ): Task[F, A, Unit] = - Task { ctx => - val data = SelectItems.forCategory(ctx, collective)(maxItems, category) - ctx.logger.info(s"Learn classifier for tag category: $category") *> - analyser.classifier.trainClassifier(ctx.logger, data)( - Kleisli( - StoreClassifierModel.handleModel( - ctx, - collective, - ClassifierName.tagCategory(category) - ) - ) - ) - } - - def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])( - collective: Ident, - maxItems: Int - ): Task[F, A, Unit] = - Task { ctx => - for { - cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective)) - task = learnTagCategory[F, A](analyser, collective, maxItems) _ - _ <- cats.map(task).traverse(_.run(ctx)) - } yield () - } - private def clearObsoleteTagModels[F[_]: Sync](ctx: Context[F, Args]): F[Unit] = for { list <- ctx.store.transact( diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala new file mode 100644 index 00000000..1dc48975 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala @@ -0,0 +1,74 @@ +package docspell.joex.learn + +import cats.data.Kleisli +import cats.effect._ +import cats.implicits._ +import fs2.Stream + +import docspell.analysis.TextAnalyser +import docspell.analysis.classifier.TextClassifier.Data +import docspell.common._ +import docspell.joex.scheduler._ + +object LearnItemEntities { + def learnAll[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + learnCorrOrg(analyser, collective, maxItems) + .flatMap(_ => learnCorrPerson[F, A](analyser, collective, maxItems)) + .flatMap(_ => learnConcPerson(analyser, collective, maxItems)) + .flatMap(_ => learnConcEquip(analyser, collective, maxItems)) + + def learnCorrOrg[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + learn(analyser, collective)( + ClassifierName.correspondentOrg, + ctx => SelectItems.forCorrOrg(ctx.store, collective, maxItems) + ) + + def learnCorrPerson[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + learn(analyser, collective)( + ClassifierName.correspondentPerson, + ctx => SelectItems.forCorrPerson(ctx.store, collective, maxItems) + ) + + def learnConcPerson[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + learn(analyser, collective)( + ClassifierName.concernedPerson, + ctx => SelectItems.forConcPerson(ctx.store, collective, maxItems) + ) + + def learnConcEquip[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + learn(analyser, collective)( + ClassifierName.concernedEquip, + ctx => SelectItems.forConcEquip(ctx.store, collective, maxItems) + ) + + private def learn[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident + )(cname: ClassifierName, data: Context[F, _] => Stream[F, Data]): Task[F, A, Unit] = + Task { ctx => + ctx.logger.info(s"Learn classifier ${cname.name}") *> + analyser.classifier.trainClassifier(ctx.logger, data(ctx))( + Kleisli(StoreClassifierModel.handleModel(ctx, collective, cname)) + ) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala new file mode 100644 index 00000000..b24eb28d --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala @@ -0,0 +1,46 @@ +package docspell.joex.learn + +import cats.data.Kleisli +import cats.effect._ +import cats.implicits._ + +import docspell.analysis.TextAnalyser +import docspell.common._ +import docspell.joex.scheduler._ +import docspell.store.records.RClassifierSetting + +object LearnTags { + + def learnTagCategory[F[_]: Sync: ContextShift, A]( + analyser: TextAnalyser[F], + collective: Ident, + maxItems: Int + )( + category: String + ): Task[F, A, Unit] = + Task { ctx => + val data = SelectItems.forCategory(ctx, collective)(maxItems, category) + ctx.logger.info(s"Learn classifier for tag category: $category") *> + analyser.classifier.trainClassifier(ctx.logger, data)( + Kleisli( + StoreClassifierModel.handleModel( + ctx, + collective, + ClassifierName.tagCategory(category) + ) + ) + ) + } + + def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])( + collective: Ident, + maxItems: Int + ): Task[F, A, Unit] = + Task { ctx => + for { + cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective)) + task = learnTagCategory[F, A](analyser, collective, maxItems) _ + _ <- cats.map(task).traverse(_.run(ctx)) + } yield () + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala index e7c31d7b..c6dab2f0 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala @@ -1,13 +1,15 @@ package docspell.joex.learn -import fs2.Stream +import fs2.{Pipe, Stream} import docspell.analysis.classifier.TextClassifier.Data import docspell.common._ import docspell.joex.scheduler.Context import docspell.store.Store import docspell.store.qb.Batch -import docspell.store.queries.QItem +import docspell.store.queries.{QItem, TextAndTag} + +import doobie._ object SelectItems { val pageSep = LearnClassifierTask.pageSep @@ -25,15 +27,67 @@ object SelectItems { max: Int, category: String ): Stream[F, Data] = { - val limit = if (max <= 0) Batch.all else Batch.limit(max) val connStream = - for { - item <- QItem.findAllNewesFirst(collective, 10, limit) - tt <- Stream.eval( - QItem.resolveTextAndTag(collective, item, category, pageSep) - ) - } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) - store.transact(connStream.filter(_.text.nonEmpty)) + allItems(collective, max) + .evalMap(item => QItem.resolveTextAndTag(collective, item, category, pageSep)) + .through(mkData) + store.transact(connStream) } + def forCorrOrg[F[_]]( + store: Store[F], + collective: Ident, + max: Int + ): Stream[F, Data] = { + val connStream = + allItems(collective, max) + .evalMap(item => QItem.resolveTextAndCorrOrg(collective, item, pageSep)) + .through(mkData) + store.transact(connStream) + } + + def forCorrPerson[F[_]]( + store: Store[F], + collective: Ident, + max: Int + ): Stream[F, Data] = { + val connStream = + allItems(collective, max) + .evalMap(item => QItem.resolveTextAndCorrPerson(collective, item, pageSep)) + .through(mkData) + store.transact(connStream) + } + + def forConcPerson[F[_]]( + store: Store[F], + collective: Ident, + max: Int + ): Stream[F, Data] = { + val connStream = + allItems(collective, max) + .evalMap(item => QItem.resolveTextAndConcPerson(collective, item, pageSep)) + .through(mkData) + store.transact(connStream) + } + + def forConcEquip[F[_]]( + store: Store[F], + collective: Ident, + max: Int + ): Stream[F, Data] = { + val connStream = + allItems(collective, max) + .evalMap(item => QItem.resolveTextAndConcEquip(collective, item, pageSep)) + .through(mkData) + store.transact(connStream) + } + + private def allItems(collective: Ident, max: Int): Stream[ConnectionIO, Ident] = { + val limit = if (max <= 0) Batch.all else Batch.limit(max) + QItem.findAllNewesFirst(collective, 10, limit) + } + + private def mkData[F[_]]: Pipe[F, TextAndTag, Data] = + _.map(tt => Data(tt.tag.map(_.name).getOrElse(noClass), tt.itemId.id, tt.text.trim)) + .filter(_.text.nonEmpty) } diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index b68afb22..7de59437 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -567,7 +567,7 @@ object QItem { val tagsTid = Column[Ident]("tid", tags) val tagsName = Column[String]("tname", tags) - val q = + readTextAndTag(collective, itemId, pageSep) { withCte( tags -> Select( select(ti.itemId.as(tagsItem), tag.tid.as(tagsTid), tag.name.as(tagsName)), @@ -584,18 +584,87 @@ object QItem { .leftJoin(tags, tagsItem === i.id), i.id === itemId && i.cid === collective && m.content.isNotNull && m.content <> "" ) - ).build + ) + } + } + def resolveTextAndCorrOrg( + collective: Ident, + itemId: Ident, + pageSep: String + ): ConnectionIO[TextAndTag] = + readTextAndTag(collective, itemId, pageSep) { + Select( + select(m.content, org.oid, org.name), + from(i) + .innerJoin(a, a.itemId === i.id) + .innerJoin(m, m.id === a.id) + .leftJoin(org, org.oid === i.corrOrg), + i.id === itemId && m.content.isNotNull && m.content <> "" + ) + } + + def resolveTextAndCorrPerson( + collective: Ident, + itemId: Ident, + pageSep: String + ): ConnectionIO[TextAndTag] = + readTextAndTag(collective, itemId, pageSep) { + Select( + select(m.content, pers0.pid, pers0.name), + from(i) + .innerJoin(a, a.itemId === i.id) + .innerJoin(m, m.id === a.id) + .leftJoin(pers0, pers0.pid === i.corrPerson), + i.id === itemId && m.content.isNotNull && m.content <> "" + ) + } + + def resolveTextAndConcPerson( + collective: Ident, + itemId: Ident, + pageSep: String + ): ConnectionIO[TextAndTag] = + readTextAndTag(collective, itemId, pageSep) { + Select( + select(m.content, pers0.pid, pers0.name), + from(i) + .innerJoin(a, a.itemId === i.id) + .innerJoin(m, m.id === a.id) + .leftJoin(pers0, pers0.pid === i.concPerson), + i.id === itemId && m.content.isNotNull && m.content <> "" + ) + } + + def resolveTextAndConcEquip( + collective: Ident, + itemId: Ident, + pageSep: String + ): ConnectionIO[TextAndTag] = + readTextAndTag(collective, itemId, pageSep) { + Select( + select(m.content, equip.eid, equip.name), + from(i) + .innerJoin(a, a.itemId === i.id) + .innerJoin(m, m.id === a.id) + .leftJoin(equip, equip.eid === i.concEquipment), + i.id === itemId && m.content.isNotNull && m.content <> "" + ) + } + + private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)( + q: Select + ): ConnectionIO[TextAndTag] = for { _ <- logger.ftrace[ConnectionIO]( - s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})" + s"query: $q (${itemId.id}, ${collective.id})" ) - texts <- q.query[(String, Option[TextAndTag.TagName])].to[List] + texts <- q.build.query[(String, Option[TextAndTag.TagName])].to[List] _ <- logger.ftrace[ConnectionIO]( s"Got ${texts.size} text and tag entries for item ${itemId.id}" ) tag = texts.headOption.flatMap(_._2) txt = texts.map(_._1).mkString(pageSep) } yield TextAndTag(itemId, txt, tag) - } + } From 5c487ef7a9f4e39d22145c6b272b5aeb51a3a1c9 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 21:30:02 +0100 Subject: [PATCH 21/38] Refactor running classifier in text analysis --- .../scala/docspell/common/MetaProposal.scala | 2 +- .../docspell/joex/learn/ClassifierName.scala | 5 ++ .../scala/docspell/joex/learn/Classify.scala | 43 ++++++++++++++ .../joex/process/ExtractArchive.scala | 2 +- .../docspell/joex/process/TextAnalysis.scala | 58 +++++++------------ 5 files changed, 70 insertions(+), 40 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/Classify.scala diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala index a68affff..62a9355f 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala @@ -87,7 +87,7 @@ object MetaProposal { } } - /** Merges candidates with same `IdRef' values and concatenates their + /** Merges candidates with same `IdRef` values and concatenates their * respective labels. The candidate order is preserved. */ def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = { diff --git a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala index 0ed2d97e..c08b96db 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/ClassifierName.scala @@ -31,6 +31,11 @@ object ClassifierName { val correspondentPerson: ClassifierName = apply("correspondentperson") + def findTagClassifiers[F[_]](coll: Ident): ConnectionIO[List[ClassifierName]] = + for { + categories <- RClassifierSetting.getActiveCategories(coll) + } yield categories.map(tagCategory) + def findTagModels[F[_]](coll: Ident): ConnectionIO[List[RClassifierModel]] = for { categories <- RClassifierSetting.getActiveCategories(coll) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala new file mode 100644 index 00000000..ae34d18f --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala @@ -0,0 +1,43 @@ +package docspell.joex.learn + +import java.nio.file.Path +import cats.implicits._ +import bitpeace.RangeDef +import cats.data.OptionT +import cats.effect._ +import docspell.store.Store +import docspell.analysis.classifier.{ClassifierModel, TextClassifier} +import docspell.common._ +import docspell.store.records.RClassifierModel + +object Classify { + + def apply[F[_]: Sync: ContextShift]( + blocker: Blocker, + logger: Logger[F], + workingDir: Path, + store: Store[F], + classifier: TextClassifier[F], + coll: Ident, + text: String + )(cname: ClassifierName): F[Option[String]] = + (for { + _ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …")) + model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name))) + modelData = + store.bitpeace + .get(model.fileId.id) + .unNoneTerminate + .through(store.bitpeace.fetchData2(RangeDef.all)) + cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir => + val modelFile = dir.resolve("model.ser.gz") + modelData + .through(fs2.io.file.writeAll(modelFile, blocker)) + .compile + .drain + .flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text)) + }).filter(_ != LearnClassifierTask.noClass) + _ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}")) + } yield cls).value + +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index c48952e2..7de6a086 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -42,7 +42,7 @@ object ExtractArchive { archive: Option[RAttachmentArchive] ): Task[F, ProcessItemArgs, (Option[RAttachmentArchive], ItemData)] = singlePass(item, archive).flatMap { t => - if (t._1 == None) Task.pure(t) + if (t._1.isEmpty) Task.pure(t) else multiPass(t._2, t._1) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index fd7c08bc..b2d50f75 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,22 +1,18 @@ package docspell.joex.process -import cats.data.OptionT import cats.effect._ import cats.implicits._ - -import docspell.analysis.classifier.{ClassifierModel, TextClassifier} +import docspell.analysis.classifier.TextClassifier import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile -import docspell.joex.learn.{ClassifierName, LearnClassifierTask} +import docspell.joex.learn.{ClassifierName, Classify, LearnClassifierTask} import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.{RAttachmentMeta, RClassifierSetting} -import bitpeace.RangeDef - object TextAnalysis { type Args = ProcessItemArgs @@ -73,40 +69,26 @@ object TextAnalysis { cfg: Config.TextAnalysis, metas: Vector[RAttachmentMeta], classifier: TextClassifier[F] - ): F[List[String]] = + ): F[List[String]] = { + val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) + val classifyWith: ClassifierName => F[Option[String]] = + Classify[F]( + ctx.blocker, + ctx.logger, + cfg.workingDir, + ctx.store, + classifier, + ctx.args.meta.collective, + text + ) for { - models <- ctx.store.transact(ClassifierName.findTagModels(ctx.args.meta.collective)) - _ <- ctx.logger.debug(s"Guessing tags for ${models.size} categories") - tags <- models - .map(_.fileId.some) - .traverse(predictTag(ctx, cfg, metas, classifier)) + names <- ctx.store.transact( + ClassifierName.findTagClassifiers(ctx.args.meta.collective) + ) + _ <- ctx.logger.debug(s"Guessing tags for ${names.size} categories") + tags <- names.traverse(classifyWith) } yield tags.flatten - - def predictTag[F[_]: Sync: ContextShift]( - ctx: Context[F, Args], - cfg: Config.TextAnalysis, - metas: Vector[RAttachmentMeta], - classifier: TextClassifier[F] - )(modelFileId: Option[Ident]): F[Option[String]] = - (for { - _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag for ${modelFileId} …")) - model <- OptionT.fromOption[F](modelFileId) - text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) - modelData = - ctx.store.bitpeace - .get(model.id) - .unNoneTerminate - .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir => - val modelFile = dir.resolve("model.ser.gz") - modelData - .through(fs2.io.file.writeAll(modelFile, ctx.blocker)) - .compile - .drain - .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) - }).filter(_ != LearnClassifierTask.noClass) - _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) - } yield cls).value + } private def getActive[F[_]: Sync]( ctx: Context[F, Args], From d124f0c1a9ec257064aabfe4a751fa0de43115fe Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 22:07:48 +0100 Subject: [PATCH 22/38] Rename db changeset It's not just a fix, but adds new things --- ...1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} | 0 ...1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} | 0 ...1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename modules/store/src/main/resources/db/migration/h2/{V1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} (100%) rename modules/store/src/main/resources/db/migration/mariadb/{V1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} (100%) rename modules/store/src/main/resources/db/migration/postgresql/{V1.17.1__classifier_model.sql => V1.18.0__classifier_model.sql} (100%) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql similarity index 100% rename from modules/store/src/main/resources/db/migration/h2/V1.17.1__classifier_model.sql rename to modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql similarity index 100% rename from modules/store/src/main/resources/db/migration/mariadb/V1.17.1__classifier_model.sql rename to modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql similarity index 100% rename from modules/store/src/main/resources/db/migration/postgresql/V1.17.1__classifier_model.sql rename to modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql From 1cd34414628eeca481f61b5711aa6aa7c8c0557a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 22:04:13 +0100 Subject: [PATCH 23/38] Run classifier for item entities (concerned, correspondent) Store the results separately from nlp results in attachment metadata. --- .../scala/docspell/joex/learn/Classify.scala | 9 ++- .../joex/learn/LearnClassifierTask.scala | 2 +- .../joex/process/AttachmentPageCount.scala | 1 + .../docspell/joex/process/CreateItem.scala | 6 +- .../docspell/joex/process/ItemData.scala | 8 +- .../docspell/joex/process/ReProcessItem.scala | 3 +- .../docspell/joex/process/SaveProposals.scala | 5 +- .../docspell/joex/process/TextAnalysis.scala | 81 +++++++++++++++---- .../h2/V1.19.0__add_classify_meta.sql | 3 + .../mariadb/V1.19.0__add_classify_meta.sql | 3 + .../postgresql/V1.19.0__add_classify_meta.sql | 3 + .../store/records/RAttachmentMeta.scala | 42 +++++++--- .../store/records/RClassifierSetting.scala | 2 +- 13 files changed, 131 insertions(+), 37 deletions(-) create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala index ae34d18f..4c65556c 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala @@ -1,15 +1,18 @@ package docspell.joex.learn import java.nio.file.Path -import cats.implicits._ -import bitpeace.RangeDef + import cats.data.OptionT import cats.effect._ -import docspell.store.Store +import cats.implicits._ + import docspell.analysis.classifier.{ClassifierModel, TextClassifier} import docspell.common._ +import docspell.store.Store import docspell.store.records.RClassifierModel +import bitpeace.RangeDef + object Classify { def apply[F[_]: Sync: ContextShift]( diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 354a8e39..e3aae66f 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -89,7 +89,7 @@ object LearnClassifierTask { ): OptionT[F, OCollective.Classifier] = if (cfg.classification.enabled) OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) - .filter(_.enabled) + .filter(_.autoTagEnabled) .map(OCollective.Classifier.fromRecord) else OptionT.none diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala index 0373db8a..15678322 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -84,6 +84,7 @@ object AttachmentPageCount { Nil, MetaProposalList.empty, md.pageCount.some, + None, None ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index fe21203b..8bc9ccc1 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -107,7 +107,8 @@ object CreateItem { Vector.empty, fm.map(a => a.id -> a.fileId).toMap, MetaProposalList.empty, - Nil + Nil, + None ) } @@ -166,7 +167,8 @@ object CreateItem { Vector.empty, origMap, MetaProposalList.empty, - Nil + Nil, + None ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 0435e37c..a151e8a6 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} * containng the source or origin file * @param givenMeta meta data to this item that was not "guessed" * from an attachment but given and thus is always correct + * @param classifyProposals these are proposals that were obtained by + * a trained classifier. There are no ner-tags, it will only provide a + * single label */ case class ItemData( item: RItem, @@ -23,7 +26,10 @@ case class ItemData( dateLabels: Vector[AttachmentDates], originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id givenMeta: MetaProposalList, // given meta data not associated to a specific attachment - tags: List[String] // a list of tags (names or ids) attached to the item if they exist + // a list of tags (names or ids) attached to the item if they exist + tags: List[String], + // proposals obtained from the classifier + classifyProposals: Option[MetaProposalList] ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 07fb2901..db41e901 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -65,7 +65,8 @@ object ReProcessItem { Vector.empty, asrcMap.view.mapValues(_.fileId).toMap, MetaProposalList.empty, - Nil + Nil, + None )).getOrElseF( Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}")) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala index ee4fd923..9d2f0ae3 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -17,7 +17,10 @@ object SaveProposals { data.metas .traverse(rm => ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> - ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) + ctx.store.transact( + RAttachmentMeta + .updateProposals(rm.id, rm.proposals, data.classifyProposals) + ) ) .map(_ => data) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index b2d50f75..a2561e07 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,9 +1,12 @@ package docspell.joex.process +import cats.Traverse import cats.effect._ import cats.implicits._ + import docspell.analysis.classifier.TextClassifier import docspell.analysis.{NlpSettings, TextAnalyser} +import docspell.common.MetaProposal.Candidate import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile @@ -37,12 +40,22 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - classifierEnabled <- getActive(ctx, cfg) + autoTagEnabled <- getActiveAutoTag(ctx, cfg) tag <- - if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) + if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) else List.empty[String].pure[F] + + classProposals <- + if (cfg.classification.enabled) + predictItemEntities(ctx, cfg, item.metas, analyser.classifier) + else MetaProposalList.empty.pure[F] + } yield item - .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + .copy( + metas = v.map(_._1), + dateLabels = v.map(_._2), + classifyProposals = classProposals.some + ) .appendTags(tag) } @@ -72,15 +85,8 @@ object TextAnalysis { ): F[List[String]] = { val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) val classifyWith: ClassifierName => F[Option[String]] = - Classify[F]( - ctx.blocker, - ctx.logger, - cfg.workingDir, - ctx.store, - classifier, - ctx.args.meta.collective, - text - ) + makeClassify(ctx, cfg, classifier)(text) + for { names <- ctx.store.transact( ClassifierName.findTagClassifiers(ctx.args.meta.collective) @@ -90,14 +96,61 @@ object TextAnalysis { } yield tags.flatten } - private def getActive[F[_]: Sync]( + def predictItemEntities[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): F[MetaProposalList] = { + val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) + + def classifyWith( + cname: ClassifierName, + mtype: MetaProposalType + ): F[Option[MetaProposal]] = + for { + _ <- ctx.logger.debug(s"Guessing $mtype using classifier") + label <- makeClassify(ctx, cfg, classifier)(text).apply(cname) + } yield label.map(str => + MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty)) + ) + + Traverse[List] + .sequence( + List( + classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg), + classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson), + classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson), + classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip) + ) + ) + .map(_.flatten) + .map(MetaProposalList.apply) + } + + private def makeClassify[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + classifier: TextClassifier[F] + )(text: String): ClassifierName => F[Option[String]] = + Classify[F]( + ctx.blocker, + ctx.logger, + cfg.workingDir, + ctx.store, + classifier, + ctx.args.meta.collective, + text + ) + + private def getActiveAutoTag[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis ): F[Boolean] = if (cfg.classification.enabled) ctx.store .transact(RClassifierSetting.findById(ctx.args.meta.collective)) - .map(_.exists(_.enabled)) + .map(_.exists(_.autoTagEnabled)) .flatTap(enabled => if (enabled) ().pure[F] else ctx.logger.info("Classification is disabled. Check config or settings.") diff --git a/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..2513dc8d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "classify_proposals" text; + diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..fdc3c9f0 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE `attachmentmeta` +ADD COLUMN (`classify_proposals` mediumtext); + diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..2513dc8d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "classify_proposals" text; + diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index 919a5b17..f201525c 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -16,7 +16,8 @@ case class RAttachmentMeta( nerlabels: List[NerLabel], proposals: MetaProposalList, pages: Option[Int], - language: Option[Language] + language: Option[Language], + classifyProposals: Option[MetaProposalList] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = @@ -29,19 +30,28 @@ case class RAttachmentMeta( object RAttachmentMeta { def empty(attachId: Ident, lang: Language) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None) final case class Table(alias: Option[String]) extends TableDef { val tableName = "attachmentmeta" - val id = Column[Ident]("attachid", this) - val content = Column[String]("content", this) - val nerlabels = Column[List[NerLabel]]("nerlabels", this) - val proposals = Column[MetaProposalList]("itemproposals", this) - val pages = Column[Int]("page_count", this) - val language = Column[Language]("language", this) + val id = Column[Ident]("attachid", this) + val content = Column[String]("content", this) + val nerlabels = Column[List[NerLabel]]("nerlabels", this) + val proposals = Column[MetaProposalList]("itemproposals", this) + val pages = Column[Int]("page_count", this) + val language = Column[Language]("language", this) + val classifyProposals = Column[MetaProposalList]("classify_proposals", this) val all = - NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language) + NonEmptyList.of[Column[_]]( + id, + content, + nerlabels, + proposals, + pages, + language, + classifyProposals + ) } val T = Table(None) @@ -52,7 +62,7 @@ object RAttachmentMeta { DML.insert( T, T.all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}" ) def exists(attachId: Ident): ConnectionIO[Boolean] = @@ -80,7 +90,8 @@ object RAttachmentMeta { DML.set( T.content.setTo(v.content), T.nerlabels.setTo(v.nerlabels), - T.proposals.setTo(v.proposals) + T.proposals.setTo(v.proposals), + T.classifyProposals.setTo(v.classifyProposals) ) ) @@ -93,12 +104,17 @@ object RAttachmentMeta { ) ) - def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] = + def updateProposals( + mid: Ident, + plist: MetaProposalList, + clist: Option[MetaProposalList] + ): ConnectionIO[Int] = DML.update( T, T.id === mid, DML.set( - T.proposals.setTo(plist) + T.proposals.setTo(plist), + T.classifyProposals.setTo(clist) ) ) diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index 9c31a5c2..1d7fd5f6 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -20,7 +20,7 @@ case class RClassifierSetting( listType: ListType ) { - def enabled: Boolean = + def autoTagEnabled: Boolean = listType match { case ListType.Blacklist => true From 8455d1badf88d50f0cfc450e022c32a6c47e3d5e Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 22:56:01 +0100 Subject: [PATCH 24/38] Lookup results from classifier The model may be out of date, data may change. Then it should be looked up to fetch the id to be compatible with next stages. --- .../docspell/joex/process/FindProposal.scala | 82 +++++++++++++++++-- .../docspell/joex/process/SaveProposals.scala | 4 +- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index 2de0de71..4f984b10 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -5,7 +5,6 @@ import java.time.ZoneId import cats.effect.Sync import cats.implicits._ import cats.{Applicative, FlatMap} - import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ @@ -17,22 +16,93 @@ import docspell.store.records._ * by looking up values from NER in the users address book. */ object FindProposal { + type Args = ProcessItemArgs def apply[F[_]: Sync]( cfg: Config.Processing - )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = + )(data: ItemData): Task[F, Args, ItemData] = Task { ctx => val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) - - ctx.logger.info("Starting find-proposal") *> - rmas + for { + _ <- ctx.logger.info("Starting find-proposal") + rmv <- rmas .traverse(rm => processAttachment(cfg, rm, data.findDates(rm), ctx) .map(ml => rm.copy(proposals = ml)) ) - .map(rmv => data.copy(metas = rmv)) + clp <- data.classifyProposals match { + case Some(cmp) => lookupClassifierProposals(ctx, cmp) + case None => MetaProposalList.empty.pure[F] + } + } yield data.copy(metas = rmv, classifyProposals = clp.some) } + def lookupClassifierProposals[F[_]: Sync]( + ctx: Context[F, Args], + mpList: MetaProposalList + ): F[MetaProposalList] = { + val coll = ctx.args.meta.collective + + def lookup(mp: MetaProposal): F[Option[IdRef]] = + mp.proposalType match { + case MetaProposalType.CorrOrg => + ctx.store + .transact( + ROrganization + .findLike(coll, mp.values.head.ref.name.toLowerCase) + .map(_.headOption) + ) + .flatTap(oref => + ctx.logger.debug(s"Found classifier organization for $mp: $oref") + ) + case MetaProposalType.CorrPerson => + ctx.store + .transact( + RPerson + .findLike(coll, mp.values.head.ref.name.toLowerCase, false) + .map(_.headOption) + ) + .flatTap(oref => + ctx.logger.debug(s"Found classifier corr-person for $mp: $oref") + ) + case MetaProposalType.ConcPerson => + ctx.store + .transact( + RPerson + .findLike(coll, mp.values.head.ref.name.toLowerCase, true) + .map(_.headOption) + ) + .flatTap(oref => + ctx.logger.debug(s"Found classifier conc-person for $mp: $oref") + ) + case MetaProposalType.ConcEquip => + ctx.store + .transact( + REquipment + .findLike(coll, mp.values.head.ref.name.toLowerCase) + .map(_.headOption) + ) + .flatTap(oref => + ctx.logger.debug(s"Found classifier conc-equip for $mp: $oref") + ) + case MetaProposalType.DocDate => + (None: Option[IdRef]).pure[F] + + case MetaProposalType.DueDate => + (None: Option[IdRef]).pure[F] + } + + def updateRef(mp: MetaProposal)(idRef: Option[IdRef]): Option[MetaProposal] = + idRef // this proposal contains a single value only, since coming from classifier + .map(ref => mp.copy(values = mp.values.map(_.copy(ref = ref)))) + + ctx.logger.debug(s"Looking up classifier results: ${mpList.proposals}") *> + mpList.proposals + .traverse(mp => lookup(mp).map(updateRef(mp))) + .map(_.flatten) + .map(MetaProposalList.apply) + } + def processAttachment[F[_]: Sync]( cfg: Config.Processing, rm: RAttachmentMeta, diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala index 9d2f0ae3..d8abf308 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -16,7 +16,9 @@ object SaveProposals { ctx.logger.info("Storing proposals") *> data.metas .traverse(rm => - ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> + ctx.logger.debug( + s"Storing attachment proposals: ${rm.proposals} and ${data.classifyProposals}" + ) *> ctx.store.transact( RAttachmentMeta .updateProposals(rm.id, rm.proposals, data.classifyProposals) From 75573c905e40a5eb57b877df9198c0deab76ab7f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 23:13:34 +0100 Subject: [PATCH 25/38] Use classifier results as fallback when linking proposed metadata --- .../joex/src/main/scala/docspell/joex/process/LinkProposal.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala index 58df16ac..6108e216 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala @@ -24,6 +24,7 @@ object LinkProposal { .flatten(data.metas.map(_.proposals)) .filter(_.proposalType != MetaProposalType.DocDate) .sortByWeights + .fillEmptyFrom(data.classifyProposals.getOrElse(MetaProposalList.empty)) ctx.logger.info(s"Starting linking proposals") *> MetaProposalType.all From 3ff9284a64429e13b67639b4d2a2e5bdbca3d803 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 23:13:51 +0100 Subject: [PATCH 26/38] Return classifier results as suggestions --- .../scala/docspell/store/queries/QAttachment.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index a9afc0bf..b1fb11b8 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -119,7 +119,7 @@ object QAttachment { def getMetaProposals(itemId: Ident, coll: Ident): ConnectionIO[MetaProposalList] = { val q = Select( - am.proposals.s, + select(am.proposals, am.classifyProposals), from(am) .innerJoin(a, a.id === am.id) .innerJoin(item, a.itemId === item.id), @@ -127,8 +127,15 @@ object QAttachment { ).build for { - ml <- q.query[MetaProposalList].to[Vector] - } yield MetaProposalList.flatten(ml) + ml <- q.query[(MetaProposalList, Option[MetaProposalList])].to[Vector] + pairs = ml.foldLeft( + (Vector.empty[MetaProposalList], Vector.empty[MetaProposalList]) + ) { case ((vl, vr), (m, o)) => + (vl.appended(m), o.map(vr.appended).getOrElse(vr)) + } + } yield MetaProposalList + .flatten(pairs._1) + .fillEmptyFrom(MetaProposalList.flatten(pairs._2)) } def getAttachmentMeta( From 9d83cb7fe461ef12823e2379b7658b92fd49c676 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 19 Jan 2021 23:48:09 +0100 Subject: [PATCH 27/38] Store item based proposals in separate table Classifier don't work on each attachment, but on all. So the results must not be stored at an attachment. This reverts some previous changes to put the classifier results for item entities into its own table. --- .../joex/process/AttachmentPageCount.scala | 1 - .../docspell/joex/process/SaveProposals.scala | 39 ++++++++---- .../h2/V1.19.0__add_classify_meta.sql | 10 +++- .../mariadb/V1.19.0__add_classify_meta.sql | 10 +++- .../postgresql/V1.19.0__add_classify_meta.sql | 10 +++- .../docspell/store/impl/DoobieMeta.scala | 3 + .../docspell/store/queries/QAttachment.scala | 24 ++++---- .../scala/docspell/store/queries/QItem.scala | 3 +- .../store/records/RAttachmentMeta.scala | 34 ++++------- .../store/records/RItemProposal.scala | 60 +++++++++++++++++++ 10 files changed, 142 insertions(+), 52 deletions(-) create mode 100644 modules/store/src/main/scala/docspell/store/records/RItemProposal.scala diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala index 15678322..0373db8a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -84,7 +84,6 @@ object AttachmentPageCount { Nil, MetaProposalList.empty, md.pageCount.some, - None, None ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala index d8abf308..060e718e 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -2,9 +2,9 @@ package docspell.joex.process import cats.effect.Sync import cats.implicits._ - import docspell.common._ import docspell.joex.scheduler.Task +import docspell.store.AddResult import docspell.store.records._ /** Saves the proposals in the database @@ -13,17 +13,36 @@ object SaveProposals { def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - ctx.logger.info("Storing proposals") *> - data.metas + for { + _ <- ctx.logger.info("Storing proposals") + _ <- data.metas .traverse(rm => ctx.logger.debug( - s"Storing attachment proposals: ${rm.proposals} and ${data.classifyProposals}" - ) *> - ctx.store.transact( - RAttachmentMeta - .updateProposals(rm.id, rm.proposals, data.classifyProposals) - ) + s"Storing attachment proposals: ${rm.proposals}" + ) *> ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) ) - .map(_ => data) + _ <- data.classifyProposals match { + case Some(clp) => + val itemId = data.item.id + ctx.logger.debug(s"Storing classifier proposals: $clp") *> + ctx.store + .add( + RItemProposal.createNew(itemId, clp), + RItemProposal.exists(itemId) + ) + .flatMap({ + case AddResult.EntityExists(_) => + ctx.store.transact(RItemProposal.updateProposals(itemId, clp)) + case AddResult.Failure(ex) => + ctx.logger + .warn(s"Could not store classifier proposals: ${ex.getMessage}") *> + 0.pure[F] + case AddResult.Success => + 1.pure[F] + }) + case None => + 0.pure[F] + } + } yield data } } diff --git a/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql index 2513dc8d..b1c6a6e4 100644 --- a/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql +++ b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql @@ -1,3 +1,7 @@ -ALTER TABLE "attachmentmeta" -ADD COLUMN "classify_proposals" text; - +CREATE TABLE "item_proposal" ( + "itemid" varchar(254) not null primary key, + "classifier_proposals" text not null, + "classifier_tags" text not null, + "created" timestamp not null, + foreign key ("itemid") references "item"("itemid") +); diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql index fdc3c9f0..08f947b3 100644 --- a/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql @@ -1,3 +1,7 @@ -ALTER TABLE `attachmentmeta` -ADD COLUMN (`classify_proposals` mediumtext); - +CREATE TABLE `item_proposal` ( + `itemid` varchar(254) not null primary key, + `classifier_proposals` mediumtext not null, + `classifier_tags` mediumtext not null, + `created` timestamp not null, + foreign key (`itemid`) references `item`(`itemid`) +); diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql index 2513dc8d..b1c6a6e4 100644 --- a/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql @@ -1,3 +1,7 @@ -ALTER TABLE "attachmentmeta" -ADD COLUMN "classify_proposals" text; - +CREATE TABLE "item_proposal" ( + "itemid" varchar(254) not null primary key, + "classifier_proposals" text not null, + "classifier_tags" text not null, + "created" timestamp not null, + foreign key ("itemid") references "item"("itemid") +); diff --git a/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala b/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala index db60a19e..8952891f 100644 --- a/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala +++ b/modules/store/src/main/scala/docspell/store/impl/DoobieMeta.scala @@ -86,6 +86,9 @@ trait DoobieMeta extends EmilDoobieMeta { implicit val metaItemProposalList: Meta[MetaProposalList] = jsonMeta[MetaProposalList] + implicit val metaIdRef: Meta[List[IdRef]] = + jsonMeta[List[IdRef]] + implicit val metaLanguage: Meta[Language] = Meta[String].imap(Language.unsafe)(_.iso3) diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index b1fb11b8..89c11faf 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -21,6 +21,7 @@ object QAttachment { private val item = RItem.as("i") private val am = RAttachmentMeta.as("am") private val c = RCollective.as("c") + private val im = RItemProposal.as("im") def deletePreview[F[_]: Sync](store: Store[F])(attachId: Ident): F[Int] = { val findPreview = @@ -118,24 +119,27 @@ object QAttachment { } yield ns.sum def getMetaProposals(itemId: Ident, coll: Ident): ConnectionIO[MetaProposalList] = { - val q = Select( - select(am.proposals, am.classifyProposals), + val qa = Select( + select(am.proposals), from(am) .innerJoin(a, a.id === am.id) .innerJoin(item, a.itemId === item.id), a.itemId === itemId && item.cid === coll ).build + val qi = Select( + select(im.classifyProposals), + from(im) + .innerJoin(item, item.id === im.itemId), + item.cid === coll && im.itemId === itemId + ).build + for { - ml <- q.query[(MetaProposalList, Option[MetaProposalList])].to[Vector] - pairs = ml.foldLeft( - (Vector.empty[MetaProposalList], Vector.empty[MetaProposalList]) - ) { case ((vl, vr), (m, o)) => - (vl.appended(m), o.map(vr.appended).getOrElse(vr)) - } + mla <- qa.query[MetaProposalList].to[Vector] + mli <- qi.query[MetaProposalList].to[Vector] } yield MetaProposalList - .flatten(pairs._1) - .fillEmptyFrom(MetaProposalList.flatten(pairs._2)) + .flatten(mla) + .fillEmptyFrom(MetaProposalList.flatten(mli)) } def getAttachmentMeta( diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 7de59437..7a53a192 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -441,8 +441,9 @@ object QItem { tn <- store.transact(RTagItem.deleteItemTags(itemId)) mn <- store.transact(RSentMail.deleteByItem(itemId)) cf <- store.transact(RCustomFieldValue.deleteByItem(itemId)) + im <- store.transact(RItemProposal.deleteByItem(itemId)) n <- store.transact(RItem.deleteByIdAndCollective(itemId, collective)) - } yield tn + rn + n + mn + cf + } yield tn + rn + n + mn + cf + im private def findByFileIdsQuery( fileMetaIds: Nel[Ident], diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index f201525c..5bc8feea 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -16,8 +16,7 @@ case class RAttachmentMeta( nerlabels: List[NerLabel], proposals: MetaProposalList, pages: Option[Int], - language: Option[Language], - classifyProposals: Option[MetaProposalList] + language: Option[Language] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = @@ -30,18 +29,17 @@ case class RAttachmentMeta( object RAttachmentMeta { def empty(attachId: Ident, lang: Language) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None) + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) final case class Table(alias: Option[String]) extends TableDef { val tableName = "attachmentmeta" - val id = Column[Ident]("attachid", this) - val content = Column[String]("content", this) - val nerlabels = Column[List[NerLabel]]("nerlabels", this) - val proposals = Column[MetaProposalList]("itemproposals", this) - val pages = Column[Int]("page_count", this) - val language = Column[Language]("language", this) - val classifyProposals = Column[MetaProposalList]("classify_proposals", this) + val id = Column[Ident]("attachid", this) + val content = Column[String]("content", this) + val nerlabels = Column[List[NerLabel]]("nerlabels", this) + val proposals = Column[MetaProposalList]("itemproposals", this) + val pages = Column[Int]("page_count", this) + val language = Column[Language]("language", this) val all = NonEmptyList.of[Column[_]]( id, @@ -49,8 +47,7 @@ object RAttachmentMeta { nerlabels, proposals, pages, - language, - classifyProposals + language ) } @@ -62,7 +59,7 @@ object RAttachmentMeta { DML.insert( T, T.all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" ) def exists(attachId: Ident): ConnectionIO[Boolean] = @@ -90,8 +87,7 @@ object RAttachmentMeta { DML.set( T.content.setTo(v.content), T.nerlabels.setTo(v.nerlabels), - T.proposals.setTo(v.proposals), - T.classifyProposals.setTo(v.classifyProposals) + T.proposals.setTo(v.proposals) ) ) @@ -106,16 +102,12 @@ object RAttachmentMeta { def updateProposals( mid: Ident, - plist: MetaProposalList, - clist: Option[MetaProposalList] + plist: MetaProposalList ): ConnectionIO[Int] = DML.update( T, T.id === mid, - DML.set( - T.proposals.setTo(plist), - T.classifyProposals.setTo(clist) - ) + DML.set(T.proposals.setTo(plist)) ) def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] = diff --git a/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala b/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala new file mode 100644 index 00000000..822404ce --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala @@ -0,0 +1,60 @@ +package docspell.store.records + +import cats.data.NonEmptyList +//import cats.implicits._ + +import docspell.common._ +import docspell.store.qb.DSL._ +import docspell.store.qb._ + +import doobie._ +import doobie.implicits._ + +case class RItemProposal( + itemId: Ident, + classifyProposals: MetaProposalList, + classifyTags: List[IdRef], + created: Timestamp +) + +object RItemProposal { + final case class Table(alias: Option[String]) extends TableDef { + val tableName = "item_proposal" + + val itemId = Column[Ident]("itemid", this) + val classifyProposals = Column[MetaProposalList]("classifier_proposals", this) + val classifyTags = Column[List[IdRef]]("classifier_tags", this) + val created = Column[Timestamp]("created", this) + val all = NonEmptyList.of[Column[_]](itemId, classifyProposals, classifyTags, created) + } + + val T = Table(None) + def as(alias: String): Table = + Table(Some(alias)) + + def insert(v: RItemProposal): ConnectionIO[Int] = + DML.insert( + T, + T.all, + fr"${v.itemId},${v.classifyProposals},${v.classifyTags},${v.created}" + ) + + def deleteByItem(itemId: Ident): ConnectionIO[Int] = + DML.delete(T, T.itemId === itemId) + + def createNew(itemId: Ident, proposals: MetaProposalList): ConnectionIO[Int] = + for { + now <- Timestamp.current[ConnectionIO] + value = RItemProposal(itemId, proposals, Nil, now) + n <- insert(value) + } yield n + + def exists(itemId: Ident): ConnectionIO[Boolean] = + Select(select(countAll), from(T), T.itemId === itemId).build + .query[Int] + .unique + .map(_ > 0) + + def updateProposals(itemId: Ident, proposals: MetaProposalList): ConnectionIO[Int] = + DML.update(T, T.itemId === itemId, DML.set(T.classifyProposals.setTo(proposals))) +} From 27c24c128d179d0cb462cb9e48c46506c3772992 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 00:30:40 +0100 Subject: [PATCH 28/38] Store tags guessed with classifier in database --- .../docspell/joex/process/CreateItem.scala | 6 ++- .../docspell/joex/process/FindProposal.scala | 8 ++- .../docspell/joex/process/ItemData.scala | 3 +- .../docspell/joex/process/LinkProposal.scala | 2 +- .../docspell/joex/process/ReProcessItem.scala | 3 +- .../docspell/joex/process/SaveProposals.scala | 54 ++++++++++--------- .../docspell/joex/process/SetGivenData.scala | 3 +- .../docspell/joex/process/TextAnalysis.scala | 4 +- .../store/records/RItemProposal.scala | 20 +++---- 9 files changed, 56 insertions(+), 47 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 8bc9ccc1..c24ad98c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -108,7 +108,8 @@ object CreateItem { fm.map(a => a.id -> a.fileId).toMap, MetaProposalList.empty, Nil, - None + MetaProposalList.empty, + Nil ) } @@ -168,7 +169,8 @@ object CreateItem { origMap, MetaProposalList.empty, Nil, - None + MetaProposalList.empty, + Nil ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index 4f984b10..fa484772 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -5,6 +5,7 @@ import java.time.ZoneId import cats.effect.Sync import cats.implicits._ import cats.{Applicative, FlatMap} + import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ @@ -30,11 +31,8 @@ object FindProposal { processAttachment(cfg, rm, data.findDates(rm), ctx) .map(ml => rm.copy(proposals = ml)) ) - clp <- data.classifyProposals match { - case Some(cmp) => lookupClassifierProposals(ctx, cmp) - case None => MetaProposalList.empty.pure[F] - } - } yield data.copy(metas = rmv, classifyProposals = clp.some) + clp <- lookupClassifierProposals(ctx, data.classifyProposals) + } yield data.copy(metas = rmv, classifyProposals = clp) } def lookupClassifierProposals[F[_]: Sync]( diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index a151e8a6..f7f52fe5 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -29,7 +29,8 @@ case class ItemData( // a list of tags (names or ids) attached to the item if they exist tags: List[String], // proposals obtained from the classifier - classifyProposals: Option[MetaProposalList] + classifyProposals: MetaProposalList, + classifyTags: List[String] ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala index 6108e216..be8d34c8 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala @@ -24,7 +24,7 @@ object LinkProposal { .flatten(data.metas.map(_.proposals)) .filter(_.proposalType != MetaProposalType.DocDate) .sortByWeights - .fillEmptyFrom(data.classifyProposals.getOrElse(MetaProposalList.empty)) + .fillEmptyFrom(data.classifyProposals) ctx.logger.info(s"Starting linking proposals") *> MetaProposalType.all diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index db41e901..42db6033 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -66,7 +66,8 @@ object ReProcessItem { asrcMap.view.mapValues(_.fileId).toMap, MetaProposalList.empty, Nil, - None + MetaProposalList.empty, + Nil )).getOrElseF( Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}")) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala index 060e718e..dfe4e1e2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -2,16 +2,18 @@ package docspell.joex.process import cats.effect.Sync import cats.implicits._ + import docspell.common._ -import docspell.joex.scheduler.Task +import docspell.joex.scheduler.{Context, Task} import docspell.store.AddResult import docspell.store.records._ /** Saves the proposals in the database */ object SaveProposals { + type Args = ProcessItemArgs - def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + def apply[F[_]: Sync](data: ItemData): Task[F, Args, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Storing proposals") @@ -21,28 +23,32 @@ object SaveProposals { s"Storing attachment proposals: ${rm.proposals}" ) *> ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) ) - _ <- data.classifyProposals match { - case Some(clp) => - val itemId = data.item.id - ctx.logger.debug(s"Storing classifier proposals: $clp") *> - ctx.store - .add( - RItemProposal.createNew(itemId, clp), - RItemProposal.exists(itemId) - ) - .flatMap({ - case AddResult.EntityExists(_) => - ctx.store.transact(RItemProposal.updateProposals(itemId, clp)) - case AddResult.Failure(ex) => - ctx.logger - .warn(s"Could not store classifier proposals: ${ex.getMessage}") *> - 0.pure[F] - case AddResult.Success => - 1.pure[F] - }) - case None => - 0.pure[F] - } + _ <- + if (data.classifyProposals.isEmpty && data.classifyTags.isEmpty) 0.pure[F] + else saveItemProposal(ctx, data) } yield data } + + def saveItemProposal[F[_]: Sync](ctx: Context[F, Args], data: ItemData): F[Unit] = { + def upsert(v: RItemProposal): F[Int] = + ctx.store.add(RItemProposal.insert(v), RItemProposal.exists(v.itemId)).flatMap { + case AddResult.Success => 1.pure[F] + case AddResult.EntityExists(_) => + ctx.store.transact(RItemProposal.update(v)) + case AddResult.Failure(ex) => + ctx.logger.warn(s"Could not store item proposals: ${ex.getMessage}") *> 0 + .pure[F] + } + + for { + _ <- ctx.logger.debug(s"Storing classifier proposals: ${data.classifyProposals}") + tags <- ctx.store.transact( + RTag.findAllByNameOrId(data.classifyTags, ctx.args.meta.collective) + ) + tagRefs = tags.map(t => IdRef(t.tagId, t.name)) + now <- Timestamp.current[F] + value = RItemProposal(data.item.id, data.classifyProposals, tagRefs.toList, now) + _ <- upsert(value) + } yield () + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala index 99348419..b668dbe9 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala @@ -45,7 +45,8 @@ object SetGivenData { Task { ctx => val itemId = data.item.id val collective = ctx.args.meta.collective - val tags = (ctx.args.meta.tags.getOrElse(Nil) ++ data.tags).distinct + val tags = + (ctx.args.meta.tags.getOrElse(Nil) ++ data.tags ++ data.classifyTags).distinct for { _ <- ctx.logger.info(s"Set tags from given data: ${tags}") e <- ops.linkTags(itemId, tags, collective).attempt diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index a2561e07..a3c4edb5 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -54,9 +54,9 @@ object TextAnalysis { .copy( metas = v.map(_._1), dateLabels = v.map(_._2), - classifyProposals = classProposals.some + classifyProposals = classProposals, + classifyTags = tag ) - .appendTags(tag) } def annotateAttachment[F[_]: Sync]( diff --git a/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala b/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala index 822404ce..c5d44cc0 100644 --- a/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala +++ b/modules/store/src/main/scala/docspell/store/records/RItemProposal.scala @@ -1,7 +1,6 @@ package docspell.store.records import cats.data.NonEmptyList -//import cats.implicits._ import docspell.common._ import docspell.store.qb.DSL._ @@ -39,22 +38,23 @@ object RItemProposal { fr"${v.itemId},${v.classifyProposals},${v.classifyTags},${v.created}" ) + def update(v: RItemProposal): ConnectionIO[Int] = + DML.update( + T, + T.itemId === v.itemId, + DML.set( + T.classifyProposals.setTo(v.classifyProposals), + T.classifyTags.setTo(v.classifyTags) + ) + ) + def deleteByItem(itemId: Ident): ConnectionIO[Int] = DML.delete(T, T.itemId === itemId) - def createNew(itemId: Ident, proposals: MetaProposalList): ConnectionIO[Int] = - for { - now <- Timestamp.current[ConnectionIO] - value = RItemProposal(itemId, proposals, Nil, now) - n <- insert(value) - } yield n - def exists(itemId: Ident): ConnectionIO[Boolean] = Select(select(countAll), from(T), T.itemId === itemId).build .query[Int] .unique .map(_ > 0) - def updateProposals(itemId: Ident, proposals: MetaProposalList): ConnectionIO[Int] = - DML.update(T, T.itemId === itemId, DML.set(T.classifyProposals.setTo(proposals))) } From b12d965223eda87a1660ed4bcc4655596002be34 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 00:40:58 +0100 Subject: [PATCH 29/38] Improve logging --- .../src/main/scala/docspell/joex/learn/Classify.scala | 4 +++- .../main/scala/docspell/joex/process/TextAnalysis.scala | 6 +++--- .../main/scala/docspell/joex/process/TextExtraction.scala | 8 ++++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala index 4c65556c..4d4c2676 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala @@ -25,8 +25,9 @@ object Classify { text: String )(cname: ClassifierName): F[Option[String]] = (for { - _ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …")) + _ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …")) model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name))) + .flatTapNone(logger.debug("No classifier model found.")) modelData = store.bitpeace .get(model.fileId.id) @@ -40,6 +41,7 @@ object Classify { .drain .flatMap(_ => classifier.classify(logger, ClassifierModel(modelFile), text)) }).filter(_ != LearnClassifierTask.noClass) + .flatTapNone(logger.debug("Guessed: ")) _ <- OptionT.liftF(logger.debug(s"Guessed: ${cls}")) } yield cls).value diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index a3c4edb5..33ec72d6 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -37,8 +37,7 @@ object TextAnalysis { _ <- t.traverse(m => ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) ) - e <- s - _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") + v = t.toVector autoTagEnabled <- getActiveAutoTag(ctx, cfg) tag <- @@ -50,6 +49,8 @@ object TextAnalysis { predictItemEntities(ctx, cfg, item.metas, analyser.classifier) else MetaProposalList.empty.pure[F] + e <- s + _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") } yield item .copy( metas = v.map(_._1), @@ -109,7 +110,6 @@ object TextAnalysis { mtype: MetaProposalType ): F[Option[MetaProposal]] = for { - _ <- ctx.logger.debug(s"Guessing $mtype using classifier") label <- makeClassify(ctx, cfg, classifier)(text).apply(cname) } yield label.map(str => MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index fee7d323..80b4b13e 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -46,10 +46,14 @@ object TextExtraction { ) _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*) dur <- start - _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") + extractedTags = txt.flatMap(_.tags).distinct.toList + _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}.") + _ <- + if (extractedTags.isEmpty) ().pure[F] + else ctx.logger.debug(s"Found tags in file: $extractedTags") } yield item .copy(metas = txt.map(_.am)) - .appendTags(txt.flatMap(_.tags).distinct.toList) + .appendTags(extractedTags) } // -- helpers From 205909558139cfa4b028149d0a3d672a672d09f1 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 00:59:35 +0100 Subject: [PATCH 30/38] Update README --- Contributing.md | 3 +++ README.md | 31 +++++++++++++++++-------------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Contributing.md b/Contributing.md index 2e114ef8..58cb5376 100644 --- a/Contributing.md +++ b/Contributing.md @@ -17,6 +17,9 @@ If you don't like to sign up to github/matrix or like to reach me personally, you can make a mail to `info [at] docspell.org` or on matrix, via `@eikek:matrix.org`. +If you find a feature request already filed, you can vote on it. I +tend to prefer most voted requests to those without much attention. + ## Documentation diff --git a/README.md b/README.md index 459dd779..1093ecce 100644 --- a/README.md +++ b/README.md @@ -9,25 +9,28 @@ # Docspell Docspell is a personal document organizer. You'll need a scanner to -convert your papers into files. Docspell can then assist in -organizing the resulting mess :wink:. +convert your papers into files. Docspell can then assist in organizing +the resulting mess :wink:. It is targeted for home use, i.e. families +and households and also for (smaller) groups/companies. -You can associate tags, set correspondends, what a document is -concerned with, a name, a date and much more. If your documents are -associated with such meta data, you should be able to quickly find -them later using the search feature. But adding this manually to each -document is a tedious task. Docspell can help you by suggesting -correspondents, guessing tags or finding dates using machine learning -techniques. This makes adding metadata to your documents a lot easier. +You can associate tags, set correspondends and lots of other +predefined and custom metadata. If your documents are associated with +such meta data, you can quickly find them later using the search +feature. But adding this manually is a tedious task. Docspell can help +by suggesting correspondents, guessing tags or finding dates using +machine learning. It can learn metadata from existing documents and +find things using NLP. This makes adding metadata to your documents a +lot easier. For machine learning, it relies on the free (GPL) +[Stanford Core NLP library](https://github.com/stanfordnlp/CoreNLP). Docspell also runs OCR (if needed) on your documents, can provide fulltext search and has great e-mail integration. Everything is accessible via a REST/HTTP api. A mobile friendly SPA web application -is provided as the user interface and an [Android -app](https://github.com/docspell/android-client) for conveniently -uploading files from your phone/tablet. The [feature -overview](https://docspell.org/#feature-selection) has a more complete -list. +is the default user interface. An [Android +app](https://github.com/docspell/android-client) exists for +conveniently uploading files from your phone/tablet. The [feature +overview](https://docspell.org/#feature-selection) lists some more +points. ## Impressions From 5d366c3bd624ebfc77a328ab6f7a8b7746e5eb06 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 01:05:59 +0100 Subject: [PATCH 31/38] Make labels in classifier settings more clear --- .../webapp/src/main/elm/Comp/ClassifierSettingsForm.elm | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm index 579506d6..36f38d11 100644 --- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -204,7 +204,14 @@ Use an empty whitelist to disable auto tagging. (Comp.FixedDropdown.view (Just catListTypeItem) model.categoryListTypeModel) ] , div [ class "field" ] - [ label [] [ text "Choose tag categories for learning" ] + [ label [] + [ case model.categoryListType of + Data.ListType.Whitelist -> + text "Include tag categories for learning" + + Data.ListType.Blacklist -> + text "Exclude tag categories from learning" + ] , Html.map CategoryListMsg (Comp.Dropdown.view settings model.categoryListModel) ] From 85ddc61d9d8be88f22e0b5cbb708f3f17f212b2f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 19:17:29 +0100 Subject: [PATCH 32/38] Move date proposal setting to nlp config --- modules/joex/src/main/resources/reference.conf | 11 ++++------- .../joex/src/main/scala/docspell/joex/Config.scala | 12 +++++++----- .../scala/docspell/joex/process/FindProposal.scala | 8 ++++---- .../scala/docspell/joex/process/ProcessItem.scala | 2 +- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index a495ea5a..378f0b9c 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -319,6 +319,10 @@ docspell.joex { # This has only any effect, if mode != disabled. clear-interval = "15 minutes" + # Restricts proposals for due dates. Only dates earlier than this + # number of years in the future are considered. + max-due-date-years = 10 + regex-ner { # Whether to enable custom NER annotation. This uses the # address book of a collective as input for NER tagging (to @@ -517,13 +521,6 @@ docspell.joex { } } - # General config for processing documents - processing { - # Restricts proposals for due dates. Only dates earlier than this - # number of years in the future are considered. - max-due-date-years = 10 - } - # The same section is also present in the rest-server config. It is # used when submitting files into the job queue for processing. # diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 4ad72d7c..922e83c7 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -31,8 +31,7 @@ case class Config( sendMail: MailSendConfig, files: Files, mailDebug: Boolean, - fullTextSearch: Config.FullTextSearch, - processing: Config.Processing + fullTextSearch: Config.FullTextSearch ) object Config { @@ -55,8 +54,6 @@ object Config { final case class Migration(indexAllChunk: Int) } - case class Processing(maxDueDateYears: Int) - case class TextAnalysis( maxLength: Int, workingDir: Path, @@ -84,7 +81,12 @@ object Config { ) } - case class NlpConfig(mode: NlpMode, clearInterval: Duration, regexNer: RegexNer) + case class NlpConfig( + mode: NlpMode, + clearInterval: Duration, + maxDueDateYears: Int, + regexNer: RegexNer + ) case class RegexNer(maxEntries: Int, fileCacheTime: Duration) diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index fa484772..1bb91af1 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -20,7 +20,7 @@ object FindProposal { type Args = ProcessItemArgs def apply[F[_]: Sync]( - cfg: Config.Processing + cfg: Config.TextAnalysis )(data: ItemData): Task[F, Args, ItemData] = Task { ctx => val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) @@ -102,7 +102,7 @@ object FindProposal { } def processAttachment[F[_]: Sync]( - cfg: Config.Processing, + cfg: Config.TextAnalysis, rm: RAttachmentMeta, rd: Vector[NerDateLabel], ctx: Context[F, ProcessItemArgs] @@ -114,11 +114,11 @@ object FindProposal { } def makeDateProposal[F[_]: Sync]( - cfg: Config.Processing, + cfg: Config.TextAnalysis, dates: Vector[NerDateLabel] ): F[MetaProposalList] = Timestamp.current[F].map { now => - val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong)) + val maxFuture = now.plus(Duration.years(cfg.nlp.maxDueDateYears.toLong)) val latestFirst = dates .filter(_.date.isBefore(maxFuture.toUtcDate)) .sortWith((l1, l2) => l1.date.isAfter(l2.date)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index b6cc493e..c119b467 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -41,7 +41,7 @@ object ProcessItem { regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item) - .flatMap(FindProposal[F](cfg.processing)) + .flatMap(FindProposal[F](cfg.textAnalysis)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) From a6c31be22f7fb6580534cfe807e0edc508e71e8c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 21:35:54 +0100 Subject: [PATCH 33/38] Update documentation --- .../joex/src/main/resources/reference.conf | 31 ++-- nix/module-joex.nix | 2 +- website/site/content/docs/configure/_index.md | 53 +++++++ website/site/content/docs/install/rpi.md | 19 ++- website/site/content/docs/install/running.md | 44 ++++- .../site/content/docs/joex/file-processing.md | 150 +++++++++++------- 6 files changed, 206 insertions(+), 93 deletions(-) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 378f0b9c..00f8d435 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -286,16 +286,13 @@ docspell.joex { # 4. disabled - doesn't use any stanford-nlp feature # # The full and basic variants rely on pre-build language models - # that are available for only 3 lanugages at the moment: German, - # English and French. - # - # Memory usage varies greatly among the languages. German has - # quite large models, that require about 1G heap. So joex should - # run with -Xmx1500M at least when using mode=full. + # that are available for only a few languages. Memory usage + # varies among the languages. So joex should run with -Xmx1400M + # at least when using mode=full. # # The basic variant does a quite good job for German and # English. It might be worse for French, always depending on the - # type of text that is analysed. Joex should run with about 600M + # type of text that is analysed. Joex should run with about 500M # heap, here again lanugage German uses the most. # # The regexonly variant doesn't depend on a language. It roughly @@ -349,25 +346,23 @@ docspell.joex { # Settings for doing document classification. # - # This works by learning from existing documents. A collective can - # specify a tag category and the system will try to predict a tag - # from this category for new incoming documents. - # - # This requires a satstical model that is computed from all - # existing documents. This process is run periodically as - # configured by the collective. It may require a lot of memory, - # depending on the amount of data. + # This works by learning from existing documents. This requires a + # satstical model that is computed from all existing documents. + # This process is run periodically as configured by the + # collective. It may require more memory, depending on the amount + # of data. # # It utilises this NLP library: https://nlp.stanford.edu/. classification { # Whether to enable classification globally. Each collective can - # decide to disable it. If it is disabled here, no collective - # can use classification. + # enable/disable auto-tagging. The classifier is also used for + # finding correspondents and concerned entities, if enabled + # here. enabled = true # If concerned with memory consumption, this restricts the # number of items to consider. More are better for training. A - # negative value or zero means no train on all items. + # negative value or zero means to train on all items. item-count = 0 # These settings are used to configure the classifier. If diff --git a/nix/module-joex.nix b/nix/module-joex.nix index aae8d835..32c663e9 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -796,7 +796,7 @@ in { Memory usage varies greatly among the languages. German has quite large models, that require about 1G heap. So joex should - run with -Xmx1500M at least when using mode=full. + run with -Xmx1400M at least when using mode=full. The basic variant does a quite good job for German and English. It might be worse for French, always depending on the diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index 1ac7b928..dccce7d9 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -20,6 +20,9 @@ The configuration of both components uses separate namespaces. The configuration for the REST server is below `docspell.server`, while the one for joex is below `docspell.joex`. +You can therefore use two separate config files or one single file +containing both namespaces. + ## JDBC This configures the connection to the database. This has to be @@ -281,6 +284,56 @@ just some minutes, the web application obtains new ones periodically. So a short time is recommended. +## File Processing + +Files are being processed by the joex component. So all the respective +configuration is in this config only. + +File processing involves several stages, detailed information can be +found [here](@/docs/joex/file-processing.md#text-analysis). + +Configuration allows to define the external tools and set some +limitations to control memory usage. The sections are: + +- `docspell.joex.extraction` +- `docspell.joex.text-analysis` +- `docspell.joex.convert` + +Options to external commands can use variables that are replaced by +values at runtime. Variables are enclosed in double braces `{{…}}`. +Please see the default configuration for more details. + +### `text-analysis.nlp.mode` + +This setting defines which NLP mode to use. It defaults to `full`, +which requires more memory for certain languages (with the advantage +of better results). Other values are `basic`, `regexonly` and +`disabled`. The modes `full` and `basic` use pre-defined lanugage +models for procesing documents of languaes German, English and French. +These require some amount of memory (see below). + +The mode `basic` is like the "light" variant to `full`. It doesn't use +all NLP features, which makes memory consumption much lower, but comes +with the compromise of less accurate results. + +The mode `regexonly` doesn't use pre-defined lanuage models, even if +available. It checks your address book against a document to find +metadata. That means, it is language independent. Also, when using +`full` or `basic` with lanugages where no pre-defined models exist, it +will degrade to `regexonly` for these. + +The mode `disabled` skips NLP processing completely. This has least +impact in memory consumption, obviously, but then only the classifier +is used to find metadata. + +You might want to try different modes and see what combination suits +best your usage pattern and machine running joex. If a powerful +machine is used, simply leave the defaults. When running on an older +raspberry pi, for example, you might need to adjust things. The +corresponding sections in [joex default config](#joex) and the [file +processing](@/docs/joex/file-processing.md#text-analysis) page provide more +details. + # File Format The format of the configuration files can be diff --git a/website/site/content/docs/install/rpi.md b/website/site/content/docs/install/rpi.md index ac7d85a7..edf35e88 100644 --- a/website/site/content/docs/install/rpi.md +++ b/website/site/content/docs/install/rpi.md @@ -25,19 +25,18 @@ work is done by the joex components. Running the joex component on the Raspberry Pi is possible, but will result in long processing times for OCR and text analysis. The board should provide 4G of RAM (like the current RPi4), especially if also a -database and solr are running next to it. I recommend to give joex a -heap of 1.5G (`-J-Xmx1536M`). You should also set the joex pool size -to 1. - -When joex processes the first file, some models are built loaded into -memory which can take a while. Subsequent processing times are faster -then. +database and solr are running next to it. The memory required by joex +depends on the config and document language. Please pick a value that +suits your setup from [here](@/docs/install/running.md#memory-usage). +For boards like the RPi, it might be necessary to use +`nlp.mode=basic`, rather than `nlp.mode=full`. You should also set the +joex pool size to 1. An example: on this [UP board](https://up-board.org/up/specifications/) with an Intel Atom -x5-Z8350 CPU (@1.44Ghz) and 4G RAM, a scanned (300dpi) pdf file with 6 -pages took *3:20 min* to process. This board also runs the SOLR and a -postgresql database. +x5-Z8350 CPU (@1.44Ghz) and 4G RAM, a scanned (300dpi, in German) pdf +file with 6 pages took *3:20 min* to process. This board also runs the +SOLR and a postgresql database. The same file was processed in 55s on a qemu virtual machine on my i7 notebook, using 1 CPU and 4G RAM (and identical config for joex). The diff --git a/website/site/content/docs/install/running.md b/website/site/content/docs/install/running.md index 2493e697..6a303b87 100644 --- a/website/site/content/docs/install/running.md +++ b/website/site/content/docs/install/running.md @@ -35,6 +35,42 @@ You should be able to create a new account and sign in. Check the [configuration page](@/docs/configure/_index.md) to further customize docspell. +## Memory Usage + +The memory requirements for the joex component depends on the document +language and the configuration for [file +processing](@/docs/configure/_index.md#file-processing). The +`nlp.mode` setting has significant impact, especially when your +documents are in German. Here are some rough numbers on jvm heap usage +(the same small jpeg file was used for all tries): + + + + + + + + + + + +
nlp.modeEnglishGermanFrench
full420M950M490M
basic170M380M390M
+ +When using `mode=full`, a heap setting of at least `-Xmx1400M` is +recommended. For `mode=basic` a heap setting of at least `-Xmx500M` is +recommended. + +Other languages can't use these two modes, and so don't require this +amount of memory (but don't have as good results). Then you can go +with less heap. + +More details about these modes can be found +[here](@/docs/joex/file-processing.md#text-analysis). + + +The restserver component is very lightweight, here you can use +defaults. + ## Options @@ -65,10 +101,10 @@ $ ./docspell-restserver*/bin/docspell-restserver -h gives an overview of supported options. -It is recommended to run joex with 1.5G heap space or more and with -the G1GC enabled. If you use java8, you need to add an option to use -G1GC, for java11 this is not necessary (but doesn't hurt either). This -could look like this: +It is recommended to run joex with the G1GC enabled. If you use java8, +you need to add an option to use G1GC (`-XX:+UseG1GC`), for java11 +this is not necessary (but doesn't hurt either). This could look like +this: ``` ./docspell-joex-{{version()}}/bin/docspell-joex -J-Xmx1596M -J-XX:+UseG1GC -- /path/to/joex.conf diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index 7c0f7610..8deb83f5 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -331,91 +331,121 @@ images for a collective. There is also a bash script provided in the # Text Analysis -This uses the extracted text to find what could be attached to the new -item. There are multiple things provided. +Finally, the extracted text is analysed to find possible metadata that +can be attached to the new item. There are two different approaches +provided. -Docspell depends on the [Stanford NLP +The basic idea here is, that instead of *you defining textual rules* to +apply tags and other things, these rules *are found for you* based on +what you have provided so far. + +Docspell relies on the [Stanford NLP Library](https://nlp.stanford.edu/software/) for its AI features. -Among other things they provide a classifier (used for guessing tags) -and NER annotators. The latter is also a classifier, that associates a -label to terms in a text. It finds out whether some term is probably -an organization, a person etc. This is then used to find matches in -your address book. +Among other things they provide a classifier and NER annotators. The +latter is also a classifier, that associates a label to terms in a +text. It finds out whether some term is probably an organization, a +person etc. It tries to “understand” the structure of the text, like +verb, nouns and their relation. -When docspell finds several possible candidates for a match, it will -show the first few to you. If then the first was not the correct one, -it can usually be fixed by a single click, because it is among the -suggestions. +The two approaches used are sketched below. They have both advantages +and disadvantages and are by default used together. However, depending +on the document languages, not all approaches are possible. They also +have different memory footprints, and you might want to disable some +features if running on low-end machines. ## Classification If you enabled classification in the config file, a model is trained -periodically from your files. This is used to guess a tag for the item -for new documents. +periodically from a collective's files. Very roughly speaking… this +model contains the essence of "patterns" in the text that are likeley +related to a tag, a corrpesondent etc. -You can tell docspell how many documents it should use for training. -Sometimes (when moving?), documents may change and you only like to -base next guesses on the documents of last year only. This can be -found in the collective settings. +When a new document arrives, this model is used to ask for what +metadata (tag, correspondent, etc) it thinks is likely to apply here. -The admin can also limit the number of documents to train with, -because it affects memory usage. +Training the model is a rather resource intensive process, but using +an existing model is quite cheap and fast. A model is trained +periodically, the schedule can be defined in your collective settings. +For tags, you can define the tag categories that should be trained (or +that should not be trained). Docspell assigns one tag from all tags in +a category to a new document. + +Note that tags that can not be derived from the text only, should +probably be excluded from learning. For example, if you tag all your +items with `Done` at some point, it may falsly learn patterns to this +tag and tag your new documents with `Done`. + +The admin can also limit the number of documents to train with in the +config file to control the memory footprint when training. + +Classification is used in Docspell once for guessing tags and also for +finding correspondent and concerned entities. For correspondent and +concerned entities, the NLP approach is used first and the classifier +results then fill missing values. ## Natural Language Processing -NLP is used to find out which terms in a text may be a company or -person that is then used to find metadata in your address book. It can -also uses your complete address book to match terms in the text. So -there are two ways: using a statistical model, terms in a text are -identified as organization or person etc. This information is then -used to search your address book. Second, regexp rules are derived -from the address book and run against the text. By default, both are -applied, where the rules are run as the last step to identify missing -terms. +NLP is the other approach that works a bit differently. In this +approach, algorithms are used that find lanugage properties from the +given text, for example which terms are nouns, organization or person +names etc. This also requires a statistical model, but this time for a +whole language. These are also provided by [Stanford +NLP](https://nlp.stanford.edu/software/), but not for all languages. +So whether this can be used depends on the document language. Models +exist for German, English and French currently. -The statistical model approach is good, i.e. for large address books. -Normally, a document contains only very few organizations or person -names. So it is much more efficient to check these against your -address book (in contrast to the other way around). It can also find -things *not* in your address book. However, it might not detect all or -there are no statistical models for your language. Then the address -book is used to automatically create rules that are run against the -document. +Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to +run custom rules against a text. This can be used as a fallback for +terms where the statistical model didn't succeed. But it can also be +used by itself. Docspell derives these rules from your address book, +so it can find terms in the document text that match your organization +and person names. This does not depend on the document language. -These statistical models are provided by [Stanford -NLP](https://nlp.stanford.edu/software/) and are currently available -for German, English and French. All other languages can use the rule -approach. The statistcal models, however, require quite some memory – -depending on the size of the models which varies between languages. -English has a lower memory footprint than German, for example. If you -have a very large address book, the rule approach may also use a lot -memory. +By default, Docspell does both: it first uses the statistical language +model (if available for the given language) and then runs the +address-book derived rules as a last step on so far unclassified +terms. This allows for the best results. If more than one candidate is +found, the "most likely" one is set on the item and others are stored +as suggestions. +The statistical model approach works generally very well, i.e. for +large address books. Normally, a document contains only very few +organizations or person names. So it is more efficient to check these +few against your (probably large) address book; in contrast to testing +hundreds of company names against a single document. It can also find +things *not* in your address book (but this is unused in Docspell +currently). However, it might not detect all or there are no +statistical models for your language. Then the address book is used to +automatically create rules that are run against the document. + +Both ways require memory, it depends on the size of your address book +and on the size of the language models (they vary for each language). In the config file, you can specify different modes of operation for nlp processing as follows: - mode `full`: creates the complete nlp pipeline, requiring the most amount of memory, providing the best results. I'd recommend to run - joex with a heap size of a least 1.5G (for English only, it can be + joex with a heap size of a least 1.4G (for English only, it can be lower that that). - mode `basic`: it only loads the NER tagger. This doesn't work as - well as the complete pipeline, because some steps are simply - skipped. But it gives quite good results and uses less memory. I'd - recommend to run joex with at least 600m heap in this mode. + well as the complete pipeline, because some NLP steps are simply + skipped. But it gives quite good results already and uses less + memory. I'd recommend to run joex with at least 500m heap in this + mode. - mode `regexonly`: this doesn't load any statistical models and is - therefore very memory efficient (depending on the address book size, - of course). It will use the address book to create regex rules and - match them against your document. It doesn't depend on a language, - so this is available for all languages. -- mode = disabled: this disables nlp processing altogether + therefore much lighter on memory (depending on the address book + size, of course). It will use the address book to create regex rules + and match them against your document. +- mode = disabled: this disables nlp processing altogether. Then only + the classifier is run (unless disabled). Note that mode `full` and `basic` is only relevant for the languages where models are available. For all other languages, it is effectively the same as `regexonly`. -The config file allows some settings. You can specify a limit for -texts. Large texts result in higher memory consumption. By default, +The config file allows to specify a limit for texts to analyse in +general. Large texts result in higher memory consumption. By default, the first 10'000 characters are taken into account. Then, for the `regexonly` mode, you can restrict the number of address @@ -424,7 +454,7 @@ book entries that are used to create the rule set via footprint. The setting `clear-stanford-nlp-interval` allows to define an idle -time after which the model files are cleared from memory. This allows -memory to be reclaimed by the OS. The timer starts after the last file -has been processed. If you can afford it, it is recommended to disable -it by setting it to `0`. +time after which the language models are cleared from memory. This +allows memory to be reclaimed by the OS. The timer starts after the +last file has been processed. If you can afford it, it is recommended +to disable it by setting it to `0`. From 38387e00a0210656140b545e0e9369d12c12927a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 20 Jan 2021 23:30:35 +0100 Subject: [PATCH 34/38] Fix mariadb migration --- .../db/migration/mariadb/V1.18.0__classifier_model.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql index 59bec4b2..573dc9d1 100644 --- a/modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.18.0__classifier_model.sql @@ -24,7 +24,7 @@ update `classifier_setting` set `category_list_type` = 'whitelist'; update `classifier_setting` -set `categories` = concat('[`', category, '`]') +set `categories` = concat('["', `category`, '"]') where category is not null; update `classifier_setting` From 363cf5aef07be97af2135948846000d2ad1654a4 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 21 Jan 2021 00:22:58 +0100 Subject: [PATCH 35/38] Quote names in sql changesets --- .../resources/db/migration/h2/V1.18.0__classifier_model.sql | 2 +- .../db/migration/postgresql/V1.18.0__classifier_model.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql b/modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql index d0aab38b..5aa517aa 100644 --- a/modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/h2/V1.18.0__classifier_model.sql @@ -24,7 +24,7 @@ update "classifier_setting" set "category_list_type" = 'whitelist'; update "classifier_setting" -set "categories" = concat('["', category, '"]') +set "categories" = concat('["', "category", '"]') where category is not null; update "classifier_setting" diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql index 1e44679a..830fcc72 100644 --- a/modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.18.0__classifier_model.sql @@ -24,7 +24,7 @@ update "classifier_setting" set "category_list_type" = 'whitelist'; update "classifier_setting" -set "categories" = concat('["', category, '"]') +set "categories" = concat('["', "category", '"]') where category is not null; update "classifier_setting" From 9957c3267ed56e30fdb363def8e821d0ae8a0d2f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 21 Jan 2021 17:46:39 +0100 Subject: [PATCH 36/38] Add constraints from config to classifier training For large and/or many documents, training the classifier can lead to OOM errors. Some limits have been set by default. --- .../joex/src/main/resources/reference.conf | 8 +-- .../src/main/scala/docspell/joex/Config.scala | 7 ++- .../joex/learn/LearnClassifierTask.scala | 11 ++-- .../joex/learn/LearnItemEntities.scala | 31 ++++++----- .../scala/docspell/joex/learn/LearnTags.scala | 10 ++-- .../docspell/joex/learn/SelectItems.scala | 54 ++++++++++++------- .../scala/docspell/store/queries/QItem.scala | 16 +++--- 7 files changed, 87 insertions(+), 50 deletions(-) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 00f8d435..7f2ee7d0 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -269,9 +269,9 @@ docspell.joex { # All text to analyse must fit into RAM. A large document may take # too much heap. Also, most important information is at the # beginning of a document, so in most cases the first two pages - # should suffice. Default is 10000, which are about 2-3 pages - # (just a rough guess, of course). - max-length = 10000 + # should suffice. Default is 8000, which are about 2-3 pages (just + # a rough guess, of course). + max-length = 8000 # A working directory for the analyser to store temporary/working # files. @@ -363,7 +363,7 @@ docspell.joex { # If concerned with memory consumption, this restricts the # number of items to consider. More are better for training. A # negative value or zero means to train on all items. - item-count = 0 + item-count = 600 # These settings are used to configure the classifier. If # multiple are given, they are all tried and the "best" is diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 922e83c7..e995e757 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -94,5 +94,10 @@ object Config { enabled: Boolean, itemCount: Int, classifiers: List[Map[String, String]] - ) + ) { + + def itemCountOrWhenLower(other: Int): Int = + if (itemCount <= 0 || (itemCount > other && other > 0)) other + else itemCount + } } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index e3aae66f..be3d7143 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -37,7 +37,8 @@ object LearnClassifierTask { .learnAll( analyser, ctx.args.collective, - cfg.classification.itemCount + cfg.classification.itemCount, + cfg.maxLength ) .run(ctx) else ().pure[F] @@ -51,10 +52,14 @@ object LearnClassifierTask { val learnTags = for { sett <- findActiveSettings[F](ctx, cfg) - maxItems = math.min(cfg.classification.itemCount, sett.itemCount) + maxItems = cfg.classification.itemCountOrWhenLower(sett.itemCount) _ <- OptionT.liftF( LearnTags - .learnAllTagCategories(analyser)(ctx.args.collective, maxItems) + .learnAllTagCategories(analyser)( + ctx.args.collective, + maxItems, + cfg.maxLength + ) .run(ctx) ) } yield () diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala index 1dc48975..f47f1e9c 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnItemEntities.scala @@ -14,51 +14,56 @@ object LearnItemEntities { def learnAll[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = - learnCorrOrg(analyser, collective, maxItems) - .flatMap(_ => learnCorrPerson[F, A](analyser, collective, maxItems)) - .flatMap(_ => learnConcPerson(analyser, collective, maxItems)) - .flatMap(_ => learnConcEquip(analyser, collective, maxItems)) + learnCorrOrg(analyser, collective, maxItems, maxTextLen) + .flatMap(_ => learnCorrPerson[F, A](analyser, collective, maxItems, maxTextLen)) + .flatMap(_ => learnConcPerson(analyser, collective, maxItems, maxTextLen)) + .flatMap(_ => learnConcEquip(analyser, collective, maxItems, maxTextLen)) def learnCorrOrg[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = learn(analyser, collective)( ClassifierName.correspondentOrg, - ctx => SelectItems.forCorrOrg(ctx.store, collective, maxItems) + ctx => SelectItems.forCorrOrg(ctx.store, collective, maxItems, maxTextLen) ) def learnCorrPerson[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = learn(analyser, collective)( ClassifierName.correspondentPerson, - ctx => SelectItems.forCorrPerson(ctx.store, collective, maxItems) + ctx => SelectItems.forCorrPerson(ctx.store, collective, maxItems, maxTextLen) ) def learnConcPerson[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = learn(analyser, collective)( ClassifierName.concernedPerson, - ctx => SelectItems.forConcPerson(ctx.store, collective, maxItems) + ctx => SelectItems.forConcPerson(ctx.store, collective, maxItems, maxTextLen) ) def learnConcEquip[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = learn(analyser, collective)( ClassifierName.concernedEquip, - ctx => SelectItems.forConcEquip(ctx.store, collective, maxItems) + ctx => SelectItems.forConcEquip(ctx.store, collective, maxItems, maxTextLen) ) private def learn[F[_]: Sync: ContextShift, A]( diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala index b24eb28d..234a548f 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnTags.scala @@ -14,12 +14,13 @@ object LearnTags { def learnTagCategory[F[_]: Sync: ContextShift, A]( analyser: TextAnalyser[F], collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int )( category: String ): Task[F, A, Unit] = Task { ctx => - val data = SelectItems.forCategory(ctx, collective)(maxItems, category) + val data = SelectItems.forCategory(ctx, collective)(maxItems, category, maxTextLen) ctx.logger.info(s"Learn classifier for tag category: $category") *> analyser.classifier.trainClassifier(ctx.logger, data)( Kleisli( @@ -34,12 +35,13 @@ object LearnTags { def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])( collective: Ident, - maxItems: Int + maxItems: Int, + maxTextLen: Int ): Task[F, A, Unit] = Task { ctx => for { cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective)) - task = learnTagCategory[F, A](analyser, collective, maxItems) _ + task = learnTagCategory[F, A](analyser, collective, maxItems, maxTextLen) _ _ <- cats.map(task).traverse(_.run(ctx)) } yield () } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala index c6dab2f0..8ce77f62 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/SelectItems.scala @@ -16,20 +16,24 @@ object SelectItems { val noClass = LearnClassifierTask.noClass def forCategory[F[_]](ctx: Context[F, _], collective: Ident)( - max: Int, - category: String + maxItems: Int, + category: String, + maxTextLen: Int ): Stream[F, Data] = - forCategory(ctx.store, collective, max, category) + forCategory(ctx.store, collective, maxItems, category, maxTextLen) def forCategory[F[_]]( store: Store[F], collective: Ident, - max: Int, - category: String + maxItems: Int, + category: String, + maxTextLen: Int ): Stream[F, Data] = { val connStream = - allItems(collective, max) - .evalMap(item => QItem.resolveTextAndTag(collective, item, category, pageSep)) + allItems(collective, maxItems) + .evalMap(item => + QItem.resolveTextAndTag(collective, item, category, maxTextLen, pageSep) + ) .through(mkData) store.transact(connStream) } @@ -37,11 +41,14 @@ object SelectItems { def forCorrOrg[F[_]]( store: Store[F], collective: Ident, - max: Int + maxItems: Int, + maxTextLen: Int ): Stream[F, Data] = { val connStream = - allItems(collective, max) - .evalMap(item => QItem.resolveTextAndCorrOrg(collective, item, pageSep)) + allItems(collective, maxItems) + .evalMap(item => + QItem.resolveTextAndCorrOrg(collective, item, maxTextLen, pageSep) + ) .through(mkData) store.transact(connStream) } @@ -49,11 +56,14 @@ object SelectItems { def forCorrPerson[F[_]]( store: Store[F], collective: Ident, - max: Int + maxItems: Int, + maxTextLen: Int ): Stream[F, Data] = { val connStream = - allItems(collective, max) - .evalMap(item => QItem.resolveTextAndCorrPerson(collective, item, pageSep)) + allItems(collective, maxItems) + .evalMap(item => + QItem.resolveTextAndCorrPerson(collective, item, maxTextLen, pageSep) + ) .through(mkData) store.transact(connStream) } @@ -61,11 +71,14 @@ object SelectItems { def forConcPerson[F[_]]( store: Store[F], collective: Ident, - max: Int + maxItems: Int, + maxTextLen: Int ): Stream[F, Data] = { val connStream = - allItems(collective, max) - .evalMap(item => QItem.resolveTextAndConcPerson(collective, item, pageSep)) + allItems(collective, maxItems) + .evalMap(item => + QItem.resolveTextAndConcPerson(collective, item, maxTextLen, pageSep) + ) .through(mkData) store.transact(connStream) } @@ -73,11 +86,14 @@ object SelectItems { def forConcEquip[F[_]]( store: Store[F], collective: Ident, - max: Int + maxItems: Int, + maxTextLen: Int ): Stream[F, Data] = { val connStream = - allItems(collective, max) - .evalMap(item => QItem.resolveTextAndConcEquip(collective, item, pageSep)) + allItems(collective, maxItems) + .evalMap(item => + QItem.resolveTextAndConcEquip(collective, item, maxTextLen, pageSep) + ) .through(mkData) store.transact(connStream) } diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 7a53a192..b8ee49e2 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -547,7 +547,6 @@ object QItem { chunkSize: Int, limit: Batch ): Stream[ConnectionIO, Ident] = { - val i = RItem.as("i") Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed) .orderBy(i.created.desc) @@ -561,6 +560,7 @@ object QItem { collective: Ident, itemId: Ident, tagCategory: String, + maxLen: Int, pageSep: String ): ConnectionIO[TextAndTag] = { val tags = TableDef("tags").as("tt") @@ -578,7 +578,7 @@ object QItem { ) )( Select( - select(m.content, tagsTid, tagsName), + select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, a.id === m.id) @@ -592,11 +592,12 @@ object QItem { def resolveTextAndCorrOrg( collective: Ident, itemId: Ident, + maxLen: Int, pageSep: String ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(m.content, org.oid, org.name), + select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -608,11 +609,12 @@ object QItem { def resolveTextAndCorrPerson( collective: Ident, itemId: Ident, + maxLen: Int, pageSep: String ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(m.content, pers0.pid, pers0.name), + select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -624,11 +626,12 @@ object QItem { def resolveTextAndConcPerson( collective: Ident, itemId: Ident, + maxLen: Int, pageSep: String ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(m.content, pers0.pid, pers0.name), + select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -640,11 +643,12 @@ object QItem { def resolveTextAndConcEquip( collective: Ident, itemId: Ident, + maxLen: Int, pageSep: String ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(m.content, equip.eid, equip.name), + select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) From 021ac568ae7698466e7cfc6256ef8b5155c7d58f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 21 Jan 2021 20:06:53 +0100 Subject: [PATCH 37/38] Update documentation for text analysis --- website/site/content/docs/configure/_index.md | 28 ++++++++++++++----- .../site/content/docs/joex/file-processing.md | 17 +++++++---- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index dccce7d9..81e697a6 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -290,7 +290,8 @@ Files are being processed by the joex component. So all the respective configuration is in this config only. File processing involves several stages, detailed information can be -found [here](@/docs/joex/file-processing.md#text-analysis). +found [here](@/docs/joex/file-processing.md#text-analysis) and in the +corresponding sections in [joex default config](#joex). Configuration allows to define the external tools and set some limitations to control memory usage. The sections are: @@ -301,9 +302,25 @@ limitations to control memory usage. The sections are: Options to external commands can use variables that are replaced by values at runtime. Variables are enclosed in double braces `{{…}}`. -Please see the default configuration for more details. +Please see the default configuration for what variables exist per +command. -### `text-analysis.nlp.mode` +### Classification + +In `text-analysis.classification` you can define how many documents at +most should be used for learning. The default settings should work +well for most cases. However, it always depends on the amount of data +and the machine that runs joex. For example, by default the documents +to learn from are limited to 600 (`classification.item-count`) and +every text is cut after 8000 characters (`text-analysis.max-length`). +This is fine if *most* of your documents are small and only a few are +near 8000 characters). But if *all* your documents are very large, you +probably need to either assign more heap memory or go down with the +limits. + +Classification can be disabled, too, for when it's not needed. + +### NLP This setting defines which NLP mode to use. It defaults to `full`, which requires more memory for certain languages (with the advantage @@ -329,10 +346,7 @@ is used to find metadata. You might want to try different modes and see what combination suits best your usage pattern and machine running joex. If a powerful machine is used, simply leave the defaults. When running on an older -raspberry pi, for example, you might need to adjust things. The -corresponding sections in [joex default config](#joex) and the [file -processing](@/docs/joex/file-processing.md#text-analysis) page provide more -details. +raspberry pi, for example, you might need to adjust things. # File Format diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index 8deb83f5..506dd8e0 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -363,12 +363,17 @@ related to a tag, a corrpesondent etc. When a new document arrives, this model is used to ask for what metadata (tag, correspondent, etc) it thinks is likely to apply here. -Training the model is a rather resource intensive process, but using -an existing model is quite cheap and fast. A model is trained -periodically, the schedule can be defined in your collective settings. -For tags, you can define the tag categories that should be trained (or -that should not be trained). Docspell assigns one tag from all tags in -a category to a new document. +Training the model is a rather resource intensive process. How much +memory is needed, depends on the number of documents to learn from and +the size of text to consider. Both can be limited in the config file. +The default values might require a heap of 1.4G if you have many and +large documents. The maximum text length is about 8000 characters, if +*all* your documents would be that large, adjusting these values might +be necessary. But using an existing model is quite cheap and fast. A +model is trained periodically, the schedule can be defined in your +collective settings. For tags, you can define the tag categories that +should be trained (or that should not be trained). Docspell assigns +one tag from all tags in a category to a new document. Note that tags that can not be derived from the text only, should probably be excluded from learning. For example, if you tag all your From 4cba96f39081e6e4ac5ce1d663ebebe1e7804461 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 21 Jan 2021 21:05:28 +0100 Subject: [PATCH 38/38] Always return classifier results as suggestion The classifier results are spliced into the suggestion list at second place. When linking they are only used if nlp didn't find anything. --- .../scala/docspell/backend/ops/OItem.scala | 2 +- .../docspell/common/MetaProposalList.scala | 44 +++++++++++++------ .../common/MetaProposalListTest.scala | 31 +++++++++++++ .../joex/learn/LearnClassifierTask.scala | 1 + .../docspell/store/queries/QAttachment.scala | 2 +- .../src/main/elm/Comp/ItemDetail/View.elm | 13 +++--- 6 files changed, 71 insertions(+), 22 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index bcefe0e5..53acd38d 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -591,7 +591,7 @@ object OItem { for { itemIds <- store.transact(RItem.filterItems(items, collective)) results <- itemIds.traverse(item => deleteItem(item, collective)) - n = results.fold(0)(_ + _) + n = results.sum } yield n def getProposals(item: Ident, collective: Ident): F[MetaProposalList] = diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index d75693d8..04cedb30 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -45,6 +45,19 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { def sortByWeights: MetaProposalList = change(_.sortByWeight) + + def insertSecond(ml: MetaProposalList): MetaProposalList = + MetaProposalList.flatten0( + Seq(this, ml), + (map, next) => + map.get(next.proposalType) match { + case Some(MetaProposal(mt, values)) => + val cand = NonEmptyList(values.head, next.values.toList ++ values.tail) + map.updated(next.proposalType, MetaProposal(mt, MetaProposal.flatten(cand))) + case None => + map.updated(next.proposalType, next) + } + ) } object MetaProposalList { @@ -74,20 +87,25 @@ object MetaProposalList { * is preserved and candidates of proposals are appended as given * by the order of the given `seq'. */ - def flatten(ml: Seq[MetaProposalList]): MetaProposalList = { - val init: Map[MetaProposalType, MetaProposal] = Map.empty - - def updateMap( - map: Map[MetaProposalType, MetaProposal], - mp: MetaProposal - ): Map[MetaProposalType, MetaProposal] = - map.get(mp.proposalType) match { - case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList)) - case None => map.updated(mp.proposalType, mp) - } - - val merged = ml.foldLeft(init)((map, el) => el.proposals.foldLeft(map)(updateMap)) + def flatten(ml: Seq[MetaProposalList]): MetaProposalList = + flatten0( + ml, + (map, mp) => + map.get(mp.proposalType) match { + case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList)) + case None => map.updated(mp.proposalType, mp) + } + ) + private def flatten0( + ml: Seq[MetaProposalList], + merge: ( + Map[MetaProposalType, MetaProposal], + MetaProposal + ) => Map[MetaProposalType, MetaProposal] + ): MetaProposalList = { + val init = Map.empty[MetaProposalType, MetaProposal] + val merged = ml.foldLeft(init)((map, el) => el.proposals.foldLeft(map)(merge)) fromMap(merged) } diff --git a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala index 4b652f62..44a6cfc2 100644 --- a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala +++ b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala @@ -68,4 +68,35 @@ object MetaProposalListTest extends SimpleTestSuite { assertEquals(candidates.head, cand1) assertEquals(candidates.tail.head, cand2) } + + test("insert second") { + val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty) + val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty) + val cand3 = Candidate(IdRef(Ident.unsafe("789"), "name"), Set.empty) + val cand4 = Candidate(IdRef(Ident.unsafe("abc"), "name"), Set.empty) + val cand5 = Candidate(IdRef(Ident.unsafe("def"), "name"), Set.empty) + + val mpl1 = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1, cand2)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand3)) + ) + + val mpl2 = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand4)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand5)) + ) + + val result = mpl1.insertSecond(mpl2) + assertEquals( + result, + MetaProposalList( + List( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1, cand4, cand2)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand3, cand5)) + ) + ) + ) + } } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index be3d7143..89d7886a 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -26,6 +26,7 @@ object LearnClassifierTask { ): Task[F, Args, Unit] = learnTags(cfg, analyser) .flatMap(_ => learnItemEntities(cfg, analyser)) + .flatMap(_ => Task(_ => Sync[F].delay(System.gc()))) private def learnItemEntities[F[_]: Sync: ContextShift]( cfg: Config.TextAnalysis, diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index 89c11faf..1b6fa8ab 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -139,7 +139,7 @@ object QAttachment { mli <- qi.query[MetaProposalList].to[Vector] } yield MetaProposalList .flatten(mla) - .fillEmptyFrom(MetaProposalList.flatten(mli)) + .insertSecond(MetaProposalList.flatten(mli)) } def getAttachmentMeta( diff --git a/modules/webapp/src/main/elm/Comp/ItemDetail/View.elm b/modules/webapp/src/main/elm/Comp/ItemDetail/View.elm index d9355d05..022ed1f4 100644 --- a/modules/webapp/src/main/elm/Comp/ItemDetail/View.elm +++ b/modules/webapp/src/main/elm/Comp/ItemDetail/View.elm @@ -958,7 +958,6 @@ renderSuggestions model mkName idnames tagger = ] , div [ class "menu" ] <| (idnames - |> List.take 5 |> List.map (\p -> a [ class "item", href "#", onClick (tagger p) ] [ text (mkName p) ]) ) ] @@ -969,7 +968,7 @@ renderOrgSuggestions : Model -> Html Msg renderOrgSuggestions model = renderSuggestions model .name - (List.take 5 model.itemProposals.corrOrg) + (List.take 6 model.itemProposals.corrOrg) SetCorrOrgSuggestion @@ -977,7 +976,7 @@ renderCorrPersonSuggestions : Model -> Html Msg renderCorrPersonSuggestions model = renderSuggestions model .name - (List.take 5 model.itemProposals.corrPerson) + (List.take 6 model.itemProposals.corrPerson) SetCorrPersonSuggestion @@ -985,7 +984,7 @@ renderConcPersonSuggestions : Model -> Html Msg renderConcPersonSuggestions model = renderSuggestions model .name - (List.take 5 model.itemProposals.concPerson) + (List.take 6 model.itemProposals.concPerson) SetConcPersonSuggestion @@ -993,7 +992,7 @@ renderConcEquipSuggestions : Model -> Html Msg renderConcEquipSuggestions model = renderSuggestions model .name - (List.take 5 model.itemProposals.concEquipment) + (List.take 6 model.itemProposals.concEquipment) SetConcEquipSuggestion @@ -1001,7 +1000,7 @@ renderItemDateSuggestions : Model -> Html Msg renderItemDateSuggestions model = renderSuggestions model Util.Time.formatDate - (List.take 5 model.itemProposals.itemDate) + (List.take 6 model.itemProposals.itemDate) SetItemDateSuggestion @@ -1009,7 +1008,7 @@ renderDueDateSuggestions : Model -> Html Msg renderDueDateSuggestions model = renderSuggestions model Util.Time.formatDate - (List.take 5 model.itemProposals.dueDate) + (List.take 6 model.itemProposals.dueDate) SetDueDateSuggestion