From fdb46da26d7be6ce457a5b8dbe4f104b198d034f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 21 Apr 2020 23:33:15 +0200 Subject: [PATCH 1/7] Add french language and upgrade stanford-nlp to 4.0.0 --- README.md | 2 +- .../docspell/analysis/date/DateFind.scala | 1 + .../analysis/nlp/LabelConverter.scala | 25 +++++ .../docspell/analysis/nlp/Properties.scala | 97 +++++++++++++++++++ .../analysis/nlp/StanfordNerClassifier.scala | 54 ++++------- .../analysis/nlp/TextAnalyserSuite.scala | 24 +++-- .../main/scala/docspell/common/Language.scala | 7 +- .../main/scala/docspell/ftssolr/Field.scala | 3 + .../scala/docspell/ftssolr/SolrQuery.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 8 ++ modules/webapp/src/main/elm/Data/Language.elm | 12 ++- project/Dependencies.scala | 13 ++- project/NerModelsPlugin.scala | 15 ++- 13 files changed, 208 insertions(+), 54 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala diff --git a/README.md b/README.md index 88928bef..6ad5e9e1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell) +[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell) [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org) [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt) [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 86fea719..f2170d31 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -54,6 +54,7 @@ object DateFind { val p = lang match { case Language.English => p2.or(p0).or(p1) case Language.German => p1.or(p0).or(p2) + case Language.French => p1.or(p0).or(p2) } p.read(parts).toOption } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala new file mode 100644 index 00000000..c32a532d --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala @@ -0,0 +1,25 @@ +package docspell.analysis.nlp + +import docspell.common.{NerLabel, NerTag} + +import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel} + +object LabelConverter { + + private def tagFromLabel[A <: CoreAnnotation[String]]( + label: CoreLabel, + annot: Class[A] + ): Option[NerTag] = { + val tag = label.get(annot) + Option(tag).flatMap(s => NerTag.fromString(s).toOption) + } + + def findTag(label: CoreLabel): Option[NerTag] = + tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation]) + .orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation])) + + def toNerLabel(label: CoreLabel): Option[NerLabel] = + findTag(label).map(t => + NerLabel(label.word(), t, label.beginPosition(), label.endPosition()) + ) +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala new file mode 100644 index 00000000..75ee7040 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -0,0 +1,97 @@ +package docspell.analysis.nlp + +import java.util.{Properties => JProps} + +import docspell.analysis.nlp.Properties.Implicits._ + +object Properties { + + def apply(ps: (String, String)*): JProps = { + val p = new JProps() + for ((k, v) <- ps) + p.setProperty(k, v) + p + } + + def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = + Properties( + "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", + "tokenize.language" -> "de", + "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently + "ner.useSUTime" -> "false", //only english, unused in docspell + "ner.language" -> "de", + "ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + + def nerEnglish(regexNerMappingFile: Option[String]): JProps = + Properties( + "annotators" -> "tokenize,ssplit,pos,lemma,ner", + "tokenize.language" -> "en", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", + "ner.useSUTime" -> "false", + "ner.language" -> "en", + "ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile) + + def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = + Properties( + "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", + "tokenize.language" -> "fr", + "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv", + "mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger", + "mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", + "ner.useSUTime" -> "false", + "ner.language" -> "de", + "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + + object Implicits { + implicit final class JPropsOps(val p: JProps) extends AnyVal { + + def set(name: String, value: Option[String]): JProps = + value match { + case Some(v) => + p.setProperty(name, v) + p + case None => + p + } + + def change(name: String, f: String => String): JProps = + Option(p.getProperty(name)) match { + case Some(current) => + p.setProperty(name, f(current)) + p + case None => + p + } + + def withRegexNer(mappingFile: Option[String]): JProps = + set("regexner.mapping", mappingFile) + .change( + "annotators", + v => if (mappingFile.isDefined) v + ",regexner" else v + ) + + def withHighRecall(flag: Boolean): JProps = { + if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL") + else p.setProperty("ner.combinationMode", "NORMAL") + p + } + } + } +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 094abcca..32c165f5 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -1,16 +1,12 @@ package docspell.analysis.nlp -import java.net.URL -import java.util.zip.GZIPInputStream +import java.util.{Properties => JProps} import scala.jdk.CollectionConverters._ -import scala.util.Using import docspell.common._ -import edu.stanford.nlp.ie.AbstractSequenceClassifier -import edu.stanford.nlp.ie.crf.CRFClassifier -import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} import org.log4s.getLogger object StanfordNerClassifier { @@ -18,48 +14,32 @@ object StanfordNerClassifier { lazy val germanNerClassifier = makeClassifier(Language.German) lazy val englishNerClassifier = makeClassifier(Language.English) + lazy val frenchNerClassifier = makeClassifier(Language.French) def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { val nerClassifier = lang match { case Language.English => englishNerClassifier case Language.German => germanNerClassifier + case Language.French => frenchNerClassifier } - nerClassifier - .classify(text) - .asScala - .flatMap(a => a.asScala) - .collect(Function.unlift { label => - val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) - NerTag - .fromString(Option(tag).getOrElse("")) - .toOption - .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) - }) - .toVector + val doc = new CoreDocument(text) + nerClassifier.annotate(doc) + + doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } - private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + private def makeClassifier(lang: Language): StanfordCoreNLP = { logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...") - val ner = classifierResource(lang) - Using(new GZIPInputStream(ner.openStream())) { in => - CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] - }.fold(throw _, identity) + new StanfordCoreNLP(classifierProperties(lang)) } - private def classifierResource(lang: Language): URL = { - def check(u: URL): URL = - if (u == null) sys.error(s"NER model url not found for language ${lang.name}") - else u - - check(lang match { + private def classifierProperties(lang: Language): JProps = + lang match { case Language.German => - getClass.getResource( - "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz" - ) + Properties.nerGerman(None, false) case Language.English => - getClass.getResource( - "/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz" - ) - }) - } + Properties.nerEnglish(None) + case Language.French => + Properties.nerFrench(None, false) + } } diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index c851edce..b7c083a1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -12,22 +12,30 @@ object TextAnalyserSuite extends SimpleTestSuite { val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), - NerLabel("Treesville", NerTag.Person, 27, 37), + NerLabel("Elm", NerTag.Misc, 17, 20), + NerLabel("Ave.", NerTag.Misc, 21, 25), + NerLabel("Treesville", NerTag.Misc, 27, 37), NerLabel("Derek", NerTag.Person, 68, 73), NerLabel("Jeter", NerTag.Person, 74, 79), - NerLabel("Treesville", NerTag.Location, 95, 105), + NerLabel("Elm", NerTag.Misc, 85, 88), + NerLabel("Ave.", NerTag.Misc, 89, 93), + NerLabel("Treesville", NerTag.Person, 95, 105), + NerLabel("Leaf", NerTag.Organization, 144, 148), + NerLabel("Chief", NerTag.Organization, 150, 155), + NerLabel("of", NerTag.Organization, 156, 158), NerLabel("Syrup", NerTag.Organization, 159, 164), NerLabel("Production", NerTag.Organization, 165, 175), NerLabel("Old", NerTag.Organization, 176, 179), NerLabel("Sticky", NerTag.Organization, 180, 186), NerLabel("Pancake", NerTag.Organization, 187, 194), NerLabel("Company", NerTag.Organization, 195, 202), - NerLabel("Maple", NerTag.Location, 207, 212), - NerLabel("Lane", NerTag.Location, 213, 217), - NerLabel("Forest", NerTag.Location, 219, 225), + NerLabel("Maple", NerTag.Organization, 207, 212), + NerLabel("Lane", NerTag.Organization, 213, 217), + NerLabel("Forest", NerTag.Organization, 219, 225), NerLabel("Hemptown", NerTag.Location, 239, 247), - NerLabel("Little", NerTag.Organization, 347, 353), - NerLabel("League", NerTag.Organization, 354, 360), + NerLabel("Leaf", NerTag.Person, 276, 280), + NerLabel("Little", NerTag.Misc, 347, 353), + NerLabel("League", NerTag.Misc, 354, 360), NerLabel("Derek", NerTag.Person, 1117, 1122), NerLabel("Jeter", NerTag.Person, 1123, 1128) ) @@ -40,7 +48,7 @@ object TextAnalyserSuite extends SimpleTestSuite { val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), - NerLabel("Lilienweg", NerTag.Location, 16, 25), + NerLabel("Lilienweg", NerTag.Person, 16, 25), NerLabel("Max", NerTag.Person, 77, 80), NerLabel("Mustermann", NerTag.Person, 81, 91), NerLabel("Lilienweg", NerTag.Location, 93, 102), diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 7d836347..92c32f4b 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -27,7 +27,12 @@ object Language { val iso3 = "eng" } - val all: List[Language] = List(German, English) + case object French extends Language { + val iso2 = "fr" + val iso3 = "fra" + } + + val all: List[Language] = List(German, English, French) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 6031cd61..2306a44d 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -23,6 +23,7 @@ object Field { val content = Field("content") val content_de = Field("content_de") val content_en = Field("content_en") + val content_fr = Field("content_fr") val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") @@ -33,6 +34,8 @@ object Field { Field.content_de case Language.English => Field.content_en + case Language.French => + Field.content_fr } implicit val jsonEncoder: Encoder[Field] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index e07e9c36..1e3b09b3 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -39,6 +39,7 @@ object SolrQuery { Field.content, Field.content_de, Field.content_en, + Field.content_fr, Field.itemName, Field.itemNotes, Field.attachmentName diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 932519c8..efb94a09 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -80,6 +80,8 @@ object SolrSetup { addTextField(l.some)(Field.content_de) case l @ Language.English => addTextField(l.some)(Field.content_en) + case l @ Language.French => + addTextField(l.some)(Field.content_fr) } cmds0 *> cmds1 *> cntLang *> ().pure[F] @@ -105,6 +107,9 @@ object SolrSetup { case Some(Language.English) => run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.textEN(field))) + case Some(Language.French) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textFR(field))) } } } @@ -138,6 +143,9 @@ object SolrSetup { def textEN(field: Field): AddField = AddField(field, "text_en", true, true, false) + + def textFR(field: Field): AddField = + AddField(field, "text_fr", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 6704ec3e..40fe5eb2 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -10,6 +10,7 @@ module Data.Language exposing type Language = German | English + | French fromString : String -> Maybe Language @@ -20,6 +21,9 @@ fromString str = else if str == "eng" || str == "en" || str == "english" then Just English + else if str == "fra" || str == "fr" || str == "french" then + Just French + else Nothing @@ -33,6 +37,9 @@ toIso3 lang = English -> "eng" + French -> + "fra" + toName : Language -> String toName lang = @@ -43,7 +50,10 @@ toName lang = English -> "English" + French -> + "French" + all : List Language all = - [ German, English ] + [ German, English, French ] diff --git a/project/Dependencies.scala b/project/Dependencies.scala index ddcfa155..7ab0e4ad 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -31,7 +31,7 @@ object Dependencies { val PostgresVersion = "42.2.16" val PureConfigVersion = "0.13.0" val Slf4jVersion = "1.7.30" - val StanfordNlpVersion = "3.9.2" + val StanfordNlpVersion = "4.0.0" val TikaVersion = "1.24.1" val YamuscaVersion = "0.6.2" val SwaggerUIVersion = "3.32.3" @@ -135,11 +135,16 @@ object Dependencies { ) val stanfordNlpModels = Seq( + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier("models"), ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) .classifier("models-german"), - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier( - "models-english" - ) + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier("models-french"), + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier( + "models-english" + ) ) val tika = Seq( diff --git a/project/NerModelsPlugin.scala b/project/NerModelsPlugin.scala index cb658615..8d8fbb2c 100644 --- a/project/NerModelsPlugin.scala +++ b/project/NerModelsPlugin.scala @@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin { } private val nerModels = List( - "german.conll.germeval2014.hgc_175m_600.crf.ser.gz", - "english.all.3class.distsim.crf.ser.gz" + "german.distsim.crf.ser.gz", + "english.conll.4class.distsim.crf.ser.gz", + "french-wikiner-4class.crf.ser.gz", + "french-mwt-statistical.tsv", + "french-mwt.tagger", + "french-mwt.tsv", + "german-mwt.tsv", + "german-ud.tagger", + "german-ud.tagger.props", + "french-ud.tagger", + "french-ud.tagger.props", + "english-left3words-distsim.tagger", + "english-left3words-distsim.tagger.props" ) } From 14f646f6a2135e135f1500e50f9eeaf884adefcf Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 23 Aug 2020 10:25:04 +0200 Subject: [PATCH 2/7] Make new coursier cache available to travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index fb6d1e7d..4d750d05 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ cache: - $HOME/.ivy2/cache - $HOME/.sbt/boot - $HOME/.coursier/cache + - $HOME/.cache/coursier - sysconfcpus install: From 4e7c00c3457e02f86651cd86b5c2b50eb5b5835a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 23 Aug 2020 17:40:37 +0200 Subject: [PATCH 3/7] Don't ignore updates for stanford-nlp anymore --- .scala-steward.conf | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .scala-steward.conf diff --git a/.scala-steward.conf b/.scala-steward.conf deleted file mode 100644 index 2bbb5c09..00000000 --- a/.scala-steward.conf +++ /dev/null @@ -1,3 +0,0 @@ -updates.ignore = [ - { groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" } -] \ No newline at end of file From 8628a0a8b3d3eb2efc9a58a2f3c61b2fe5c1b190 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 24 Aug 2020 00:56:25 +0200 Subject: [PATCH 4/7] Allow configuring stanford-ner and cache based on collective --- .../docspell/analysis/TextAnalyser.scala | 83 +++++++++-------- .../docspell/analysis/nlp/PipelineCache.scala | 90 +++++++++++++++++++ .../docspell/analysis/nlp/Properties.scala | 14 +++ .../analysis/nlp/StanfordNerClassifier.scala | 50 +++++------ .../analysis/nlp/StanfordSettings.scala | 22 +++++ .../src/main/scala/docspell/common/File.scala | 3 + .../scala/docspell/joex/JoexAppImpl.scala | 22 ++--- .../docspell/joex/process/ItemHandler.scala | 14 +-- .../docspell/joex/process/ProcessItem.scala | 19 ++-- .../docspell/joex/process/ReProcessItem.scala | 16 ++-- .../docspell/joex/process/TextAnalysis.scala | 55 ++++++------ 11 files changed, 271 insertions(+), 117 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 443fd47d..75d07eef 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -5,12 +5,19 @@ import cats.implicits._ import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind +import docspell.analysis.nlp.PipelineCache import docspell.analysis.nlp.StanfordNerClassifier +import docspell.analysis.nlp.StanfordSettings import docspell.common._ trait TextAnalyser[F[_]] { - def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result] + def annotate( + logger: Logger[F], + settings: StanfordSettings, + cacheKey: Ident, + text: String + ): F[TextAnalyser.Result] } object TextAnalyser { @@ -22,43 +29,47 @@ object TextAnalyser { } def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] = - Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] { - def annotate( - logger: Logger[F], - lang: Language, - text: String - ): F[TextAnalyser.Result] = - for { - input <- textLimit(logger, text) - tags0 <- stanfordNer(lang, input) - tags1 <- contactNer(input) - dates <- dateNer(lang, input) - list = tags0 ++ tags1 - spans = NerLabelSpan.build(list) - } yield Result(spans ++ list, dates) + Resource + .liftF(PipelineCache[F]()) + .map(cache => + new TextAnalyser[F] { + def annotate( + logger: Logger[F], + settings: StanfordSettings, + cacheKey: Ident, + text: String + ): F[TextAnalyser.Result] = + for { + input <- textLimit(logger, text) + tags0 <- stanfordNer(cacheKey, settings, input) + tags1 <- contactNer(input) + dates <- dateNer(settings.lang, input) + list = tags0 ++ tags1 + spans = NerLabelSpan.build(list) + } yield Result(spans ++ list, dates) - private def textLimit(logger: Logger[F], text: String): F[String] = - if (text.length <= cfg.maxLength) text.pure[F] - else - logger.info( - s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + - s" Analysing only first ${cfg.maxLength} characters." - ) *> text.take(cfg.maxLength).pure[F] + private def textLimit(logger: Logger[F], text: String): F[String] = + if (text.length <= cfg.maxLength) text.pure[F] + else + logger.info( + s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + + s" Analysing only first ${cfg.maxLength} characters." + ) *> text.take(cfg.maxLength).pure[F] - private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] = - Sync[F].delay { - StanfordNerClassifier.nerAnnotate(lang)(text) + private def stanfordNer(key: Ident, settings: StanfordSettings, text: String) + : F[Vector[NerLabel]] = + StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text) + + private def contactNer(text: String): F[Vector[NerLabel]] = + Sync[F].delay { + Contact.annotate(text) + } + + private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] = + Sync[F].delay { + DateFind.findDates(text, lang).toVector + } } - - private def contactNer(text: String): F[Vector[NerLabel]] = - Sync[F].delay { - Contact.annotate(text) - } - - private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] = - Sync[F].delay { - DateFind.findDates(text, lang).toVector - } - }) + ) } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala new file mode 100644 index 00000000..9787563f --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -0,0 +1,90 @@ +package docspell.analysis.nlp + +import cats.Applicative +import cats.effect._ +import cats.effect.concurrent.Ref +import cats.implicits._ + +import docspell.common._ + +import edu.stanford.nlp.pipeline.StanfordCoreNLP +import org.log4s.getLogger + +/** Creating the StanfordCoreNLP pipeline is quite expensive as it + * involves IO and initializing large objects. + * + * Therefore, the instances are cached, because they are thread-safe. + * + * **This is an internal API** + */ +trait PipelineCache[F[_]] { + + def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] + +} + +object PipelineCache { + private[this] val logger = getLogger + + def none[F[_]: Applicative]: PipelineCache[F] = + new PipelineCache[F] { + def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] = + makeClassifier(settings).pure[F] + } + + def apply[F[_]: Sync](): F[PipelineCache[F]] = + Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F])) + + final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]]) + extends PipelineCache[F] { + + def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] = + for { + id <- makeSettingsId(settings) + nlp <- data.modify(cache => getOrCreate(key, id, cache, settings)) + } yield nlp + + private def getOrCreate( + key: String, + id: String, + cache: Map[String, Entry], + settings: StanfordSettings + ): (Map[String, Entry], StanfordCoreNLP) = + cache.get(key) match { + case Some(entry) => + if (entry.id == id) (cache, entry.value) + else { + logger.info( + s"StanfordNLP settings changed for key $key. Creating new classifier" + ) + val nlp = makeClassifier(settings) + val e = Entry(id, nlp) + (cache.updated(key, e), nlp) + } + + case None => + val nlp = makeClassifier(settings) + val e = Entry(id, nlp) + (cache.updated(key, e), nlp) + } + + private def makeSettingsId(settings: StanfordSettings): F[String] = { + val base = settings.copy(regexNer = None).toString + val size: F[Long] = + settings.regexNer match { + case Some(p) => + File.size(p) + case None => + 0L.pure[F] + } + size.map(len => s"$base-$len") + } + + } + private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = { + logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") + new StanfordCoreNLP(Properties.forSettings(settings)) + } + + private case class Entry(id: String, value: StanfordCoreNLP) +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index 75ee7040..314f04fb 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -3,6 +3,7 @@ package docspell.analysis.nlp import java.util.{Properties => JProps} import docspell.analysis.nlp.Properties.Implicits._ +import docspell.common._ object Properties { @@ -13,6 +14,19 @@ object Properties { p } + def forSettings(settings: StanfordSettings): JProps = { + val regexNerFile = settings.regexNer + .map(p => p.normalize().toAbsolutePath().toString()) + settings.lang match { + case Language.German => + Properties.nerGerman(regexNerFile, settings.highRecall) + case Language.English => + Properties.nerEnglish(regexNerFile) + case Language.French => + Properties.nerFrench(regexNerFile, settings.highRecall) + } + } + def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = Properties( "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 32c165f5..424396e5 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -1,45 +1,39 @@ package docspell.analysis.nlp -import java.util.{Properties => JProps} - import scala.jdk.CollectionConverters._ +import cats.Applicative +import cats.implicits._ + import docspell.common._ import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} -import org.log4s.getLogger object StanfordNerClassifier { - private[this] val logger = getLogger - lazy val germanNerClassifier = makeClassifier(Language.German) - lazy val englishNerClassifier = makeClassifier(Language.English) - lazy val frenchNerClassifier = makeClassifier(Language.French) + /** Runs named entity recognition on the given `text`. + * + * This uses the classifier pipeline from stanford-nlp, see + * https://nlp.stanford.edu/software/CRF-NER.html. Creating these + * classifiers is quite expensive, it involves loading large model + * files. The classifiers are thread-safe and so they are cached. + * The `cacheKey` defines the "slot" where classifiers are stored + * and retrieved. If for a given `cacheKey` the `settings` change, + * a new classifier must be created. It will then replace the + * previous one. + */ + def nerAnnotate[F[_]: Applicative]( + cacheKey: String, + cache: PipelineCache[F] + )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] = + cache + .obtain(cacheKey, settings) + .map(crf => runClassifier(crf, text)) - def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { - val nerClassifier = lang match { - case Language.English => englishNerClassifier - case Language.German => germanNerClassifier - case Language.French => frenchNerClassifier - } + def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { val doc = new CoreDocument(text) nerClassifier.annotate(doc) - doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } - private def makeClassifier(lang: Language): StanfordCoreNLP = { - logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...") - new StanfordCoreNLP(classifierProperties(lang)) - } - - private def classifierProperties(lang: Language): JProps = - lang match { - case Language.German => - Properties.nerGerman(None, false) - case Language.English => - Properties.nerEnglish(None) - case Language.French => - Properties.nerFrench(None, false) - } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala new file mode 100644 index 00000000..c2f6f98c --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala @@ -0,0 +1,22 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +import docspell.common._ + +/** Settings for configuring the stanford NER pipeline. + * + * The language is mandatory, only the provided ones are supported. + * The `highRecall` only applies for non-English languages. For + * non-English languages the english classifier is run as second + * classifier and if `highRecall` is true, then it will be used to + * tag untagged tokens. This may lead to a lot of false positives, + * but since English is omnipresent in other languages, too it + * depends on the use case for whether this is useful or not. + * + * The `regexNer` allows to specify a text file as described here: + * https://nlp.stanford.edu/software/regexner.html. This will be used + * as a last step to tag untagged tokens using the provided list of + * regexps. + */ +case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path]) diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index e9596fa8..0efc552a 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -55,6 +55,9 @@ object File { def exists[F[_]: Sync](file: Path): F[Boolean] = Sync[F].delay(Files.exists(file)) + def size[F[_]: Sync](file: Path): F[Long] = + Sync[F].delay(Files.size(file)) + def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index bc415446..dcea79df 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -6,6 +6,7 @@ import cats.effect._ import cats.implicits._ import fs2.concurrent.SignallingRef +import docspell.analysis.TextAnalyser import docspell.backend.ops._ import docspell.common._ import docspell.ftsclient.FtsClient @@ -80,14 +81,15 @@ object JoexAppImpl { for { httpClient <- BlazeClientBuilder[F](clientEC).resource client = JoexClient(httpClient) - store <- Store.create(cfg.jdbc, connectEC, blocker) - queue <- JobQueue(store) - pstore <- PeriodicTaskStore.create(store) - nodeOps <- ONode(store) - joex <- OJoex(client, store) - upload <- OUpload(store, queue, cfg.files, joex) - fts <- createFtsClient(cfg)(httpClient) - itemOps <- OItem(store, fts, queue, joex) + store <- Store.create(cfg.jdbc, connectEC, blocker) + queue <- JobQueue(store) + pstore <- PeriodicTaskStore.create(store) + nodeOps <- ONode(store) + joex <- OJoex(client, store) + upload <- OUpload(store, queue, cfg.files, joex) + fts <- createFtsClient(cfg)(httpClient) + itemOps <- OItem(store, fts, queue, joex) + analyser <- TextAnalyser.create[F](cfg.textAnalysis) javaEmil = JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) sch <- SchedulerBuilder(cfg.scheduler, blocker, store) @@ -95,14 +97,14 @@ object JoexAppImpl { .withTask( JobTask.json( ProcessItemArgs.taskName, - ItemHandler.newItem[F](cfg, itemOps, fts), + ItemHandler.newItem[F](cfg, itemOps, fts, analyser), ItemHandler.onCancel[F] ) ) .withTask( JobTask.json( ReProcessItemArgs.taskName, - ReProcessItem[F](cfg, fts), + ReProcessItem[F](cfg, fts, analyser), ReProcessItem.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index 4da8f779..240e7f54 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -5,6 +5,7 @@ import cats.effect._ import cats.implicits._ import fs2.Stream +import docspell.analysis.TextAnalyser import docspell.backend.ops.OItem import docspell.common.{ItemState, ProcessItemArgs} import docspell.ftsclient.FtsClient @@ -29,11 +30,12 @@ object ItemHandler { def newItem[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, itemOps: OItem[F], - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] ): Task[F, Args, Unit] = CreateItem[F] .flatMap(itemStateTask(ItemState.Processing)) - .flatMap(safeProcess[F](cfg, itemOps, fts)) + .flatMap(safeProcess[F](cfg, itemOps, fts, analyser)) .map(_ => ()) def itemStateTask[F[_]: Sync, A]( @@ -51,11 +53,12 @@ object ItemHandler { def safeProcess[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, itemOps: OItem[F], - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({ + ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({ case Right(d) => Task.pure(d) case Left(ex) => @@ -65,7 +68,8 @@ object ItemHandler { .andThen(_ => Sync[F].raiseError(ex)) }) case false => - ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created)) + ProcessItem[F](cfg, itemOps, fts, analyser)(data) + .flatMap(itemStateTask(ItemState.Created)) } private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 9b4d050f..cd76e095 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -2,6 +2,7 @@ package docspell.joex.process import cats.effect._ +import docspell.analysis.TextAnalyser import docspell.backend.ops.OItem import docspell.common.ProcessItemArgs import docspell.ftsclient.FtsClient @@ -13,25 +14,28 @@ object ProcessItem { def apply[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, itemOps: OItem[F], - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ExtractArchive(item) .flatMap(Task.setProgress(20)) - .flatMap(processAttachments0(cfg, fts, (40, 60, 80))) + .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80))) .flatMap(LinkProposal[F]) .flatMap(SetGivenData[F](itemOps)) .flatMap(Task.setProgress(99)) def processAttachments[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - processAttachments0[F](cfg, fts, (30, 60, 90))(item) + processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item) def analysisOnly[F[_]: Sync]( - cfg: Config + cfg: Config, + analyser: TextAnalyser[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](cfg.textAnalysis)(item) + TextAnalysis[F](analyser)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) @@ -39,12 +43,13 @@ object ProcessItem { private def processAttachments0[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], + analyser: TextAnalyser[F], progress: (Int, Int, Int) )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ConvertPdf(cfg.convert, item) .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(Task.setProgress(progress._2)) - .flatMap(analysisOnly[F](cfg)) + .flatMap(analysisOnly[F](cfg, analyser)) .flatMap(Task.setProgress(progress._3)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 8f5e11f2..53282539 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -4,6 +4,7 @@ import cats.data.OptionT import cats.effect._ import cats.implicits._ +import docspell.analysis.TextAnalyser import docspell.common._ import docspell.ftsclient.FtsClient import docspell.joex.Config @@ -19,10 +20,11 @@ object ReProcessItem { def apply[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] ): Task[F, Args, Unit] = loadItem[F] - .flatMap(safeProcess[F](cfg, fts)) + .flatMap(safeProcess[F](cfg, fts, analyser)) .map(_ => ()) def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = @@ -70,6 +72,7 @@ object ReProcessItem { def processFiles[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], + analyser: TextAnalyser[F], data: ItemData ): Task[F, Args, ItemData] = { @@ -91,7 +94,7 @@ object ReProcessItem { getLanguage[F].flatMap { lang => ProcessItem - .processAttachments[F](cfg, fts)(data) + .processAttachments[F](cfg, fts, analyser)(data) .contramap[Args](convertArgs(lang)) } } @@ -109,11 +112,12 @@ object ReProcessItem { def safeProcess[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, - fts: FtsClient[F] + fts: FtsClient[F], + analyser: TextAnalyser[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - processFiles[F](cfg, fts, data).attempt + processFiles[F](cfg, fts, analyser, data).attempt .flatMap({ case Right(d) => Task.pure(d) @@ -123,7 +127,7 @@ object ReProcessItem { ).andThen(_ => Sync[F].raiseError(ex)) }) case false => - processFiles[F](cfg, fts, data) + processFiles[F](cfg, fts, analyser, data) } private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 5e31e2d9..625738ef 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,9 +1,10 @@ package docspell.joex.process -import cats.effect.Sync +import cats.effect._ import cats.implicits._ -import docspell.analysis.{TextAnalyser, TextAnalysisConfig} +import docspell.analysis.TextAnalyser +import docspell.analysis.nlp.StanfordSettings import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Task @@ -12,36 +13,40 @@ import docspell.store.records.RAttachmentMeta object TextAnalysis { def apply[F[_]: Sync]( - cfg: TextAnalysisConfig + analyser: TextAnalyser[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - TextAnalyser.create[F](cfg).use { analyser => - for { - _ <- ctx.logger.info("Starting text analysis") - s <- Duration.stopTime[F] - t <- - item.metas.toList - .traverse( - annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser) - ) - _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") - _ <- t.traverse(m => - ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) - ) - e <- s - _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") - v = t.toVector - } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) - } + for { + _ <- ctx.logger.info("Starting text analysis") + s <- Duration.stopTime[F] + t <- + item.metas.toList + .traverse( + annotateAttachment[F](ctx.args, ctx.logger, analyser) + ) + _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") + _ <- t.traverse(m => + ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) + ) + e <- s + _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") + v = t.toVector + } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) } def annotateAttachment[F[_]: Sync]( - lang: Language, + args: ProcessItemArgs, logger: Logger[F], analyser: TextAnalyser[F] - )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = + )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { + val settings = StanfordSettings(args.meta.language, false, None) for { - labels <- analyser.annotate(logger, lang, rm.content.getOrElse("")) + labels <- analyser.annotate( + logger, + settings, + args.meta.collective, + rm.content.getOrElse("") + ) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) - + } } From 96d2f948f2af5a0e11859fc1101ced9de36d862b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 24 Aug 2020 14:35:56 +0200 Subject: [PATCH 5/7] Use collective's addressbook to configure regexner --- .../analysis/nlp/TextAnalyserSuite.scala | 9 ++- .../src/main/scala/docspell/common/File.scala | 4 ++ .../docspell/joex/process/TextAnalysis.scala | 65 +++++++++++++++++-- .../docspell/store/queries/QCollective.scala | 15 +++++ 4 files changed, 84 insertions(+), 9 deletions(-) diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index b7c083a1..b22093f1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -3,12 +3,17 @@ package docspell.analysis.nlp import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ +import edu.stanford.nlp.pipeline.StanfordCoreNLP object TextAnalyserSuite extends SimpleTestSuite { + lazy val germanClassifier = + new StanfordCoreNLP(Properties.nerGerman(None, false)) + lazy val englishClassifier = + new StanfordCoreNLP(Properties.nerEnglish(None)) test("find english ner labels") { val labels = - StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText) + StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find german ner labels") { val labels = - StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText) + StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 0efc552a..2d5cfb8a 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -1,6 +1,7 @@ package docspell.common import java.io.IOException +import java.nio.charset.StandardCharsets import java.nio.file._ import java.nio.file.attribute.BasicFileAttributes import java.util.concurrent.atomic.AtomicInteger @@ -87,4 +88,7 @@ object File { def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid + + def writeString[F[_]: Sync](file: Path, content: String): F[Path] = + Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8))) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 625738ef..9ee3850c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,13 +1,18 @@ package docspell.joex.process +import java.nio.file.Paths + import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.split.TextSplitter import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates +import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task +import docspell.store.queries.QCollective import docspell.store.records.RAttachmentMeta object TextAnalysis { @@ -22,7 +27,7 @@ object TextAnalysis { t <- item.metas.toList .traverse( - annotateAttachment[F](ctx.args, ctx.logger, analyser) + annotateAttachment[F](ctx, analyser) ) _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- t.traverse(m => @@ -35,18 +40,64 @@ object TextAnalysis { } def annotateAttachment[F[_]: Sync]( - args: ProcessItemArgs, - logger: Logger[F], + ctx: Context[F, ProcessItemArgs], analyser: TextAnalyser[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordSettings(args.meta.language, false, None) + val settings = StanfordSettings(ctx.args.meta.language, false, None) for { + names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective)) + temp <- File.mkTempFile(Paths.get("."), "textanalysis") + _ <- File.writeString(temp, mkNerConfig(names)) + sett = settings.copy(regexNer = Some(temp)) labels <- analyser.annotate( - logger, - settings, - args.meta.collective, + ctx.logger, + sett, + ctx.args.meta.collective, rm.content.getOrElse("") ) + _ <- File.deleteFile(temp) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + + def mkNerConfig(names: QCollective.Names): String = { + val orgs = names.org + .flatMap(Pattern(3)) + .distinct + .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) + + val pers = + names.pers + .flatMap(Pattern(2)) + .distinct + .map(_.toRow("PERSON", "LOCATION,MISC")) + + val equips = + names.equip + .flatMap(Pattern(1)) + .distinct + .map(_.toRow("MISC", "LOCATION")) + + (orgs ++ pers ++ equips).mkString("\n") + } + + case class Pattern(value: String, weight: Int) { + def toRow(tag: String, overrideTags: String): String = + s"$value\t$tag\t$overrideTags\t$weight" + } + + object Pattern { + def apply(weight: Int)(str: String): Vector[Pattern] = { + val delims = " \t\n\r".toSet + val words = + TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}") + val tokens = + TextSplitter + .splitToken(str, delims) + .toVector + .take(3) + .map(w => s"(?i)${w.toLower.value}") + + tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) + } + } } diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala index 2dc94e05..80b40207 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala @@ -1,5 +1,6 @@ package docspell.store.queries +import cats.data.OptionT import fs2.Stream import docspell.common.ContactKind @@ -11,6 +12,20 @@ import doobie._ import doobie.implicits._ object QCollective { + + case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String]) + object Names { + val empty = Names(Vector.empty, Vector.empty, Vector.empty) + } + + def allNames(collective: Ident): ConnectionIO[Names] = + (for { + orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name)) + pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name)) + equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name)) + } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name))) + .getOrElse(Names.empty) + case class TagCount(tag: RTag, count: Int) case class InsightData( From de5b33c40ddc50cd901eaf6e2d8d2f8f07290562 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 24 Aug 2020 16:09:11 +0200 Subject: [PATCH 6/7] Add `updated` column to some tables --- .../restserver/conv/Conversions.scala | 16 ++++-- .../restserver/routes/EquipmentRoutes.scala | 8 +-- .../mariadb/V1.9.0__updated_column.sql | 29 ++++++++++ .../postgresql/V1.9.0__updated_column.sql | 29 ++++++++++ .../docspell/store/records/REquipment.scala | 35 ++++++++---- .../store/records/ROrganization.scala | 39 ++++++++------ .../docspell/store/records/RPerson.scala | 53 +++++++++++++------ 7 files changed, 156 insertions(+), 53 deletions(-) create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index f2f131f0..539ec3eb 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -341,6 +341,7 @@ trait Conversions { v.address.city, v.address.country, v.notes, + now, now ) } yield OOrganization.OrgAndContacts(org, cont) @@ -353,6 +354,7 @@ trait Conversions { def contacts(oid: Ident) = v.contacts.traverse(c => newContact(c, oid.some, None)) for { + now <- Timestamp.current[F] cont <- contacts(v.id) org = ROrganization( v.id, @@ -363,7 +365,8 @@ trait Conversions { v.address.city, v.address.country, v.notes, - v.created + v.created, + now ) } yield OOrganization.OrgAndContacts(org, cont) } @@ -398,6 +401,7 @@ trait Conversions { v.address.country, v.notes, v.concerning, + now, now ) } yield OOrganization.PersonAndContacts(org, cont) @@ -410,6 +414,7 @@ trait Conversions { def contacts(pid: Ident) = v.contacts.traverse(c => newContact(c, None, pid.some)) for { + now <- Timestamp.current[F] cont <- contacts(v.id) org = RPerson( v.id, @@ -421,7 +426,8 @@ trait Conversions { v.address.country, v.notes, v.concerning, - v.created + v.created, + now ) } yield OOrganization.PersonAndContacts(org, cont) } @@ -536,11 +542,11 @@ trait Conversions { def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] = timeId.map({ case (id, now) => - REquipment(id, cid, e.name, now) + REquipment(id, cid, e.name, now, now) }) - def changeEquipment(e: Equipment, cid: Ident): REquipment = - REquipment(e.id, cid, e.name, e.created) + def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] = + Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now)) // idref diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala index edfc7521..a8db67ba 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/EquipmentRoutes.scala @@ -39,10 +39,10 @@ object EquipmentRoutes { case req @ PUT -> Root => for { - data <- req.as[Equipment] - equip = changeEquipment(data, user.account.collective) - res <- backend.equipment.update(equip) - resp <- Ok(basicResult(res, "Equipment updated.")) + data <- req.as[Equipment] + equip <- changeEquipment(data, user.account.collective) + res <- backend.equipment.update(equip) + resp <- Ok(basicResult(res, "Equipment updated.")) } yield resp case DELETE -> Root / Ident(id) => diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql new file mode 100644 index 00000000..72b6b152 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.0__updated_column.sql @@ -0,0 +1,29 @@ +-- organization +ALTER TABLE `organization` +ADD COLUMN (`updated` timestamp); + +UPDATE `organization` SET `updated` = `created`; + +ALTER TABLE `organization` +MODIFY `updated` timestamp NOT NULL; + +-- person +ALTER TABLE `person` +MODIFY `created` timestamp; + +ALTER TABLE `person` +ADD COLUMN (`updated` timestamp); + +UPDATE `person` SET `updated` = `created`; + +ALTER TABLE `person` +MODIFY `updated` timestamp NOT NULL; + +-- equipment +ALTER TABLE `equipment` +ADD COLUMN (`updated` timestamp); + +UPDATE `equipment` SET `updated` = `created`; + +ALTER TABLE `equipment` +MODIFY `updated` timestamp NOT NULL; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql new file mode 100644 index 00000000..34c57718 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.0__updated_column.sql @@ -0,0 +1,29 @@ +-- organization +ALTER TABLE "organization" +ADD COLUMN "updated" timestamp; + +UPDATE "organization" SET "updated" = "created"; + +ALTER TABLE "organization" +ALTER COLUMN "updated" SET NOT NULL; + +-- person +ALTER TABLE "person" ALTER COLUMN "created" + TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp); + +ALTER TABLE "person" +ADD COLUMN "updated" timestamp; + +UPDATE "person" SET "updated" = "created"; + +ALTER TABLE "person" +ALTER COLUMN "updated" SET NOT NULL; + +-- equipment +ALTER TABLE "equipment" +ADD COLUMN "updated" timestamp; + +UPDATE "equipment" SET "updated" = "created"; + +ALTER TABLE "equipment" +ALTER COLUMN "updated" SET NOT NULL; diff --git a/modules/store/src/main/scala/docspell/store/records/REquipment.scala b/modules/store/src/main/scala/docspell/store/records/REquipment.scala index 78d2e7f8..3a7f6d2f 100644 --- a/modules/store/src/main/scala/docspell/store/records/REquipment.scala +++ b/modules/store/src/main/scala/docspell/store/records/REquipment.scala @@ -7,7 +7,13 @@ import docspell.store.impl._ import doobie._ import doobie.implicits._ -case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {} +case class REquipment( + eid: Ident, + cid: Ident, + name: String, + created: Timestamp, + updated: Timestamp +) {} object REquipment { @@ -18,25 +24,32 @@ object REquipment { val cid = Column("cid") val name = Column("name") val created = Column("created") - val all = List(eid, cid, name, created) + val updated = Column("updated") + val all = List(eid, cid, name, created, updated) } import Columns._ def insert(v: REquipment): ConnectionIO[Int] = { - val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}") + val sql = + insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}") sql.update.run } def update(v: REquipment): ConnectionIO[Int] = { - val sql = updateRow( - table, - and(eid.is(v.eid), cid.is(v.cid)), - commas( - cid.setTo(v.cid), - name.setTo(v.name) + def sql(now: Timestamp) = + updateRow( + table, + and(eid.is(v.eid), cid.is(v.cid)), + commas( + cid.setTo(v.cid), + name.setTo(v.name), + updated.setTo(now) + ) ) - ) - sql.update.run + for { + now <- Timestamp.current[ConnectionIO] + n <- sql(now).update.run + } yield n } def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = { diff --git a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala index 17fe4845..8eb07e29 100644 --- a/modules/store/src/main/scala/docspell/store/records/ROrganization.scala +++ b/modules/store/src/main/scala/docspell/store/records/ROrganization.scala @@ -19,7 +19,8 @@ case class ROrganization( city: String, country: String, notes: Option[String], - created: Timestamp + created: Timestamp, + updated: Timestamp ) {} object ROrganization { @@ -38,7 +39,8 @@ object ROrganization { val country = Column("country") val notes = Column("notes") val created = Column("created") - val all = List(oid, cid, name, street, zip, city, country, notes, created) + val updated = Column("updated") + val all = List(oid, cid, name, street, zip, city, country, notes, created, updated) } import Columns._ @@ -47,26 +49,31 @@ object ROrganization { val sql = insertRow( table, all, - fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}" + fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}" ) sql.update.run } def update(v: ROrganization): ConnectionIO[Int] = { - val sql = updateRow( - table, - and(oid.is(v.oid), cid.is(v.cid)), - commas( - cid.setTo(v.cid), - name.setTo(v.name), - street.setTo(v.street), - zip.setTo(v.zip), - city.setTo(v.city), - country.setTo(v.country), - notes.setTo(v.notes) + def sql(now: Timestamp) = + updateRow( + table, + and(oid.is(v.oid), cid.is(v.cid)), + commas( + cid.setTo(v.cid), + name.setTo(v.name), + street.setTo(v.street), + zip.setTo(v.zip), + city.setTo(v.city), + country.setTo(v.country), + notes.setTo(v.notes), + updated.setTo(now) + ) ) - ) - sql.update.run + for { + now <- Timestamp.current[ConnectionIO] + n <- sql(now).update.run + } yield n } def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] = diff --git a/modules/store/src/main/scala/docspell/store/records/RPerson.scala b/modules/store/src/main/scala/docspell/store/records/RPerson.scala index eb9a9872..0c2bdcd9 100644 --- a/modules/store/src/main/scala/docspell/store/records/RPerson.scala +++ b/modules/store/src/main/scala/docspell/store/records/RPerson.scala @@ -20,7 +20,8 @@ case class RPerson( country: String, notes: Option[String], concerning: Boolean, - created: Timestamp + created: Timestamp, + updated: Timestamp ) {} object RPerson { @@ -40,7 +41,20 @@ object RPerson { val notes = Column("notes") val concerning = Column("concerning") val created = Column("created") - val all = List(pid, cid, name, street, zip, city, country, notes, concerning, created) + val updated = Column("updated") + val all = List( + pid, + cid, + name, + street, + zip, + city, + country, + notes, + concerning, + created, + updated + ) } import Columns._ @@ -49,27 +63,32 @@ object RPerson { val sql = insertRow( table, all, - fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}" + fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}" ) sql.update.run } def update(v: RPerson): ConnectionIO[Int] = { - val sql = updateRow( - table, - and(pid.is(v.pid), cid.is(v.cid)), - commas( - cid.setTo(v.cid), - name.setTo(v.name), - street.setTo(v.street), - zip.setTo(v.zip), - city.setTo(v.city), - country.setTo(v.country), - concerning.setTo(v.concerning), - notes.setTo(v.notes) + def sql(now: Timestamp) = + updateRow( + table, + and(pid.is(v.pid), cid.is(v.cid)), + commas( + cid.setTo(v.cid), + name.setTo(v.name), + street.setTo(v.street), + zip.setTo(v.zip), + city.setTo(v.city), + country.setTo(v.country), + concerning.setTo(v.concerning), + notes.setTo(v.notes), + updated.setTo(now) + ) ) - ) - sql.update.run + for { + now <- Timestamp.current[ConnectionIO] + n <- sql(now).update.run + } yield n } def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] = From 3473cbb773b6ee11f92e523f6920b684ac82912b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 24 Aug 2020 23:25:57 +0200 Subject: [PATCH 7/7] Use collective data with NER annotation --- .../main/scala/docspell/common/Duration.scala | 9 + .../src/main/scala/docspell/common/File.scala | 10 ++ .../joex/src/main/resources/reference.conf | 23 +++ .../src/main/scala/docspell/joex/Config.scala | 20 ++- .../scala/docspell/joex/JoexAppImpl.scala | 8 +- .../docspell/joex/analysis/NerFile.scala | 99 +++++++++++ .../docspell/joex/analysis/RegexNerFile.scala | 164 ++++++++++++++++++ .../docspell/joex/process/ItemHandler.scala | 13 +- .../docspell/joex/process/ProcessItem.scala | 19 +- .../docspell/joex/process/ReProcessItem.scala | 16 +- .../docspell/joex/process/TextAnalysis.scala | 62 +------ nix/module-joex.nix | 46 +++++ 12 files changed, 413 insertions(+), 76 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index f154a292..1c290c95 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -20,6 +20,12 @@ case class Duration(nanos: Long) { def hours: Long = minutes / 60 + def >(other: Duration): Boolean = + nanos > other.nanos + + def <(other: Duration): Boolean = + nanos < other.nanos + def toScala: FiniteDuration = FiniteDuration(nanos, TimeUnit.NANOSECONDS) @@ -62,6 +68,9 @@ object Duration { def nanos(n: Long): Duration = Duration(n) + def between(start: Timestamp, end: Timestamp): Duration = + apply(JDur.between(start.value, end.value)) + def stopTime[F[_]: Sync]: F[F[Duration]] = for { now <- Timestamp.current[F] diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 2d5cfb8a..572291c5 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -12,6 +12,10 @@ import cats.effect._ import cats.implicits._ import fs2.Stream +import docspell.common.syntax.all._ + +import io.circe.Decoder + object File { def mkDir[F[_]: Sync](dir: Path): F[Path] = @@ -91,4 +95,10 @@ object File { def writeString[F[_]: Sync](file: Path, content: String): F[Path] = Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8))) + + def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit + d: Decoder[A] + ): F[A] = + readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow + } diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index bd0de234..115d2893 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -248,6 +248,29 @@ docspell.joex { # should suffice. Default is 10000, which are about 2-3 pages # (just a rough guess, of course). max-length = 10000 + + # A working directory for the analyser to store temporary/working + # files. + working-dir = ${java.io.tmpdir}"/docspell-analysis" + + regex-ner { + # Whether to enable custom NER annotation. This uses the address + # book of a collective as input for NER tagging (to automatically + # find correspondent and concerned entities). If the address book + # is large, this can be quite memory intensive and also makes text + # analysis slower. But it greatly improves accuracy. If this is + # false, NER tagging uses only statistical models (that also work + # quite well). + # + # This setting might be moved to the collective settings in the + # future. + enabled = true + + # The NER annotation uses a file of patterns that is derived from + # a collective's address book. This is is the time how long this + # file will be kept until a check for a state change is done. + file-cache-time = "1 minute" + } } # Configuration for converting files into PDFs. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 3625ffb1..cb6bb9f3 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -1,11 +1,14 @@ package docspell.joex +import java.nio.file.Path + import docspell.analysis.TextAnalysisConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig import docspell.extract.ExtractConfig import docspell.ftssolr.SolrConfig +import docspell.joex.analysis.RegexNerFile import docspell.joex.hk.HouseKeepingConfig import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig} import docspell.store.JdbcConfig @@ -20,7 +23,7 @@ case class Config( userTasks: Config.UserTasks, houseKeeping: HouseKeepingConfig, extraction: ExtractConfig, - textAnalysis: TextAnalysisConfig, + textAnalysis: Config.TextAnalysis, convert: ConvertConfig, sendMail: MailSendConfig, files: Files, @@ -50,4 +53,19 @@ object Config { } case class Processing(maxDueDateYears: Int) + + case class TextAnalysis( + maxLength: Int, + workingDir: Path, + regexNer: RegexNer + ) { + + def textAnalysisConfig: TextAnalysisConfig = + TextAnalysisConfig(maxLength) + + def regexNerFileConfig: RegexNerFile.Config = + RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) + } + + case class RegexNer(enabled: Boolean, fileCacheTime: Duration) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index dcea79df..2fa94c25 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -11,6 +11,7 @@ import docspell.backend.ops._ import docspell.common._ import docspell.ftsclient.FtsClient import docspell.ftssolr.SolrFtsClient +import docspell.joex.analysis.RegexNerFile import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.hk._ import docspell.joex.notify._ @@ -89,7 +90,8 @@ object JoexAppImpl { upload <- OUpload(store, queue, cfg.files, joex) fts <- createFtsClient(cfg)(httpClient) itemOps <- OItem(store, fts, queue, joex) - analyser <- TextAnalyser.create[F](cfg.textAnalysis) + analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig) + regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store) javaEmil = JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) sch <- SchedulerBuilder(cfg.scheduler, blocker, store) @@ -97,14 +99,14 @@ object JoexAppImpl { .withTask( JobTask.json( ProcessItemArgs.taskName, - ItemHandler.newItem[F](cfg, itemOps, fts, analyser), + ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer), ItemHandler.onCancel[F] ) ) .withTask( JobTask.json( ReProcessItemArgs.taskName, - ReProcessItem[F](cfg, fts, analyser), + ReProcessItem[F](cfg, fts, analyser, regexNer), ReProcessItem.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala new file mode 100644 index 00000000..f7abe029 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala @@ -0,0 +1,99 @@ +package docspell.joex.analysis + +import java.nio.file.Path + +import cats.effect._ +import cats.implicits._ + +import docspell.analysis.split.TextSplitter +import docspell.common._ +import docspell.store.queries.QCollective + +import io.circe.generic.semiauto._ +import io.circe.{Decoder, Encoder} + +case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) { + def nerFilePath(directory: Path): Path = + NerFile.nerFilePath(directory, collective) + + def jsonFilePath(directory: Path) = + NerFile.jsonFilePath(directory, collective) +} + +object NerFile { + implicit val jsonDecoder: Decoder[NerFile] = + deriveDecoder[NerFile] + + implicit val jsonEncoder: Encoder[NerFile] = + deriveEncoder[NerFile] + + private def nerFilePath(directory: Path, collective: Ident): Path = + directory.resolve(s"${collective.id}.txt") + + private def jsonFilePath(directory: Path, collective: Ident): Path = + directory.resolve(s"${collective.id}.json") + + def find[F[_]: Sync: ContextShift]( + collective: Ident, + directory: Path, + blocker: Blocker + ): F[Option[NerFile]] = { + val file = jsonFilePath(directory, collective) + File.existsNonEmpty[F](file).flatMap { + case true => + File + .readJson[F, NerFile](file, blocker) + .map(_.some) + case false => + (None: Option[NerFile]).pure[F] + } + } + + def mkNerConfig(names: QCollective.Names): String = { + val orgs = names.org + .flatMap(Pattern(3)) + .distinct + .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) + + val pers = + names.pers + .flatMap(Pattern(2)) + .distinct + .map(_.toRow("PERSON", "LOCATION,MISC")) + + val equips = + names.equip + .flatMap(Pattern(1)) + .distinct + .map(_.toRow("MISC", "LOCATION")) + + (orgs ++ pers ++ equips).mkString("\n") + } + case class Pattern(value: String, weight: Int) { + def toRow(tag: String, overrideTags: String): String = + s"$value\t$tag\t$overrideTags\t$weight" + } + + object Pattern { + def apply(weight: Int)(str: String): Vector[Pattern] = { + val delims = " \t\n\r".toSet + val words = + TextSplitter + .split(str, delims) + .map(_.toLower.value.trim) + .filter(_.nonEmpty) + .toVector + .map(w => s"(?i)${w}") + val tokens = + TextSplitter + .splitToken(str, delims) + .map(_.toLower.value.trim) + .filter(_.nonEmpty) + .toVector + .take(3) + .map(w => s"(?i)${w}") + + tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala new file mode 100644 index 00000000..570fc659 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala @@ -0,0 +1,164 @@ +package docspell.joex.analysis + +import java.nio.file.Path + +import cats.effect._ +import cats.effect.concurrent.Semaphore +import cats.implicits._ + +import docspell.common._ +import docspell.common.syntax.all._ +import docspell.store.Store +import docspell.store.queries.QCollective +import docspell.store.records.REquipment +import docspell.store.records.ROrganization +import docspell.store.records.RPerson + +import io.circe.syntax._ +import org.log4s.getLogger + +/** Maintains a custom regex-ner file per collective for stanford's + * regexner annotator. + */ +trait RegexNerFile[F[_]] { + + def makeFile(collective: Ident): F[Option[Path]] + +} + +object RegexNerFile { + private[this] val logger = getLogger + + case class Config(enabled: Boolean, directory: Path, minTime: Duration) + + def apply[F[_]: Concurrent: ContextShift]( + cfg: Config, + blocker: Blocker, + store: Store[F] + ): Resource[F, RegexNerFile[F]] = + for { + dir <- File.withTempDir[F](cfg.directory, "regexner-") + writer <- Resource.liftF(Semaphore(1)) + } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer) + + final private class Impl[F[_]: Concurrent: ContextShift]( + cfg: Config, + blocker: Blocker, + store: Store[F], + writer: Semaphore[F] //TODO allow parallelism per collective + ) extends RegexNerFile[F] { + + def makeFile(collective: Ident): F[Option[Path]] = + if (cfg.enabled) doMakeFile(collective) + else (None: Option[Path]).pure[F] + + def doMakeFile(collective: Ident): F[Option[Path]] = + for { + now <- Timestamp.current[F] + existing <- NerFile.find[F](collective, cfg.directory, blocker) + result <- existing match { + case Some(nf) => + val dur = Duration.between(nf.creation, now) + if (dur > cfg.minTime) + logger.fdebug( + s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state." + ) *> updateFile( + collective, + now, + Some(nf) + ) + else nf.nerFilePath(cfg.directory).some.pure[F] + case None => + updateFile(collective, now, None) + } + } yield result + + private def updateFile( + collective: Ident, + now: Timestamp, + current: Option[NerFile] + ): F[Option[Path]] = + for { + lastUpdate <- store.transact(Sql.latestUpdate(collective)) + result <- lastUpdate match { + case None => + (None: Option[Path]).pure[F] + case Some(lup) => + current match { + case Some(cur) => + val nerf = + if (cur.updated == lup) + logger.fdebug(s"No state change detected.") *> updateTimestamp( + cur, + now + ) *> cur.pure[F] + else + logger.fdebug( + s"There have been state changes for collective '${collective.id}'. Reload NER file." + ) *> createFile(lup, collective, now) + nerf.map(_.nerFilePath(cfg.directory).some) + case None => + createFile(lup, collective, now) + .map(_.nerFilePath(cfg.directory).some) + } + } + } yield result + + private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] = + writer.withPermit(for { + file <- Sync[F].pure(nf.jsonFilePath(cfg.directory)) + _ <- File.mkDir(file.getParent) + _ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2) + } yield ()) + + private def createFile( + lastUpdate: Timestamp, + collective: Ident, + now: Timestamp + ): F[NerFile] = { + def update(nf: NerFile, text: String): F[Unit] = + writer.withPermit(for { + jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory)) + _ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'") + _ <- File.mkDir(jsonFile.getParent) + _ <- File.writeString(nf.nerFilePath(cfg.directory), text) + _ <- File.writeString(jsonFile, nf.asJson.spaces2) + } yield ()) + + for { + _ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'") + names <- store.transact(QCollective.allNames(collective)) + nerFile = NerFile(collective, lastUpdate, now) + _ <- update(nerFile, NerFile.mkNerConfig(names)) + } yield nerFile + } + } + + object Sql { + import doobie._ + import doobie.implicits._ + import docspell.store.impl.Implicits._ + import docspell.store.impl.Column + + def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = { + def max(col: Column, table: Fragment, cidCol: Column): Fragment = + selectSimple(col.max ++ fr"as t", table, cidCol.is(collective)) + + val sql = + List( + max( + ROrganization.Columns.updated, + ROrganization.table, + ROrganization.Columns.cid + ), + max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid), + max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid) + ) + .reduce(_ ++ fr"UNION ALL" ++ _) + + selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty) + .query[Timestamp] + .option + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index 240e7f54..acbf810b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -10,6 +10,7 @@ import docspell.backend.ops.OItem import docspell.common.{ItemState, ProcessItemArgs} import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Task import docspell.store.queries.QItem import docspell.store.records.RItem @@ -31,11 +32,12 @@ object ItemHandler { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] ): Task[F, Args, Unit] = CreateItem[F] .flatMap(itemStateTask(ItemState.Processing)) - .flatMap(safeProcess[F](cfg, itemOps, fts, analyser)) + .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer)) .map(_ => ()) def itemStateTask[F[_]: Sync, A]( @@ -54,11 +56,12 @@ object ItemHandler { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({ + ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({ case Right(d) => Task.pure(d) case Left(ex) => @@ -68,7 +71,7 @@ object ItemHandler { .andThen(_ => Sync[F].raiseError(ex)) }) case false => - ProcessItem[F](cfg, itemOps, fts, analyser)(data) + ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data) .flatMap(itemStateTask(ItemState.Created)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index cd76e095..7b8b6431 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -7,6 +7,7 @@ import docspell.backend.ops.OItem import docspell.common.ProcessItemArgs import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Task object ProcessItem { @@ -15,11 +16,12 @@ object ProcessItem { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ExtractArchive(item) .flatMap(Task.setProgress(20)) - .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80))) + .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80))) .flatMap(LinkProposal[F]) .flatMap(SetGivenData[F](itemOps)) .flatMap(Task.setProgress(99)) @@ -27,15 +29,17 @@ object ProcessItem { def processAttachments[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item) + processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item) def analysisOnly[F[_]: Sync]( cfg: Config, - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](analyser)(item) + TextAnalysis[F](analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) @@ -44,12 +48,13 @@ object ProcessItem { cfg: Config, fts: FtsClient[F], analyser: TextAnalyser[F], + regexNer: RegexNerFile[F], progress: (Int, Int, Int) )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ConvertPdf(cfg.convert, item) .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(Task.setProgress(progress._2)) - .flatMap(analysisOnly[F](cfg, analyser)) + .flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(Task.setProgress(progress._3)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 53282539..bf6d2467 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser import docspell.common._ import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.RAttachment @@ -21,10 +22,11 @@ object ReProcessItem { def apply[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] ): Task[F, Args, Unit] = loadItem[F] - .flatMap(safeProcess[F](cfg, fts, analyser)) + .flatMap(safeProcess[F](cfg, fts, analyser, regexNer)) .map(_ => ()) def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = @@ -73,6 +75,7 @@ object ReProcessItem { cfg: Config, fts: FtsClient[F], analyser: TextAnalyser[F], + regexNer: RegexNerFile[F], data: ItemData ): Task[F, Args, ItemData] = { @@ -94,7 +97,7 @@ object ReProcessItem { getLanguage[F].flatMap { lang => ProcessItem - .processAttachments[F](cfg, fts, analyser)(data) + .processAttachments[F](cfg, fts, analyser, regexNer)(data) .contramap[Args](convertArgs(lang)) } } @@ -113,11 +116,12 @@ object ReProcessItem { def safeProcess[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - processFiles[F](cfg, fts, analyser, data).attempt + processFiles[F](cfg, fts, analyser, regexNer, data).attempt .flatMap({ case Right(d) => Task.pure(d) @@ -127,7 +131,7 @@ object ReProcessItem { ).andThen(_ => Sync[F].raiseError(ex)) }) case false => - processFiles[F](cfg, fts, analyser, data) + processFiles[F](cfg, fts, analyser, regexNer, data) } private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 9ee3850c..abbb6870 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,24 +1,22 @@ package docspell.joex.process -import java.nio.file.Paths - import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser import docspell.analysis.nlp.StanfordSettings -import docspell.analysis.split.TextSplitter import docspell.common._ +import docspell.joex.analysis.RegexNerFile import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task -import docspell.store.queries.QCollective import docspell.store.records.RAttachmentMeta object TextAnalysis { def apply[F[_]: Sync]( - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + nerFile: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => for { @@ -27,7 +25,7 @@ object TextAnalysis { t <- item.metas.toList .traverse( - annotateAttachment[F](ctx, analyser) + annotateAttachment[F](ctx, analyser, nerFile) ) _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- t.traverse(m => @@ -41,63 +39,19 @@ object TextAnalysis { def annotateAttachment[F[_]: Sync]( ctx: Context[F, ProcessItemArgs], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { val settings = StanfordSettings(ctx.args.meta.language, false, None) for { - names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective)) - temp <- File.mkTempFile(Paths.get("."), "textanalysis") - _ <- File.writeString(temp, mkNerConfig(names)) - sett = settings.copy(regexNer = Some(temp)) + customNer <- nerFile.makeFile(ctx.args.meta.collective) + sett = settings.copy(regexNer = customNer) labels <- analyser.annotate( ctx.logger, sett, ctx.args.meta.collective, rm.content.getOrElse("") ) - _ <- File.deleteFile(temp) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } - - def mkNerConfig(names: QCollective.Names): String = { - val orgs = names.org - .flatMap(Pattern(3)) - .distinct - .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) - - val pers = - names.pers - .flatMap(Pattern(2)) - .distinct - .map(_.toRow("PERSON", "LOCATION,MISC")) - - val equips = - names.equip - .flatMap(Pattern(1)) - .distinct - .map(_.toRow("MISC", "LOCATION")) - - (orgs ++ pers ++ equips).mkString("\n") - } - - case class Pattern(value: String, weight: Int) { - def toRow(tag: String, overrideTags: String): String = - s"$value\t$tag\t$overrideTags\t$weight" - } - - object Pattern { - def apply(weight: Int)(str: String): Vector[Pattern] = { - val delims = " \t\n\r".toSet - val words = - TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}") - val tokens = - TextSplitter - .splitToken(str, delims) - .toVector - .take(3) - .map(w => s"(?i)${w.toLower.value}") - - tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) - } - } } diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 6e16581f..d550c2d3 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -91,6 +91,11 @@ let }; text-analysis = { max-length = 10000; + regex-ner = { + enabled = true; + file-cache-time = "1 minute"; + }; + working-dir = "/tmp/docspell-analysis"; }; processing = { max-due-date-years = 10; @@ -689,7 +694,48 @@ in { (a rough guess). ''; }; + working-dir = mkOption { + type = types.str; + default = defaults.text-analysis.working-dir; + description = '' + A working directory for the analyser to store temporary/working + files. + ''; + }; + regex-ner = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.text-analysis.regex-ner.enabled; + description = '' + Whether to enable custom NER annotation. This uses the address + book of a collective as input for NER tagging (to automatically + find correspondent and concerned entities). If the address book + is large, this can be quite memory intensive and also makes text + analysis slower. But it greatly improves accuracy. If this is + false, NER tagging uses only statistical models (that also work + quite well). + + This setting might be moved to the collective settings in the + future. + ''; + }; + file-cache-time = mkOption { + type = types.str; + default = defaults.text-analysis.ner-file-cache-time; + description = '' + The NER annotation uses a file of patterns that is derived from + a collective's address book. This is is the time how long this + file will be kept until a check for a state change is done. + ''; + }; + }; + }); + default = defaults.text-analysis.regex-ner; + description = ""; + }; }; }); default = defaults.text-analysis;