Merge pull request #238 from eikek/stanford-nlp4

Stanford nlp4
This commit is contained in:
mergify[bot] 2020-08-25 19:02:43 +00:00 committed by GitHub
commit 31544240fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 1040 additions and 219 deletions

View File

@ -1,3 +0,0 @@
updates.ignore = [
{ groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
]

View File

@ -10,6 +10,7 @@ cache:
- $HOME/.ivy2/cache - $HOME/.ivy2/cache
- $HOME/.sbt/boot - $HOME/.sbt/boot
- $HOME/.coursier/cache - $HOME/.coursier/cache
- $HOME/.cache/coursier
- sysconfcpus - sysconfcpus
install: install:

View File

@ -1,6 +1,6 @@
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/> <img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell) [![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
[![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org) [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org)
[![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt) [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
[![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell) [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)

View File

@ -5,12 +5,19 @@ import cats.implicits._
import docspell.analysis.contact.Contact import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind import docspell.analysis.date.DateFind
import docspell.analysis.nlp.PipelineCache
import docspell.analysis.nlp.StanfordNerClassifier import docspell.analysis.nlp.StanfordNerClassifier
import docspell.analysis.nlp.StanfordSettings
import docspell.common._ import docspell.common._
trait TextAnalyser[F[_]] { trait TextAnalyser[F[_]] {
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result] def annotate(
logger: Logger[F],
settings: StanfordSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result]
} }
object TextAnalyser { object TextAnalyser {
@ -22,43 +29,47 @@ object TextAnalyser {
} }
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] = def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] { Resource
def annotate( .liftF(PipelineCache[F]())
logger: Logger[F], .map(cache =>
lang: Language, new TextAnalyser[F] {
text: String def annotate(
): F[TextAnalyser.Result] = logger: Logger[F],
for { settings: StanfordSettings,
input <- textLimit(logger, text) cacheKey: Ident,
tags0 <- stanfordNer(lang, input) text: String
tags1 <- contactNer(input) ): F[TextAnalyser.Result] =
dates <- dateNer(lang, input) for {
list = tags0 ++ tags1 input <- textLimit(logger, text)
spans = NerLabelSpan.build(list) tags0 <- stanfordNer(cacheKey, settings, input)
} yield Result(spans ++ list, dates) tags1 <- contactNer(input)
dates <- dateNer(settings.lang, input)
list = tags0 ++ tags1
spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates)
private def textLimit(logger: Logger[F], text: String): F[String] = private def textLimit(logger: Logger[F], text: String): F[String] =
if (text.length <= cfg.maxLength) text.pure[F] if (text.length <= cfg.maxLength) text.pure[F]
else else
logger.info( logger.info(
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
s" Analysing only first ${cfg.maxLength} characters." s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F] ) *> text.take(cfg.maxLength).pure[F]
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] = private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
Sync[F].delay { : F[Vector[NerLabel]] =
StanfordNerClassifier.nerAnnotate(lang)(text) StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {
Contact.annotate(text)
}
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
Sync[F].delay {
DateFind.findDates(text, lang).toVector
}
} }
)
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {
Contact.annotate(text)
}
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
Sync[F].delay {
DateFind.findDates(text, lang).toVector
}
})
} }

View File

@ -54,6 +54,7 @@ object DateFind {
val p = lang match { val p = lang match {
case Language.English => p2.or(p0).or(p1) case Language.English => p2.or(p0).or(p1)
case Language.German => p1.or(p0).or(p2) case Language.German => p1.or(p0).or(p2)
case Language.French => p1.or(p0).or(p2)
} }
p.read(parts).toOption p.read(parts).toOption
} }

View File

@ -0,0 +1,25 @@
package docspell.analysis.nlp
import docspell.common.{NerLabel, NerTag}
import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
object LabelConverter {
private def tagFromLabel[A <: CoreAnnotation[String]](
label: CoreLabel,
annot: Class[A]
): Option[NerTag] = {
val tag = label.get(annot)
Option(tag).flatMap(s => NerTag.fromString(s).toOption)
}
def findTag(label: CoreLabel): Option[NerTag] =
tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
.orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
def toNerLabel(label: CoreLabel): Option[NerLabel] =
findTag(label).map(t =>
NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
)
}

View File

@ -0,0 +1,90 @@
package docspell.analysis.nlp
import cats.Applicative
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import org.log4s.getLogger
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
* involves IO and initializing large objects.
*
* Therefore, the instances are cached, because they are thread-safe.
*
* **This is an internal API**
*/
trait PipelineCache[F[_]] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
}
object PipelineCache {
private[this] val logger = getLogger
def none[F[_]: Applicative]: PipelineCache[F] =
new PipelineCache[F] {
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
makeClassifier(settings).pure[F]
}
def apply[F[_]: Sync](): F[PipelineCache[F]] =
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
extends PipelineCache[F] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
for {
id <- makeSettingsId(settings)
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
} yield nlp
private def getOrCreate(
key: String,
id: String,
cache: Map[String, Entry],
settings: StanfordSettings
): (Map[String, Entry], StanfordCoreNLP) =
cache.get(key) match {
case Some(entry) =>
if (entry.id == id) (cache, entry.value)
else {
logger.info(
s"StanfordNLP settings changed for key $key. Creating new classifier"
)
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
case None =>
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
private def makeSettingsId(settings: StanfordSettings): F[String] = {
val base = settings.copy(regexNer = None).toString
val size: F[Long] =
settings.regexNer match {
case Some(p) =>
File.size(p)
case None =>
0L.pure[F]
}
size.map(len => s"$base-$len")
}
}
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(Properties.forSettings(settings))
}
private case class Entry(id: String, value: StanfordCoreNLP)
}

View File

@ -0,0 +1,111 @@
package docspell.analysis.nlp
import java.util.{Properties => JProps}
import docspell.analysis.nlp.Properties.Implicits._
import docspell.common._
object Properties {
def apply(ps: (String, String)*): JProps = {
val p = new JProps()
for ((k, v) <- ps)
p.setProperty(k, v)
p
}
def forSettings(settings: StanfordSettings): JProps = {
val regexNerFile = settings.regexNer
.map(p => p.normalize().toAbsolutePath().toString())
settings.lang match {
case Language.German =>
Properties.nerGerman(regexNerFile, settings.highRecall)
case Language.English =>
Properties.nerEnglish(regexNerFile)
case Language.French =>
Properties.nerFrench(regexNerFile, settings.highRecall)
}
}
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
Properties(
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile)
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
object Implicits {
implicit final class JPropsOps(val p: JProps) extends AnyVal {
def set(name: String, value: Option[String]): JProps =
value match {
case Some(v) =>
p.setProperty(name, v)
p
case None =>
p
}
def change(name: String, f: String => String): JProps =
Option(p.getProperty(name)) match {
case Some(current) =>
p.setProperty(name, f(current))
p
case None =>
p
}
def withRegexNer(mappingFile: Option[String]): JProps =
set("regexner.mapping", mappingFile)
.change(
"annotators",
v => if (mappingFile.isDefined) v + ",regexner" else v
)
def withHighRecall(flag: Boolean): JProps = {
if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
else p.setProperty("ner.combinationMode", "NORMAL")
p
}
}
}
}

View File

@ -1,65 +1,39 @@
package docspell.analysis.nlp package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import scala.jdk.CollectionConverters._ import scala.jdk.CollectionConverters._
import scala.util.Using
import cats.Applicative
import cats.implicits._
import docspell.common._ import docspell.common._
import edu.stanford.nlp.ie.AbstractSequenceClassifier import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
object StanfordNerClassifier { object StanfordNerClassifier {
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German) /** Runs named entity recognition on the given `text`.
lazy val englishNerClassifier = makeClassifier(Language.English) *
* This uses the classifier pipeline from stanford-nlp, see
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
* classifiers is quite expensive, it involves loading large model
* files. The classifiers are thread-safe and so they are cached.
* The `cacheKey` defines the "slot" where classifiers are stored
* and retrieved. If for a given `cacheKey` the `settings` change,
* a new classifier must be created. It will then replace the
* previous one.
*/
def nerAnnotate[F[_]: Applicative](
cacheKey: String,
cache: PipelineCache[F]
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.map(crf => runClassifier(crf, text))
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
val nerClassifier = lang match { val doc = new CoreDocument(text)
case Language.English => englishNerClassifier nerClassifier.annotate(doc)
case Language.German => germanNerClassifier doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
}
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
} }
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
}.fold(throw _, identity)
}
private def classifierResource(lang: Language): URL = {
def check(u: URL): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
else u
check(lang match {
case Language.German =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
)
})
}
} }

View File

@ -0,0 +1,22 @@
package docspell.analysis.nlp
import java.nio.file.Path
import docspell.common._
/** Settings for configuring the stanford NER pipeline.
*
* The language is mandatory, only the provided ones are supported.
* The `highRecall` only applies for non-English languages. For
* non-English languages the english classifier is run as second
* classifier and if `highRecall` is true, then it will be used to
* tag untagged tokens. This may lead to a lot of false positives,
* but since English is omnipresent in other languages, too it
* depends on the use case for whether this is useful or not.
*
* The `regexNer` allows to specify a text file as described here:
* https://nlp.stanford.edu/software/regexner.html. This will be used
* as a last step to tag untagged tokens using the provided list of
* regexps.
*/
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])

View File

@ -3,31 +3,44 @@ package docspell.analysis.nlp
import minitest.SimpleTestSuite import minitest.SimpleTestSuite
import docspell.files.TestFiles import docspell.files.TestFiles
import docspell.common._ import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
object TextAnalyserSuite extends SimpleTestSuite { object TextAnalyserSuite extends SimpleTestSuite {
lazy val germanClassifier =
new StanfordCoreNLP(Properties.nerGerman(None, false))
lazy val englishClassifier =
new StanfordCoreNLP(Properties.nerEnglish(None))
test("find english ner labels") { test("find english ner labels") {
val labels = val labels =
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText) StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
val expect = Vector( val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11), NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37), NerLabel("Elm", NerTag.Misc, 17, 20),
NerLabel("Ave.", NerTag.Misc, 21, 25),
NerLabel("Treesville", NerTag.Misc, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73), NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79), NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105), NerLabel("Elm", NerTag.Misc, 85, 88),
NerLabel("Ave.", NerTag.Misc, 89, 93),
NerLabel("Treesville", NerTag.Person, 95, 105),
NerLabel("Leaf", NerTag.Organization, 144, 148),
NerLabel("Chief", NerTag.Organization, 150, 155),
NerLabel("of", NerTag.Organization, 156, 158),
NerLabel("Syrup", NerTag.Organization, 159, 164), NerLabel("Syrup", NerTag.Organization, 159, 164),
NerLabel("Production", NerTag.Organization, 165, 175), NerLabel("Production", NerTag.Organization, 165, 175),
NerLabel("Old", NerTag.Organization, 176, 179), NerLabel("Old", NerTag.Organization, 176, 179),
NerLabel("Sticky", NerTag.Organization, 180, 186), NerLabel("Sticky", NerTag.Organization, 180, 186),
NerLabel("Pancake", NerTag.Organization, 187, 194), NerLabel("Pancake", NerTag.Organization, 187, 194),
NerLabel("Company", NerTag.Organization, 195, 202), NerLabel("Company", NerTag.Organization, 195, 202),
NerLabel("Maple", NerTag.Location, 207, 212), NerLabel("Maple", NerTag.Organization, 207, 212),
NerLabel("Lane", NerTag.Location, 213, 217), NerLabel("Lane", NerTag.Organization, 213, 217),
NerLabel("Forest", NerTag.Location, 219, 225), NerLabel("Forest", NerTag.Organization, 219, 225),
NerLabel("Hemptown", NerTag.Location, 239, 247), NerLabel("Hemptown", NerTag.Location, 239, 247),
NerLabel("Little", NerTag.Organization, 347, 353), NerLabel("Leaf", NerTag.Person, 276, 280),
NerLabel("League", NerTag.Organization, 354, 360), NerLabel("Little", NerTag.Misc, 347, 353),
NerLabel("League", NerTag.Misc, 354, 360),
NerLabel("Derek", NerTag.Person, 1117, 1122), NerLabel("Derek", NerTag.Person, 1117, 1122),
NerLabel("Jeter", NerTag.Person, 1123, 1128) NerLabel("Jeter", NerTag.Person, 1123, 1128)
) )
@ -36,11 +49,11 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find german ner labels") { test("find german ner labels") {
val labels = val labels =
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText) StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
val expect = Vector( val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14), NerLabel("Mustermann", NerTag.Person, 4, 14),
NerLabel("Lilienweg", NerTag.Location, 16, 25), NerLabel("Lilienweg", NerTag.Person, 16, 25),
NerLabel("Max", NerTag.Person, 77, 80), NerLabel("Max", NerTag.Person, 77, 80),
NerLabel("Mustermann", NerTag.Person, 81, 91), NerLabel("Mustermann", NerTag.Person, 81, 91),
NerLabel("Lilienweg", NerTag.Location, 93, 102), NerLabel("Lilienweg", NerTag.Location, 93, 102),

View File

@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
def hours: Long = minutes / 60 def hours: Long = minutes / 60
def >(other: Duration): Boolean =
nanos > other.nanos
def <(other: Duration): Boolean =
nanos < other.nanos
def toScala: FiniteDuration = def toScala: FiniteDuration =
FiniteDuration(nanos, TimeUnit.NANOSECONDS) FiniteDuration(nanos, TimeUnit.NANOSECONDS)
@ -62,6 +68,9 @@ object Duration {
def nanos(n: Long): Duration = def nanos(n: Long): Duration =
Duration(n) Duration(n)
def between(start: Timestamp, end: Timestamp): Duration =
apply(JDur.between(start.value, end.value))
def stopTime[F[_]: Sync]: F[F[Duration]] = def stopTime[F[_]: Sync]: F[F[Duration]] =
for { for {
now <- Timestamp.current[F] now <- Timestamp.current[F]

View File

@ -1,6 +1,7 @@
package docspell.common package docspell.common
import java.io.IOException import java.io.IOException
import java.nio.charset.StandardCharsets
import java.nio.file._ import java.nio.file._
import java.nio.file.attribute.BasicFileAttributes import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.atomic.AtomicInteger
@ -11,6 +12,10 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.common.syntax.all._
import io.circe.Decoder
object File { object File {
def mkDir[F[_]: Sync](dir: Path): F[Path] = def mkDir[F[_]: Sync](dir: Path): F[Path] =
@ -55,6 +60,9 @@ object File {
def exists[F[_]: Sync](file: Path): F[Boolean] = def exists[F[_]: Sync](file: Path): F[Boolean] =
Sync[F].delay(Files.exists(file)) Sync[F].delay(Files.exists(file))
def size[F[_]: Sync](file: Path): F[Long] =
Sync[F].delay(Files.size(file))
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
@ -84,4 +92,13 @@ object File {
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
d: Decoder[A]
): F[A] =
readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
} }

View File

@ -27,7 +27,12 @@ object Language {
val iso3 = "eng" val iso3 = "eng"
} }
val all: List[Language] = List(German, English) case object French extends Language {
val iso2 = "fr"
val iso3 = "fra"
}
val all: List[Language] = List(German, English, French)
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase val lang = str.toLowerCase

View File

@ -23,6 +23,7 @@ object Field {
val content = Field("content") val content = Field("content")
val content_de = Field("content_de") val content_de = Field("content_de")
val content_en = Field("content_en") val content_en = Field("content_en")
val content_fr = Field("content_fr")
val itemName = Field("itemName") val itemName = Field("itemName")
val itemNotes = Field("itemNotes") val itemNotes = Field("itemNotes")
val folderId = Field("folder") val folderId = Field("folder")
@ -33,6 +34,8 @@ object Field {
Field.content_de Field.content_de
case Language.English => case Language.English =>
Field.content_en Field.content_en
case Language.French =>
Field.content_fr
} }
implicit val jsonEncoder: Encoder[Field] = implicit val jsonEncoder: Encoder[Field] =

View File

@ -39,6 +39,7 @@ object SolrQuery {
Field.content, Field.content,
Field.content_de, Field.content_de,
Field.content_en, Field.content_en,
Field.content_fr,
Field.itemName, Field.itemName,
Field.itemNotes, Field.itemNotes,
Field.attachmentName Field.attachmentName

View File

@ -80,6 +80,8 @@ object SolrSetup {
addTextField(l.some)(Field.content_de) addTextField(l.some)(Field.content_de)
case l @ Language.English => case l @ Language.English =>
addTextField(l.some)(Field.content_en) addTextField(l.some)(Field.content_en)
case l @ Language.French =>
addTextField(l.some)(Field.content_fr)
} }
cmds0 *> cmds1 *> cntLang *> ().pure[F] cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -105,6 +107,9 @@ object SolrSetup {
case Some(Language.English) => case Some(Language.English) =>
run(DeleteField.command(DeleteField(field))).attempt *> run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textEN(field))) run(AddField.command(AddField.textEN(field)))
case Some(Language.French) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textFR(field)))
} }
} }
} }
@ -138,6 +143,9 @@ object SolrSetup {
def textEN(field: Field): AddField = def textEN(field: Field): AddField =
AddField(field, "text_en", true, true, false) AddField(field, "text_en", true, true, false)
def textFR(field: Field): AddField =
AddField(field, "text_fr", true, true, false)
} }
case class DeleteField(name: Field) case class DeleteField(name: Field)

View File

@ -248,6 +248,29 @@ docspell.joex {
# should suffice. Default is 10000, which are about 2-3 pages # should suffice. Default is 10000, which are about 2-3 pages
# (just a rough guess, of course). # (just a rough guess, of course).
max-length = 10000 max-length = 10000
# A working directory for the analyser to store temporary/working
# files.
working-dir = ${java.io.tmpdir}"/docspell-analysis"
regex-ner {
# Whether to enable custom NER annotation. This uses the address
# book of a collective as input for NER tagging (to automatically
# find correspondent and concerned entities). If the address book
# is large, this can be quite memory intensive and also makes text
# analysis slower. But it greatly improves accuracy. If this is
# false, NER tagging uses only statistical models (that also work
# quite well).
#
# This setting might be moved to the collective settings in the
# future.
enabled = true
# The NER annotation uses a file of patterns that is derived from
# a collective's address book. This is is the time how long this
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
} }
# Configuration for converting files into PDFs. # Configuration for converting files into PDFs.

View File

@ -1,11 +1,14 @@
package docspell.joex package docspell.joex
import java.nio.file.Path
import docspell.analysis.TextAnalysisConfig import docspell.analysis.TextAnalysisConfig
import docspell.backend.Config.Files import docspell.backend.Config.Files
import docspell.common._ import docspell.common._
import docspell.convert.ConvertConfig import docspell.convert.ConvertConfig
import docspell.extract.ExtractConfig import docspell.extract.ExtractConfig
import docspell.ftssolr.SolrConfig import docspell.ftssolr.SolrConfig
import docspell.joex.analysis.RegexNerFile
import docspell.joex.hk.HouseKeepingConfig import docspell.joex.hk.HouseKeepingConfig
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig} import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
import docspell.store.JdbcConfig import docspell.store.JdbcConfig
@ -20,7 +23,7 @@ case class Config(
userTasks: Config.UserTasks, userTasks: Config.UserTasks,
houseKeeping: HouseKeepingConfig, houseKeeping: HouseKeepingConfig,
extraction: ExtractConfig, extraction: ExtractConfig,
textAnalysis: TextAnalysisConfig, textAnalysis: Config.TextAnalysis,
convert: ConvertConfig, convert: ConvertConfig,
sendMail: MailSendConfig, sendMail: MailSendConfig,
files: Files, files: Files,
@ -50,4 +53,19 @@ object Config {
} }
case class Processing(maxDueDateYears: Int) case class Processing(maxDueDateYears: Int)
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
) {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(maxLength)
def regexNerFileConfig: RegexNerFile.Config =
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
} }

View File

@ -6,10 +6,12 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.concurrent.SignallingRef import fs2.concurrent.SignallingRef
import docspell.analysis.TextAnalyser
import docspell.backend.ops._ import docspell.backend.ops._
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.ftssolr.SolrFtsClient import docspell.ftssolr.SolrFtsClient
import docspell.joex.analysis.RegexNerFile
import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.hk._ import docspell.joex.hk._
import docspell.joex.notify._ import docspell.joex.notify._
@ -80,14 +82,16 @@ object JoexAppImpl {
for { for {
httpClient <- BlazeClientBuilder[F](clientEC).resource httpClient <- BlazeClientBuilder[F](clientEC).resource
client = JoexClient(httpClient) client = JoexClient(httpClient)
store <- Store.create(cfg.jdbc, connectEC, blocker) store <- Store.create(cfg.jdbc, connectEC, blocker)
queue <- JobQueue(store) queue <- JobQueue(store)
pstore <- PeriodicTaskStore.create(store) pstore <- PeriodicTaskStore.create(store)
nodeOps <- ONode(store) nodeOps <- ONode(store)
joex <- OJoex(client, store) joex <- OJoex(client, store)
upload <- OUpload(store, queue, cfg.files, joex) upload <- OUpload(store, queue, cfg.files, joex)
fts <- createFtsClient(cfg)(httpClient) fts <- createFtsClient(cfg)(httpClient)
itemOps <- OItem(store, fts, queue, joex) itemOps <- OItem(store, fts, queue, joex)
analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
javaEmil = javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
sch <- SchedulerBuilder(cfg.scheduler, blocker, store) sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -95,14 +99,14 @@ object JoexAppImpl {
.withTask( .withTask(
JobTask.json( JobTask.json(
ProcessItemArgs.taskName, ProcessItemArgs.taskName,
ItemHandler.newItem[F](cfg, itemOps, fts), ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
ItemHandler.onCancel[F] ItemHandler.onCancel[F]
) )
) )
.withTask( .withTask(
JobTask.json( JobTask.json(
ReProcessItemArgs.taskName, ReProcessItemArgs.taskName,
ReProcessItem[F](cfg, fts), ReProcessItem[F](cfg, fts, analyser, regexNer),
ReProcessItem.onCancel[F] ReProcessItem.onCancel[F]
) )
) )

View File

@ -0,0 +1,99 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.implicits._
import docspell.analysis.split.TextSplitter
import docspell.common._
import docspell.store.queries.QCollective
import io.circe.generic.semiauto._
import io.circe.{Decoder, Encoder}
case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
def nerFilePath(directory: Path): Path =
NerFile.nerFilePath(directory, collective)
def jsonFilePath(directory: Path) =
NerFile.jsonFilePath(directory, collective)
}
object NerFile {
implicit val jsonDecoder: Decoder[NerFile] =
deriveDecoder[NerFile]
implicit val jsonEncoder: Encoder[NerFile] =
deriveEncoder[NerFile]
private def nerFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.txt")
private def jsonFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.json")
def find[F[_]: Sync: ContextShift](
collective: Ident,
directory: Path,
blocker: Blocker
): F[Option[NerFile]] = {
val file = jsonFilePath(directory, collective)
File.existsNonEmpty[F](file).flatMap {
case true =>
File
.readJson[F, NerFile](file, blocker)
.map(_.some)
case false =>
(None: Option[NerFile]).pure[F]
}
}
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter
.split(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.map(w => s"(?i)${w}")
val tokens =
TextSplitter
.splitToken(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.take(3)
.map(w => s"(?i)${w}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
}

View File

@ -0,0 +1,164 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.effect.concurrent.Semaphore
import cats.implicits._
import docspell.common._
import docspell.common.syntax.all._
import docspell.store.Store
import docspell.store.queries.QCollective
import docspell.store.records.REquipment
import docspell.store.records.ROrganization
import docspell.store.records.RPerson
import io.circe.syntax._
import org.log4s.getLogger
/** Maintains a custom regex-ner file per collective for stanford's
* regexner annotator.
*/
trait RegexNerFile[F[_]] {
def makeFile(collective: Ident): F[Option[Path]]
}
object RegexNerFile {
private[this] val logger = getLogger
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
def apply[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F]
): Resource[F, RegexNerFile[F]] =
for {
dir <- File.withTempDir[F](cfg.directory, "regexner-")
writer <- Resource.liftF(Semaphore(1))
} yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
final private class Impl[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F],
writer: Semaphore[F] //TODO allow parallelism per collective
) extends RegexNerFile[F] {
def makeFile(collective: Ident): F[Option[Path]] =
if (cfg.enabled) doMakeFile(collective)
else (None: Option[Path]).pure[F]
def doMakeFile(collective: Ident): F[Option[Path]] =
for {
now <- Timestamp.current[F]
existing <- NerFile.find[F](collective, cfg.directory, blocker)
result <- existing match {
case Some(nf) =>
val dur = Duration.between(nf.creation, now)
if (dur > cfg.minTime)
logger.fdebug(
s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
) *> updateFile(
collective,
now,
Some(nf)
)
else nf.nerFilePath(cfg.directory).some.pure[F]
case None =>
updateFile(collective, now, None)
}
} yield result
private def updateFile(
collective: Ident,
now: Timestamp,
current: Option[NerFile]
): F[Option[Path]] =
for {
lastUpdate <- store.transact(Sql.latestUpdate(collective))
result <- lastUpdate match {
case None =>
(None: Option[Path]).pure[F]
case Some(lup) =>
current match {
case Some(cur) =>
val nerf =
if (cur.updated == lup)
logger.fdebug(s"No state change detected.") *> updateTimestamp(
cur,
now
) *> cur.pure[F]
else
logger.fdebug(
s"There have been state changes for collective '${collective.id}'. Reload NER file."
) *> createFile(lup, collective, now)
nerf.map(_.nerFilePath(cfg.directory).some)
case None =>
createFile(lup, collective, now)
.map(_.nerFilePath(cfg.directory).some)
}
}
} yield result
private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
writer.withPermit(for {
file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- File.mkDir(file.getParent)
_ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
} yield ())
private def createFile(
lastUpdate: Timestamp,
collective: Ident,
now: Timestamp
): F[NerFile] = {
def update(nf: NerFile, text: String): F[Unit] =
writer.withPermit(for {
jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
_ <- File.mkDir(jsonFile.getParent)
_ <- File.writeString(nf.nerFilePath(cfg.directory), text)
_ <- File.writeString(jsonFile, nf.asJson.spaces2)
} yield ())
for {
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
names <- store.transact(QCollective.allNames(collective))
nerFile = NerFile(collective, lastUpdate, now)
_ <- update(nerFile, NerFile.mkNerConfig(names))
} yield nerFile
}
}
object Sql {
import doobie._
import doobie.implicits._
import docspell.store.impl.Implicits._
import docspell.store.impl.Column
def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
def max(col: Column, table: Fragment, cidCol: Column): Fragment =
selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
val sql =
List(
max(
ROrganization.Columns.updated,
ROrganization.table,
ROrganization.Columns.cid
),
max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
)
.reduce(_ ++ fr"UNION ALL" ++ _)
selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
.query[Timestamp]
.option
}
}
}

View File

@ -5,10 +5,12 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem import docspell.backend.ops.OItem
import docspell.common.{ItemState, ProcessItemArgs} import docspell.common.{ItemState, ProcessItemArgs}
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.queries.QItem import docspell.store.queries.QItem
import docspell.store.records.RItem import docspell.store.records.RItem
@ -29,11 +31,13 @@ object ItemHandler {
def newItem[F[_]: ConcurrentEffect: ContextShift]( def newItem[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
CreateItem[F] CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing)) .flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts)) .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
.map(_ => ()) .map(_ => ())
def itemStateTask[F[_]: Sync, A]( def itemStateTask[F[_]: Sync, A](
@ -51,11 +55,13 @@ object ItemHandler {
def safeProcess[F[_]: ConcurrentEffect: ContextShift]( def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({ ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
case Left(ex) => case Left(ex) =>
@ -65,7 +71,8 @@ object ItemHandler {
.andThen(_ => Sync[F].raiseError(ex)) .andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created)) ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
.flatMap(itemStateTask(ItemState.Created))
} }
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] = private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =

View File

@ -2,10 +2,12 @@ package docspell.joex.process
import cats.effect._ import cats.effect._
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem import docspell.backend.ops.OItem
import docspell.common.ProcessItemArgs import docspell.common.ProcessItemArgs
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
object ProcessItem { object ProcessItem {
@ -13,25 +15,31 @@ object ProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift]( def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ExtractArchive(item) ExtractArchive(item)
.flatMap(Task.setProgress(20)) .flatMap(Task.setProgress(20))
.flatMap(processAttachments0(cfg, fts, (40, 60, 80))) .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(SetGivenData[F](itemOps)) .flatMap(SetGivenData[F](itemOps))
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
def processAttachments[F[_]: ConcurrentEffect: ContextShift]( def processAttachments[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, (30, 60, 90))(item) processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync]( def analysisOnly[F[_]: Sync](
cfg: Config cfg: Config,
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg.textAnalysis)(item) TextAnalysis[F](analyser, regexNer)(item)
.flatMap(FindProposal[F](cfg.processing)) .flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F]) .flatMap(EvalProposals[F])
.flatMap(SaveProposals[F]) .flatMap(SaveProposals[F])
@ -39,12 +47,14 @@ object ProcessItem {
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift]( private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
progress: (Int, Int, Int) progress: (Int, Int, Int)
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(cfg.convert, item) ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1)) .flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(progress._2)) .flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg)) .flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3)) .flatMap(Task.setProgress(progress._3))
} }

View File

@ -4,9 +4,11 @@ import cats.data.OptionT
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Context import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment import docspell.store.records.RAttachment
@ -19,10 +21,12 @@ object ReProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift]( def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
loadItem[F] loadItem[F]
.flatMap(safeProcess[F](cfg, fts)) .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
.map(_ => ()) .map(_ => ())
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -70,6 +74,8 @@ object ReProcessItem {
def processFiles[F[_]: ConcurrentEffect: ContextShift]( def processFiles[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
data: ItemData data: ItemData
): Task[F, Args, ItemData] = { ): Task[F, Args, ItemData] = {
@ -91,7 +97,7 @@ object ReProcessItem {
getLanguage[F].flatMap { lang => getLanguage[F].flatMap { lang =>
ProcessItem ProcessItem
.processAttachments[F](cfg, fts)(data) .processAttachments[F](cfg, fts, analyser, regexNer)(data)
.contramap[Args](convertArgs(lang)) .contramap[Args](convertArgs(lang))
} }
} }
@ -109,11 +115,13 @@ object ReProcessItem {
def safeProcess[F[_]: ConcurrentEffect: ContextShift]( def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
processFiles[F](cfg, fts, data).attempt processFiles[F](cfg, fts, analyser, regexNer, data).attempt
.flatMap({ .flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
@ -123,7 +131,7 @@ object ReProcessItem {
).andThen(_ => Sync[F].raiseError(ex)) ).andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
processFiles[F](cfg, fts, data) processFiles[F](cfg, fts, analyser, regexNer, data)
} }
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =

View File

@ -1,47 +1,57 @@
package docspell.joex.process package docspell.joex.process
import cats.effect.Sync import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.{TextAnalyser, TextAnalysisConfig} import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.common._ import docspell.common._
import docspell.joex.analysis.RegexNerFile
import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta import docspell.store.records.RAttachmentMeta
object TextAnalysis { object TextAnalysis {
def apply[F[_]: Sync]( def apply[F[_]: Sync](
cfg: TextAnalysisConfig analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
TextAnalyser.create[F](cfg).use { analyser => for {
for { _ <- ctx.logger.info("Starting text analysis")
_ <- ctx.logger.info("Starting text analysis") s <- Duration.stopTime[F]
s <- Duration.stopTime[F] t <-
t <- item.metas.toList
item.metas.toList .traverse(
.traverse( annotateAttachment[F](ctx, analyser, nerFile)
annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser) )
) _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- t.traverse(m =>
_ <- t.traverse(m => ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) )
) e <- s
e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector
v = t.toVector } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
}
} }
def annotateAttachment[F[_]: Sync]( def annotateAttachment[F[_]: Sync](
lang: Language, ctx: Context[F, ProcessItemArgs],
logger: Logger[F], analyser: TextAnalyser[F],
analyser: TextAnalyser[F] nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(ctx.args.meta.language, false, None)
for { for {
labels <- analyser.annotate(logger, lang, rm.content.getOrElse("")) customNer <- nerFile.makeFile(ctx.args.meta.collective)
sett = settings.copy(regexNer = customNer)
labels <- analyser.annotate(
ctx.logger,
sett,
ctx.args.meta.collective,
rm.content.getOrElse("")
)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
} }

View File

@ -341,6 +341,7 @@ trait Conversions {
v.address.city, v.address.city,
v.address.country, v.address.country,
v.notes, v.notes,
now,
now now
) )
} yield OOrganization.OrgAndContacts(org, cont) } yield OOrganization.OrgAndContacts(org, cont)
@ -353,6 +354,7 @@ trait Conversions {
def contacts(oid: Ident) = def contacts(oid: Ident) =
v.contacts.traverse(c => newContact(c, oid.some, None)) v.contacts.traverse(c => newContact(c, oid.some, None))
for { for {
now <- Timestamp.current[F]
cont <- contacts(v.id) cont <- contacts(v.id)
org = ROrganization( org = ROrganization(
v.id, v.id,
@ -363,7 +365,8 @@ trait Conversions {
v.address.city, v.address.city,
v.address.country, v.address.country,
v.notes, v.notes,
v.created v.created,
now
) )
} yield OOrganization.OrgAndContacts(org, cont) } yield OOrganization.OrgAndContacts(org, cont)
} }
@ -398,6 +401,7 @@ trait Conversions {
v.address.country, v.address.country,
v.notes, v.notes,
v.concerning, v.concerning,
now,
now now
) )
} yield OOrganization.PersonAndContacts(org, cont) } yield OOrganization.PersonAndContacts(org, cont)
@ -410,6 +414,7 @@ trait Conversions {
def contacts(pid: Ident) = def contacts(pid: Ident) =
v.contacts.traverse(c => newContact(c, None, pid.some)) v.contacts.traverse(c => newContact(c, None, pid.some))
for { for {
now <- Timestamp.current[F]
cont <- contacts(v.id) cont <- contacts(v.id)
org = RPerson( org = RPerson(
v.id, v.id,
@ -421,7 +426,8 @@ trait Conversions {
v.address.country, v.address.country,
v.notes, v.notes,
v.concerning, v.concerning,
v.created v.created,
now
) )
} yield OOrganization.PersonAndContacts(org, cont) } yield OOrganization.PersonAndContacts(org, cont)
} }
@ -536,11 +542,11 @@ trait Conversions {
def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] = def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
timeId.map({ timeId.map({
case (id, now) => case (id, now) =>
REquipment(id, cid, e.name, now) REquipment(id, cid, e.name, now, now)
}) })
def changeEquipment(e: Equipment, cid: Ident): REquipment = def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
REquipment(e.id, cid, e.name, e.created) Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))
// idref // idref

View File

@ -39,10 +39,10 @@ object EquipmentRoutes {
case req @ PUT -> Root => case req @ PUT -> Root =>
for { for {
data <- req.as[Equipment] data <- req.as[Equipment]
equip = changeEquipment(data, user.account.collective) equip <- changeEquipment(data, user.account.collective)
res <- backend.equipment.update(equip) res <- backend.equipment.update(equip)
resp <- Ok(basicResult(res, "Equipment updated.")) resp <- Ok(basicResult(res, "Equipment updated."))
} yield resp } yield resp
case DELETE -> Root / Ident(id) => case DELETE -> Root / Ident(id) =>

View File

@ -0,0 +1,29 @@
-- organization
ALTER TABLE `organization`
ADD COLUMN (`updated` timestamp);
UPDATE `organization` SET `updated` = `created`;
ALTER TABLE `organization`
MODIFY `updated` timestamp NOT NULL;
-- person
ALTER TABLE `person`
MODIFY `created` timestamp;
ALTER TABLE `person`
ADD COLUMN (`updated` timestamp);
UPDATE `person` SET `updated` = `created`;
ALTER TABLE `person`
MODIFY `updated` timestamp NOT NULL;
-- equipment
ALTER TABLE `equipment`
ADD COLUMN (`updated` timestamp);
UPDATE `equipment` SET `updated` = `created`;
ALTER TABLE `equipment`
MODIFY `updated` timestamp NOT NULL;

View File

@ -0,0 +1,29 @@
-- organization
ALTER TABLE "organization"
ADD COLUMN "updated" timestamp;
UPDATE "organization" SET "updated" = "created";
ALTER TABLE "organization"
ALTER COLUMN "updated" SET NOT NULL;
-- person
ALTER TABLE "person" ALTER COLUMN "created"
TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
ALTER TABLE "person"
ADD COLUMN "updated" timestamp;
UPDATE "person" SET "updated" = "created";
ALTER TABLE "person"
ALTER COLUMN "updated" SET NOT NULL;
-- equipment
ALTER TABLE "equipment"
ADD COLUMN "updated" timestamp;
UPDATE "equipment" SET "updated" = "created";
ALTER TABLE "equipment"
ALTER COLUMN "updated" SET NOT NULL;

View File

@ -1,5 +1,6 @@
package docspell.store.queries package docspell.store.queries
import cats.data.OptionT
import fs2.Stream import fs2.Stream
import docspell.common.ContactKind import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
import doobie.implicits._ import doobie.implicits._
object QCollective { object QCollective {
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
object Names {
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
}
def allNames(collective: Ident): ConnectionIO[Names] =
(for {
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
.getOrElse(Names.empty)
case class TagCount(tag: RTag, count: Int) case class TagCount(tag: RTag, count: Int)
case class InsightData( case class InsightData(

View File

@ -7,7 +7,13 @@ import docspell.store.impl._
import doobie._ import doobie._
import doobie.implicits._ import doobie.implicits._
case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {} case class REquipment(
eid: Ident,
cid: Ident,
name: String,
created: Timestamp,
updated: Timestamp
) {}
object REquipment { object REquipment {
@ -18,25 +24,32 @@ object REquipment {
val cid = Column("cid") val cid = Column("cid")
val name = Column("name") val name = Column("name")
val created = Column("created") val created = Column("created")
val all = List(eid, cid, name, created) val updated = Column("updated")
val all = List(eid, cid, name, created, updated)
} }
import Columns._ import Columns._
def insert(v: REquipment): ConnectionIO[Int] = { def insert(v: REquipment): ConnectionIO[Int] = {
val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}") val sql =
insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
sql.update.run sql.update.run
} }
def update(v: REquipment): ConnectionIO[Int] = { def update(v: REquipment): ConnectionIO[Int] = {
val sql = updateRow( def sql(now: Timestamp) =
table, updateRow(
and(eid.is(v.eid), cid.is(v.cid)), table,
commas( and(eid.is(v.eid), cid.is(v.cid)),
cid.setTo(v.cid), commas(
name.setTo(v.name) cid.setTo(v.cid),
name.setTo(v.name),
updated.setTo(now)
)
) )
) for {
sql.update.run now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
} }
def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = { def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {

View File

@ -19,7 +19,8 @@ case class ROrganization(
city: String, city: String,
country: String, country: String,
notes: Option[String], notes: Option[String],
created: Timestamp created: Timestamp,
updated: Timestamp
) {} ) {}
object ROrganization { object ROrganization {
@ -38,7 +39,8 @@ object ROrganization {
val country = Column("country") val country = Column("country")
val notes = Column("notes") val notes = Column("notes")
val created = Column("created") val created = Column("created")
val all = List(oid, cid, name, street, zip, city, country, notes, created) val updated = Column("updated")
val all = List(oid, cid, name, street, zip, city, country, notes, created, updated)
} }
import Columns._ import Columns._
@ -47,26 +49,31 @@ object ROrganization {
val sql = insertRow( val sql = insertRow(
table, table,
all, all,
fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}" fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
) )
sql.update.run sql.update.run
} }
def update(v: ROrganization): ConnectionIO[Int] = { def update(v: ROrganization): ConnectionIO[Int] = {
val sql = updateRow( def sql(now: Timestamp) =
table, updateRow(
and(oid.is(v.oid), cid.is(v.cid)), table,
commas( and(oid.is(v.oid), cid.is(v.cid)),
cid.setTo(v.cid), commas(
name.setTo(v.name), cid.setTo(v.cid),
street.setTo(v.street), name.setTo(v.name),
zip.setTo(v.zip), street.setTo(v.street),
city.setTo(v.city), zip.setTo(v.zip),
country.setTo(v.country), city.setTo(v.city),
notes.setTo(v.notes) country.setTo(v.country),
notes.setTo(v.notes),
updated.setTo(now)
)
) )
) for {
sql.update.run now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
} }
def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] = def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =

View File

@ -20,7 +20,8 @@ case class RPerson(
country: String, country: String,
notes: Option[String], notes: Option[String],
concerning: Boolean, concerning: Boolean,
created: Timestamp created: Timestamp,
updated: Timestamp
) {} ) {}
object RPerson { object RPerson {
@ -40,7 +41,20 @@ object RPerson {
val notes = Column("notes") val notes = Column("notes")
val concerning = Column("concerning") val concerning = Column("concerning")
val created = Column("created") val created = Column("created")
val all = List(pid, cid, name, street, zip, city, country, notes, concerning, created) val updated = Column("updated")
val all = List(
pid,
cid,
name,
street,
zip,
city,
country,
notes,
concerning,
created,
updated
)
} }
import Columns._ import Columns._
@ -49,27 +63,32 @@ object RPerson {
val sql = insertRow( val sql = insertRow(
table, table,
all, all,
fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}" fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
) )
sql.update.run sql.update.run
} }
def update(v: RPerson): ConnectionIO[Int] = { def update(v: RPerson): ConnectionIO[Int] = {
val sql = updateRow( def sql(now: Timestamp) =
table, updateRow(
and(pid.is(v.pid), cid.is(v.cid)), table,
commas( and(pid.is(v.pid), cid.is(v.cid)),
cid.setTo(v.cid), commas(
name.setTo(v.name), cid.setTo(v.cid),
street.setTo(v.street), name.setTo(v.name),
zip.setTo(v.zip), street.setTo(v.street),
city.setTo(v.city), zip.setTo(v.zip),
country.setTo(v.country), city.setTo(v.city),
concerning.setTo(v.concerning), country.setTo(v.country),
notes.setTo(v.notes) concerning.setTo(v.concerning),
notes.setTo(v.notes),
updated.setTo(now)
)
) )
) for {
sql.update.run now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
} }
def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] = def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =

View File

@ -10,6 +10,7 @@ module Data.Language exposing
type Language type Language
= German = German
| English | English
| French
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -20,6 +21,9 @@ fromString str =
else if str == "eng" || str == "en" || str == "english" then else if str == "eng" || str == "en" || str == "english" then
Just English Just English
else if str == "fra" || str == "fr" || str == "french" then
Just French
else else
Nothing Nothing
@ -33,6 +37,9 @@ toIso3 lang =
English -> English ->
"eng" "eng"
French ->
"fra"
toName : Language -> String toName : Language -> String
toName lang = toName lang =
@ -43,7 +50,10 @@ toName lang =
English -> English ->
"English" "English"
French ->
"French"
all : List Language all : List Language
all = all =
[ German, English ] [ German, English, French ]

View File

@ -91,6 +91,11 @@ let
}; };
text-analysis = { text-analysis = {
max-length = 10000; max-length = 10000;
regex-ner = {
enabled = true;
file-cache-time = "1 minute";
};
working-dir = "/tmp/docspell-analysis";
}; };
processing = { processing = {
max-due-date-years = 10; max-due-date-years = 10;
@ -689,7 +694,48 @@ in {
(a rough guess). (a rough guess).
''; '';
}; };
working-dir = mkOption {
type = types.str;
default = defaults.text-analysis.working-dir;
description = ''
A working directory for the analyser to store temporary/working
files.
'';
};
regex-ner = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.text-analysis.regex-ner.enabled;
description = ''
Whether to enable custom NER annotation. This uses the address
book of a collective as input for NER tagging (to automatically
find correspondent and concerned entities). If the address book
is large, this can be quite memory intensive and also makes text
analysis slower. But it greatly improves accuracy. If this is
false, NER tagging uses only statistical models (that also work
quite well).
This setting might be moved to the collective settings in the
future.
'';
};
file-cache-time = mkOption {
type = types.str;
default = defaults.text-analysis.ner-file-cache-time;
description = ''
The NER annotation uses a file of patterns that is derived from
a collective's address book. This is is the time how long this
file will be kept until a check for a state change is done.
'';
};
};
});
default = defaults.text-analysis.regex-ner;
description = "";
};
}; };
}); });
default = defaults.text-analysis; default = defaults.text-analysis;

View File

@ -31,7 +31,7 @@ object Dependencies {
val PostgresVersion = "42.2.16" val PostgresVersion = "42.2.16"
val PureConfigVersion = "0.13.0" val PureConfigVersion = "0.13.0"
val Slf4jVersion = "1.7.30" val Slf4jVersion = "1.7.30"
val StanfordNlpVersion = "3.9.2" val StanfordNlpVersion = "4.0.0"
val TikaVersion = "1.24.1" val TikaVersion = "1.24.1"
val YamuscaVersion = "0.6.2" val YamuscaVersion = "0.6.2"
val SwaggerUIVersion = "3.32.3" val SwaggerUIVersion = "3.32.3"
@ -135,11 +135,16 @@ object Dependencies {
) )
val stanfordNlpModels = Seq( val stanfordNlpModels = Seq(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"), .classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier( ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
"models-english" .classifier("models-french"),
) ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier(
"models-english"
)
) )
val tika = Seq( val tika = Seq(

View File

@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
} }
private val nerModels = List( private val nerModels = List(
"german.conll.germeval2014.hgc_175m_600.crf.ser.gz", "german.distsim.crf.ser.gz",
"english.all.3class.distsim.crf.ser.gz" "english.conll.4class.distsim.crf.ser.gz",
"french-wikiner-4class.crf.ser.gz",
"french-mwt-statistical.tsv",
"french-mwt.tagger",
"french-mwt.tsv",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
"french-ud.tagger",
"french-ud.tagger.props",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props"
) )
} }