Merge pull request #238 from eikek/stanford-nlp4

Stanford nlp4
This commit is contained in:
mergify[bot] 2020-08-25 19:02:43 +00:00 committed by GitHub
commit 31544240fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 1040 additions and 219 deletions

View File

@ -1,3 +0,0 @@
updates.ignore = [
{ groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
]

View File

@ -10,6 +10,7 @@ cache:
- $HOME/.ivy2/cache
- $HOME/.sbt/boot
- $HOME/.coursier/cache
- $HOME/.cache/coursier
- sysconfcpus
install:

View File

@ -1,6 +1,6 @@
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell)
[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
[![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org)
[![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
[![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)

View File

@ -5,12 +5,19 @@ import cats.implicits._
import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind
import docspell.analysis.nlp.PipelineCache
import docspell.analysis.nlp.StanfordNerClassifier
import docspell.analysis.nlp.StanfordSettings
import docspell.common._
trait TextAnalyser[F[_]] {
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
def annotate(
logger: Logger[F],
settings: StanfordSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result]
}
object TextAnalyser {
@ -22,43 +29,47 @@ object TextAnalyser {
}
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
def annotate(
logger: Logger[F],
lang: Language,
text: String
): F[TextAnalyser.Result] =
for {
input <- textLimit(logger, text)
tags0 <- stanfordNer(lang, input)
tags1 <- contactNer(input)
dates <- dateNer(lang, input)
list = tags0 ++ tags1
spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates)
Resource
.liftF(PipelineCache[F]())
.map(cache =>
new TextAnalyser[F] {
def annotate(
logger: Logger[F],
settings: StanfordSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result] =
for {
input <- textLimit(logger, text)
tags0 <- stanfordNer(cacheKey, settings, input)
tags1 <- contactNer(input)
dates <- dateNer(settings.lang, input)
list = tags0 ++ tags1
spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates)
private def textLimit(logger: Logger[F], text: String): F[String] =
if (text.length <= cfg.maxLength) text.pure[F]
else
logger.info(
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F]
private def textLimit(logger: Logger[F], text: String): F[String] =
if (text.length <= cfg.maxLength) text.pure[F]
else
logger.info(
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F]
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
Sync[F].delay {
StanfordNerClassifier.nerAnnotate(lang)(text)
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
: F[Vector[NerLabel]] =
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {
Contact.annotate(text)
}
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
Sync[F].delay {
DateFind.findDates(text, lang).toVector
}
}
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {
Contact.annotate(text)
}
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
Sync[F].delay {
DateFind.findDates(text, lang).toVector
}
})
)
}

View File

@ -54,6 +54,7 @@ object DateFind {
val p = lang match {
case Language.English => p2.or(p0).or(p1)
case Language.German => p1.or(p0).or(p2)
case Language.French => p1.or(p0).or(p2)
}
p.read(parts).toOption
}

View File

@ -0,0 +1,25 @@
package docspell.analysis.nlp
import docspell.common.{NerLabel, NerTag}
import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
object LabelConverter {
private def tagFromLabel[A <: CoreAnnotation[String]](
label: CoreLabel,
annot: Class[A]
): Option[NerTag] = {
val tag = label.get(annot)
Option(tag).flatMap(s => NerTag.fromString(s).toOption)
}
def findTag(label: CoreLabel): Option[NerTag] =
tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
.orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
def toNerLabel(label: CoreLabel): Option[NerLabel] =
findTag(label).map(t =>
NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
)
}

View File

@ -0,0 +1,90 @@
package docspell.analysis.nlp
import cats.Applicative
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import org.log4s.getLogger
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
* involves IO and initializing large objects.
*
* Therefore, the instances are cached, because they are thread-safe.
*
* **This is an internal API**
*/
trait PipelineCache[F[_]] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
}
object PipelineCache {
private[this] val logger = getLogger
def none[F[_]: Applicative]: PipelineCache[F] =
new PipelineCache[F] {
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
makeClassifier(settings).pure[F]
}
def apply[F[_]: Sync](): F[PipelineCache[F]] =
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
extends PipelineCache[F] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
for {
id <- makeSettingsId(settings)
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
} yield nlp
private def getOrCreate(
key: String,
id: String,
cache: Map[String, Entry],
settings: StanfordSettings
): (Map[String, Entry], StanfordCoreNLP) =
cache.get(key) match {
case Some(entry) =>
if (entry.id == id) (cache, entry.value)
else {
logger.info(
s"StanfordNLP settings changed for key $key. Creating new classifier"
)
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
case None =>
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
private def makeSettingsId(settings: StanfordSettings): F[String] = {
val base = settings.copy(regexNer = None).toString
val size: F[Long] =
settings.regexNer match {
case Some(p) =>
File.size(p)
case None =>
0L.pure[F]
}
size.map(len => s"$base-$len")
}
}
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(Properties.forSettings(settings))
}
private case class Entry(id: String, value: StanfordCoreNLP)
}

View File

@ -0,0 +1,111 @@
package docspell.analysis.nlp
import java.util.{Properties => JProps}
import docspell.analysis.nlp.Properties.Implicits._
import docspell.common._
object Properties {
def apply(ps: (String, String)*): JProps = {
val p = new JProps()
for ((k, v) <- ps)
p.setProperty(k, v)
p
}
def forSettings(settings: StanfordSettings): JProps = {
val regexNerFile = settings.regexNer
.map(p => p.normalize().toAbsolutePath().toString())
settings.lang match {
case Language.German =>
Properties.nerGerman(regexNerFile, settings.highRecall)
case Language.English =>
Properties.nerEnglish(regexNerFile)
case Language.French =>
Properties.nerFrench(regexNerFile, settings.highRecall)
}
}
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
Properties(
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile)
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
object Implicits {
implicit final class JPropsOps(val p: JProps) extends AnyVal {
def set(name: String, value: Option[String]): JProps =
value match {
case Some(v) =>
p.setProperty(name, v)
p
case None =>
p
}
def change(name: String, f: String => String): JProps =
Option(p.getProperty(name)) match {
case Some(current) =>
p.setProperty(name, f(current))
p
case None =>
p
}
def withRegexNer(mappingFile: Option[String]): JProps =
set("regexner.mapping", mappingFile)
.change(
"annotators",
v => if (mappingFile.isDefined) v + ",regexner" else v
)
def withHighRecall(flag: Boolean): JProps = {
if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
else p.setProperty("ner.combinationMode", "NORMAL")
p
}
}
}
}

View File

@ -1,65 +1,39 @@
package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import scala.jdk.CollectionConverters._
import scala.util.Using
import cats.Applicative
import cats.implicits._
import docspell.common._
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
object StanfordNerClassifier {
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
/** Runs named entity recognition on the given `text`.
*
* This uses the classifier pipeline from stanford-nlp, see
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
* classifiers is quite expensive, it involves loading large model
* files. The classifiers are thread-safe and so they are cached.
* The `cacheKey` defines the "slot" where classifiers are stored
* and retrieved. If for a given `cacheKey` the `settings` change,
* a new classifier must be created. It will then replace the
* previous one.
*/
def nerAnnotate[F[_]: Applicative](
cacheKey: String,
cache: PipelineCache[F]
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.map(crf => runClassifier(crf, text))
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
}
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
val doc = new CoreDocument(text)
nerClassifier.annotate(doc)
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
}.fold(throw _, identity)
}
private def classifierResource(lang: Language): URL = {
def check(u: URL): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
else u
check(lang match {
case Language.German =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
)
})
}
}

View File

@ -0,0 +1,22 @@
package docspell.analysis.nlp
import java.nio.file.Path
import docspell.common._
/** Settings for configuring the stanford NER pipeline.
*
* The language is mandatory, only the provided ones are supported.
* The `highRecall` only applies for non-English languages. For
* non-English languages the english classifier is run as second
* classifier and if `highRecall` is true, then it will be used to
* tag untagged tokens. This may lead to a lot of false positives,
* but since English is omnipresent in other languages, too it
* depends on the use case for whether this is useful or not.
*
* The `regexNer` allows to specify a text file as described here:
* https://nlp.stanford.edu/software/regexner.html. This will be used
* as a last step to tag untagged tokens using the provided list of
* regexps.
*/
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])

View File

@ -3,31 +3,44 @@ package docspell.analysis.nlp
import minitest.SimpleTestSuite
import docspell.files.TestFiles
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
object TextAnalyserSuite extends SimpleTestSuite {
lazy val germanClassifier =
new StanfordCoreNLP(Properties.nerGerman(None, false))
lazy val englishClassifier =
new StanfordCoreNLP(Properties.nerEnglish(None))
test("find english ner labels") {
val labels =
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Elm", NerTag.Misc, 17, 20),
NerLabel("Ave.", NerTag.Misc, 21, 25),
NerLabel("Treesville", NerTag.Misc, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Elm", NerTag.Misc, 85, 88),
NerLabel("Ave.", NerTag.Misc, 89, 93),
NerLabel("Treesville", NerTag.Person, 95, 105),
NerLabel("Leaf", NerTag.Organization, 144, 148),
NerLabel("Chief", NerTag.Organization, 150, 155),
NerLabel("of", NerTag.Organization, 156, 158),
NerLabel("Syrup", NerTag.Organization, 159, 164),
NerLabel("Production", NerTag.Organization, 165, 175),
NerLabel("Old", NerTag.Organization, 176, 179),
NerLabel("Sticky", NerTag.Organization, 180, 186),
NerLabel("Pancake", NerTag.Organization, 187, 194),
NerLabel("Company", NerTag.Organization, 195, 202),
NerLabel("Maple", NerTag.Location, 207, 212),
NerLabel("Lane", NerTag.Location, 213, 217),
NerLabel("Forest", NerTag.Location, 219, 225),
NerLabel("Maple", NerTag.Organization, 207, 212),
NerLabel("Lane", NerTag.Organization, 213, 217),
NerLabel("Forest", NerTag.Organization, 219, 225),
NerLabel("Hemptown", NerTag.Location, 239, 247),
NerLabel("Little", NerTag.Organization, 347, 353),
NerLabel("League", NerTag.Organization, 354, 360),
NerLabel("Leaf", NerTag.Person, 276, 280),
NerLabel("Little", NerTag.Misc, 347, 353),
NerLabel("League", NerTag.Misc, 354, 360),
NerLabel("Derek", NerTag.Person, 1117, 1122),
NerLabel("Jeter", NerTag.Person, 1123, 1128)
)
@ -36,11 +49,11 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find german ner labels") {
val labels =
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),
NerLabel("Lilienweg", NerTag.Location, 16, 25),
NerLabel("Lilienweg", NerTag.Person, 16, 25),
NerLabel("Max", NerTag.Person, 77, 80),
NerLabel("Mustermann", NerTag.Person, 81, 91),
NerLabel("Lilienweg", NerTag.Location, 93, 102),

View File

@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
def hours: Long = minutes / 60
def >(other: Duration): Boolean =
nanos > other.nanos
def <(other: Duration): Boolean =
nanos < other.nanos
def toScala: FiniteDuration =
FiniteDuration(nanos, TimeUnit.NANOSECONDS)
@ -62,6 +68,9 @@ object Duration {
def nanos(n: Long): Duration =
Duration(n)
def between(start: Timestamp, end: Timestamp): Duration =
apply(JDur.between(start.value, end.value))
def stopTime[F[_]: Sync]: F[F[Duration]] =
for {
now <- Timestamp.current[F]

View File

@ -1,6 +1,7 @@
package docspell.common
import java.io.IOException
import java.nio.charset.StandardCharsets
import java.nio.file._
import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.atomic.AtomicInteger
@ -11,6 +12,10 @@ import cats.effect._
import cats.implicits._
import fs2.Stream
import docspell.common.syntax.all._
import io.circe.Decoder
object File {
def mkDir[F[_]: Sync](dir: Path): F[Path] =
@ -55,6 +60,9 @@ object File {
def exists[F[_]: Sync](file: Path): F[Boolean] =
Sync[F].delay(Files.exists(file))
def size[F[_]: Sync](file: Path): F[Long] =
Sync[F].delay(Files.size(file))
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
@ -84,4 +92,13 @@ object File {
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
d: Decoder[A]
): F[A] =
readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
}

View File

@ -27,7 +27,12 @@ object Language {
val iso3 = "eng"
}
val all: List[Language] = List(German, English)
case object French extends Language {
val iso2 = "fr"
val iso3 = "fra"
}
val all: List[Language] = List(German, English, French)
def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase

View File

@ -23,6 +23,7 @@ object Field {
val content = Field("content")
val content_de = Field("content_de")
val content_en = Field("content_en")
val content_fr = Field("content_fr")
val itemName = Field("itemName")
val itemNotes = Field("itemNotes")
val folderId = Field("folder")
@ -33,6 +34,8 @@ object Field {
Field.content_de
case Language.English =>
Field.content_en
case Language.French =>
Field.content_fr
}
implicit val jsonEncoder: Encoder[Field] =

View File

@ -39,6 +39,7 @@ object SolrQuery {
Field.content,
Field.content_de,
Field.content_en,
Field.content_fr,
Field.itemName,
Field.itemNotes,
Field.attachmentName

View File

@ -80,6 +80,8 @@ object SolrSetup {
addTextField(l.some)(Field.content_de)
case l @ Language.English =>
addTextField(l.some)(Field.content_en)
case l @ Language.French =>
addTextField(l.some)(Field.content_fr)
}
cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -105,6 +107,9 @@ object SolrSetup {
case Some(Language.English) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textEN(field)))
case Some(Language.French) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textFR(field)))
}
}
}
@ -138,6 +143,9 @@ object SolrSetup {
def textEN(field: Field): AddField =
AddField(field, "text_en", true, true, false)
def textFR(field: Field): AddField =
AddField(field, "text_fr", true, true, false)
}
case class DeleteField(name: Field)

View File

@ -248,6 +248,29 @@ docspell.joex {
# should suffice. Default is 10000, which are about 2-3 pages
# (just a rough guess, of course).
max-length = 10000
# A working directory for the analyser to store temporary/working
# files.
working-dir = ${java.io.tmpdir}"/docspell-analysis"
regex-ner {
# Whether to enable custom NER annotation. This uses the address
# book of a collective as input for NER tagging (to automatically
# find correspondent and concerned entities). If the address book
# is large, this can be quite memory intensive and also makes text
# analysis slower. But it greatly improves accuracy. If this is
# false, NER tagging uses only statistical models (that also work
# quite well).
#
# This setting might be moved to the collective settings in the
# future.
enabled = true
# The NER annotation uses a file of patterns that is derived from
# a collective's address book. This is is the time how long this
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
}
# Configuration for converting files into PDFs.

View File

@ -1,11 +1,14 @@
package docspell.joex
import java.nio.file.Path
import docspell.analysis.TextAnalysisConfig
import docspell.backend.Config.Files
import docspell.common._
import docspell.convert.ConvertConfig
import docspell.extract.ExtractConfig
import docspell.ftssolr.SolrConfig
import docspell.joex.analysis.RegexNerFile
import docspell.joex.hk.HouseKeepingConfig
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
import docspell.store.JdbcConfig
@ -20,7 +23,7 @@ case class Config(
userTasks: Config.UserTasks,
houseKeeping: HouseKeepingConfig,
extraction: ExtractConfig,
textAnalysis: TextAnalysisConfig,
textAnalysis: Config.TextAnalysis,
convert: ConvertConfig,
sendMail: MailSendConfig,
files: Files,
@ -50,4 +53,19 @@ object Config {
}
case class Processing(maxDueDateYears: Int)
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
) {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(maxLength)
def regexNerFileConfig: RegexNerFile.Config =
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
}

View File

@ -6,10 +6,12 @@ import cats.effect._
import cats.implicits._
import fs2.concurrent.SignallingRef
import docspell.analysis.TextAnalyser
import docspell.backend.ops._
import docspell.common._
import docspell.ftsclient.FtsClient
import docspell.ftssolr.SolrFtsClient
import docspell.joex.analysis.RegexNerFile
import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.hk._
import docspell.joex.notify._
@ -80,14 +82,16 @@ object JoexAppImpl {
for {
httpClient <- BlazeClientBuilder[F](clientEC).resource
client = JoexClient(httpClient)
store <- Store.create(cfg.jdbc, connectEC, blocker)
queue <- JobQueue(store)
pstore <- PeriodicTaskStore.create(store)
nodeOps <- ONode(store)
joex <- OJoex(client, store)
upload <- OUpload(store, queue, cfg.files, joex)
fts <- createFtsClient(cfg)(httpClient)
itemOps <- OItem(store, fts, queue, joex)
store <- Store.create(cfg.jdbc, connectEC, blocker)
queue <- JobQueue(store)
pstore <- PeriodicTaskStore.create(store)
nodeOps <- ONode(store)
joex <- OJoex(client, store)
upload <- OUpload(store, queue, cfg.files, joex)
fts <- createFtsClient(cfg)(httpClient)
itemOps <- OItem(store, fts, queue, joex)
analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -95,14 +99,14 @@ object JoexAppImpl {
.withTask(
JobTask.json(
ProcessItemArgs.taskName,
ItemHandler.newItem[F](cfg, itemOps, fts),
ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
ItemHandler.onCancel[F]
)
)
.withTask(
JobTask.json(
ReProcessItemArgs.taskName,
ReProcessItem[F](cfg, fts),
ReProcessItem[F](cfg, fts, analyser, regexNer),
ReProcessItem.onCancel[F]
)
)

View File

@ -0,0 +1,99 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.implicits._
import docspell.analysis.split.TextSplitter
import docspell.common._
import docspell.store.queries.QCollective
import io.circe.generic.semiauto._
import io.circe.{Decoder, Encoder}
case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
def nerFilePath(directory: Path): Path =
NerFile.nerFilePath(directory, collective)
def jsonFilePath(directory: Path) =
NerFile.jsonFilePath(directory, collective)
}
object NerFile {
implicit val jsonDecoder: Decoder[NerFile] =
deriveDecoder[NerFile]
implicit val jsonEncoder: Encoder[NerFile] =
deriveEncoder[NerFile]
private def nerFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.txt")
private def jsonFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.json")
def find[F[_]: Sync: ContextShift](
collective: Ident,
directory: Path,
blocker: Blocker
): F[Option[NerFile]] = {
val file = jsonFilePath(directory, collective)
File.existsNonEmpty[F](file).flatMap {
case true =>
File
.readJson[F, NerFile](file, blocker)
.map(_.some)
case false =>
(None: Option[NerFile]).pure[F]
}
}
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter
.split(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.map(w => s"(?i)${w}")
val tokens =
TextSplitter
.splitToken(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.take(3)
.map(w => s"(?i)${w}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
}

View File

@ -0,0 +1,164 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.effect.concurrent.Semaphore
import cats.implicits._
import docspell.common._
import docspell.common.syntax.all._
import docspell.store.Store
import docspell.store.queries.QCollective
import docspell.store.records.REquipment
import docspell.store.records.ROrganization
import docspell.store.records.RPerson
import io.circe.syntax._
import org.log4s.getLogger
/** Maintains a custom regex-ner file per collective for stanford's
* regexner annotator.
*/
trait RegexNerFile[F[_]] {
def makeFile(collective: Ident): F[Option[Path]]
}
object RegexNerFile {
private[this] val logger = getLogger
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
def apply[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F]
): Resource[F, RegexNerFile[F]] =
for {
dir <- File.withTempDir[F](cfg.directory, "regexner-")
writer <- Resource.liftF(Semaphore(1))
} yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
final private class Impl[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F],
writer: Semaphore[F] //TODO allow parallelism per collective
) extends RegexNerFile[F] {
def makeFile(collective: Ident): F[Option[Path]] =
if (cfg.enabled) doMakeFile(collective)
else (None: Option[Path]).pure[F]
def doMakeFile(collective: Ident): F[Option[Path]] =
for {
now <- Timestamp.current[F]
existing <- NerFile.find[F](collective, cfg.directory, blocker)
result <- existing match {
case Some(nf) =>
val dur = Duration.between(nf.creation, now)
if (dur > cfg.minTime)
logger.fdebug(
s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
) *> updateFile(
collective,
now,
Some(nf)
)
else nf.nerFilePath(cfg.directory).some.pure[F]
case None =>
updateFile(collective, now, None)
}
} yield result
private def updateFile(
collective: Ident,
now: Timestamp,
current: Option[NerFile]
): F[Option[Path]] =
for {
lastUpdate <- store.transact(Sql.latestUpdate(collective))
result <- lastUpdate match {
case None =>
(None: Option[Path]).pure[F]
case Some(lup) =>
current match {
case Some(cur) =>
val nerf =
if (cur.updated == lup)
logger.fdebug(s"No state change detected.") *> updateTimestamp(
cur,
now
) *> cur.pure[F]
else
logger.fdebug(
s"There have been state changes for collective '${collective.id}'. Reload NER file."
) *> createFile(lup, collective, now)
nerf.map(_.nerFilePath(cfg.directory).some)
case None =>
createFile(lup, collective, now)
.map(_.nerFilePath(cfg.directory).some)
}
}
} yield result
private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
writer.withPermit(for {
file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- File.mkDir(file.getParent)
_ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
} yield ())
private def createFile(
lastUpdate: Timestamp,
collective: Ident,
now: Timestamp
): F[NerFile] = {
def update(nf: NerFile, text: String): F[Unit] =
writer.withPermit(for {
jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
_ <- File.mkDir(jsonFile.getParent)
_ <- File.writeString(nf.nerFilePath(cfg.directory), text)
_ <- File.writeString(jsonFile, nf.asJson.spaces2)
} yield ())
for {
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
names <- store.transact(QCollective.allNames(collective))
nerFile = NerFile(collective, lastUpdate, now)
_ <- update(nerFile, NerFile.mkNerConfig(names))
} yield nerFile
}
}
object Sql {
import doobie._
import doobie.implicits._
import docspell.store.impl.Implicits._
import docspell.store.impl.Column
def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
def max(col: Column, table: Fragment, cidCol: Column): Fragment =
selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
val sql =
List(
max(
ROrganization.Columns.updated,
ROrganization.table,
ROrganization.Columns.cid
),
max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
)
.reduce(_ ++ fr"UNION ALL" ++ _)
selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
.query[Timestamp]
.option
}
}
}

View File

@ -5,10 +5,12 @@ import cats.effect._
import cats.implicits._
import fs2.Stream
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem
import docspell.common.{ItemState, ProcessItemArgs}
import docspell.ftsclient.FtsClient
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task
import docspell.store.queries.QItem
import docspell.store.records.RItem
@ -29,11 +31,13 @@ object ItemHandler {
def newItem[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
itemOps: OItem[F],
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] =
CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts))
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
.map(_ => ())
def itemStateTask[F[_]: Sync, A](
@ -51,11 +55,13 @@ object ItemHandler {
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
itemOps: OItem[F],
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap {
case true =>
ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
case Right(d) =>
Task.pure(d)
case Left(ex) =>
@ -65,7 +71,8 @@ object ItemHandler {
.andThen(_ => Sync[F].raiseError(ex))
})
case false =>
ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
.flatMap(itemStateTask(ItemState.Created))
}
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =

View File

@ -2,10 +2,12 @@ package docspell.joex.process
import cats.effect._
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem
import docspell.common.ProcessItemArgs
import docspell.ftsclient.FtsClient
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task
object ProcessItem {
@ -13,25 +15,31 @@ object ProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
itemOps: OItem[F],
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ExtractArchive(item)
.flatMap(Task.setProgress(20))
.flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
.flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
.flatMap(LinkProposal[F])
.flatMap(SetGivenData[F](itemOps))
.flatMap(Task.setProgress(99))
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, (30, 60, 90))(item)
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync](
cfg: Config
cfg: Config,
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg.textAnalysis)(item)
TextAnalysis[F](analyser, regexNer)(item)
.flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
@ -39,12 +47,14 @@ object ProcessItem {
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
progress: (Int, Int, Int)
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg))
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3))
}

View File

@ -4,9 +4,11 @@ import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.common._
import docspell.ftsclient.FtsClient
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment
@ -19,10 +21,12 @@ object ReProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] =
loadItem[F]
.flatMap(safeProcess[F](cfg, fts))
.flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
.map(_ => ())
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -70,6 +74,8 @@ object ReProcessItem {
def processFiles[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
data: ItemData
): Task[F, Args, ItemData] = {
@ -91,7 +97,7 @@ object ReProcessItem {
getLanguage[F].flatMap { lang =>
ProcessItem
.processAttachments[F](cfg, fts)(data)
.processAttachments[F](cfg, fts, analyser, regexNer)(data)
.contramap[Args](convertArgs(lang))
}
}
@ -109,11 +115,13 @@ object ReProcessItem {
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
fts: FtsClient[F],
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap {
case true =>
processFiles[F](cfg, fts, data).attempt
processFiles[F](cfg, fts, analyser, regexNer, data).attempt
.flatMap({
case Right(d) =>
Task.pure(d)
@ -123,7 +131,7 @@ object ReProcessItem {
).andThen(_ => Sync[F].raiseError(ex))
})
case false =>
processFiles[F](cfg, fts, data)
processFiles[F](cfg, fts, analyser, regexNer, data)
}
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =

View File

@ -1,47 +1,57 @@
package docspell.joex.process
import cats.effect.Sync
import cats.effect._
import cats.implicits._
import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.common._
import docspell.joex.analysis.RegexNerFile
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
object TextAnalysis {
def apply[F[_]: Sync](
cfg: TextAnalysisConfig
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
TextAnalyser.create[F](cfg).use { analyser =>
for {
_ <- ctx.logger.info("Starting text analysis")
s <- Duration.stopTime[F]
t <-
item.metas.toList
.traverse(
annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
)
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m =>
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
)
e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
}
for {
_ <- ctx.logger.info("Starting text analysis")
s <- Duration.stopTime[F]
t <-
item.metas.toList
.traverse(
annotateAttachment[F](ctx, analyser, nerFile)
)
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m =>
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
)
e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
}
def annotateAttachment[F[_]: Sync](
lang: Language,
logger: Logger[F],
analyser: TextAnalyser[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
ctx: Context[F, ProcessItemArgs],
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(ctx.args.meta.language, false, None)
for {
labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
customNer <- nerFile.makeFile(ctx.args.meta.collective)
sett = settings.copy(regexNer = customNer)
labels <- analyser.annotate(
ctx.logger,
sett,
ctx.args.meta.collective,
rm.content.getOrElse("")
)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
}

View File

@ -341,6 +341,7 @@ trait Conversions {
v.address.city,
v.address.country,
v.notes,
now,
now
)
} yield OOrganization.OrgAndContacts(org, cont)
@ -353,6 +354,7 @@ trait Conversions {
def contacts(oid: Ident) =
v.contacts.traverse(c => newContact(c, oid.some, None))
for {
now <- Timestamp.current[F]
cont <- contacts(v.id)
org = ROrganization(
v.id,
@ -363,7 +365,8 @@ trait Conversions {
v.address.city,
v.address.country,
v.notes,
v.created
v.created,
now
)
} yield OOrganization.OrgAndContacts(org, cont)
}
@ -398,6 +401,7 @@ trait Conversions {
v.address.country,
v.notes,
v.concerning,
now,
now
)
} yield OOrganization.PersonAndContacts(org, cont)
@ -410,6 +414,7 @@ trait Conversions {
def contacts(pid: Ident) =
v.contacts.traverse(c => newContact(c, None, pid.some))
for {
now <- Timestamp.current[F]
cont <- contacts(v.id)
org = RPerson(
v.id,
@ -421,7 +426,8 @@ trait Conversions {
v.address.country,
v.notes,
v.concerning,
v.created
v.created,
now
)
} yield OOrganization.PersonAndContacts(org, cont)
}
@ -536,11 +542,11 @@ trait Conversions {
def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
timeId.map({
case (id, now) =>
REquipment(id, cid, e.name, now)
REquipment(id, cid, e.name, now, now)
})
def changeEquipment(e: Equipment, cid: Ident): REquipment =
REquipment(e.id, cid, e.name, e.created)
def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))
// idref

View File

@ -39,10 +39,10 @@ object EquipmentRoutes {
case req @ PUT -> Root =>
for {
data <- req.as[Equipment]
equip = changeEquipment(data, user.account.collective)
res <- backend.equipment.update(equip)
resp <- Ok(basicResult(res, "Equipment updated."))
data <- req.as[Equipment]
equip <- changeEquipment(data, user.account.collective)
res <- backend.equipment.update(equip)
resp <- Ok(basicResult(res, "Equipment updated."))
} yield resp
case DELETE -> Root / Ident(id) =>

View File

@ -0,0 +1,29 @@
-- organization
ALTER TABLE `organization`
ADD COLUMN (`updated` timestamp);
UPDATE `organization` SET `updated` = `created`;
ALTER TABLE `organization`
MODIFY `updated` timestamp NOT NULL;
-- person
ALTER TABLE `person`
MODIFY `created` timestamp;
ALTER TABLE `person`
ADD COLUMN (`updated` timestamp);
UPDATE `person` SET `updated` = `created`;
ALTER TABLE `person`
MODIFY `updated` timestamp NOT NULL;
-- equipment
ALTER TABLE `equipment`
ADD COLUMN (`updated` timestamp);
UPDATE `equipment` SET `updated` = `created`;
ALTER TABLE `equipment`
MODIFY `updated` timestamp NOT NULL;

View File

@ -0,0 +1,29 @@
-- organization
ALTER TABLE "organization"
ADD COLUMN "updated" timestamp;
UPDATE "organization" SET "updated" = "created";
ALTER TABLE "organization"
ALTER COLUMN "updated" SET NOT NULL;
-- person
ALTER TABLE "person" ALTER COLUMN "created"
TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
ALTER TABLE "person"
ADD COLUMN "updated" timestamp;
UPDATE "person" SET "updated" = "created";
ALTER TABLE "person"
ALTER COLUMN "updated" SET NOT NULL;
-- equipment
ALTER TABLE "equipment"
ADD COLUMN "updated" timestamp;
UPDATE "equipment" SET "updated" = "created";
ALTER TABLE "equipment"
ALTER COLUMN "updated" SET NOT NULL;

View File

@ -1,5 +1,6 @@
package docspell.store.queries
import cats.data.OptionT
import fs2.Stream
import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
import doobie.implicits._
object QCollective {
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
object Names {
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
}
def allNames(collective: Ident): ConnectionIO[Names] =
(for {
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
.getOrElse(Names.empty)
case class TagCount(tag: RTag, count: Int)
case class InsightData(

View File

@ -7,7 +7,13 @@ import docspell.store.impl._
import doobie._
import doobie.implicits._
case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {}
case class REquipment(
eid: Ident,
cid: Ident,
name: String,
created: Timestamp,
updated: Timestamp
) {}
object REquipment {
@ -18,25 +24,32 @@ object REquipment {
val cid = Column("cid")
val name = Column("name")
val created = Column("created")
val all = List(eid, cid, name, created)
val updated = Column("updated")
val all = List(eid, cid, name, created, updated)
}
import Columns._
def insert(v: REquipment): ConnectionIO[Int] = {
val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}")
val sql =
insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
sql.update.run
}
def update(v: REquipment): ConnectionIO[Int] = {
val sql = updateRow(
table,
and(eid.is(v.eid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name)
def sql(now: Timestamp) =
updateRow(
table,
and(eid.is(v.eid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name),
updated.setTo(now)
)
)
)
sql.update.run
for {
now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
}
def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {

View File

@ -19,7 +19,8 @@ case class ROrganization(
city: String,
country: String,
notes: Option[String],
created: Timestamp
created: Timestamp,
updated: Timestamp
) {}
object ROrganization {
@ -38,7 +39,8 @@ object ROrganization {
val country = Column("country")
val notes = Column("notes")
val created = Column("created")
val all = List(oid, cid, name, street, zip, city, country, notes, created)
val updated = Column("updated")
val all = List(oid, cid, name, street, zip, city, country, notes, created, updated)
}
import Columns._
@ -47,26 +49,31 @@ object ROrganization {
val sql = insertRow(
table,
all,
fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}"
fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
)
sql.update.run
}
def update(v: ROrganization): ConnectionIO[Int] = {
val sql = updateRow(
table,
and(oid.is(v.oid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name),
street.setTo(v.street),
zip.setTo(v.zip),
city.setTo(v.city),
country.setTo(v.country),
notes.setTo(v.notes)
def sql(now: Timestamp) =
updateRow(
table,
and(oid.is(v.oid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name),
street.setTo(v.street),
zip.setTo(v.zip),
city.setTo(v.city),
country.setTo(v.country),
notes.setTo(v.notes),
updated.setTo(now)
)
)
)
sql.update.run
for {
now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
}
def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =

View File

@ -20,7 +20,8 @@ case class RPerson(
country: String,
notes: Option[String],
concerning: Boolean,
created: Timestamp
created: Timestamp,
updated: Timestamp
) {}
object RPerson {
@ -40,7 +41,20 @@ object RPerson {
val notes = Column("notes")
val concerning = Column("concerning")
val created = Column("created")
val all = List(pid, cid, name, street, zip, city, country, notes, concerning, created)
val updated = Column("updated")
val all = List(
pid,
cid,
name,
street,
zip,
city,
country,
notes,
concerning,
created,
updated
)
}
import Columns._
@ -49,27 +63,32 @@ object RPerson {
val sql = insertRow(
table,
all,
fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}"
fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
)
sql.update.run
}
def update(v: RPerson): ConnectionIO[Int] = {
val sql = updateRow(
table,
and(pid.is(v.pid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name),
street.setTo(v.street),
zip.setTo(v.zip),
city.setTo(v.city),
country.setTo(v.country),
concerning.setTo(v.concerning),
notes.setTo(v.notes)
def sql(now: Timestamp) =
updateRow(
table,
and(pid.is(v.pid), cid.is(v.cid)),
commas(
cid.setTo(v.cid),
name.setTo(v.name),
street.setTo(v.street),
zip.setTo(v.zip),
city.setTo(v.city),
country.setTo(v.country),
concerning.setTo(v.concerning),
notes.setTo(v.notes),
updated.setTo(now)
)
)
)
sql.update.run
for {
now <- Timestamp.current[ConnectionIO]
n <- sql(now).update.run
} yield n
}
def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =

View File

@ -10,6 +10,7 @@ module Data.Language exposing
type Language
= German
| English
| French
fromString : String -> Maybe Language
@ -20,6 +21,9 @@ fromString str =
else if str == "eng" || str == "en" || str == "english" then
Just English
else if str == "fra" || str == "fr" || str == "french" then
Just French
else
Nothing
@ -33,6 +37,9 @@ toIso3 lang =
English ->
"eng"
French ->
"fra"
toName : Language -> String
toName lang =
@ -43,7 +50,10 @@ toName lang =
English ->
"English"
French ->
"French"
all : List Language
all =
[ German, English ]
[ German, English, French ]

View File

@ -91,6 +91,11 @@ let
};
text-analysis = {
max-length = 10000;
regex-ner = {
enabled = true;
file-cache-time = "1 minute";
};
working-dir = "/tmp/docspell-analysis";
};
processing = {
max-due-date-years = 10;
@ -689,7 +694,48 @@ in {
(a rough guess).
'';
};
working-dir = mkOption {
type = types.str;
default = defaults.text-analysis.working-dir;
description = ''
A working directory for the analyser to store temporary/working
files.
'';
};
regex-ner = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.text-analysis.regex-ner.enabled;
description = ''
Whether to enable custom NER annotation. This uses the address
book of a collective as input for NER tagging (to automatically
find correspondent and concerned entities). If the address book
is large, this can be quite memory intensive and also makes text
analysis slower. But it greatly improves accuracy. If this is
false, NER tagging uses only statistical models (that also work
quite well).
This setting might be moved to the collective settings in the
future.
'';
};
file-cache-time = mkOption {
type = types.str;
default = defaults.text-analysis.ner-file-cache-time;
description = ''
The NER annotation uses a file of patterns that is derived from
a collective's address book. This is is the time how long this
file will be kept until a check for a state change is done.
'';
};
};
});
default = defaults.text-analysis.regex-ner;
description = "";
};
};
});
default = defaults.text-analysis;

View File

@ -31,7 +31,7 @@ object Dependencies {
val PostgresVersion = "42.2.16"
val PureConfigVersion = "0.13.0"
val Slf4jVersion = "1.7.30"
val StanfordNlpVersion = "3.9.2"
val StanfordNlpVersion = "4.0.0"
val TikaVersion = "1.24.1"
val YamuscaVersion = "0.6.2"
val SwaggerUIVersion = "3.32.3"
@ -135,11 +135,16 @@ object Dependencies {
)
val stanfordNlpModels = Seq(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
"models-english"
)
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-french"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier(
"models-english"
)
)
val tika = Seq(

View File

@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
}
private val nerModels = List(
"german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
"english.all.3class.distsim.crf.ser.gz"
"german.distsim.crf.ser.gz",
"english.conll.4class.distsim.crf.ser.gz",
"french-wikiner-4class.crf.ser.gz",
"french-mwt-statistical.tsv",
"french-mwt.tagger",
"french-mwt.tsv",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
"french-ud.tagger",
"french-ud.tagger.props",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props"
)
}