mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 14:45:59 +00:00
commit
31544240fb
@ -1,3 +0,0 @@
|
|||||||
updates.ignore = [
|
|
||||||
{ groupId = "edu.stanford.nlp", artifactId = "stanford-corenlp" }
|
|
||||||
]
|
|
@ -10,6 +10,7 @@ cache:
|
|||||||
- $HOME/.ivy2/cache
|
- $HOME/.ivy2/cache
|
||||||
- $HOME/.sbt/boot
|
- $HOME/.sbt/boot
|
||||||
- $HOME/.coursier/cache
|
- $HOME/.coursier/cache
|
||||||
|
- $HOME/.cache/coursier
|
||||||
- sysconfcpus
|
- sysconfcpus
|
||||||
|
|
||||||
install:
|
install:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
|
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
|
||||||
|
|
||||||
[](https://travis-ci.org/eikek/docspell)
|
[](https://travis-ci.org/eikek/docspell)
|
||||||
[](https://scala-steward.org)
|
[](https://scala-steward.org)
|
||||||
[](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
|
[](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
|
||||||
[](https://hub.docker.com/r/eikek0/docspell)
|
[](https://hub.docker.com/r/eikek0/docspell)
|
||||||
|
@ -5,12 +5,19 @@ import cats.implicits._
|
|||||||
|
|
||||||
import docspell.analysis.contact.Contact
|
import docspell.analysis.contact.Contact
|
||||||
import docspell.analysis.date.DateFind
|
import docspell.analysis.date.DateFind
|
||||||
|
import docspell.analysis.nlp.PipelineCache
|
||||||
import docspell.analysis.nlp.StanfordNerClassifier
|
import docspell.analysis.nlp.StanfordNerClassifier
|
||||||
|
import docspell.analysis.nlp.StanfordSettings
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
trait TextAnalyser[F[_]] {
|
trait TextAnalyser[F[_]] {
|
||||||
|
|
||||||
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
|
def annotate(
|
||||||
|
logger: Logger[F],
|
||||||
|
settings: StanfordSettings,
|
||||||
|
cacheKey: Ident,
|
||||||
|
text: String
|
||||||
|
): F[TextAnalyser.Result]
|
||||||
|
|
||||||
}
|
}
|
||||||
object TextAnalyser {
|
object TextAnalyser {
|
||||||
@ -22,43 +29,47 @@ object TextAnalyser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
|
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
|
||||||
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
|
Resource
|
||||||
def annotate(
|
.liftF(PipelineCache[F]())
|
||||||
logger: Logger[F],
|
.map(cache =>
|
||||||
lang: Language,
|
new TextAnalyser[F] {
|
||||||
text: String
|
def annotate(
|
||||||
): F[TextAnalyser.Result] =
|
logger: Logger[F],
|
||||||
for {
|
settings: StanfordSettings,
|
||||||
input <- textLimit(logger, text)
|
cacheKey: Ident,
|
||||||
tags0 <- stanfordNer(lang, input)
|
text: String
|
||||||
tags1 <- contactNer(input)
|
): F[TextAnalyser.Result] =
|
||||||
dates <- dateNer(lang, input)
|
for {
|
||||||
list = tags0 ++ tags1
|
input <- textLimit(logger, text)
|
||||||
spans = NerLabelSpan.build(list)
|
tags0 <- stanfordNer(cacheKey, settings, input)
|
||||||
} yield Result(spans ++ list, dates)
|
tags1 <- contactNer(input)
|
||||||
|
dates <- dateNer(settings.lang, input)
|
||||||
|
list = tags0 ++ tags1
|
||||||
|
spans = NerLabelSpan.build(list)
|
||||||
|
} yield Result(spans ++ list, dates)
|
||||||
|
|
||||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||||
if (text.length <= cfg.maxLength) text.pure[F]
|
if (text.length <= cfg.maxLength) text.pure[F]
|
||||||
else
|
else
|
||||||
logger.info(
|
logger.info(
|
||||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||||
s" Analysing only first ${cfg.maxLength} characters."
|
s" Analysing only first ${cfg.maxLength} characters."
|
||||||
) *> text.take(cfg.maxLength).pure[F]
|
) *> text.take(cfg.maxLength).pure[F]
|
||||||
|
|
||||||
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
|
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
|
||||||
Sync[F].delay {
|
: F[Vector[NerLabel]] =
|
||||||
StanfordNerClassifier.nerAnnotate(lang)(text)
|
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
||||||
|
|
||||||
|
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||||
|
Sync[F].delay {
|
||||||
|
Contact.annotate(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
|
||||||
|
Sync[F].delay {
|
||||||
|
DateFind.findDates(text, lang).toVector
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
)
|
||||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
|
||||||
Sync[F].delay {
|
|
||||||
Contact.annotate(text)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
|
|
||||||
Sync[F].delay {
|
|
||||||
DateFind.findDates(text, lang).toVector
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -54,6 +54,7 @@ object DateFind {
|
|||||||
val p = lang match {
|
val p = lang match {
|
||||||
case Language.English => p2.or(p0).or(p1)
|
case Language.English => p2.or(p0).or(p1)
|
||||||
case Language.German => p1.or(p0).or(p2)
|
case Language.German => p1.or(p0).or(p2)
|
||||||
|
case Language.French => p1.or(p0).or(p2)
|
||||||
}
|
}
|
||||||
p.read(parts).toOption
|
p.read(parts).toOption
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,25 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import docspell.common.{NerLabel, NerTag}
|
||||||
|
|
||||||
|
import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
|
||||||
|
|
||||||
|
object LabelConverter {
|
||||||
|
|
||||||
|
private def tagFromLabel[A <: CoreAnnotation[String]](
|
||||||
|
label: CoreLabel,
|
||||||
|
annot: Class[A]
|
||||||
|
): Option[NerTag] = {
|
||||||
|
val tag = label.get(annot)
|
||||||
|
Option(tag).flatMap(s => NerTag.fromString(s).toOption)
|
||||||
|
}
|
||||||
|
|
||||||
|
def findTag(label: CoreLabel): Option[NerTag] =
|
||||||
|
tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
|
||||||
|
.orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
|
||||||
|
|
||||||
|
def toNerLabel(label: CoreLabel): Option[NerLabel] =
|
||||||
|
findTag(label).map(t =>
|
||||||
|
NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
|
||||||
|
)
|
||||||
|
}
|
@ -0,0 +1,90 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import cats.Applicative
|
||||||
|
import cats.effect._
|
||||||
|
import cats.effect.concurrent.Ref
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
|
import org.log4s.getLogger
|
||||||
|
|
||||||
|
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||||
|
* involves IO and initializing large objects.
|
||||||
|
*
|
||||||
|
* Therefore, the instances are cached, because they are thread-safe.
|
||||||
|
*
|
||||||
|
* **This is an internal API**
|
||||||
|
*/
|
||||||
|
trait PipelineCache[F[_]] {
|
||||||
|
|
||||||
|
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object PipelineCache {
|
||||||
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
|
def none[F[_]: Applicative]: PipelineCache[F] =
|
||||||
|
new PipelineCache[F] {
|
||||||
|
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||||
|
makeClassifier(settings).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
def apply[F[_]: Sync](): F[PipelineCache[F]] =
|
||||||
|
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
|
||||||
|
|
||||||
|
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
|
||||||
|
extends PipelineCache[F] {
|
||||||
|
|
||||||
|
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||||
|
for {
|
||||||
|
id <- makeSettingsId(settings)
|
||||||
|
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
|
||||||
|
} yield nlp
|
||||||
|
|
||||||
|
private def getOrCreate(
|
||||||
|
key: String,
|
||||||
|
id: String,
|
||||||
|
cache: Map[String, Entry],
|
||||||
|
settings: StanfordSettings
|
||||||
|
): (Map[String, Entry], StanfordCoreNLP) =
|
||||||
|
cache.get(key) match {
|
||||||
|
case Some(entry) =>
|
||||||
|
if (entry.id == id) (cache, entry.value)
|
||||||
|
else {
|
||||||
|
logger.info(
|
||||||
|
s"StanfordNLP settings changed for key $key. Creating new classifier"
|
||||||
|
)
|
||||||
|
val nlp = makeClassifier(settings)
|
||||||
|
val e = Entry(id, nlp)
|
||||||
|
(cache.updated(key, e), nlp)
|
||||||
|
}
|
||||||
|
|
||||||
|
case None =>
|
||||||
|
val nlp = makeClassifier(settings)
|
||||||
|
val e = Entry(id, nlp)
|
||||||
|
(cache.updated(key, e), nlp)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def makeSettingsId(settings: StanfordSettings): F[String] = {
|
||||||
|
val base = settings.copy(regexNer = None).toString
|
||||||
|
val size: F[Long] =
|
||||||
|
settings.regexNer match {
|
||||||
|
case Some(p) =>
|
||||||
|
File.size(p)
|
||||||
|
case None =>
|
||||||
|
0L.pure[F]
|
||||||
|
}
|
||||||
|
size.map(len => s"$base-$len")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
|
||||||
|
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
||||||
|
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||||
|
}
|
||||||
|
|
||||||
|
private case class Entry(id: String, value: StanfordCoreNLP)
|
||||||
|
}
|
@ -0,0 +1,111 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.util.{Properties => JProps}
|
||||||
|
|
||||||
|
import docspell.analysis.nlp.Properties.Implicits._
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
object Properties {
|
||||||
|
|
||||||
|
def apply(ps: (String, String)*): JProps = {
|
||||||
|
val p = new JProps()
|
||||||
|
for ((k, v) <- ps)
|
||||||
|
p.setProperty(k, v)
|
||||||
|
p
|
||||||
|
}
|
||||||
|
|
||||||
|
def forSettings(settings: StanfordSettings): JProps = {
|
||||||
|
val regexNerFile = settings.regexNer
|
||||||
|
.map(p => p.normalize().toAbsolutePath().toString())
|
||||||
|
settings.lang match {
|
||||||
|
case Language.German =>
|
||||||
|
Properties.nerGerman(regexNerFile, settings.highRecall)
|
||||||
|
case Language.English =>
|
||||||
|
Properties.nerEnglish(regexNerFile)
|
||||||
|
case Language.French =>
|
||||||
|
Properties.nerFrench(regexNerFile, settings.highRecall)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||||
|
"tokenize.language" -> "de",
|
||||||
|
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||||
|
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||||
|
"ner.statisticalOnly" -> "true",
|
||||||
|
"ner.rulesOnly" -> "false",
|
||||||
|
"ner.applyFineGrained" -> "false",
|
||||||
|
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
|
||||||
|
"ner.useSUTime" -> "false", //only english, unused in docspell
|
||||||
|
"ner.language" -> "de",
|
||||||
|
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
|
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||||
|
"tokenize.language" -> "en",
|
||||||
|
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
|
||||||
|
"ner.statisticalOnly" -> "true",
|
||||||
|
"ner.rulesOnly" -> "false",
|
||||||
|
"ner.applyFineGrained" -> "false",
|
||||||
|
"ner.applyNumericClassifiers" -> "false",
|
||||||
|
"ner.useSUTime" -> "false",
|
||||||
|
"ner.language" -> "en",
|
||||||
|
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
|
).withRegexNer(regexNerMappingFile)
|
||||||
|
|
||||||
|
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||||
|
"tokenize.language" -> "fr",
|
||||||
|
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||||
|
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||||
|
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
|
||||||
|
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||||
|
"ner.statisticalOnly" -> "true",
|
||||||
|
"ner.rulesOnly" -> "false",
|
||||||
|
"ner.applyFineGrained" -> "false",
|
||||||
|
"ner.applyNumericClassifiers" -> "false",
|
||||||
|
"ner.useSUTime" -> "false",
|
||||||
|
"ner.language" -> "de",
|
||||||
|
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
|
object Implicits {
|
||||||
|
implicit final class JPropsOps(val p: JProps) extends AnyVal {
|
||||||
|
|
||||||
|
def set(name: String, value: Option[String]): JProps =
|
||||||
|
value match {
|
||||||
|
case Some(v) =>
|
||||||
|
p.setProperty(name, v)
|
||||||
|
p
|
||||||
|
case None =>
|
||||||
|
p
|
||||||
|
}
|
||||||
|
|
||||||
|
def change(name: String, f: String => String): JProps =
|
||||||
|
Option(p.getProperty(name)) match {
|
||||||
|
case Some(current) =>
|
||||||
|
p.setProperty(name, f(current))
|
||||||
|
p
|
||||||
|
case None =>
|
||||||
|
p
|
||||||
|
}
|
||||||
|
|
||||||
|
def withRegexNer(mappingFile: Option[String]): JProps =
|
||||||
|
set("regexner.mapping", mappingFile)
|
||||||
|
.change(
|
||||||
|
"annotators",
|
||||||
|
v => if (mappingFile.isDefined) v + ",regexner" else v
|
||||||
|
)
|
||||||
|
|
||||||
|
def withHighRecall(flag: Boolean): JProps = {
|
||||||
|
if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
|
||||||
|
else p.setProperty("ner.combinationMode", "NORMAL")
|
||||||
|
p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,65 +1,39 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
import java.net.URL
|
|
||||||
import java.util.zip.GZIPInputStream
|
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
import scala.util.Using
|
|
||||||
|
import cats.Applicative
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
||||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
|
||||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
|
||||||
import org.log4s.getLogger
|
|
||||||
|
|
||||||
object StanfordNerClassifier {
|
object StanfordNerClassifier {
|
||||||
private[this] val logger = getLogger
|
|
||||||
|
|
||||||
lazy val germanNerClassifier = makeClassifier(Language.German)
|
/** Runs named entity recognition on the given `text`.
|
||||||
lazy val englishNerClassifier = makeClassifier(Language.English)
|
*
|
||||||
|
* This uses the classifier pipeline from stanford-nlp, see
|
||||||
|
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
|
||||||
|
* classifiers is quite expensive, it involves loading large model
|
||||||
|
* files. The classifiers are thread-safe and so they are cached.
|
||||||
|
* The `cacheKey` defines the "slot" where classifiers are stored
|
||||||
|
* and retrieved. If for a given `cacheKey` the `settings` change,
|
||||||
|
* a new classifier must be created. It will then replace the
|
||||||
|
* previous one.
|
||||||
|
*/
|
||||||
|
def nerAnnotate[F[_]: Applicative](
|
||||||
|
cacheKey: String,
|
||||||
|
cache: PipelineCache[F]
|
||||||
|
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
|
||||||
|
cache
|
||||||
|
.obtain(cacheKey, settings)
|
||||||
|
.map(crf => runClassifier(crf, text))
|
||||||
|
|
||||||
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
|
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||||
val nerClassifier = lang match {
|
val doc = new CoreDocument(text)
|
||||||
case Language.English => englishNerClassifier
|
nerClassifier.annotate(doc)
|
||||||
case Language.German => germanNerClassifier
|
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
||||||
}
|
|
||||||
nerClassifier
|
|
||||||
.classify(text)
|
|
||||||
.asScala
|
|
||||||
.flatMap(a => a.asScala)
|
|
||||||
.collect(Function.unlift { label =>
|
|
||||||
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
|
|
||||||
NerTag
|
|
||||||
.fromString(Option(tag).getOrElse(""))
|
|
||||||
.toOption
|
|
||||||
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
|
|
||||||
})
|
|
||||||
.toVector
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
|
|
||||||
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
|
|
||||||
val ner = classifierResource(lang)
|
|
||||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
|
||||||
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
|
|
||||||
}.fold(throw _, identity)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def classifierResource(lang: Language): URL = {
|
|
||||||
def check(u: URL): URL =
|
|
||||||
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
|
|
||||||
else u
|
|
||||||
|
|
||||||
check(lang match {
|
|
||||||
case Language.German =>
|
|
||||||
getClass.getResource(
|
|
||||||
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
|
|
||||||
)
|
|
||||||
case Language.English =>
|
|
||||||
getClass.getResource(
|
|
||||||
"/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
|
|
||||||
)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
/** Settings for configuring the stanford NER pipeline.
|
||||||
|
*
|
||||||
|
* The language is mandatory, only the provided ones are supported.
|
||||||
|
* The `highRecall` only applies for non-English languages. For
|
||||||
|
* non-English languages the english classifier is run as second
|
||||||
|
* classifier and if `highRecall` is true, then it will be used to
|
||||||
|
* tag untagged tokens. This may lead to a lot of false positives,
|
||||||
|
* but since English is omnipresent in other languages, too it
|
||||||
|
* depends on the use case for whether this is useful or not.
|
||||||
|
*
|
||||||
|
* The `regexNer` allows to specify a text file as described here:
|
||||||
|
* https://nlp.stanford.edu/software/regexner.html. This will be used
|
||||||
|
* as a last step to tag untagged tokens using the provided list of
|
||||||
|
* regexps.
|
||||||
|
*/
|
||||||
|
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
@ -3,31 +3,44 @@ package docspell.analysis.nlp
|
|||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
import docspell.files.TestFiles
|
import docspell.files.TestFiles
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
|
|
||||||
object TextAnalyserSuite extends SimpleTestSuite {
|
object TextAnalyserSuite extends SimpleTestSuite {
|
||||||
|
lazy val germanClassifier =
|
||||||
|
new StanfordCoreNLP(Properties.nerGerman(None, false))
|
||||||
|
lazy val englishClassifier =
|
||||||
|
new StanfordCoreNLP(Properties.nerEnglish(None))
|
||||||
|
|
||||||
test("find english ner labels") {
|
test("find english ner labels") {
|
||||||
val labels =
|
val labels =
|
||||||
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
|
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
NerLabel("Elm", NerTag.Misc, 17, 20),
|
||||||
|
NerLabel("Ave.", NerTag.Misc, 21, 25),
|
||||||
|
NerLabel("Treesville", NerTag.Misc, 27, 37),
|
||||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||||
NerLabel("Treesville", NerTag.Location, 95, 105),
|
NerLabel("Elm", NerTag.Misc, 85, 88),
|
||||||
|
NerLabel("Ave.", NerTag.Misc, 89, 93),
|
||||||
|
NerLabel("Treesville", NerTag.Person, 95, 105),
|
||||||
|
NerLabel("Leaf", NerTag.Organization, 144, 148),
|
||||||
|
NerLabel("Chief", NerTag.Organization, 150, 155),
|
||||||
|
NerLabel("of", NerTag.Organization, 156, 158),
|
||||||
NerLabel("Syrup", NerTag.Organization, 159, 164),
|
NerLabel("Syrup", NerTag.Organization, 159, 164),
|
||||||
NerLabel("Production", NerTag.Organization, 165, 175),
|
NerLabel("Production", NerTag.Organization, 165, 175),
|
||||||
NerLabel("Old", NerTag.Organization, 176, 179),
|
NerLabel("Old", NerTag.Organization, 176, 179),
|
||||||
NerLabel("Sticky", NerTag.Organization, 180, 186),
|
NerLabel("Sticky", NerTag.Organization, 180, 186),
|
||||||
NerLabel("Pancake", NerTag.Organization, 187, 194),
|
NerLabel("Pancake", NerTag.Organization, 187, 194),
|
||||||
NerLabel("Company", NerTag.Organization, 195, 202),
|
NerLabel("Company", NerTag.Organization, 195, 202),
|
||||||
NerLabel("Maple", NerTag.Location, 207, 212),
|
NerLabel("Maple", NerTag.Organization, 207, 212),
|
||||||
NerLabel("Lane", NerTag.Location, 213, 217),
|
NerLabel("Lane", NerTag.Organization, 213, 217),
|
||||||
NerLabel("Forest", NerTag.Location, 219, 225),
|
NerLabel("Forest", NerTag.Organization, 219, 225),
|
||||||
NerLabel("Hemptown", NerTag.Location, 239, 247),
|
NerLabel("Hemptown", NerTag.Location, 239, 247),
|
||||||
NerLabel("Little", NerTag.Organization, 347, 353),
|
NerLabel("Leaf", NerTag.Person, 276, 280),
|
||||||
NerLabel("League", NerTag.Organization, 354, 360),
|
NerLabel("Little", NerTag.Misc, 347, 353),
|
||||||
|
NerLabel("League", NerTag.Misc, 354, 360),
|
||||||
NerLabel("Derek", NerTag.Person, 1117, 1122),
|
NerLabel("Derek", NerTag.Person, 1117, 1122),
|
||||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||||
)
|
)
|
||||||
@ -36,11 +49,11 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
|||||||
|
|
||||||
test("find german ner labels") {
|
test("find german ner labels") {
|
||||||
val labels =
|
val labels =
|
||||||
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Max", NerTag.Person, 0, 3),
|
NerLabel("Max", NerTag.Person, 0, 3),
|
||||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||||
NerLabel("Lilienweg", NerTag.Location, 16, 25),
|
NerLabel("Lilienweg", NerTag.Person, 16, 25),
|
||||||
NerLabel("Max", NerTag.Person, 77, 80),
|
NerLabel("Max", NerTag.Person, 77, 80),
|
||||||
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
||||||
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
||||||
|
@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
|
|||||||
|
|
||||||
def hours: Long = minutes / 60
|
def hours: Long = minutes / 60
|
||||||
|
|
||||||
|
def >(other: Duration): Boolean =
|
||||||
|
nanos > other.nanos
|
||||||
|
|
||||||
|
def <(other: Duration): Boolean =
|
||||||
|
nanos < other.nanos
|
||||||
|
|
||||||
def toScala: FiniteDuration =
|
def toScala: FiniteDuration =
|
||||||
FiniteDuration(nanos, TimeUnit.NANOSECONDS)
|
FiniteDuration(nanos, TimeUnit.NANOSECONDS)
|
||||||
|
|
||||||
@ -62,6 +68,9 @@ object Duration {
|
|||||||
def nanos(n: Long): Duration =
|
def nanos(n: Long): Duration =
|
||||||
Duration(n)
|
Duration(n)
|
||||||
|
|
||||||
|
def between(start: Timestamp, end: Timestamp): Duration =
|
||||||
|
apply(JDur.between(start.value, end.value))
|
||||||
|
|
||||||
def stopTime[F[_]: Sync]: F[F[Duration]] =
|
def stopTime[F[_]: Sync]: F[F[Duration]] =
|
||||||
for {
|
for {
|
||||||
now <- Timestamp.current[F]
|
now <- Timestamp.current[F]
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
import java.io.IOException
|
import java.io.IOException
|
||||||
|
import java.nio.charset.StandardCharsets
|
||||||
import java.nio.file._
|
import java.nio.file._
|
||||||
import java.nio.file.attribute.BasicFileAttributes
|
import java.nio.file.attribute.BasicFileAttributes
|
||||||
import java.util.concurrent.atomic.AtomicInteger
|
import java.util.concurrent.atomic.AtomicInteger
|
||||||
@ -11,6 +12,10 @@ import cats.effect._
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
|
||||||
|
import io.circe.Decoder
|
||||||
|
|
||||||
object File {
|
object File {
|
||||||
|
|
||||||
def mkDir[F[_]: Sync](dir: Path): F[Path] =
|
def mkDir[F[_]: Sync](dir: Path): F[Path] =
|
||||||
@ -55,6 +60,9 @@ object File {
|
|||||||
def exists[F[_]: Sync](file: Path): F[Boolean] =
|
def exists[F[_]: Sync](file: Path): F[Boolean] =
|
||||||
Sync[F].delay(Files.exists(file))
|
Sync[F].delay(Files.exists(file))
|
||||||
|
|
||||||
|
def size[F[_]: Sync](file: Path): F[Long] =
|
||||||
|
Sync[F].delay(Files.size(file))
|
||||||
|
|
||||||
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
||||||
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
||||||
|
|
||||||
@ -84,4 +92,13 @@ object File {
|
|||||||
|
|
||||||
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||||
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
|
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||||
|
|
||||||
|
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
|
||||||
|
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
|
||||||
|
|
||||||
|
def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
|
||||||
|
d: Decoder[A]
|
||||||
|
): F[A] =
|
||||||
|
readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,12 @@ object Language {
|
|||||||
val iso3 = "eng"
|
val iso3 = "eng"
|
||||||
}
|
}
|
||||||
|
|
||||||
val all: List[Language] = List(German, English)
|
case object French extends Language {
|
||||||
|
val iso2 = "fr"
|
||||||
|
val iso3 = "fra"
|
||||||
|
}
|
||||||
|
|
||||||
|
val all: List[Language] = List(German, English, French)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
val lang = str.toLowerCase
|
val lang = str.toLowerCase
|
||||||
|
@ -23,6 +23,7 @@ object Field {
|
|||||||
val content = Field("content")
|
val content = Field("content")
|
||||||
val content_de = Field("content_de")
|
val content_de = Field("content_de")
|
||||||
val content_en = Field("content_en")
|
val content_en = Field("content_en")
|
||||||
|
val content_fr = Field("content_fr")
|
||||||
val itemName = Field("itemName")
|
val itemName = Field("itemName")
|
||||||
val itemNotes = Field("itemNotes")
|
val itemNotes = Field("itemNotes")
|
||||||
val folderId = Field("folder")
|
val folderId = Field("folder")
|
||||||
@ -33,6 +34,8 @@ object Field {
|
|||||||
Field.content_de
|
Field.content_de
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
Field.content_en
|
Field.content_en
|
||||||
|
case Language.French =>
|
||||||
|
Field.content_fr
|
||||||
}
|
}
|
||||||
|
|
||||||
implicit val jsonEncoder: Encoder[Field] =
|
implicit val jsonEncoder: Encoder[Field] =
|
||||||
|
@ -39,6 +39,7 @@ object SolrQuery {
|
|||||||
Field.content,
|
Field.content,
|
||||||
Field.content_de,
|
Field.content_de,
|
||||||
Field.content_en,
|
Field.content_en,
|
||||||
|
Field.content_fr,
|
||||||
Field.itemName,
|
Field.itemName,
|
||||||
Field.itemNotes,
|
Field.itemNotes,
|
||||||
Field.attachmentName
|
Field.attachmentName
|
||||||
|
@ -80,6 +80,8 @@ object SolrSetup {
|
|||||||
addTextField(l.some)(Field.content_de)
|
addTextField(l.some)(Field.content_de)
|
||||||
case l @ Language.English =>
|
case l @ Language.English =>
|
||||||
addTextField(l.some)(Field.content_en)
|
addTextField(l.some)(Field.content_en)
|
||||||
|
case l @ Language.French =>
|
||||||
|
addTextField(l.some)(Field.content_fr)
|
||||||
}
|
}
|
||||||
|
|
||||||
cmds0 *> cmds1 *> cntLang *> ().pure[F]
|
cmds0 *> cmds1 *> cntLang *> ().pure[F]
|
||||||
@ -105,6 +107,9 @@ object SolrSetup {
|
|||||||
case Some(Language.English) =>
|
case Some(Language.English) =>
|
||||||
run(DeleteField.command(DeleteField(field))).attempt *>
|
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||||
run(AddField.command(AddField.textEN(field)))
|
run(AddField.command(AddField.textEN(field)))
|
||||||
|
case Some(Language.French) =>
|
||||||
|
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||||
|
run(AddField.command(AddField.textFR(field)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -138,6 +143,9 @@ object SolrSetup {
|
|||||||
|
|
||||||
def textEN(field: Field): AddField =
|
def textEN(field: Field): AddField =
|
||||||
AddField(field, "text_en", true, true, false)
|
AddField(field, "text_en", true, true, false)
|
||||||
|
|
||||||
|
def textFR(field: Field): AddField =
|
||||||
|
AddField(field, "text_fr", true, true, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
case class DeleteField(name: Field)
|
case class DeleteField(name: Field)
|
||||||
|
@ -248,6 +248,29 @@ docspell.joex {
|
|||||||
# should suffice. Default is 10000, which are about 2-3 pages
|
# should suffice. Default is 10000, which are about 2-3 pages
|
||||||
# (just a rough guess, of course).
|
# (just a rough guess, of course).
|
||||||
max-length = 10000
|
max-length = 10000
|
||||||
|
|
||||||
|
# A working directory for the analyser to store temporary/working
|
||||||
|
# files.
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
||||||
|
|
||||||
|
regex-ner {
|
||||||
|
# Whether to enable custom NER annotation. This uses the address
|
||||||
|
# book of a collective as input for NER tagging (to automatically
|
||||||
|
# find correspondent and concerned entities). If the address book
|
||||||
|
# is large, this can be quite memory intensive and also makes text
|
||||||
|
# analysis slower. But it greatly improves accuracy. If this is
|
||||||
|
# false, NER tagging uses only statistical models (that also work
|
||||||
|
# quite well).
|
||||||
|
#
|
||||||
|
# This setting might be moved to the collective settings in the
|
||||||
|
# future.
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
# The NER annotation uses a file of patterns that is derived from
|
||||||
|
# a collective's address book. This is is the time how long this
|
||||||
|
# file will be kept until a check for a state change is done.
|
||||||
|
file-cache-time = "1 minute"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Configuration for converting files into PDFs.
|
# Configuration for converting files into PDFs.
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
package docspell.joex
|
package docspell.joex
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
import docspell.analysis.TextAnalysisConfig
|
import docspell.analysis.TextAnalysisConfig
|
||||||
import docspell.backend.Config.Files
|
import docspell.backend.Config.Files
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
import docspell.extract.ExtractConfig
|
import docspell.extract.ExtractConfig
|
||||||
import docspell.ftssolr.SolrConfig
|
import docspell.ftssolr.SolrConfig
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.hk.HouseKeepingConfig
|
import docspell.joex.hk.HouseKeepingConfig
|
||||||
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
@ -20,7 +23,7 @@ case class Config(
|
|||||||
userTasks: Config.UserTasks,
|
userTasks: Config.UserTasks,
|
||||||
houseKeeping: HouseKeepingConfig,
|
houseKeeping: HouseKeepingConfig,
|
||||||
extraction: ExtractConfig,
|
extraction: ExtractConfig,
|
||||||
textAnalysis: TextAnalysisConfig,
|
textAnalysis: Config.TextAnalysis,
|
||||||
convert: ConvertConfig,
|
convert: ConvertConfig,
|
||||||
sendMail: MailSendConfig,
|
sendMail: MailSendConfig,
|
||||||
files: Files,
|
files: Files,
|
||||||
@ -50,4 +53,19 @@ object Config {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case class Processing(maxDueDateYears: Int)
|
case class Processing(maxDueDateYears: Int)
|
||||||
|
|
||||||
|
case class TextAnalysis(
|
||||||
|
maxLength: Int,
|
||||||
|
workingDir: Path,
|
||||||
|
regexNer: RegexNer
|
||||||
|
) {
|
||||||
|
|
||||||
|
def textAnalysisConfig: TextAnalysisConfig =
|
||||||
|
TextAnalysisConfig(maxLength)
|
||||||
|
|
||||||
|
def regexNerFileConfig: RegexNerFile.Config =
|
||||||
|
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
||||||
|
}
|
||||||
|
|
||||||
|
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
||||||
}
|
}
|
||||||
|
@ -6,10 +6,12 @@ import cats.effect._
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.concurrent.SignallingRef
|
import fs2.concurrent.SignallingRef
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.backend.ops._
|
import docspell.backend.ops._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.ftssolr.SolrFtsClient
|
import docspell.ftssolr.SolrFtsClient
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
||||||
import docspell.joex.hk._
|
import docspell.joex.hk._
|
||||||
import docspell.joex.notify._
|
import docspell.joex.notify._
|
||||||
@ -80,14 +82,16 @@ object JoexAppImpl {
|
|||||||
for {
|
for {
|
||||||
httpClient <- BlazeClientBuilder[F](clientEC).resource
|
httpClient <- BlazeClientBuilder[F](clientEC).resource
|
||||||
client = JoexClient(httpClient)
|
client = JoexClient(httpClient)
|
||||||
store <- Store.create(cfg.jdbc, connectEC, blocker)
|
store <- Store.create(cfg.jdbc, connectEC, blocker)
|
||||||
queue <- JobQueue(store)
|
queue <- JobQueue(store)
|
||||||
pstore <- PeriodicTaskStore.create(store)
|
pstore <- PeriodicTaskStore.create(store)
|
||||||
nodeOps <- ONode(store)
|
nodeOps <- ONode(store)
|
||||||
joex <- OJoex(client, store)
|
joex <- OJoex(client, store)
|
||||||
upload <- OUpload(store, queue, cfg.files, joex)
|
upload <- OUpload(store, queue, cfg.files, joex)
|
||||||
fts <- createFtsClient(cfg)(httpClient)
|
fts <- createFtsClient(cfg)(httpClient)
|
||||||
itemOps <- OItem(store, fts, queue, joex)
|
itemOps <- OItem(store, fts, queue, joex)
|
||||||
|
analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
|
||||||
|
regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
|
||||||
javaEmil =
|
javaEmil =
|
||||||
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
|
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
|
||||||
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
|
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
|
||||||
@ -95,14 +99,14 @@ object JoexAppImpl {
|
|||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
ProcessItemArgs.taskName,
|
ProcessItemArgs.taskName,
|
||||||
ItemHandler.newItem[F](cfg, itemOps, fts),
|
ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
|
||||||
ItemHandler.onCancel[F]
|
ItemHandler.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
ReProcessItemArgs.taskName,
|
ReProcessItemArgs.taskName,
|
||||||
ReProcessItem[F](cfg, fts),
|
ReProcessItem[F](cfg, fts, analyser, regexNer),
|
||||||
ReProcessItem.onCancel[F]
|
ReProcessItem.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,99 @@
|
|||||||
|
package docspell.joex.analysis
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.split.TextSplitter
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.queries.QCollective
|
||||||
|
|
||||||
|
import io.circe.generic.semiauto._
|
||||||
|
import io.circe.{Decoder, Encoder}
|
||||||
|
|
||||||
|
case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
|
||||||
|
def nerFilePath(directory: Path): Path =
|
||||||
|
NerFile.nerFilePath(directory, collective)
|
||||||
|
|
||||||
|
def jsonFilePath(directory: Path) =
|
||||||
|
NerFile.jsonFilePath(directory, collective)
|
||||||
|
}
|
||||||
|
|
||||||
|
object NerFile {
|
||||||
|
implicit val jsonDecoder: Decoder[NerFile] =
|
||||||
|
deriveDecoder[NerFile]
|
||||||
|
|
||||||
|
implicit val jsonEncoder: Encoder[NerFile] =
|
||||||
|
deriveEncoder[NerFile]
|
||||||
|
|
||||||
|
private def nerFilePath(directory: Path, collective: Ident): Path =
|
||||||
|
directory.resolve(s"${collective.id}.txt")
|
||||||
|
|
||||||
|
private def jsonFilePath(directory: Path, collective: Ident): Path =
|
||||||
|
directory.resolve(s"${collective.id}.json")
|
||||||
|
|
||||||
|
def find[F[_]: Sync: ContextShift](
|
||||||
|
collective: Ident,
|
||||||
|
directory: Path,
|
||||||
|
blocker: Blocker
|
||||||
|
): F[Option[NerFile]] = {
|
||||||
|
val file = jsonFilePath(directory, collective)
|
||||||
|
File.existsNonEmpty[F](file).flatMap {
|
||||||
|
case true =>
|
||||||
|
File
|
||||||
|
.readJson[F, NerFile](file, blocker)
|
||||||
|
.map(_.some)
|
||||||
|
case false =>
|
||||||
|
(None: Option[NerFile]).pure[F]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def mkNerConfig(names: QCollective.Names): String = {
|
||||||
|
val orgs = names.org
|
||||||
|
.flatMap(Pattern(3))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
|
||||||
|
|
||||||
|
val pers =
|
||||||
|
names.pers
|
||||||
|
.flatMap(Pattern(2))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("PERSON", "LOCATION,MISC"))
|
||||||
|
|
||||||
|
val equips =
|
||||||
|
names.equip
|
||||||
|
.flatMap(Pattern(1))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("MISC", "LOCATION"))
|
||||||
|
|
||||||
|
(orgs ++ pers ++ equips).mkString("\n")
|
||||||
|
}
|
||||||
|
case class Pattern(value: String, weight: Int) {
|
||||||
|
def toRow(tag: String, overrideTags: String): String =
|
||||||
|
s"$value\t$tag\t$overrideTags\t$weight"
|
||||||
|
}
|
||||||
|
|
||||||
|
object Pattern {
|
||||||
|
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
||||||
|
val delims = " \t\n\r".toSet
|
||||||
|
val words =
|
||||||
|
TextSplitter
|
||||||
|
.split(str, delims)
|
||||||
|
.map(_.toLower.value.trim)
|
||||||
|
.filter(_.nonEmpty)
|
||||||
|
.toVector
|
||||||
|
.map(w => s"(?i)${w}")
|
||||||
|
val tokens =
|
||||||
|
TextSplitter
|
||||||
|
.splitToken(str, delims)
|
||||||
|
.map(_.toLower.value.trim)
|
||||||
|
.filter(_.nonEmpty)
|
||||||
|
.toVector
|
||||||
|
.take(3)
|
||||||
|
.map(w => s"(?i)${w}")
|
||||||
|
|
||||||
|
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,164 @@
|
|||||||
|
package docspell.joex.analysis
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.effect.concurrent.Semaphore
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
import docspell.store.Store
|
||||||
|
import docspell.store.queries.QCollective
|
||||||
|
import docspell.store.records.REquipment
|
||||||
|
import docspell.store.records.ROrganization
|
||||||
|
import docspell.store.records.RPerson
|
||||||
|
|
||||||
|
import io.circe.syntax._
|
||||||
|
import org.log4s.getLogger
|
||||||
|
|
||||||
|
/** Maintains a custom regex-ner file per collective for stanford's
|
||||||
|
* regexner annotator.
|
||||||
|
*/
|
||||||
|
trait RegexNerFile[F[_]] {
|
||||||
|
|
||||||
|
def makeFile(collective: Ident): F[Option[Path]]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object RegexNerFile {
|
||||||
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
|
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
|
||||||
|
|
||||||
|
def apply[F[_]: Concurrent: ContextShift](
|
||||||
|
cfg: Config,
|
||||||
|
blocker: Blocker,
|
||||||
|
store: Store[F]
|
||||||
|
): Resource[F, RegexNerFile[F]] =
|
||||||
|
for {
|
||||||
|
dir <- File.withTempDir[F](cfg.directory, "regexner-")
|
||||||
|
writer <- Resource.liftF(Semaphore(1))
|
||||||
|
} yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
|
||||||
|
|
||||||
|
final private class Impl[F[_]: Concurrent: ContextShift](
|
||||||
|
cfg: Config,
|
||||||
|
blocker: Blocker,
|
||||||
|
store: Store[F],
|
||||||
|
writer: Semaphore[F] //TODO allow parallelism per collective
|
||||||
|
) extends RegexNerFile[F] {
|
||||||
|
|
||||||
|
def makeFile(collective: Ident): F[Option[Path]] =
|
||||||
|
if (cfg.enabled) doMakeFile(collective)
|
||||||
|
else (None: Option[Path]).pure[F]
|
||||||
|
|
||||||
|
def doMakeFile(collective: Ident): F[Option[Path]] =
|
||||||
|
for {
|
||||||
|
now <- Timestamp.current[F]
|
||||||
|
existing <- NerFile.find[F](collective, cfg.directory, blocker)
|
||||||
|
result <- existing match {
|
||||||
|
case Some(nf) =>
|
||||||
|
val dur = Duration.between(nf.creation, now)
|
||||||
|
if (dur > cfg.minTime)
|
||||||
|
logger.fdebug(
|
||||||
|
s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
|
||||||
|
) *> updateFile(
|
||||||
|
collective,
|
||||||
|
now,
|
||||||
|
Some(nf)
|
||||||
|
)
|
||||||
|
else nf.nerFilePath(cfg.directory).some.pure[F]
|
||||||
|
case None =>
|
||||||
|
updateFile(collective, now, None)
|
||||||
|
}
|
||||||
|
} yield result
|
||||||
|
|
||||||
|
private def updateFile(
|
||||||
|
collective: Ident,
|
||||||
|
now: Timestamp,
|
||||||
|
current: Option[NerFile]
|
||||||
|
): F[Option[Path]] =
|
||||||
|
for {
|
||||||
|
lastUpdate <- store.transact(Sql.latestUpdate(collective))
|
||||||
|
result <- lastUpdate match {
|
||||||
|
case None =>
|
||||||
|
(None: Option[Path]).pure[F]
|
||||||
|
case Some(lup) =>
|
||||||
|
current match {
|
||||||
|
case Some(cur) =>
|
||||||
|
val nerf =
|
||||||
|
if (cur.updated == lup)
|
||||||
|
logger.fdebug(s"No state change detected.") *> updateTimestamp(
|
||||||
|
cur,
|
||||||
|
now
|
||||||
|
) *> cur.pure[F]
|
||||||
|
else
|
||||||
|
logger.fdebug(
|
||||||
|
s"There have been state changes for collective '${collective.id}'. Reload NER file."
|
||||||
|
) *> createFile(lup, collective, now)
|
||||||
|
nerf.map(_.nerFilePath(cfg.directory).some)
|
||||||
|
case None =>
|
||||||
|
createFile(lup, collective, now)
|
||||||
|
.map(_.nerFilePath(cfg.directory).some)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} yield result
|
||||||
|
|
||||||
|
private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
|
||||||
|
writer.withPermit(for {
|
||||||
|
file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
|
||||||
|
_ <- File.mkDir(file.getParent)
|
||||||
|
_ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
|
||||||
|
} yield ())
|
||||||
|
|
||||||
|
private def createFile(
|
||||||
|
lastUpdate: Timestamp,
|
||||||
|
collective: Ident,
|
||||||
|
now: Timestamp
|
||||||
|
): F[NerFile] = {
|
||||||
|
def update(nf: NerFile, text: String): F[Unit] =
|
||||||
|
writer.withPermit(for {
|
||||||
|
jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
|
||||||
|
_ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
|
||||||
|
_ <- File.mkDir(jsonFile.getParent)
|
||||||
|
_ <- File.writeString(nf.nerFilePath(cfg.directory), text)
|
||||||
|
_ <- File.writeString(jsonFile, nf.asJson.spaces2)
|
||||||
|
} yield ())
|
||||||
|
|
||||||
|
for {
|
||||||
|
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
|
||||||
|
names <- store.transact(QCollective.allNames(collective))
|
||||||
|
nerFile = NerFile(collective, lastUpdate, now)
|
||||||
|
_ <- update(nerFile, NerFile.mkNerConfig(names))
|
||||||
|
} yield nerFile
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object Sql {
|
||||||
|
import doobie._
|
||||||
|
import doobie.implicits._
|
||||||
|
import docspell.store.impl.Implicits._
|
||||||
|
import docspell.store.impl.Column
|
||||||
|
|
||||||
|
def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
|
||||||
|
def max(col: Column, table: Fragment, cidCol: Column): Fragment =
|
||||||
|
selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
|
||||||
|
|
||||||
|
val sql =
|
||||||
|
List(
|
||||||
|
max(
|
||||||
|
ROrganization.Columns.updated,
|
||||||
|
ROrganization.table,
|
||||||
|
ROrganization.Columns.cid
|
||||||
|
),
|
||||||
|
max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
|
||||||
|
max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
|
||||||
|
)
|
||||||
|
.reduce(_ ++ fr"UNION ALL" ++ _)
|
||||||
|
|
||||||
|
selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
|
||||||
|
.query[Timestamp]
|
||||||
|
.option
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -5,10 +5,12 @@ import cats.effect._
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.backend.ops.OItem
|
import docspell.backend.ops.OItem
|
||||||
import docspell.common.{ItemState, ProcessItemArgs}
|
import docspell.common.{ItemState, ProcessItemArgs}
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.queries.QItem
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.RItem
|
import docspell.store.records.RItem
|
||||||
@ -29,11 +31,13 @@ object ItemHandler {
|
|||||||
def newItem[F[_]: ConcurrentEffect: ContextShift](
|
def newItem[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
CreateItem[F]
|
CreateItem[F]
|
||||||
.flatMap(itemStateTask(ItemState.Processing))
|
.flatMap(itemStateTask(ItemState.Processing))
|
||||||
.flatMap(safeProcess[F](cfg, itemOps, fts))
|
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
|
||||||
.map(_ => ())
|
.map(_ => ())
|
||||||
|
|
||||||
def itemStateTask[F[_]: Sync, A](
|
def itemStateTask[F[_]: Sync, A](
|
||||||
@ -51,11 +55,13 @@ object ItemHandler {
|
|||||||
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(data: ItemData): Task[F, Args, ItemData] =
|
)(data: ItemData): Task[F, Args, ItemData] =
|
||||||
isLastRetry[F].flatMap {
|
isLastRetry[F].flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
|
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
|
||||||
case Right(d) =>
|
case Right(d) =>
|
||||||
Task.pure(d)
|
Task.pure(d)
|
||||||
case Left(ex) =>
|
case Left(ex) =>
|
||||||
@ -65,7 +71,8 @@ object ItemHandler {
|
|||||||
.andThen(_ => Sync[F].raiseError(ex))
|
.andThen(_ => Sync[F].raiseError(ex))
|
||||||
})
|
})
|
||||||
case false =>
|
case false =>
|
||||||
ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
|
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
|
||||||
|
.flatMap(itemStateTask(ItemState.Created))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
|
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
|
||||||
|
@ -2,10 +2,12 @@ package docspell.joex.process
|
|||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.backend.ops.OItem
|
import docspell.backend.ops.OItem
|
||||||
import docspell.common.ProcessItemArgs
|
import docspell.common.ProcessItemArgs
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
|
|
||||||
object ProcessItem {
|
object ProcessItem {
|
||||||
@ -13,25 +15,31 @@ object ProcessItem {
|
|||||||
def apply[F[_]: ConcurrentEffect: ContextShift](
|
def apply[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
ExtractArchive(item)
|
ExtractArchive(item)
|
||||||
.flatMap(Task.setProgress(20))
|
.flatMap(Task.setProgress(20))
|
||||||
.flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
|
.flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
|
||||||
.flatMap(LinkProposal[F])
|
.flatMap(LinkProposal[F])
|
||||||
.flatMap(SetGivenData[F](itemOps))
|
.flatMap(SetGivenData[F](itemOps))
|
||||||
.flatMap(Task.setProgress(99))
|
.flatMap(Task.setProgress(99))
|
||||||
|
|
||||||
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
|
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
processAttachments0[F](cfg, fts, (30, 60, 90))(item)
|
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](
|
def analysisOnly[F[_]: Sync](
|
||||||
cfg: Config
|
cfg: Config,
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
TextAnalysis[F](cfg.textAnalysis)(item)
|
TextAnalysis[F](analyser, regexNer)(item)
|
||||||
.flatMap(FindProposal[F](cfg.processing))
|
.flatMap(FindProposal[F](cfg.processing))
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
@ -39,12 +47,14 @@ object ProcessItem {
|
|||||||
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
|
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F],
|
||||||
progress: (Int, Int, Int)
|
progress: (Int, Int, Int)
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
ConvertPdf(cfg.convert, item)
|
ConvertPdf(cfg.convert, item)
|
||||||
.flatMap(Task.setProgress(progress._1))
|
.flatMap(Task.setProgress(progress._1))
|
||||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||||
.flatMap(Task.setProgress(progress._2))
|
.flatMap(Task.setProgress(progress._2))
|
||||||
.flatMap(analysisOnly[F](cfg))
|
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||||
.flatMap(Task.setProgress(progress._3))
|
.flatMap(Task.setProgress(progress._3))
|
||||||
}
|
}
|
||||||
|
@ -4,9 +4,11 @@ import cats.data.OptionT
|
|||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachment
|
import docspell.store.records.RAttachment
|
||||||
@ -19,10 +21,12 @@ object ReProcessItem {
|
|||||||
|
|
||||||
def apply[F[_]: ConcurrentEffect: ContextShift](
|
def apply[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
loadItem[F]
|
loadItem[F]
|
||||||
.flatMap(safeProcess[F](cfg, fts))
|
.flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
|
||||||
.map(_ => ())
|
.map(_ => ())
|
||||||
|
|
||||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
|
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
|
||||||
@ -70,6 +74,8 @@ object ReProcessItem {
|
|||||||
def processFiles[F[_]: ConcurrentEffect: ContextShift](
|
def processFiles[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F],
|
||||||
data: ItemData
|
data: ItemData
|
||||||
): Task[F, Args, ItemData] = {
|
): Task[F, Args, ItemData] = {
|
||||||
|
|
||||||
@ -91,7 +97,7 @@ object ReProcessItem {
|
|||||||
|
|
||||||
getLanguage[F].flatMap { lang =>
|
getLanguage[F].flatMap { lang =>
|
||||||
ProcessItem
|
ProcessItem
|
||||||
.processAttachments[F](cfg, fts)(data)
|
.processAttachments[F](cfg, fts, analyser, regexNer)(data)
|
||||||
.contramap[Args](convertArgs(lang))
|
.contramap[Args](convertArgs(lang))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -109,11 +115,13 @@ object ReProcessItem {
|
|||||||
|
|
||||||
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F]
|
fts: FtsClient[F],
|
||||||
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(data: ItemData): Task[F, Args, ItemData] =
|
)(data: ItemData): Task[F, Args, ItemData] =
|
||||||
isLastRetry[F].flatMap {
|
isLastRetry[F].flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
processFiles[F](cfg, fts, data).attempt
|
processFiles[F](cfg, fts, analyser, regexNer, data).attempt
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case Right(d) =>
|
case Right(d) =>
|
||||||
Task.pure(d)
|
Task.pure(d)
|
||||||
@ -123,7 +131,7 @@ object ReProcessItem {
|
|||||||
).andThen(_ => Sync[F].raiseError(ex))
|
).andThen(_ => Sync[F].raiseError(ex))
|
||||||
})
|
})
|
||||||
case false =>
|
case false =>
|
||||||
processFiles[F](cfg, fts, data)
|
processFiles[F](cfg, fts, analyser, regexNer, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
|
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
|
||||||
|
@ -1,47 +1,57 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import cats.effect.Sync
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
|
import docspell.analysis.TextAnalyser
|
||||||
|
import docspell.analysis.nlp.StanfordSettings
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync](
|
||||||
cfg: TextAnalysisConfig
|
analyser: TextAnalyser[F],
|
||||||
|
nerFile: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
TextAnalyser.create[F](cfg).use { analyser =>
|
for {
|
||||||
for {
|
_ <- ctx.logger.info("Starting text analysis")
|
||||||
_ <- ctx.logger.info("Starting text analysis")
|
s <- Duration.stopTime[F]
|
||||||
s <- Duration.stopTime[F]
|
t <-
|
||||||
t <-
|
item.metas.toList
|
||||||
item.metas.toList
|
.traverse(
|
||||||
.traverse(
|
annotateAttachment[F](ctx, analyser, nerFile)
|
||||||
annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
|
)
|
||||||
)
|
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
_ <- t.traverse(m =>
|
||||||
_ <- t.traverse(m =>
|
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
||||||
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
)
|
||||||
)
|
e <- s
|
||||||
e <- s
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
v = t.toVector
|
||||||
v = t.toVector
|
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
lang: Language,
|
ctx: Context[F, ProcessItemArgs],
|
||||||
logger: Logger[F],
|
analyser: TextAnalyser[F],
|
||||||
analyser: TextAnalyser[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||||
|
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
||||||
for {
|
for {
|
||||||
labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
|
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||||
|
sett = settings.copy(regexNer = customNer)
|
||||||
|
labels <- analyser.annotate(
|
||||||
|
ctx.logger,
|
||||||
|
sett,
|
||||||
|
ctx.args.meta.collective,
|
||||||
|
rm.content.getOrElse("")
|
||||||
|
)
|
||||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -341,6 +341,7 @@ trait Conversions {
|
|||||||
v.address.city,
|
v.address.city,
|
||||||
v.address.country,
|
v.address.country,
|
||||||
v.notes,
|
v.notes,
|
||||||
|
now,
|
||||||
now
|
now
|
||||||
)
|
)
|
||||||
} yield OOrganization.OrgAndContacts(org, cont)
|
} yield OOrganization.OrgAndContacts(org, cont)
|
||||||
@ -353,6 +354,7 @@ trait Conversions {
|
|||||||
def contacts(oid: Ident) =
|
def contacts(oid: Ident) =
|
||||||
v.contacts.traverse(c => newContact(c, oid.some, None))
|
v.contacts.traverse(c => newContact(c, oid.some, None))
|
||||||
for {
|
for {
|
||||||
|
now <- Timestamp.current[F]
|
||||||
cont <- contacts(v.id)
|
cont <- contacts(v.id)
|
||||||
org = ROrganization(
|
org = ROrganization(
|
||||||
v.id,
|
v.id,
|
||||||
@ -363,7 +365,8 @@ trait Conversions {
|
|||||||
v.address.city,
|
v.address.city,
|
||||||
v.address.country,
|
v.address.country,
|
||||||
v.notes,
|
v.notes,
|
||||||
v.created
|
v.created,
|
||||||
|
now
|
||||||
)
|
)
|
||||||
} yield OOrganization.OrgAndContacts(org, cont)
|
} yield OOrganization.OrgAndContacts(org, cont)
|
||||||
}
|
}
|
||||||
@ -398,6 +401,7 @@ trait Conversions {
|
|||||||
v.address.country,
|
v.address.country,
|
||||||
v.notes,
|
v.notes,
|
||||||
v.concerning,
|
v.concerning,
|
||||||
|
now,
|
||||||
now
|
now
|
||||||
)
|
)
|
||||||
} yield OOrganization.PersonAndContacts(org, cont)
|
} yield OOrganization.PersonAndContacts(org, cont)
|
||||||
@ -410,6 +414,7 @@ trait Conversions {
|
|||||||
def contacts(pid: Ident) =
|
def contacts(pid: Ident) =
|
||||||
v.contacts.traverse(c => newContact(c, None, pid.some))
|
v.contacts.traverse(c => newContact(c, None, pid.some))
|
||||||
for {
|
for {
|
||||||
|
now <- Timestamp.current[F]
|
||||||
cont <- contacts(v.id)
|
cont <- contacts(v.id)
|
||||||
org = RPerson(
|
org = RPerson(
|
||||||
v.id,
|
v.id,
|
||||||
@ -421,7 +426,8 @@ trait Conversions {
|
|||||||
v.address.country,
|
v.address.country,
|
||||||
v.notes,
|
v.notes,
|
||||||
v.concerning,
|
v.concerning,
|
||||||
v.created
|
v.created,
|
||||||
|
now
|
||||||
)
|
)
|
||||||
} yield OOrganization.PersonAndContacts(org, cont)
|
} yield OOrganization.PersonAndContacts(org, cont)
|
||||||
}
|
}
|
||||||
@ -536,11 +542,11 @@ trait Conversions {
|
|||||||
def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
|
def newEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
|
||||||
timeId.map({
|
timeId.map({
|
||||||
case (id, now) =>
|
case (id, now) =>
|
||||||
REquipment(id, cid, e.name, now)
|
REquipment(id, cid, e.name, now, now)
|
||||||
})
|
})
|
||||||
|
|
||||||
def changeEquipment(e: Equipment, cid: Ident): REquipment =
|
def changeEquipment[F[_]: Sync](e: Equipment, cid: Ident): F[REquipment] =
|
||||||
REquipment(e.id, cid, e.name, e.created)
|
Timestamp.current[F].map(now => REquipment(e.id, cid, e.name, e.created, now))
|
||||||
|
|
||||||
// idref
|
// idref
|
||||||
|
|
||||||
|
@ -39,10 +39,10 @@ object EquipmentRoutes {
|
|||||||
|
|
||||||
case req @ PUT -> Root =>
|
case req @ PUT -> Root =>
|
||||||
for {
|
for {
|
||||||
data <- req.as[Equipment]
|
data <- req.as[Equipment]
|
||||||
equip = changeEquipment(data, user.account.collective)
|
equip <- changeEquipment(data, user.account.collective)
|
||||||
res <- backend.equipment.update(equip)
|
res <- backend.equipment.update(equip)
|
||||||
resp <- Ok(basicResult(res, "Equipment updated."))
|
resp <- Ok(basicResult(res, "Equipment updated."))
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
case DELETE -> Root / Ident(id) =>
|
case DELETE -> Root / Ident(id) =>
|
||||||
|
@ -0,0 +1,29 @@
|
|||||||
|
-- organization
|
||||||
|
ALTER TABLE `organization`
|
||||||
|
ADD COLUMN (`updated` timestamp);
|
||||||
|
|
||||||
|
UPDATE `organization` SET `updated` = `created`;
|
||||||
|
|
||||||
|
ALTER TABLE `organization`
|
||||||
|
MODIFY `updated` timestamp NOT NULL;
|
||||||
|
|
||||||
|
-- person
|
||||||
|
ALTER TABLE `person`
|
||||||
|
MODIFY `created` timestamp;
|
||||||
|
|
||||||
|
ALTER TABLE `person`
|
||||||
|
ADD COLUMN (`updated` timestamp);
|
||||||
|
|
||||||
|
UPDATE `person` SET `updated` = `created`;
|
||||||
|
|
||||||
|
ALTER TABLE `person`
|
||||||
|
MODIFY `updated` timestamp NOT NULL;
|
||||||
|
|
||||||
|
-- equipment
|
||||||
|
ALTER TABLE `equipment`
|
||||||
|
ADD COLUMN (`updated` timestamp);
|
||||||
|
|
||||||
|
UPDATE `equipment` SET `updated` = `created`;
|
||||||
|
|
||||||
|
ALTER TABLE `equipment`
|
||||||
|
MODIFY `updated` timestamp NOT NULL;
|
@ -0,0 +1,29 @@
|
|||||||
|
-- organization
|
||||||
|
ALTER TABLE "organization"
|
||||||
|
ADD COLUMN "updated" timestamp;
|
||||||
|
|
||||||
|
UPDATE "organization" SET "updated" = "created";
|
||||||
|
|
||||||
|
ALTER TABLE "organization"
|
||||||
|
ALTER COLUMN "updated" SET NOT NULL;
|
||||||
|
|
||||||
|
-- person
|
||||||
|
ALTER TABLE "person" ALTER COLUMN "created"
|
||||||
|
TYPE timestamp USING(to_timestamp("created", 'YYYY-MM-DD HH24:MI:SS')::timestamp);
|
||||||
|
|
||||||
|
ALTER TABLE "person"
|
||||||
|
ADD COLUMN "updated" timestamp;
|
||||||
|
|
||||||
|
UPDATE "person" SET "updated" = "created";
|
||||||
|
|
||||||
|
ALTER TABLE "person"
|
||||||
|
ALTER COLUMN "updated" SET NOT NULL;
|
||||||
|
|
||||||
|
-- equipment
|
||||||
|
ALTER TABLE "equipment"
|
||||||
|
ADD COLUMN "updated" timestamp;
|
||||||
|
|
||||||
|
UPDATE "equipment" SET "updated" = "created";
|
||||||
|
|
||||||
|
ALTER TABLE "equipment"
|
||||||
|
ALTER COLUMN "updated" SET NOT NULL;
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.store.queries
|
package docspell.store.queries
|
||||||
|
|
||||||
|
import cats.data.OptionT
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common.ContactKind
|
import docspell.common.ContactKind
|
||||||
@ -11,6 +12,20 @@ import doobie._
|
|||||||
import doobie.implicits._
|
import doobie.implicits._
|
||||||
|
|
||||||
object QCollective {
|
object QCollective {
|
||||||
|
|
||||||
|
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
|
||||||
|
object Names {
|
||||||
|
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
|
||||||
|
}
|
||||||
|
|
||||||
|
def allNames(collective: Ident): ConnectionIO[Names] =
|
||||||
|
(for {
|
||||||
|
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
|
||||||
|
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
|
||||||
|
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
|
||||||
|
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
|
||||||
|
.getOrElse(Names.empty)
|
||||||
|
|
||||||
case class TagCount(tag: RTag, count: Int)
|
case class TagCount(tag: RTag, count: Int)
|
||||||
|
|
||||||
case class InsightData(
|
case class InsightData(
|
||||||
|
@ -7,7 +7,13 @@ import docspell.store.impl._
|
|||||||
import doobie._
|
import doobie._
|
||||||
import doobie.implicits._
|
import doobie.implicits._
|
||||||
|
|
||||||
case class REquipment(eid: Ident, cid: Ident, name: String, created: Timestamp) {}
|
case class REquipment(
|
||||||
|
eid: Ident,
|
||||||
|
cid: Ident,
|
||||||
|
name: String,
|
||||||
|
created: Timestamp,
|
||||||
|
updated: Timestamp
|
||||||
|
) {}
|
||||||
|
|
||||||
object REquipment {
|
object REquipment {
|
||||||
|
|
||||||
@ -18,25 +24,32 @@ object REquipment {
|
|||||||
val cid = Column("cid")
|
val cid = Column("cid")
|
||||||
val name = Column("name")
|
val name = Column("name")
|
||||||
val created = Column("created")
|
val created = Column("created")
|
||||||
val all = List(eid, cid, name, created)
|
val updated = Column("updated")
|
||||||
|
val all = List(eid, cid, name, created, updated)
|
||||||
}
|
}
|
||||||
import Columns._
|
import Columns._
|
||||||
|
|
||||||
def insert(v: REquipment): ConnectionIO[Int] = {
|
def insert(v: REquipment): ConnectionIO[Int] = {
|
||||||
val sql = insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created}")
|
val sql =
|
||||||
|
insertRow(table, all, fr"${v.eid},${v.cid},${v.name},${v.created},${v.updated}")
|
||||||
sql.update.run
|
sql.update.run
|
||||||
}
|
}
|
||||||
|
|
||||||
def update(v: REquipment): ConnectionIO[Int] = {
|
def update(v: REquipment): ConnectionIO[Int] = {
|
||||||
val sql = updateRow(
|
def sql(now: Timestamp) =
|
||||||
table,
|
updateRow(
|
||||||
and(eid.is(v.eid), cid.is(v.cid)),
|
table,
|
||||||
commas(
|
and(eid.is(v.eid), cid.is(v.cid)),
|
||||||
cid.setTo(v.cid),
|
commas(
|
||||||
name.setTo(v.name)
|
cid.setTo(v.cid),
|
||||||
|
name.setTo(v.name),
|
||||||
|
updated.setTo(now)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
for {
|
||||||
sql.update.run
|
now <- Timestamp.current[ConnectionIO]
|
||||||
|
n <- sql(now).update.run
|
||||||
|
} yield n
|
||||||
}
|
}
|
||||||
|
|
||||||
def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {
|
def existsByName(coll: Ident, ename: String): ConnectionIO[Boolean] = {
|
||||||
|
@ -19,7 +19,8 @@ case class ROrganization(
|
|||||||
city: String,
|
city: String,
|
||||||
country: String,
|
country: String,
|
||||||
notes: Option[String],
|
notes: Option[String],
|
||||||
created: Timestamp
|
created: Timestamp,
|
||||||
|
updated: Timestamp
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
object ROrganization {
|
object ROrganization {
|
||||||
@ -38,7 +39,8 @@ object ROrganization {
|
|||||||
val country = Column("country")
|
val country = Column("country")
|
||||||
val notes = Column("notes")
|
val notes = Column("notes")
|
||||||
val created = Column("created")
|
val created = Column("created")
|
||||||
val all = List(oid, cid, name, street, zip, city, country, notes, created)
|
val updated = Column("updated")
|
||||||
|
val all = List(oid, cid, name, street, zip, city, country, notes, created, updated)
|
||||||
}
|
}
|
||||||
|
|
||||||
import Columns._
|
import Columns._
|
||||||
@ -47,26 +49,31 @@ object ROrganization {
|
|||||||
val sql = insertRow(
|
val sql = insertRow(
|
||||||
table,
|
table,
|
||||||
all,
|
all,
|
||||||
fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created}"
|
fr"${v.oid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.created},${v.updated}"
|
||||||
)
|
)
|
||||||
sql.update.run
|
sql.update.run
|
||||||
}
|
}
|
||||||
|
|
||||||
def update(v: ROrganization): ConnectionIO[Int] = {
|
def update(v: ROrganization): ConnectionIO[Int] = {
|
||||||
val sql = updateRow(
|
def sql(now: Timestamp) =
|
||||||
table,
|
updateRow(
|
||||||
and(oid.is(v.oid), cid.is(v.cid)),
|
table,
|
||||||
commas(
|
and(oid.is(v.oid), cid.is(v.cid)),
|
||||||
cid.setTo(v.cid),
|
commas(
|
||||||
name.setTo(v.name),
|
cid.setTo(v.cid),
|
||||||
street.setTo(v.street),
|
name.setTo(v.name),
|
||||||
zip.setTo(v.zip),
|
street.setTo(v.street),
|
||||||
city.setTo(v.city),
|
zip.setTo(v.zip),
|
||||||
country.setTo(v.country),
|
city.setTo(v.city),
|
||||||
notes.setTo(v.notes)
|
country.setTo(v.country),
|
||||||
|
notes.setTo(v.notes),
|
||||||
|
updated.setTo(now)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
for {
|
||||||
sql.update.run
|
now <- Timestamp.current[ConnectionIO]
|
||||||
|
n <- sql(now).update.run
|
||||||
|
} yield n
|
||||||
}
|
}
|
||||||
|
|
||||||
def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =
|
def existsByName(coll: Ident, oname: String): ConnectionIO[Boolean] =
|
||||||
|
@ -20,7 +20,8 @@ case class RPerson(
|
|||||||
country: String,
|
country: String,
|
||||||
notes: Option[String],
|
notes: Option[String],
|
||||||
concerning: Boolean,
|
concerning: Boolean,
|
||||||
created: Timestamp
|
created: Timestamp,
|
||||||
|
updated: Timestamp
|
||||||
) {}
|
) {}
|
||||||
|
|
||||||
object RPerson {
|
object RPerson {
|
||||||
@ -40,7 +41,20 @@ object RPerson {
|
|||||||
val notes = Column("notes")
|
val notes = Column("notes")
|
||||||
val concerning = Column("concerning")
|
val concerning = Column("concerning")
|
||||||
val created = Column("created")
|
val created = Column("created")
|
||||||
val all = List(pid, cid, name, street, zip, city, country, notes, concerning, created)
|
val updated = Column("updated")
|
||||||
|
val all = List(
|
||||||
|
pid,
|
||||||
|
cid,
|
||||||
|
name,
|
||||||
|
street,
|
||||||
|
zip,
|
||||||
|
city,
|
||||||
|
country,
|
||||||
|
notes,
|
||||||
|
concerning,
|
||||||
|
created,
|
||||||
|
updated
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
import Columns._
|
import Columns._
|
||||||
@ -49,27 +63,32 @@ object RPerson {
|
|||||||
val sql = insertRow(
|
val sql = insertRow(
|
||||||
table,
|
table,
|
||||||
all,
|
all,
|
||||||
fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created}"
|
fr"${v.pid},${v.cid},${v.name},${v.street},${v.zip},${v.city},${v.country},${v.notes},${v.concerning},${v.created},${v.updated}"
|
||||||
)
|
)
|
||||||
sql.update.run
|
sql.update.run
|
||||||
}
|
}
|
||||||
|
|
||||||
def update(v: RPerson): ConnectionIO[Int] = {
|
def update(v: RPerson): ConnectionIO[Int] = {
|
||||||
val sql = updateRow(
|
def sql(now: Timestamp) =
|
||||||
table,
|
updateRow(
|
||||||
and(pid.is(v.pid), cid.is(v.cid)),
|
table,
|
||||||
commas(
|
and(pid.is(v.pid), cid.is(v.cid)),
|
||||||
cid.setTo(v.cid),
|
commas(
|
||||||
name.setTo(v.name),
|
cid.setTo(v.cid),
|
||||||
street.setTo(v.street),
|
name.setTo(v.name),
|
||||||
zip.setTo(v.zip),
|
street.setTo(v.street),
|
||||||
city.setTo(v.city),
|
zip.setTo(v.zip),
|
||||||
country.setTo(v.country),
|
city.setTo(v.city),
|
||||||
concerning.setTo(v.concerning),
|
country.setTo(v.country),
|
||||||
notes.setTo(v.notes)
|
concerning.setTo(v.concerning),
|
||||||
|
notes.setTo(v.notes),
|
||||||
|
updated.setTo(now)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
for {
|
||||||
sql.update.run
|
now <- Timestamp.current[ConnectionIO]
|
||||||
|
n <- sql(now).update.run
|
||||||
|
} yield n
|
||||||
}
|
}
|
||||||
|
|
||||||
def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =
|
def existsByName(coll: Ident, pname: String): ConnectionIO[Boolean] =
|
||||||
|
@ -10,6 +10,7 @@ module Data.Language exposing
|
|||||||
type Language
|
type Language
|
||||||
= German
|
= German
|
||||||
| English
|
| English
|
||||||
|
| French
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -20,6 +21,9 @@ fromString str =
|
|||||||
else if str == "eng" || str == "en" || str == "english" then
|
else if str == "eng" || str == "en" || str == "english" then
|
||||||
Just English
|
Just English
|
||||||
|
|
||||||
|
else if str == "fra" || str == "fr" || str == "french" then
|
||||||
|
Just French
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -33,6 +37,9 @@ toIso3 lang =
|
|||||||
English ->
|
English ->
|
||||||
"eng"
|
"eng"
|
||||||
|
|
||||||
|
French ->
|
||||||
|
"fra"
|
||||||
|
|
||||||
|
|
||||||
toName : Language -> String
|
toName : Language -> String
|
||||||
toName lang =
|
toName lang =
|
||||||
@ -43,7 +50,10 @@ toName lang =
|
|||||||
English ->
|
English ->
|
||||||
"English"
|
"English"
|
||||||
|
|
||||||
|
French ->
|
||||||
|
"French"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
[ German, English ]
|
[ German, English, French ]
|
||||||
|
@ -91,6 +91,11 @@ let
|
|||||||
};
|
};
|
||||||
text-analysis = {
|
text-analysis = {
|
||||||
max-length = 10000;
|
max-length = 10000;
|
||||||
|
regex-ner = {
|
||||||
|
enabled = true;
|
||||||
|
file-cache-time = "1 minute";
|
||||||
|
};
|
||||||
|
working-dir = "/tmp/docspell-analysis";
|
||||||
};
|
};
|
||||||
processing = {
|
processing = {
|
||||||
max-due-date-years = 10;
|
max-due-date-years = 10;
|
||||||
@ -689,7 +694,48 @@ in {
|
|||||||
(a rough guess).
|
(a rough guess).
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
working-dir = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.text-analysis.working-dir;
|
||||||
|
description = ''
|
||||||
|
A working directory for the analyser to store temporary/working
|
||||||
|
files.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
regex-ner = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.text-analysis.regex-ner.enabled;
|
||||||
|
description = ''
|
||||||
|
Whether to enable custom NER annotation. This uses the address
|
||||||
|
book of a collective as input for NER tagging (to automatically
|
||||||
|
find correspondent and concerned entities). If the address book
|
||||||
|
is large, this can be quite memory intensive and also makes text
|
||||||
|
analysis slower. But it greatly improves accuracy. If this is
|
||||||
|
false, NER tagging uses only statistical models (that also work
|
||||||
|
quite well).
|
||||||
|
|
||||||
|
This setting might be moved to the collective settings in the
|
||||||
|
future.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
file-cache-time = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.text-analysis.ner-file-cache-time;
|
||||||
|
description = ''
|
||||||
|
The NER annotation uses a file of patterns that is derived from
|
||||||
|
a collective's address book. This is is the time how long this
|
||||||
|
file will be kept until a check for a state change is done.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis.regex-ner;
|
||||||
|
description = "";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.text-analysis;
|
default = defaults.text-analysis;
|
||||||
|
@ -31,7 +31,7 @@ object Dependencies {
|
|||||||
val PostgresVersion = "42.2.16"
|
val PostgresVersion = "42.2.16"
|
||||||
val PureConfigVersion = "0.13.0"
|
val PureConfigVersion = "0.13.0"
|
||||||
val Slf4jVersion = "1.7.30"
|
val Slf4jVersion = "1.7.30"
|
||||||
val StanfordNlpVersion = "3.9.2"
|
val StanfordNlpVersion = "4.0.0"
|
||||||
val TikaVersion = "1.24.1"
|
val TikaVersion = "1.24.1"
|
||||||
val YamuscaVersion = "0.6.2"
|
val YamuscaVersion = "0.6.2"
|
||||||
val SwaggerUIVersion = "3.32.3"
|
val SwaggerUIVersion = "3.32.3"
|
||||||
@ -135,11 +135,16 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
|
|
||||||
val stanfordNlpModels = Seq(
|
val stanfordNlpModels = Seq(
|
||||||
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||||
|
.classifier("models"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||||
.classifier("models-german"),
|
.classifier("models-german"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||||
"models-english"
|
.classifier("models-french"),
|
||||||
)
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||||
|
.classifier(
|
||||||
|
"models-english"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
val tika = Seq(
|
val tika = Seq(
|
||||||
|
@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private val nerModels = List(
|
private val nerModels = List(
|
||||||
"german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
|
"german.distsim.crf.ser.gz",
|
||||||
"english.all.3class.distsim.crf.ser.gz"
|
"english.conll.4class.distsim.crf.ser.gz",
|
||||||
|
"french-wikiner-4class.crf.ser.gz",
|
||||||
|
"french-mwt-statistical.tsv",
|
||||||
|
"french-mwt.tagger",
|
||||||
|
"french-mwt.tsv",
|
||||||
|
"german-mwt.tsv",
|
||||||
|
"german-ud.tagger",
|
||||||
|
"german-ud.tagger.props",
|
||||||
|
"french-ud.tagger",
|
||||||
|
"french-ud.tagger.props",
|
||||||
|
"english-left3words-distsim.tagger",
|
||||||
|
"english-left3words-distsim.tagger.props"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user