mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-07-29 11:04:53 +00:00
Reorganize nlp pipeline and add nlp-unsupported language italian
Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
This commit is contained in:
@ -24,4 +24,4 @@ before_script:
|
|||||||
- export TZ=Europe/Berlin
|
- export TZ=Europe/Berlin
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- sbt ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test"
|
- sbt -J-XX:+UseG1GC ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test"
|
||||||
|
@ -15,6 +15,7 @@ RUN apk add --no-cache openjdk11-jre \
|
|||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-data-deu \
|
tesseract-ocr-data-deu \
|
||||||
tesseract-ocr-data-fra \
|
tesseract-ocr-data-fra \
|
||||||
|
tesseract-ocr-data-ita \
|
||||||
unpaper \
|
unpaper \
|
||||||
wkhtmltopdf \
|
wkhtmltopdf \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
package docspell.analysis
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
case class NlpSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
@ -10,13 +10,13 @@ import docspell.analysis.date.DateFind
|
|||||||
import docspell.analysis.nlp._
|
import docspell.analysis.nlp._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
import org.log4s.getLogger
|
||||||
|
|
||||||
trait TextAnalyser[F[_]] {
|
trait TextAnalyser[F[_]] {
|
||||||
|
|
||||||
def annotate(
|
def annotate(
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
settings: StanfordNerSettings,
|
settings: NlpSettings,
|
||||||
cacheKey: Ident,
|
cacheKey: Ident,
|
||||||
text: String
|
text: String
|
||||||
): F[TextAnalyser.Result]
|
): F[TextAnalyser.Result]
|
||||||
@ -24,6 +24,7 @@ trait TextAnalyser[F[_]] {
|
|||||||
def classifier: TextClassifier[F]
|
def classifier: TextClassifier[F]
|
||||||
}
|
}
|
||||||
object TextAnalyser {
|
object TextAnalyser {
|
||||||
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
|
case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
|
||||||
|
|
||||||
@ -41,13 +42,13 @@ object TextAnalyser {
|
|||||||
new TextAnalyser[F] {
|
new TextAnalyser[F] {
|
||||||
def annotate(
|
def annotate(
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
settings: StanfordNerSettings,
|
settings: NlpSettings,
|
||||||
cacheKey: Ident,
|
cacheKey: Ident,
|
||||||
text: String
|
text: String
|
||||||
): F[TextAnalyser.Result] =
|
): F[TextAnalyser.Result] =
|
||||||
for {
|
for {
|
||||||
input <- textLimit(logger, text)
|
input <- textLimit(logger, text)
|
||||||
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
|
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input))
|
||||||
tags1 <- contactNer(input)
|
tags1 <- contactNer(input)
|
||||||
dates <- dateNer(settings.lang, input)
|
dates <- dateNer(settings.lang, input)
|
||||||
list = tags0 ++ tags1
|
list = tags0 ++ tags1
|
||||||
@ -77,31 +78,36 @@ object TextAnalyser {
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/** Provides the nlp pipeline based on the configuration. */
|
||||||
private object Nlp {
|
private object Nlp {
|
||||||
|
|
||||||
def apply[F[_]: Concurrent: Timer: BracketThrow](
|
def apply[F[_]: Concurrent: Timer: BracketThrow](
|
||||||
cfg: TextAnalysisConfig.NlpConfig
|
cfg: TextAnalysisConfig.NlpConfig
|
||||||
): F[Input => F[Vector[NerLabel]]] =
|
): F[Input[F] => F[Vector[NerLabel]]] =
|
||||||
cfg.mode match {
|
cfg.mode match {
|
||||||
case NlpMode.Full =>
|
|
||||||
PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
|
|
||||||
case NlpMode.Basic =>
|
|
||||||
PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
|
|
||||||
case NlpMode.Disabled =>
|
case NlpMode.Disabled =>
|
||||||
|
Logger.log4s(logger).info("NLP is disabled as defined in config.") *>
|
||||||
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
|
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
|
||||||
|
case _ =>
|
||||||
|
PipelineCache(cfg.clearInterval)(
|
||||||
|
Annotator[F](cfg.mode),
|
||||||
|
Annotator.clearCaches[F]
|
||||||
|
)
|
||||||
|
.map(annotate[F])
|
||||||
}
|
}
|
||||||
|
|
||||||
final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
|
final case class Input[F[_]](
|
||||||
|
key: Ident,
|
||||||
|
settings: NlpSettings,
|
||||||
|
logger: Logger[F],
|
||||||
|
text: String
|
||||||
|
)
|
||||||
|
|
||||||
def full[F[_]: BracketThrow](
|
def annotate[F[_]: BracketThrow](
|
||||||
cache: PipelineCache[F, StanfordCoreNLP]
|
cache: PipelineCache[F]
|
||||||
)(input: Input): F[Vector[NerLabel]] =
|
)(input: Input[F]): F[Vector[NerLabel]] =
|
||||||
StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
cache
|
||||||
|
.obtain(input.key.id, input.settings)
|
||||||
def basic[F[_]: BracketThrow](
|
.use(ann => ann.nerAnnotate(input.logger)(input.text))
|
||||||
cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
|
|
||||||
)(input: Input): F[Vector[NerLabel]] =
|
|
||||||
BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,23 +41,30 @@ object DateFind {
|
|||||||
}
|
}
|
||||||
|
|
||||||
object SimpleDate {
|
object SimpleDate {
|
||||||
val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) =>
|
def pattern0(lang: Language) = (readYear >> readMonth(lang) >> readDay).map {
|
||||||
|
case ((y, m), d) =>
|
||||||
List(SimpleDate(y, m, d))
|
List(SimpleDate(y, m, d))
|
||||||
}
|
}
|
||||||
val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) =>
|
def pattern1(lang: Language) = (readDay >> readMonth(lang) >> readYear).map {
|
||||||
|
case ((d, m), y) =>
|
||||||
List(SimpleDate(y, m, d))
|
List(SimpleDate(y, m, d))
|
||||||
}
|
}
|
||||||
val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) =>
|
def pattern2(lang: Language) = (readMonth(lang) >> readDay >> readYear).map {
|
||||||
|
case ((m, d), y) =>
|
||||||
List(SimpleDate(y, m, d))
|
List(SimpleDate(y, m, d))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
|
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
|
||||||
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
|
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
|
||||||
|
val p0 = pattern0(lang)
|
||||||
|
val p1 = pattern1(lang)
|
||||||
|
val p2 = pattern2(lang)
|
||||||
val p = lang match {
|
val p = lang match {
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1)
|
p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1)
|
||||||
case Language.German => p1.or(p0).or(p2)
|
case Language.German => p1.or(p0).or(p2)
|
||||||
case Language.French => p1.or(p0).or(p2)
|
case Language.French => p1.or(p0).or(p2)
|
||||||
|
case Language.Italian => p1.or(p0).or(p2)
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
@ -76,9 +83,11 @@ object DateFind {
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
def readMonth: Reader[Int] =
|
def readMonth(lang: Language): Reader[Int] =
|
||||||
Reader.readFirst(w =>
|
Reader.readFirst(w =>
|
||||||
Some(months.indexWhere(_.contains(w.value))).filter(_ >= 0).map(_ + 1)
|
Some(MonthName.getAll(lang).indexWhere(_.contains(w.value)))
|
||||||
|
.filter(_ >= 0)
|
||||||
|
.map(_ + 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
def readDay: Reader[Int] =
|
def readDay: Reader[Int] =
|
||||||
@ -150,20 +159,5 @@ object DateFind {
|
|||||||
Failure
|
Failure
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private val months = List(
|
|
||||||
List("jan", "january", "januar", "01"),
|
|
||||||
List("feb", "february", "februar", "02"),
|
|
||||||
List("mar", "march", "märz", "marz", "03"),
|
|
||||||
List("apr", "april", "04"),
|
|
||||||
List("may", "mai", "05"),
|
|
||||||
List("jun", "june", "juni", "06"),
|
|
||||||
List("jul", "july", "juli", "07"),
|
|
||||||
List("aug", "august", "08"),
|
|
||||||
List("sep", "september", "09"),
|
|
||||||
List("oct", "october", "oktober", "10"),
|
|
||||||
List("nov", "november", "11"),
|
|
||||||
List("dec", "december", "dezember", "12")
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,101 @@
|
|||||||
|
package docspell.analysis.date
|
||||||
|
|
||||||
|
import docspell.common.Language
|
||||||
|
|
||||||
|
object MonthName {
|
||||||
|
|
||||||
|
def getAll(lang: Language): List[List[String]] =
|
||||||
|
merge(numbers, forLang(lang))
|
||||||
|
|
||||||
|
private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
|
||||||
|
ns.foldLeft(n0) { (res, el) =>
|
||||||
|
res.zip(el).map({ case (a, b) => a ++ b })
|
||||||
|
}
|
||||||
|
|
||||||
|
private def forLang(lang: Language): List[List[String]] =
|
||||||
|
lang match {
|
||||||
|
case Language.English =>
|
||||||
|
english
|
||||||
|
case Language.German =>
|
||||||
|
german
|
||||||
|
case Language.French =>
|
||||||
|
french
|
||||||
|
case Language.Italian =>
|
||||||
|
italian
|
||||||
|
}
|
||||||
|
|
||||||
|
private val numbers = List(
|
||||||
|
List("01"),
|
||||||
|
List("02"),
|
||||||
|
List("03"),
|
||||||
|
List("04"),
|
||||||
|
List("05"),
|
||||||
|
List("06"),
|
||||||
|
List("07"),
|
||||||
|
List("08"),
|
||||||
|
List("09"),
|
||||||
|
List("10"),
|
||||||
|
List("11"),
|
||||||
|
List("12")
|
||||||
|
)
|
||||||
|
|
||||||
|
private val english = List(
|
||||||
|
List("jan", "january"),
|
||||||
|
List("feb", "february"),
|
||||||
|
List("mar", "march"),
|
||||||
|
List("apr", "april"),
|
||||||
|
List("may"),
|
||||||
|
List("jun", "june"),
|
||||||
|
List("jul", "july"),
|
||||||
|
List("aug", "august"),
|
||||||
|
List("sept", "september"),
|
||||||
|
List("oct", "october"),
|
||||||
|
List("nov", "november"),
|
||||||
|
List("dec", "december")
|
||||||
|
)
|
||||||
|
|
||||||
|
private val german = List(
|
||||||
|
List("jan", "januar"),
|
||||||
|
List("feb", "februar"),
|
||||||
|
List("märz"),
|
||||||
|
List("apr", "april"),
|
||||||
|
List("mai"),
|
||||||
|
List("juni"),
|
||||||
|
List("juli"),
|
||||||
|
List("aug", "august"),
|
||||||
|
List("sept", "september"),
|
||||||
|
List("okt", "oktober"),
|
||||||
|
List("nov", "november"),
|
||||||
|
List("dez", "dezember")
|
||||||
|
)
|
||||||
|
|
||||||
|
private val french = List(
|
||||||
|
List("janv", "janvier"),
|
||||||
|
List("févr", "fevr", "février", "fevrier"),
|
||||||
|
List("mars"),
|
||||||
|
List("avril"),
|
||||||
|
List("mai"),
|
||||||
|
List("juin"),
|
||||||
|
List("juil", "juillet"),
|
||||||
|
List("aout", "août"),
|
||||||
|
List("sept", "septembre"),
|
||||||
|
List("oct", "octobre"),
|
||||||
|
List("nov", "novembre"),
|
||||||
|
List("dec", "déc", "décembre", "decembre")
|
||||||
|
)
|
||||||
|
|
||||||
|
private val italian = List(
|
||||||
|
List("genn", "gennaio"),
|
||||||
|
List("febbr", "febbraio"),
|
||||||
|
List("mar", "marzo"),
|
||||||
|
List("apr", "aprile"),
|
||||||
|
List("magg", "maggio"),
|
||||||
|
List("giugno"),
|
||||||
|
List("luglio"),
|
||||||
|
List("ag", "agosto"),
|
||||||
|
List("sett", "settembre"),
|
||||||
|
List("ott", "ottobre"),
|
||||||
|
List("nov", "novembre"),
|
||||||
|
List("dic", "dicembre")
|
||||||
|
)
|
||||||
|
}
|
@ -0,0 +1,98 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import cats.effect.Sync
|
||||||
|
import cats.implicits._
|
||||||
|
import cats.{Applicative, FlatMap}
|
||||||
|
|
||||||
|
import docspell.analysis.NlpSettings
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
|
|
||||||
|
/** Analyses a text to mark certain parts with a `NerLabel`. */
|
||||||
|
trait Annotator[F[_]] { self =>
|
||||||
|
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]]
|
||||||
|
|
||||||
|
def ++(next: Annotator[F])(implicit F: FlatMap[F]): Annotator[F] =
|
||||||
|
new Annotator[F] {
|
||||||
|
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||||
|
for {
|
||||||
|
n0 <- self.nerAnnotate(logger)(text)
|
||||||
|
n1 <- next.nerAnnotate(logger)(text)
|
||||||
|
} yield (n0 ++ n1).distinct
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object Annotator {
|
||||||
|
|
||||||
|
/** Creates an annotator according to the given `mode` and `settings`.
|
||||||
|
*
|
||||||
|
* There are the following ways:
|
||||||
|
*
|
||||||
|
* - disabled: it returns a no-op annotator that always gives an empty list
|
||||||
|
* - full: the complete stanford pipeline is used
|
||||||
|
* - basic: only the ner classifier is used
|
||||||
|
*
|
||||||
|
* Additionally, if there is a regexNer-file specified, the regexner annotator is
|
||||||
|
* also run. In case the full pipeline is used, this is already included.
|
||||||
|
*/
|
||||||
|
def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
|
||||||
|
mode match {
|
||||||
|
case NlpMode.Disabled =>
|
||||||
|
Annotator.none[F]
|
||||||
|
case NlpMode.Full =>
|
||||||
|
StanfordNerSettings.fromNlpSettings(settings) match {
|
||||||
|
case Some(ss) =>
|
||||||
|
Annotator.pipeline(StanfordNerAnnotator.makePipeline(ss))
|
||||||
|
case None =>
|
||||||
|
Annotator.none[F]
|
||||||
|
}
|
||||||
|
case NlpMode.Basic =>
|
||||||
|
StanfordNerSettings.fromNlpSettings(settings) match {
|
||||||
|
case Some(StanfordNerSettings.Full(lang, _, Some(file))) =>
|
||||||
|
Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) ++
|
||||||
|
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||||
|
case Some(StanfordNerSettings.Full(lang, _, None)) =>
|
||||||
|
Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang))
|
||||||
|
case Some(StanfordNerSettings.RegexOnly(file)) =>
|
||||||
|
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||||
|
case None =>
|
||||||
|
Annotator.none[F]
|
||||||
|
}
|
||||||
|
case NlpMode.RegexOnly =>
|
||||||
|
settings.regexNer match {
|
||||||
|
case Some(file) =>
|
||||||
|
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||||
|
case None =>
|
||||||
|
Annotator.none[F]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def none[F[_]: Applicative]: Annotator[F] =
|
||||||
|
new Annotator[F] {
|
||||||
|
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||||
|
logger.debug("Running empty annotator. NLP not supported.") *>
|
||||||
|
Vector.empty[NerLabel].pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
def basic[F[_]: Sync](ann: BasicCRFAnnotator.Annotator): Annotator[F] =
|
||||||
|
new Annotator[F] {
|
||||||
|
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||||
|
Sync[F].delay(
|
||||||
|
BasicCRFAnnotator.nerAnnotate(ann)(text)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
def pipeline[F[_]: Sync](cp: StanfordCoreNLP): Annotator[F] =
|
||||||
|
new Annotator[F] {
|
||||||
|
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||||
|
Sync[F].delay(StanfordNerAnnotator.nerAnnotate(cp, text))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def clearCaches[F[_]: Sync]: F[Unit] =
|
||||||
|
Sync[F].delay {
|
||||||
|
StanfordCoreNLP.clearAnnotatorPool()
|
||||||
|
BasicCRFAnnotator.Cache.clearCache()
|
||||||
|
}
|
||||||
|
}
|
@ -7,9 +7,7 @@ import java.util.zip.GZIPInputStream
|
|||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
import scala.util.Using
|
import scala.util.Using
|
||||||
|
|
||||||
import cats.Applicative
|
import docspell.common.Language.NLPLanguage
|
||||||
import cats.effect.BracketThrow
|
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||||
@ -30,14 +28,6 @@ object BasicCRFAnnotator {
|
|||||||
|
|
||||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||||
|
|
||||||
def nerAnnotate[F[_]: BracketThrow](
|
|
||||||
cacheKey: String,
|
|
||||||
cache: PipelineCache[F, Annotator]
|
|
||||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
|
||||||
cache
|
|
||||||
.obtain(cacheKey, settings)
|
|
||||||
.use(crf => Applicative[F].pure(nerAnnotate(crf)(text)))
|
|
||||||
|
|
||||||
def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
|
def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
|
||||||
nerClassifier
|
nerClassifier
|
||||||
.classify(text)
|
.classify(text)
|
||||||
@ -52,7 +42,7 @@ object BasicCRFAnnotator {
|
|||||||
})
|
})
|
||||||
.toVector
|
.toVector
|
||||||
|
|
||||||
private def makeClassifier(lang: Language): Annotator = {
|
def makeAnnotator(lang: NLPLanguage): Annotator = {
|
||||||
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
||||||
val ner = classifierResource(lang)
|
val ner = classifierResource(lang)
|
||||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
Using(new GZIPInputStream(ner.openStream())) { in =>
|
||||||
@ -60,7 +50,7 @@ object BasicCRFAnnotator {
|
|||||||
}.fold(throw _, identity)
|
}.fold(throw _, identity)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def classifierResource(lang: Language): URL = {
|
private def classifierResource(lang: NLPLanguage): URL = {
|
||||||
def check(name: String): URL =
|
def check(name: String): URL =
|
||||||
Option(getClass.getResource(name)) match {
|
Option(getClass.getResource(name)) match {
|
||||||
case None =>
|
case None =>
|
||||||
@ -79,11 +69,11 @@ object BasicCRFAnnotator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
final class Cache {
|
final class Cache {
|
||||||
private[this] lazy val germanNerClassifier = makeClassifier(Language.German)
|
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||||
private[this] lazy val englishNerClassifier = makeClassifier(Language.English)
|
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||||
private[this] lazy val frenchNerClassifier = makeClassifier(Language.French)
|
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||||
|
|
||||||
def forLang(language: Language): Annotator =
|
def forLang(language: NLPLanguage): Annotator =
|
||||||
language match {
|
language match {
|
||||||
case Language.French => frenchNerClassifier
|
case Language.French => frenchNerClassifier
|
||||||
case Language.German => germanNerClassifier
|
case Language.German => germanNerClassifier
|
||||||
@ -95,7 +85,7 @@ object BasicCRFAnnotator {
|
|||||||
|
|
||||||
private[this] val cacheRef = new AtomicReference[Cache](new Cache)
|
private[this] val cacheRef = new AtomicReference[Cache](new Cache)
|
||||||
|
|
||||||
def getAnnotator(language: Language): Annotator =
|
def getAnnotator(language: NLPLanguage): Annotator =
|
||||||
cacheRef.get().forLang(language)
|
cacheRef.get().forLang(language)
|
||||||
|
|
||||||
def clearCache(): Unit =
|
def clearCache(): Unit =
|
||||||
|
@ -3,14 +3,13 @@ package docspell.analysis.nlp
|
|||||||
import scala.concurrent.duration.{Duration => _, _}
|
import scala.concurrent.duration.{Duration => _, _}
|
||||||
|
|
||||||
import cats.Applicative
|
import cats.Applicative
|
||||||
import cats.data.Kleisli
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.effect.concurrent.Ref
|
import cats.effect.concurrent.Ref
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.NlpSettings
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
|
||||||
import org.log4s.getLogger
|
import org.log4s.getLogger
|
||||||
|
|
||||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||||
@ -20,58 +19,32 @@ import org.log4s.getLogger
|
|||||||
*
|
*
|
||||||
* **This is an internal API**
|
* **This is an internal API**
|
||||||
*/
|
*/
|
||||||
trait PipelineCache[F[_], A] {
|
trait PipelineCache[F[_]] {
|
||||||
|
|
||||||
def obtain(key: String, settings: StanfordNerSettings): Resource[F, A]
|
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object PipelineCache {
|
object PipelineCache {
|
||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
def none[F[_]: Applicative, A](
|
def apply[F[_]: Concurrent: Timer](clearInterval: Duration)(
|
||||||
creator: Kleisli[F, StanfordNerSettings, A]
|
creator: NlpSettings => Annotator[F],
|
||||||
): PipelineCache[F, A] =
|
|
||||||
new PipelineCache[F, A] {
|
|
||||||
def obtain(
|
|
||||||
ignored: String,
|
|
||||||
settings: StanfordNerSettings
|
|
||||||
): Resource[F, A] =
|
|
||||||
Resource.liftF(creator.run(settings))
|
|
||||||
}
|
|
||||||
|
|
||||||
def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)(
|
|
||||||
creator: StanfordNerSettings => A,
|
|
||||||
release: F[Unit]
|
release: F[Unit]
|
||||||
): F[PipelineCache[F, A]] =
|
): F[PipelineCache[F]] =
|
||||||
for {
|
for {
|
||||||
data <- Ref.of(Map.empty[String, Entry[A]])
|
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
|
||||||
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
||||||
} yield new Impl[F, A](data, creator, cacheClear)
|
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
|
||||||
|
} yield new Impl[F](data, creator, cacheClear)
|
||||||
|
|
||||||
def full[F[_]: Concurrent: Timer](
|
final private class Impl[F[_]: Sync](
|
||||||
clearInterval: Duration
|
data: Ref[F, Map[String, Entry[Annotator[F]]]],
|
||||||
): F[PipelineCache[F, StanfordCoreNLP]] =
|
creator: NlpSettings => Annotator[F],
|
||||||
apply(clearInterval)(
|
|
||||||
StanfordNerAnnotator.makePipeline,
|
|
||||||
StanfordNerAnnotator.clearPipelineCaches
|
|
||||||
)
|
|
||||||
|
|
||||||
def basic[F[_]: Concurrent: Timer](
|
|
||||||
clearInterval: Duration
|
|
||||||
): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] =
|
|
||||||
apply(clearInterval)(
|
|
||||||
settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang),
|
|
||||||
Sync[F].delay(BasicCRFAnnotator.Cache.clearCache())
|
|
||||||
)
|
|
||||||
|
|
||||||
final private class Impl[F[_]: Sync, A](
|
|
||||||
data: Ref[F, Map[String, Entry[A]]],
|
|
||||||
creator: StanfordNerSettings => A,
|
|
||||||
cacheClear: CacheClearing[F]
|
cacheClear: CacheClearing[F]
|
||||||
) extends PipelineCache[F, A] {
|
) extends PipelineCache[F] {
|
||||||
|
|
||||||
def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] =
|
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] =
|
||||||
for {
|
for {
|
||||||
_ <- cacheClear.withCache
|
_ <- cacheClear.withCache
|
||||||
id <- Resource.liftF(makeSettingsId(settings))
|
id <- Resource.liftF(makeSettingsId(settings))
|
||||||
@ -83,10 +56,10 @@ object PipelineCache {
|
|||||||
private def getOrCreate(
|
private def getOrCreate(
|
||||||
key: String,
|
key: String,
|
||||||
id: String,
|
id: String,
|
||||||
cache: Map[String, Entry[A]],
|
cache: Map[String, Entry[Annotator[F]]],
|
||||||
settings: StanfordNerSettings,
|
settings: NlpSettings,
|
||||||
creator: StanfordNerSettings => A
|
creator: NlpSettings => Annotator[F]
|
||||||
): (Map[String, Entry[A]], A) =
|
): (Map[String, Entry[Annotator[F]]], Annotator[F]) =
|
||||||
cache.get(key) match {
|
cache.get(key) match {
|
||||||
case Some(entry) =>
|
case Some(entry) =>
|
||||||
if (entry.id == id) (cache, entry.value)
|
if (entry.id == id) (cache, entry.value)
|
||||||
@ -105,7 +78,7 @@ object PipelineCache {
|
|||||||
(cache.updated(key, e), nlp)
|
(cache.updated(key, e), nlp)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
|
private def makeSettingsId(settings: NlpSettings): F[String] = {
|
||||||
val base = settings.copy(regexNer = None).toString
|
val base = settings.copy(regexNer = None).toString
|
||||||
val size: F[Long] =
|
val size: F[Long] =
|
||||||
settings.regexNer match {
|
settings.regexNer match {
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
import java.util.{Properties => JProps}
|
import java.util.{Properties => JProps}
|
||||||
|
|
||||||
import docspell.analysis.nlp.Properties.Implicits._
|
import docspell.analysis.nlp.Properties.Implicits._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.common.syntax.FileSyntax._
|
||||||
|
|
||||||
object Properties {
|
object Properties {
|
||||||
|
|
||||||
@ -17,17 +19,20 @@ object Properties {
|
|||||||
p
|
p
|
||||||
}
|
}
|
||||||
|
|
||||||
def forSettings(settings: StanfordNerSettings): JProps = {
|
def forSettings(settings: StanfordNerSettings): JProps =
|
||||||
val regexNerFile = settings.regexNer
|
settings match {
|
||||||
.map(p => p.normalize().toAbsolutePath().toString())
|
case StanfordNerSettings.Full(lang, highRecall, regexNer) =>
|
||||||
settings.lang match {
|
val regexNerFile = regexNer.map(p => p.absolutePathAsString)
|
||||||
|
lang match {
|
||||||
case Language.German =>
|
case Language.German =>
|
||||||
Properties.nerGerman(regexNerFile, settings.highRecall)
|
Properties.nerGerman(regexNerFile, highRecall)
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
Properties.nerEnglish(regexNerFile)
|
Properties.nerEnglish(regexNerFile)
|
||||||
case Language.French =>
|
case Language.French =>
|
||||||
Properties.nerFrench(regexNerFile, settings.highRecall)
|
Properties.nerFrench(regexNerFile, highRecall)
|
||||||
}
|
}
|
||||||
|
case StanfordNerSettings.RegexOnly(path) =>
|
||||||
|
Properties.regexNerOnly(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||||
@ -76,6 +81,11 @@ object Properties {
|
|||||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
|
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize,ssplit"
|
||||||
|
).withRegexNer(Some(regexNerMappingFile.absolutePathAsString))
|
||||||
|
|
||||||
object Implicits {
|
object Implicits {
|
||||||
implicit final class JPropsOps(val p: JProps) extends AnyVal {
|
implicit final class JPropsOps(val p: JProps) extends AnyVal {
|
||||||
|
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
|
|
||||||
import cats.Applicative
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -24,25 +25,25 @@ object StanfordNerAnnotator {
|
|||||||
* a new classifier must be created. It will then replace the
|
* a new classifier must be created. It will then replace the
|
||||||
* previous one.
|
* previous one.
|
||||||
*/
|
*/
|
||||||
def nerAnnotate[F[_]: BracketThrow](
|
|
||||||
cacheKey: String,
|
|
||||||
cache: PipelineCache[F, StanfordCoreNLP]
|
|
||||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
|
||||||
cache
|
|
||||||
.obtain(cacheKey, settings)
|
|
||||||
.use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
|
|
||||||
|
|
||||||
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||||
val doc = new CoreDocument(text)
|
val doc = new CoreDocument(text)
|
||||||
nerClassifier.annotate(doc)
|
nerClassifier.annotate(doc)
|
||||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
||||||
}
|
}
|
||||||
|
|
||||||
def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = {
|
def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP =
|
||||||
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
settings match {
|
||||||
|
case s: StanfordNerSettings.Full =>
|
||||||
|
logger.info(s"Creating ${s.lang.name} Stanford NLP NER classifier...")
|
||||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||||
|
case StanfordNerSettings.RegexOnly(path) =>
|
||||||
|
logger.info(s"Creating regexNer-only Stanford NLP NER classifier...")
|
||||||
|
regexNerPipeline(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def regexNerPipeline(regexNerFile: Path): StanfordCoreNLP =
|
||||||
|
new StanfordCoreNLP(Properties.regexNerOnly(regexNerFile))
|
||||||
|
|
||||||
def clearPipelineCaches[F[_]: Sync]: F[Unit] =
|
def clearPipelineCaches[F[_]: Sync]: F[Unit] =
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
// turns out that everything is cached in a static map
|
// turns out that everything is cached in a static map
|
||||||
|
@ -2,9 +2,14 @@ package docspell.analysis.nlp
|
|||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.analysis.NlpSettings
|
||||||
|
import docspell.common.Language.NLPLanguage
|
||||||
|
|
||||||
/** Settings for configuring the stanford NER pipeline.
|
sealed trait StanfordNerSettings
|
||||||
|
|
||||||
|
object StanfordNerSettings {
|
||||||
|
|
||||||
|
/** Settings for configuring the stanford NER pipeline.
|
||||||
*
|
*
|
||||||
* The language is mandatory, only the provided ones are supported.
|
* The language is mandatory, only the provided ones are supported.
|
||||||
* The `highRecall` only applies for non-English languages. For
|
* The `highRecall` only applies for non-English languages. For
|
||||||
@ -19,8 +24,19 @@ import docspell.common._
|
|||||||
* as a last step to tag untagged tokens using the provided list of
|
* as a last step to tag untagged tokens using the provided list of
|
||||||
* regexps.
|
* regexps.
|
||||||
*/
|
*/
|
||||||
case class StanfordNerSettings(
|
case class Full(
|
||||||
lang: Language,
|
lang: NLPLanguage,
|
||||||
highRecall: Boolean,
|
highRecall: Boolean,
|
||||||
regexNer: Option[Path]
|
regexNer: Option[Path]
|
||||||
)
|
) extends StanfordNerSettings
|
||||||
|
|
||||||
|
/** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
|
||||||
|
*/
|
||||||
|
case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings
|
||||||
|
|
||||||
|
def fromNlpSettings(ns: NlpSettings): Option[StanfordNerSettings] =
|
||||||
|
NLPLanguage.all
|
||||||
|
.find(nl => nl == ns.lang)
|
||||||
|
.map(nl => Full(nl, ns.highRecall, ns.regexNer))
|
||||||
|
.orElse(ns.regexNer.map(nrf => RegexOnly(nrf)))
|
||||||
|
}
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import docspell.common.Language.NLPLanguage
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
import docspell.files.TestFiles
|
import docspell.files.TestFiles
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
||||||
|
|
||||||
def annotate(language: Language): String => Vector[NerLabel] =
|
def annotate(language: NLPLanguage): String => Vector[NerLabel] =
|
||||||
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
|
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
|
||||||
|
|
||||||
test("find english ner labels") {
|
test("find english ner labels") {
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Paths
|
||||||
|
|
||||||
|
import cats.effect.IO
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
import docspell.files.TestFiles
|
import docspell.files.TestFiles
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.common.syntax.FileSyntax._
|
||||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
|
|
||||||
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||||
@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
|||||||
assertEquals(labels, expect)
|
assertEquals(labels, expect)
|
||||||
StanfordCoreNLP.clearAnnotatorPool()
|
StanfordCoreNLP.clearAnnotatorPool()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("regexner-only annotator") {
|
||||||
|
val regexNerContent =
|
||||||
|
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||||
|
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||||
|
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||||
|
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||||
|
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||||
|
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||||
|
|""".stripMargin
|
||||||
|
|
||||||
|
File
|
||||||
|
.withTempDir[IO](Paths.get("target"), "test-regex-ner")
|
||||||
|
.use { dir =>
|
||||||
|
for {
|
||||||
|
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
|
||||||
|
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
|
||||||
|
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
|
||||||
|
_ <- IO(
|
||||||
|
assertEquals(
|
||||||
|
labels,
|
||||||
|
Vector(
|
||||||
|
NerLabel("Andrea", NerTag.Person, 6, 12),
|
||||||
|
NerLabel("Rossi", NerTag.Person, 13, 18)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
} yield ()
|
||||||
|
}
|
||||||
|
.unsafeRunSync()
|
||||||
|
StanfordCoreNLP.clearAnnotatorPool()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
import io.circe.{Decoder, Encoder}
|
import io.circe.{Decoder, Encoder}
|
||||||
|
|
||||||
sealed trait Language { self: Product =>
|
sealed trait Language { self: Product =>
|
||||||
@ -11,28 +13,41 @@ sealed trait Language { self: Product =>
|
|||||||
|
|
||||||
def iso3: String
|
def iso3: String
|
||||||
|
|
||||||
|
val allowsNLP: Boolean = false
|
||||||
|
|
||||||
private[common] def allNames =
|
private[common] def allNames =
|
||||||
Set(name, iso3, iso2)
|
Set(name, iso3, iso2)
|
||||||
}
|
}
|
||||||
|
|
||||||
object Language {
|
object Language {
|
||||||
|
sealed trait NLPLanguage extends Language with Product {
|
||||||
|
override val allowsNLP = true
|
||||||
|
}
|
||||||
|
object NLPLanguage {
|
||||||
|
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
||||||
|
}
|
||||||
|
|
||||||
case object German extends Language {
|
case object German extends NLPLanguage {
|
||||||
val iso2 = "de"
|
val iso2 = "de"
|
||||||
val iso3 = "deu"
|
val iso3 = "deu"
|
||||||
}
|
}
|
||||||
|
|
||||||
case object English extends Language {
|
case object English extends NLPLanguage {
|
||||||
val iso2 = "en"
|
val iso2 = "en"
|
||||||
val iso3 = "eng"
|
val iso3 = "eng"
|
||||||
}
|
}
|
||||||
|
|
||||||
case object French extends Language {
|
case object French extends NLPLanguage {
|
||||||
val iso2 = "fr"
|
val iso2 = "fr"
|
||||||
val iso3 = "fra"
|
val iso3 = "fra"
|
||||||
}
|
}
|
||||||
|
|
||||||
val all: List[Language] = List(German, English, French)
|
case object Italian extends Language {
|
||||||
|
val iso2 = "it"
|
||||||
|
val iso3 = "ita"
|
||||||
|
}
|
||||||
|
|
||||||
|
val all: List[Language] = List(German, English, French, Italian)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
val lang = str.toLowerCase
|
val lang = str.toLowerCase
|
||||||
|
@ -8,12 +8,14 @@ sealed trait NlpMode { self: Product =>
|
|||||||
object NlpMode {
|
object NlpMode {
|
||||||
case object Full extends NlpMode
|
case object Full extends NlpMode
|
||||||
case object Basic extends NlpMode
|
case object Basic extends NlpMode
|
||||||
|
case object RegexOnly extends NlpMode
|
||||||
case object Disabled extends NlpMode
|
case object Disabled extends NlpMode
|
||||||
|
|
||||||
def fromString(name: String): Either[String, NlpMode] =
|
def fromString(name: String): Either[String, NlpMode] =
|
||||||
name.toLowerCase match {
|
name.toLowerCase match {
|
||||||
case "full" => Right(Full)
|
case "full" => Right(Full)
|
||||||
case "basic" => Right(Basic)
|
case "basic" => Right(Basic)
|
||||||
|
case "regexonly" => Right(RegexOnly)
|
||||||
case "disabled" => Right(Disabled)
|
case "disabled" => Right(Disabled)
|
||||||
case _ => Left(s"Unknown nlp-mode: $name")
|
case _ => Left(s"Unknown nlp-mode: $name")
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,20 @@
|
|||||||
|
package docspell.common.syntax
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
trait FileSyntax {
|
||||||
|
|
||||||
|
implicit final class PathOps(p: Path) {
|
||||||
|
|
||||||
|
def absolutePath: Path =
|
||||||
|
p.normalize().toAbsolutePath
|
||||||
|
|
||||||
|
def absolutePathAsString: String =
|
||||||
|
absolutePath.toString
|
||||||
|
|
||||||
|
def /(next: String): Path =
|
||||||
|
p.resolve(next)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object FileSyntax extends FileSyntax
|
@ -2,6 +2,11 @@ package docspell.common
|
|||||||
|
|
||||||
package object syntax {
|
package object syntax {
|
||||||
|
|
||||||
object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
|
object all
|
||||||
|
extends EitherSyntax
|
||||||
|
with StreamSyntax
|
||||||
|
with StringSyntax
|
||||||
|
with LoggerSyntax
|
||||||
|
with FileSyntax
|
||||||
|
|
||||||
}
|
}
|
||||||
|
13
modules/files/src/test/resources/examples/letter-ita.txt
Normal file
13
modules/files/src/test/resources/examples/letter-ita.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Pontremoli, 9 aprile 2013
|
||||||
|
|
||||||
|
Spettabile Villa Albicocca
|
||||||
|
Via Francigena, 9
|
||||||
|
55100 Pontetetto (LU)
|
||||||
|
|
||||||
|
Oggetto: Prenotazione
|
||||||
|
|
||||||
|
Gentile Direttore,
|
||||||
|
|
||||||
|
Vorrei prenotare una camera matrimoniale …….
|
||||||
|
|
||||||
|
In attesa di una Sua pronta risposta, La saluto cordialmente
|
@ -24,6 +24,7 @@ object Field {
|
|||||||
val content_de = Field("content_de")
|
val content_de = Field("content_de")
|
||||||
val content_en = Field("content_en")
|
val content_en = Field("content_en")
|
||||||
val content_fr = Field("content_fr")
|
val content_fr = Field("content_fr")
|
||||||
|
val content_it = Field("content_it")
|
||||||
val itemName = Field("itemName")
|
val itemName = Field("itemName")
|
||||||
val itemNotes = Field("itemNotes")
|
val itemNotes = Field("itemNotes")
|
||||||
val folderId = Field("folder")
|
val folderId = Field("folder")
|
||||||
@ -36,6 +37,8 @@ object Field {
|
|||||||
Field.content_en
|
Field.content_en
|
||||||
case Language.French =>
|
case Language.French =>
|
||||||
Field.content_fr
|
Field.content_fr
|
||||||
|
case Language.Italian =>
|
||||||
|
Field.content_it
|
||||||
}
|
}
|
||||||
|
|
||||||
implicit val jsonEncoder: Encoder[Field] =
|
implicit val jsonEncoder: Encoder[Field] =
|
||||||
|
@ -40,6 +40,7 @@ object SolrQuery {
|
|||||||
Field.content_de,
|
Field.content_de,
|
||||||
Field.content_en,
|
Field.content_en,
|
||||||
Field.content_fr,
|
Field.content_fr,
|
||||||
|
Field.content_it,
|
||||||
Field.itemName,
|
Field.itemName,
|
||||||
Field.itemNotes,
|
Field.itemNotes,
|
||||||
Field.attachmentName
|
Field.attachmentName
|
||||||
|
@ -63,6 +63,12 @@ object SolrSetup {
|
|||||||
solrEngine,
|
solrEngine,
|
||||||
"Index all from database",
|
"Index all from database",
|
||||||
FtsMigration.Result.indexAll.pure[F]
|
FtsMigration.Result.indexAll.pure[F]
|
||||||
|
),
|
||||||
|
FtsMigration[F](
|
||||||
|
7,
|
||||||
|
solrEngine,
|
||||||
|
"Add content_it field",
|
||||||
|
addContentItField.map(_ => FtsMigration.Result.reIndexAll)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -72,6 +78,9 @@ object SolrSetup {
|
|||||||
def addContentFrField: F[Unit] =
|
def addContentFrField: F[Unit] =
|
||||||
addTextField(Some(Language.French))(Field.content_fr)
|
addTextField(Some(Language.French))(Field.content_fr)
|
||||||
|
|
||||||
|
def addContentItField: F[Unit] =
|
||||||
|
addTextField(Some(Language.Italian))(Field.content_it)
|
||||||
|
|
||||||
def setupCoreSchema: F[Unit] = {
|
def setupCoreSchema: F[Unit] = {
|
||||||
val cmds0 =
|
val cmds0 =
|
||||||
List(
|
List(
|
||||||
@ -90,13 +99,15 @@ object SolrSetup {
|
|||||||
)
|
)
|
||||||
.traverse(addTextField(None))
|
.traverse(addTextField(None))
|
||||||
|
|
||||||
val cntLang = Language.all.traverse {
|
val cntLang = List(Language.German, Language.English, Language.French).traverse {
|
||||||
case l @ Language.German =>
|
case l @ Language.German =>
|
||||||
addTextField(l.some)(Field.content_de)
|
addTextField(l.some)(Field.content_de)
|
||||||
case l @ Language.English =>
|
case l @ Language.English =>
|
||||||
addTextField(l.some)(Field.content_en)
|
addTextField(l.some)(Field.content_en)
|
||||||
case l @ Language.French =>
|
case l @ Language.French =>
|
||||||
addTextField(l.some)(Field.content_fr)
|
addTextField(l.some)(Field.content_fr)
|
||||||
|
case _ =>
|
||||||
|
().pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
cmds0 *> cmds1 *> cntLang *> ().pure[F]
|
cmds0 *> cmds1 *> cntLang *> ().pure[F]
|
||||||
@ -125,6 +136,9 @@ object SolrSetup {
|
|||||||
case Some(Language.French) =>
|
case Some(Language.French) =>
|
||||||
run(DeleteField.command(DeleteField(field))).attempt *>
|
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||||
run(AddField.command(AddField.textFR(field)))
|
run(AddField.command(AddField.textFR(field)))
|
||||||
|
case Some(Language.Italian) =>
|
||||||
|
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||||
|
run(AddField.command(AddField.textIT(field)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -161,6 +175,9 @@ object SolrSetup {
|
|||||||
|
|
||||||
def textFR(field: Field): AddField =
|
def textFR(field: Field): AddField =
|
||||||
AddField(field, "text_fr", true, true, false)
|
AddField(field, "text_fr", true, true, false)
|
||||||
|
|
||||||
|
def textIT(field: Field): AddField =
|
||||||
|
AddField(field, "text_it", true, true, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
case class DeleteField(name: Field)
|
case class DeleteField(name: Field)
|
||||||
|
@ -277,7 +277,39 @@ docspell.joex {
|
|||||||
# files.
|
# files.
|
||||||
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
||||||
|
|
||||||
nlp-config {
|
nlp {
|
||||||
|
# The mode for configuring NLP models:
|
||||||
|
#
|
||||||
|
# 1. full – builds the complete pipeline
|
||||||
|
# 2. basic - builds only the ner annotator
|
||||||
|
# 3. regexonly - matches each entry in your address book via regexps
|
||||||
|
# 4. disabled - doesn't use any stanford-nlp feature
|
||||||
|
#
|
||||||
|
# The full and basic variants rely on pre-build language models
|
||||||
|
# that are available for only 3 lanugages at the moment: German,
|
||||||
|
# English and French.
|
||||||
|
#
|
||||||
|
# Memory usage varies greatly among the languages. German has
|
||||||
|
# quite large models, that require about 1G heap. So joex should
|
||||||
|
# run with -Xmx1500M at least when using mode=full.
|
||||||
|
#
|
||||||
|
# The basic variant does a quite good job for German and
|
||||||
|
# English. It might be worse for French, always depending on the
|
||||||
|
# type of text that is analysed. Joex should run with about 600M
|
||||||
|
# heap, here again lanugage German uses the most.
|
||||||
|
#
|
||||||
|
# The regexonly variant doesn't depend on a language. It roughly
|
||||||
|
# works by converting all entries in your addressbook into
|
||||||
|
# regexps and matches each one against the text. This can get
|
||||||
|
# memory intensive, too, when the addressbook grows large. This
|
||||||
|
# is included in the full and basic by default, but can be used
|
||||||
|
# independently by setting mode=regexner.
|
||||||
|
#
|
||||||
|
# When mode=disabled, then the whole nlp pipeline is disabled,
|
||||||
|
# and you won't get any suggestions. Only what the classifier
|
||||||
|
# returns (if enabled).
|
||||||
|
mode = full
|
||||||
|
|
||||||
# The StanfordCoreNLP library caches language models which
|
# The StanfordCoreNLP library caches language models which
|
||||||
# requires quite some amount of memory. Setting this interval to a
|
# requires quite some amount of memory. Setting this interval to a
|
||||||
# positive duration, the cache is cleared after this amount of
|
# positive duration, the cache is cleared after this amount of
|
||||||
@ -287,38 +319,29 @@ docspell.joex {
|
|||||||
# This has only any effect, if mode != disabled.
|
# This has only any effect, if mode != disabled.
|
||||||
clear-interval = "15 minutes"
|
clear-interval = "15 minutes"
|
||||||
|
|
||||||
# The mode for configuring NLP models. Currently 3 are available:
|
|
||||||
#
|
|
||||||
# 1. full – builds the complete pipeline, run with -Xmx1500M or more
|
|
||||||
# 2. basic - builds only the ner annotator, run with -Xmx600M or more
|
|
||||||
# 3. disabled - doesn't use any stanford-nlp feature
|
|
||||||
#
|
|
||||||
# The basic variant does a quite good job for German and
|
|
||||||
# English. It might be worse for French, always depending on the
|
|
||||||
# type of text that is analysed.
|
|
||||||
mode = full
|
|
||||||
}
|
|
||||||
|
|
||||||
regex-ner {
|
regex-ner {
|
||||||
# Whether to enable custom NER annotation. This uses the address
|
# Whether to enable custom NER annotation. This uses the
|
||||||
# book of a collective as input for NER tagging (to automatically
|
# address book of a collective as input for NER tagging (to
|
||||||
# find correspondent and concerned entities). If the address book
|
# automatically find correspondent and concerned entities). If
|
||||||
# is large, this can be quite memory intensive and also makes text
|
# the address book is large, this can be quite memory
|
||||||
# analysis slower. But it greatly improves accuracy. If this is
|
# intensive and also makes text analysis much slower. But it
|
||||||
# false, NER tagging uses only statistical models (that also work
|
# improves accuracy and can be used independent of the
|
||||||
# quite well).
|
# lanugage. If this is set to 0, it is effectively disabled
|
||||||
|
# and NER tagging uses only statistical models (that also work
|
||||||
|
# quite well, but are restricted to the languages mentioned
|
||||||
|
# above).
|
||||||
#
|
#
|
||||||
# This setting might be moved to the collective settings in the
|
# Note, this is only relevant if nlp-config.mode is not
|
||||||
# future.
|
# "disabled".
|
||||||
#
|
max-entries = 1000
|
||||||
# Note, this is only relevant if nlp-config.mode = full.
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
# The NER annotation uses a file of patterns that is derived from
|
# The NER annotation uses a file of patterns that is derived
|
||||||
# a collective's address book. This is is the time how long this
|
# from a collective's address book. This is is the time how
|
||||||
# file will be kept until a check for a state change is done.
|
# long this data will be kept until a check for a state change
|
||||||
|
# is done.
|
||||||
file-cache-time = "1 minute"
|
file-cache-time = "1 minute"
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Settings for doing document classification.
|
# Settings for doing document classification.
|
||||||
#
|
#
|
||||||
|
@ -60,15 +60,14 @@ object Config {
|
|||||||
case class TextAnalysis(
|
case class TextAnalysis(
|
||||||
maxLength: Int,
|
maxLength: Int,
|
||||||
workingDir: Path,
|
workingDir: Path,
|
||||||
nlpConfig: TextAnalysisConfig.NlpConfig,
|
nlp: NlpConfig,
|
||||||
regexNer: RegexNer,
|
|
||||||
classification: Classification
|
classification: Classification
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def textAnalysisConfig: TextAnalysisConfig =
|
def textAnalysisConfig: TextAnalysisConfig =
|
||||||
TextAnalysisConfig(
|
TextAnalysisConfig(
|
||||||
maxLength,
|
maxLength,
|
||||||
nlpConfig,
|
TextAnalysisConfig.NlpConfig(nlp.clearInterval, nlp.mode),
|
||||||
TextClassifierConfig(
|
TextClassifierConfig(
|
||||||
workingDir,
|
workingDir,
|
||||||
NonEmptyList
|
NonEmptyList
|
||||||
@ -78,10 +77,16 @@ object Config {
|
|||||||
)
|
)
|
||||||
|
|
||||||
def regexNerFileConfig: RegexNerFile.Config =
|
def regexNerFileConfig: RegexNerFile.Config =
|
||||||
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
RegexNerFile.Config(
|
||||||
|
nlp.regexNer.maxEntries,
|
||||||
|
workingDir,
|
||||||
|
nlp.regexNer.fileCacheTime
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
case class NlpConfig(mode: NlpMode, clearInterval: Duration, regexNer: RegexNer)
|
||||||
|
|
||||||
|
case class RegexNer(maxEntries: Int, fileCacheTime: Duration)
|
||||||
|
|
||||||
case class Classification(
|
case class Classification(
|
||||||
enabled: Boolean,
|
enabled: Boolean,
|
||||||
|
@ -29,7 +29,7 @@ trait RegexNerFile[F[_]] {
|
|||||||
object RegexNerFile {
|
object RegexNerFile {
|
||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
|
case class Config(maxEntries: Int, directory: Path, minTime: Duration)
|
||||||
|
|
||||||
def apply[F[_]: Concurrent: ContextShift](
|
def apply[F[_]: Concurrent: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
@ -49,7 +49,7 @@ object RegexNerFile {
|
|||||||
) extends RegexNerFile[F] {
|
) extends RegexNerFile[F] {
|
||||||
|
|
||||||
def makeFile(collective: Ident): F[Option[Path]] =
|
def makeFile(collective: Ident): F[Option[Path]] =
|
||||||
if (cfg.enabled) doMakeFile(collective)
|
if (cfg.maxEntries > 0) doMakeFile(collective)
|
||||||
else (None: Option[Path]).pure[F]
|
else (None: Option[Path]).pure[F]
|
||||||
|
|
||||||
def doMakeFile(collective: Ident): F[Option[Path]] =
|
def doMakeFile(collective: Ident): F[Option[Path]] =
|
||||||
@ -127,7 +127,7 @@ object RegexNerFile {
|
|||||||
|
|
||||||
for {
|
for {
|
||||||
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
|
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
|
||||||
names <- store.transact(QCollective.allNames(collective))
|
names <- store.transact(QCollective.allNames(collective, cfg.maxEntries))
|
||||||
nerFile = NerFile(collective, lastUpdate, now)
|
nerFile = NerFile(collective, lastUpdate, now)
|
||||||
_ <- update(nerFile, NerFile.mkNerConfig(names))
|
_ <- update(nerFile, NerFile.mkNerConfig(names))
|
||||||
} yield nerFile
|
} yield nerFile
|
||||||
|
@ -4,9 +4,8 @@ import cats.data.OptionT
|
|||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
|
||||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||||
import docspell.analysis.nlp.StanfordNerSettings
|
import docspell.analysis.{NlpSettings, TextAnalyser}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
@ -54,7 +53,7 @@ object TextAnalysis {
|
|||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
nerFile: RegexNerFile[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||||
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
|
val settings = NlpSettings(ctx.args.meta.language, false, None)
|
||||||
for {
|
for {
|
||||||
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||||
sett = settings.copy(regexNer = customNer)
|
sett = settings.copy(regexNer = customNer)
|
||||||
|
@ -1,10 +1,8 @@
|
|||||||
package docspell.store.queries
|
package docspell.store.queries
|
||||||
|
|
||||||
import cats.data.OptionT
|
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common.ContactKind
|
import docspell.common._
|
||||||
import docspell.common.{Direction, Ident}
|
|
||||||
import docspell.store.qb.DSL._
|
import docspell.store.qb.DSL._
|
||||||
import docspell.store.qb._
|
import docspell.store.qb._
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
@ -17,6 +15,7 @@ object QCollective {
|
|||||||
private val t = RTag.as("t")
|
private val t = RTag.as("t")
|
||||||
private val ro = ROrganization.as("o")
|
private val ro = ROrganization.as("o")
|
||||||
private val rp = RPerson.as("p")
|
private val rp = RPerson.as("p")
|
||||||
|
private val re = REquipment.as("e")
|
||||||
private val rc = RContact.as("c")
|
private val rc = RContact.as("c")
|
||||||
private val i = RItem.as("i")
|
private val i = RItem.as("i")
|
||||||
|
|
||||||
@ -25,13 +24,37 @@ object QCollective {
|
|||||||
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
|
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
|
||||||
}
|
}
|
||||||
|
|
||||||
def allNames(collective: Ident): ConnectionIO[Names] =
|
def allNames(collective: Ident, maxEntries: Int): ConnectionIO[Names] = {
|
||||||
(for {
|
val created = Column[Timestamp]("created", TableDef(""))
|
||||||
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
|
union(
|
||||||
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
|
Select(
|
||||||
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
|
select(ro.name.s, lit(1).as("kind"), ro.created.as(created)),
|
||||||
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
|
from(ro),
|
||||||
.getOrElse(Names.empty)
|
ro.cid === collective
|
||||||
|
),
|
||||||
|
Select(
|
||||||
|
select(rp.name.s, lit(2).as("kind"), rp.created.as(created)),
|
||||||
|
from(rp),
|
||||||
|
rp.cid === collective
|
||||||
|
),
|
||||||
|
Select(
|
||||||
|
select(re.name.s, lit(3).as("kind"), re.created.as(created)),
|
||||||
|
from(re),
|
||||||
|
re.cid === collective
|
||||||
|
)
|
||||||
|
).orderBy(created.desc)
|
||||||
|
.limit(Batch.limit(maxEntries))
|
||||||
|
.build
|
||||||
|
.query[(String, Int)]
|
||||||
|
.streamWithChunkSize(maxEntries)
|
||||||
|
.fold(Names.empty) { case (names, (name, kind)) =>
|
||||||
|
if (kind == 1) names.copy(org = names.org :+ name)
|
||||||
|
else if (kind == 2) names.copy(pers = names.pers :+ name)
|
||||||
|
else names.copy(equip = names.equip :+ name)
|
||||||
|
}
|
||||||
|
.compile
|
||||||
|
.lastOrError
|
||||||
|
}
|
||||||
|
|
||||||
case class InsightData(
|
case class InsightData(
|
||||||
incoming: Int,
|
incoming: Int,
|
||||||
|
@ -11,6 +11,7 @@ type Language
|
|||||||
= German
|
= German
|
||||||
| English
|
| English
|
||||||
| French
|
| French
|
||||||
|
| Italian
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -24,6 +25,8 @@ fromString str =
|
|||||||
else if str == "fra" || str == "fr" || str == "french" then
|
else if str == "fra" || str == "fr" || str == "french" then
|
||||||
Just French
|
Just French
|
||||||
|
|
||||||
|
else if str == "ita" || str == "it" || str == "italian" then
|
||||||
|
Just Italian
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -40,6 +43,9 @@ toIso3 lang =
|
|||||||
French ->
|
French ->
|
||||||
"fra"
|
"fra"
|
||||||
|
|
||||||
|
Italian ->
|
||||||
|
"ita"
|
||||||
|
|
||||||
|
|
||||||
toName : Language -> String
|
toName : Language -> String
|
||||||
toName lang =
|
toName lang =
|
||||||
@ -53,7 +59,10 @@ toName lang =
|
|||||||
French ->
|
French ->
|
||||||
"French"
|
"French"
|
||||||
|
|
||||||
|
Italian ->
|
||||||
|
"Italian"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
[ German, English, French ]
|
[ German, English, French, Italian ]
|
||||||
|
@ -98,10 +98,14 @@ let
|
|||||||
};
|
};
|
||||||
text-analysis = {
|
text-analysis = {
|
||||||
max-length = 10000;
|
max-length = 10000;
|
||||||
|
nlp = {
|
||||||
|
mode = "full";
|
||||||
|
clear-interval = "15 minutes";
|
||||||
regex-ner = {
|
regex-ner = {
|
||||||
enabled = true;
|
max-entries = 1000;
|
||||||
file-cache-time = "1 minute";
|
file-cache-time = "1 minute";
|
||||||
};
|
};
|
||||||
|
};
|
||||||
classification = {
|
classification = {
|
||||||
enabled = true;
|
enabled = true;
|
||||||
item-count = 0;
|
item-count = 0;
|
||||||
@ -118,7 +122,6 @@ let
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
working-dir = "/tmp/docspell-analysis";
|
working-dir = "/tmp/docspell-analysis";
|
||||||
clear-stanford-nlp-interval = "15 minutes";
|
|
||||||
};
|
};
|
||||||
processing = {
|
processing = {
|
||||||
max-due-date-years = 10;
|
max-due-date-years = 10;
|
||||||
@ -772,9 +775,50 @@ in {
|
|||||||
files.
|
files.
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
clear-stanford-nlp-interval = mkOption {
|
|
||||||
|
nlp = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
mode = mkOption {
|
||||||
type = types.str;
|
type = types.str;
|
||||||
default = defaults.text-analysis.clear-stanford-nlp-interval;
|
default = defaults.text-analysis.nlp.mode;
|
||||||
|
description = ''
|
||||||
|
The mode for configuring NLP models:
|
||||||
|
|
||||||
|
1. full – builds the complete pipeline
|
||||||
|
2. basic - builds only the ner annotator
|
||||||
|
3. regexonly - matches each entry in your address book via regexps
|
||||||
|
4. disabled - doesn't use any stanford-nlp feature
|
||||||
|
|
||||||
|
The full and basic variants rely on pre-build language models
|
||||||
|
that are available for only 3 lanugages at the moment: German,
|
||||||
|
English and French.
|
||||||
|
|
||||||
|
Memory usage varies greatly among the languages. German has
|
||||||
|
quite large models, that require about 1G heap. So joex should
|
||||||
|
run with -Xmx1500M at least when using mode=full.
|
||||||
|
|
||||||
|
The basic variant does a quite good job for German and
|
||||||
|
English. It might be worse for French, always depending on the
|
||||||
|
type of text that is analysed. Joex should run with about 600M
|
||||||
|
heap, here again lanugage German uses the most.
|
||||||
|
|
||||||
|
The regexonly variant doesn't depend on a language. It roughly
|
||||||
|
works by converting all entries in your addressbook into
|
||||||
|
regexps and matches each one against the text. This can get
|
||||||
|
memory intensive, too, when the addressbook grows large. This
|
||||||
|
is included in the full and basic by default, but can be used
|
||||||
|
independently by setting mode=regexner.
|
||||||
|
|
||||||
|
When mode=disabled, then the whole nlp pipeline is disabled,
|
||||||
|
and you won't get any suggestions. Only what the classifier
|
||||||
|
returns (if enabled).
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
clear-interval = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.text-analysis.nlp.clear-interval;
|
||||||
description = ''
|
description = ''
|
||||||
Idle time after which the NLP caches are cleared to free
|
Idle time after which the NLP caches are cleared to free
|
||||||
memory. If <= 0 clearing the cache is disabled.
|
memory. If <= 0 clearing the cache is disabled.
|
||||||
@ -785,19 +829,22 @@ in {
|
|||||||
type = types.submodule({
|
type = types.submodule({
|
||||||
options = {
|
options = {
|
||||||
enabled = mkOption {
|
enabled = mkOption {
|
||||||
type = types.bool;
|
type = types.int;
|
||||||
default = defaults.text-analysis.regex-ner.enabled;
|
default = defaults.text-analysis.regex-ner.max-entries;
|
||||||
description = ''
|
description = ''
|
||||||
Whether to enable custom NER annotation. This uses the address
|
Whether to enable custom NER annotation. This uses the
|
||||||
book of a collective as input for NER tagging (to automatically
|
address book of a collective as input for NER tagging (to
|
||||||
find correspondent and concerned entities). If the address book
|
automatically find correspondent and concerned entities). If
|
||||||
is large, this can be quite memory intensive and also makes text
|
the address book is large, this can be quite memory
|
||||||
analysis slower. But it greatly improves accuracy. If this is
|
intensive and also makes text analysis much slower. But it
|
||||||
false, NER tagging uses only statistical models (that also work
|
improves accuracy and can be used independent of the
|
||||||
quite well).
|
lanugage. If this is set to 0, it is effectively disabled
|
||||||
|
and NER tagging uses only statistical models (that also work
|
||||||
|
quite well, but are restricted to the languages mentioned
|
||||||
|
above).
|
||||||
|
|
||||||
This setting might be moved to the collective settings in the
|
Note, this is only relevant if nlp-config.mode is not
|
||||||
future.
|
"disabled".
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
file-cache-time = mkOption {
|
file-cache-time = mkOption {
|
||||||
@ -811,9 +858,14 @@ in {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.text-analysis.regex-ner;
|
default = defaults.text-analysis.nlp.regex-ner;
|
||||||
description = "";
|
description = "";
|
||||||
};
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis.nlp;
|
||||||
|
description = "Configure NLP";
|
||||||
|
};
|
||||||
|
|
||||||
classification = mkOption {
|
classification = mkOption {
|
||||||
type = types.submodule({
|
type = types.submodule({
|
||||||
|
Reference in New Issue
Block a user