diff --git a/.travis.yml b/.travis.yml index 4d750d05..d78ff4b0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,4 +24,4 @@ before_script: - export TZ=Europe/Berlin script: - - sbt ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test" + - sbt -J-XX:+UseG1GC ++$TRAVIS_SCALA_VERSION ";project root ;scalafmtCheckAll ;make ;test" diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index 0baa1973..8ebad224 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -15,6 +15,7 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr \ tesseract-ocr-data-deu \ tesseract-ocr-data-fra \ + tesseract-ocr-data-ita \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala new file mode 100644 index 00000000..a1b426e5 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/NlpSettings.scala @@ -0,0 +1,7 @@ +package docspell.analysis + +import java.nio.file.Path + +import docspell.common._ + +case class NlpSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path]) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index a9234027..c2deafce 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -10,13 +10,13 @@ import docspell.analysis.date.DateFind import docspell.analysis.nlp._ import docspell.common._ -import edu.stanford.nlp.pipeline.StanfordCoreNLP +import org.log4s.getLogger trait TextAnalyser[F[_]] { def annotate( logger: Logger[F], - settings: StanfordNerSettings, + settings: NlpSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] @@ -24,6 +24,7 @@ trait TextAnalyser[F[_]] { def classifier: TextClassifier[F] } object TextAnalyser { + private[this] val logger = getLogger case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) { @@ -41,13 +42,13 @@ object TextAnalyser { new TextAnalyser[F] { def annotate( logger: Logger[F], - settings: StanfordNerSettings, + settings: NlpSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] = for { input <- textLimit(logger, text) - tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input)) + tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input)) tags1 <- contactNer(input) dates <- dateNer(settings.lang, input) list = tags0 ++ tags1 @@ -77,31 +78,36 @@ object TextAnalyser { } ) + /** Provides the nlp pipeline based on the configuration. */ private object Nlp { - def apply[F[_]: Concurrent: Timer: BracketThrow]( cfg: TextAnalysisConfig.NlpConfig - ): F[Input => F[Vector[NerLabel]]] = + ): F[Input[F] => F[Vector[NerLabel]]] = cfg.mode match { - case NlpMode.Full => - PipelineCache.full(cfg.clearInterval).map(cache => full(cache)) - case NlpMode.Basic => - PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache)) case NlpMode.Disabled => - Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + Logger.log4s(logger).info("NLP is disabled as defined in config.") *> + Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + case _ => + PipelineCache(cfg.clearInterval)( + Annotator[F](cfg.mode), + Annotator.clearCaches[F] + ) + .map(annotate[F]) } - final case class Input(key: Ident, settings: StanfordNerSettings, text: String) + final case class Input[F[_]]( + key: Ident, + settings: NlpSettings, + logger: Logger[F], + text: String + ) - def full[F[_]: BracketThrow]( - cache: PipelineCache[F, StanfordCoreNLP] - )(input: Input): F[Vector[NerLabel]] = - StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) - - def basic[F[_]: BracketThrow]( - cache: PipelineCache[F, BasicCRFAnnotator.Annotator] - )(input: Input): F[Vector[NerLabel]] = - BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + def annotate[F[_]: BracketThrow]( + cache: PipelineCache[F] + )(input: Input[F]): F[Vector[NerLabel]] = + cache + .obtain(input.key.id, input.settings) + .use(ann => ann.nerAnnotate(input.logger)(input.text)) } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 90fcd8cd..5feb8b57 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -41,23 +41,30 @@ object DateFind { } object SimpleDate { - val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) => - List(SimpleDate(y, m, d)) + def pattern0(lang: Language) = (readYear >> readMonth(lang) >> readDay).map { + case ((y, m), d) => + List(SimpleDate(y, m, d)) } - val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) => - List(SimpleDate(y, m, d)) + def pattern1(lang: Language) = (readDay >> readMonth(lang) >> readYear).map { + case ((d, m), y) => + List(SimpleDate(y, m, d)) } - val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) => - List(SimpleDate(y, m, d)) + def pattern2(lang: Language) = (readMonth(lang) >> readDay >> readYear).map { + case ((m, d), y) => + List(SimpleDate(y, m, d)) } // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { + val p0 = pattern0(lang) + val p1 = pattern1(lang) + val p2 = pattern2(lang) val p = lang match { case Language.English => p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1) - case Language.German => p1.or(p0).or(p2) - case Language.French => p1.or(p0).or(p2) + case Language.German => p1.or(p0).or(p2) + case Language.French => p1.or(p0).or(p2) + case Language.Italian => p1.or(p0).or(p2) } p.read(parts) match { case Result.Success(sds, _) => @@ -76,9 +83,11 @@ object DateFind { } ) - def readMonth: Reader[Int] = + def readMonth(lang: Language): Reader[Int] = Reader.readFirst(w => - Some(months.indexWhere(_.contains(w.value))).filter(_ >= 0).map(_ + 1) + Some(MonthName.getAll(lang).indexWhere(_.contains(w.value))) + .filter(_ >= 0) + .map(_ + 1) ) def readDay: Reader[Int] = @@ -150,20 +159,5 @@ object DateFind { Failure } } - - private val months = List( - List("jan", "january", "januar", "01"), - List("feb", "february", "februar", "02"), - List("mar", "march", "märz", "marz", "03"), - List("apr", "april", "04"), - List("may", "mai", "05"), - List("jun", "june", "juni", "06"), - List("jul", "july", "juli", "07"), - List("aug", "august", "08"), - List("sep", "september", "09"), - List("oct", "october", "oktober", "10"), - List("nov", "november", "11"), - List("dec", "december", "dezember", "12") - ) } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala new file mode 100644 index 00000000..cf61cd72 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -0,0 +1,101 @@ +package docspell.analysis.date + +import docspell.common.Language + +object MonthName { + + def getAll(lang: Language): List[List[String]] = + merge(numbers, forLang(lang)) + + private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] = + ns.foldLeft(n0) { (res, el) => + res.zip(el).map({ case (a, b) => a ++ b }) + } + + private def forLang(lang: Language): List[List[String]] = + lang match { + case Language.English => + english + case Language.German => + german + case Language.French => + french + case Language.Italian => + italian + } + + private val numbers = List( + List("01"), + List("02"), + List("03"), + List("04"), + List("05"), + List("06"), + List("07"), + List("08"), + List("09"), + List("10"), + List("11"), + List("12") + ) + + private val english = List( + List("jan", "january"), + List("feb", "february"), + List("mar", "march"), + List("apr", "april"), + List("may"), + List("jun", "june"), + List("jul", "july"), + List("aug", "august"), + List("sept", "september"), + List("oct", "october"), + List("nov", "november"), + List("dec", "december") + ) + + private val german = List( + List("jan", "januar"), + List("feb", "februar"), + List("märz"), + List("apr", "april"), + List("mai"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dez", "dezember") + ) + + private val french = List( + List("janv", "janvier"), + List("févr", "fevr", "février", "fevrier"), + List("mars"), + List("avril"), + List("mai"), + List("juin"), + List("juil", "juillet"), + List("aout", "août"), + List("sept", "septembre"), + List("oct", "octobre"), + List("nov", "novembre"), + List("dec", "déc", "décembre", "decembre") + ) + + private val italian = List( + List("genn", "gennaio"), + List("febbr", "febbraio"), + List("mar", "marzo"), + List("apr", "aprile"), + List("magg", "maggio"), + List("giugno"), + List("luglio"), + List("ag", "agosto"), + List("sett", "settembre"), + List("ott", "ottobre"), + List("nov", "novembre"), + List("dic", "dicembre") + ) +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala new file mode 100644 index 00000000..d509805a --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala @@ -0,0 +1,98 @@ +package docspell.analysis.nlp + +import cats.effect.Sync +import cats.implicits._ +import cats.{Applicative, FlatMap} + +import docspell.analysis.NlpSettings +import docspell.common._ + +import edu.stanford.nlp.pipeline.StanfordCoreNLP + +/** Analyses a text to mark certain parts with a `NerLabel`. */ +trait Annotator[F[_]] { self => + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] + + def ++(next: Annotator[F])(implicit F: FlatMap[F]): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + for { + n0 <- self.nerAnnotate(logger)(text) + n1 <- next.nerAnnotate(logger)(text) + } yield (n0 ++ n1).distinct + } +} + +object Annotator { + + /** Creates an annotator according to the given `mode` and `settings`. + * + * There are the following ways: + * + * - disabled: it returns a no-op annotator that always gives an empty list + * - full: the complete stanford pipeline is used + * - basic: only the ner classifier is used + * + * Additionally, if there is a regexNer-file specified, the regexner annotator is + * also run. In case the full pipeline is used, this is already included. + */ + def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] = + mode match { + case NlpMode.Disabled => + Annotator.none[F] + case NlpMode.Full => + StanfordNerSettings.fromNlpSettings(settings) match { + case Some(ss) => + Annotator.pipeline(StanfordNerAnnotator.makePipeline(ss)) + case None => + Annotator.none[F] + } + case NlpMode.Basic => + StanfordNerSettings.fromNlpSettings(settings) match { + case Some(StanfordNerSettings.Full(lang, _, Some(file))) => + Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) ++ + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case Some(StanfordNerSettings.Full(lang, _, None)) => + Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) + case Some(StanfordNerSettings.RegexOnly(file)) => + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case None => + Annotator.none[F] + } + case NlpMode.RegexOnly => + settings.regexNer match { + case Some(file) => + Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file)) + case None => + Annotator.none[F] + } + } + + def none[F[_]: Applicative]: Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + logger.debug("Running empty annotator. NLP not supported.") *> + Vector.empty[NerLabel].pure[F] + } + + def basic[F[_]: Sync](ann: BasicCRFAnnotator.Annotator): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + Sync[F].delay( + BasicCRFAnnotator.nerAnnotate(ann)(text) + ) + } + + def pipeline[F[_]: Sync](cp: StanfordCoreNLP): Annotator[F] = + new Annotator[F] { + def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] = + Sync[F].delay(StanfordNerAnnotator.nerAnnotate(cp, text)) + + } + + def clearCaches[F[_]: Sync]: F[Unit] = + Sync[F].delay { + StanfordCoreNLP.clearAnnotatorPool() + BasicCRFAnnotator.Cache.clearCache() + } +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala index a6fb6af0..76ffe7c6 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -7,9 +7,7 @@ import java.util.zip.GZIPInputStream import scala.jdk.CollectionConverters._ import scala.util.Using -import cats.Applicative -import cats.effect.BracketThrow - +import docspell.common.Language.NLPLanguage import docspell.common._ import edu.stanford.nlp.ie.AbstractSequenceClassifier @@ -30,14 +28,6 @@ object BasicCRFAnnotator { type Annotator = AbstractSequenceClassifier[CoreLabel] - def nerAnnotate[F[_]: BracketThrow]( - cacheKey: String, - cache: PipelineCache[F, Annotator] - )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = - cache - .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(nerAnnotate(crf)(text))) - def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] = nerClassifier .classify(text) @@ -52,7 +42,7 @@ object BasicCRFAnnotator { }) .toVector - private def makeClassifier(lang: Language): Annotator = { + def makeAnnotator(lang: NLPLanguage): Annotator = { logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") val ner = classifierResource(lang) Using(new GZIPInputStream(ner.openStream())) { in => @@ -60,7 +50,7 @@ object BasicCRFAnnotator { }.fold(throw _, identity) } - private def classifierResource(lang: Language): URL = { + private def classifierResource(lang: NLPLanguage): URL = { def check(name: String): URL = Option(getClass.getResource(name)) match { case None => @@ -79,11 +69,11 @@ object BasicCRFAnnotator { } final class Cache { - private[this] lazy val germanNerClassifier = makeClassifier(Language.German) - private[this] lazy val englishNerClassifier = makeClassifier(Language.English) - private[this] lazy val frenchNerClassifier = makeClassifier(Language.French) + private[this] lazy val germanNerClassifier = makeAnnotator(Language.German) + private[this] lazy val englishNerClassifier = makeAnnotator(Language.English) + private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French) - def forLang(language: Language): Annotator = + def forLang(language: NLPLanguage): Annotator = language match { case Language.French => frenchNerClassifier case Language.German => germanNerClassifier @@ -95,7 +85,7 @@ object BasicCRFAnnotator { private[this] val cacheRef = new AtomicReference[Cache](new Cache) - def getAnnotator(language: Language): Annotator = + def getAnnotator(language: NLPLanguage): Annotator = cacheRef.get().forLang(language) def clearCache(): Unit = diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 2b567548..3b38da22 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -3,14 +3,13 @@ package docspell.analysis.nlp import scala.concurrent.duration.{Duration => _, _} import cats.Applicative -import cats.data.Kleisli import cats.effect._ import cats.effect.concurrent.Ref import cats.implicits._ +import docspell.analysis.NlpSettings import docspell.common._ -import edu.stanford.nlp.pipeline.StanfordCoreNLP import org.log4s.getLogger /** Creating the StanfordCoreNLP pipeline is quite expensive as it @@ -20,58 +19,32 @@ import org.log4s.getLogger * * **This is an internal API** */ -trait PipelineCache[F[_], A] { +trait PipelineCache[F[_]] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] + def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] } object PipelineCache { private[this] val logger = getLogger - def none[F[_]: Applicative, A]( - creator: Kleisli[F, StanfordNerSettings, A] - ): PipelineCache[F, A] = - new PipelineCache[F, A] { - def obtain( - ignored: String, - settings: StanfordNerSettings - ): Resource[F, A] = - Resource.liftF(creator.run(settings)) - } - - def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)( - creator: StanfordNerSettings => A, + def apply[F[_]: Concurrent: Timer](clearInterval: Duration)( + creator: NlpSettings => Annotator[F], release: F[Unit] - ): F[PipelineCache[F, A]] = + ): F[PipelineCache[F]] = for { - data <- Ref.of(Map.empty[String, Entry[A]]) + data <- Ref.of(Map.empty[String, Entry[Annotator[F]]]) cacheClear <- CacheClearing.create(data, clearInterval, release) - } yield new Impl[F, A](data, creator, cacheClear) + _ <- Logger.log4s(logger).info("Creating nlp pipeline cache") + } yield new Impl[F](data, creator, cacheClear) - def full[F[_]: Concurrent: Timer]( - clearInterval: Duration - ): F[PipelineCache[F, StanfordCoreNLP]] = - apply(clearInterval)( - StanfordNerAnnotator.makePipeline, - StanfordNerAnnotator.clearPipelineCaches - ) - - def basic[F[_]: Concurrent: Timer]( - clearInterval: Duration - ): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] = - apply(clearInterval)( - settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang), - Sync[F].delay(BasicCRFAnnotator.Cache.clearCache()) - ) - - final private class Impl[F[_]: Sync, A]( - data: Ref[F, Map[String, Entry[A]]], - creator: StanfordNerSettings => A, + final private class Impl[F[_]: Sync]( + data: Ref[F, Map[String, Entry[Annotator[F]]]], + creator: NlpSettings => Annotator[F], cacheClear: CacheClearing[F] - ) extends PipelineCache[F, A] { + ) extends PipelineCache[F] { - def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] = + def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] = for { _ <- cacheClear.withCache id <- Resource.liftF(makeSettingsId(settings)) @@ -83,10 +56,10 @@ object PipelineCache { private def getOrCreate( key: String, id: String, - cache: Map[String, Entry[A]], - settings: StanfordNerSettings, - creator: StanfordNerSettings => A - ): (Map[String, Entry[A]], A) = + cache: Map[String, Entry[Annotator[F]]], + settings: NlpSettings, + creator: NlpSettings => Annotator[F] + ): (Map[String, Entry[Annotator[F]]], Annotator[F]) = cache.get(key) match { case Some(entry) => if (entry.id == id) (cache, entry.value) @@ -105,7 +78,7 @@ object PipelineCache { (cache.updated(key, e), nlp) } - private def makeSettingsId(settings: StanfordNerSettings): F[String] = { + private def makeSettingsId(settings: NlpSettings): F[String] = { val base = settings.copy(regexNer = None).toString val size: F[Long] = settings.regexNer match { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index 46a614d1..75fe9d36 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -1,9 +1,11 @@ package docspell.analysis.nlp +import java.nio.file.Path import java.util.{Properties => JProps} import docspell.analysis.nlp.Properties.Implicits._ import docspell.common._ +import docspell.common.syntax.FileSyntax._ object Properties { @@ -17,18 +19,21 @@ object Properties { p } - def forSettings(settings: StanfordNerSettings): JProps = { - val regexNerFile = settings.regexNer - .map(p => p.normalize().toAbsolutePath().toString()) - settings.lang match { - case Language.German => - Properties.nerGerman(regexNerFile, settings.highRecall) - case Language.English => - Properties.nerEnglish(regexNerFile) - case Language.French => - Properties.nerFrench(regexNerFile, settings.highRecall) + def forSettings(settings: StanfordNerSettings): JProps = + settings match { + case StanfordNerSettings.Full(lang, highRecall, regexNer) => + val regexNerFile = regexNer.map(p => p.absolutePathAsString) + lang match { + case Language.German => + Properties.nerGerman(regexNerFile, highRecall) + case Language.English => + Properties.nerEnglish(regexNerFile) + case Language.French => + Properties.nerFrench(regexNerFile, highRecall) + } + case StanfordNerSettings.RegexOnly(path) => + Properties.regexNerOnly(path) } - } def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = Properties( @@ -76,6 +81,11 @@ object Properties { "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + def regexNerOnly(regexNerMappingFile: Path): JProps = + Properties( + "annotators" -> "tokenize,ssplit" + ).withRegexNer(Some(regexNerMappingFile.absolutePathAsString)) + object Implicits { implicit final class JPropsOps(val p: JProps) extends AnyVal { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala index 37b54b40..2ec4e802 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala @@ -1,8 +1,9 @@ package docspell.analysis.nlp +import java.nio.file.Path + import scala.jdk.CollectionConverters._ -import cats.Applicative import cats.effect._ import docspell.common._ @@ -24,24 +25,24 @@ object StanfordNerAnnotator { * a new classifier must be created. It will then replace the * previous one. */ - def nerAnnotate[F[_]: BracketThrow]( - cacheKey: String, - cache: PipelineCache[F, StanfordCoreNLP] - )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = - cache - .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(nerAnnotate(crf, text))) - def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { val doc = new CoreDocument(text) nerClassifier.annotate(doc) doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } - def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = { - logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") - new StanfordCoreNLP(Properties.forSettings(settings)) - } + def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = + settings match { + case s: StanfordNerSettings.Full => + logger.info(s"Creating ${s.lang.name} Stanford NLP NER classifier...") + new StanfordCoreNLP(Properties.forSettings(settings)) + case StanfordNerSettings.RegexOnly(path) => + logger.info(s"Creating regexNer-only Stanford NLP NER classifier...") + regexNerPipeline(path) + } + + def regexNerPipeline(regexNerFile: Path): StanfordCoreNLP = + new StanfordCoreNLP(Properties.regexNerOnly(regexNerFile)) def clearPipelineCaches[F[_]: Sync]: F[Unit] = Sync[F].delay { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala index 06136a18..fd0a7ecd 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala @@ -2,25 +2,41 @@ package docspell.analysis.nlp import java.nio.file.Path -import docspell.common._ +import docspell.analysis.NlpSettings +import docspell.common.Language.NLPLanguage -/** Settings for configuring the stanford NER pipeline. - * - * The language is mandatory, only the provided ones are supported. - * The `highRecall` only applies for non-English languages. For - * non-English languages the english classifier is run as second - * classifier and if `highRecall` is true, then it will be used to - * tag untagged tokens. This may lead to a lot of false positives, - * but since English is omnipresent in other languages, too it - * depends on the use case for whether this is useful or not. - * - * The `regexNer` allows to specify a text file as described here: - * https://nlp.stanford.edu/software/regexner.html. This will be used - * as a last step to tag untagged tokens using the provided list of - * regexps. - */ -case class StanfordNerSettings( - lang: Language, - highRecall: Boolean, - regexNer: Option[Path] -) +sealed trait StanfordNerSettings + +object StanfordNerSettings { + + /** Settings for configuring the stanford NER pipeline. + * + * The language is mandatory, only the provided ones are supported. + * The `highRecall` only applies for non-English languages. For + * non-English languages the english classifier is run as second + * classifier and if `highRecall` is true, then it will be used to + * tag untagged tokens. This may lead to a lot of false positives, + * but since English is omnipresent in other languages, too it + * depends on the use case for whether this is useful or not. + * + * The `regexNer` allows to specify a text file as described here: + * https://nlp.stanford.edu/software/regexner.html. This will be used + * as a last step to tag untagged tokens using the provided list of + * regexps. + */ + case class Full( + lang: NLPLanguage, + highRecall: Boolean, + regexNer: Option[Path] + ) extends StanfordNerSettings + + /** Not all languages are supported with predefined statistical models. This allows to provide regexps only. + */ + case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings + + def fromNlpSettings(ns: NlpSettings): Option[StanfordNerSettings] = + NLPLanguage.all + .find(nl => nl == ns.lang) + .map(nl => Full(nl, ns.highRecall, ns.regexNer)) + .orElse(ns.regexNer.map(nrf => RegexOnly(nrf))) +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala index 0abab7e9..2f0cab57 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -1,12 +1,13 @@ package docspell.analysis.nlp +import docspell.common.Language.NLPLanguage import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ object BaseCRFAnnotatorSuite extends SimpleTestSuite { - def annotate(language: Language): String => Vector[NerLabel] = + def annotate(language: NLPLanguage): String => Vector[NerLabel] = BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language)) test("find english ner labels") { diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala index 1704ef1b..416cdff7 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala @@ -1,8 +1,12 @@ package docspell.analysis.nlp +import java.nio.file.Paths + +import cats.effect.IO import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ +import docspell.common.syntax.FileSyntax._ import edu.stanford.nlp.pipeline.StanfordCoreNLP object StanfordNerAnnotatorSuite extends SimpleTestSuite { @@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite { assertEquals(labels, expect) StanfordCoreNLP.clearAnnotatorPool() } + + test("regexner-only annotator") { + val regexNerContent = + s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3 + |(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2 + |""".stripMargin + + File + .withTempDir[IO](Paths.get("target"), "test-regex-ner") + .use { dir => + for { + out <- File.writeString[IO](dir / "regex.txt", regexNerContent) + ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out)) + labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.") + _ <- IO( + assertEquals( + labels, + Vector( + NerLabel("Andrea", NerTag.Person, 6, 12), + NerLabel("Rossi", NerTag.Person, 13, 18) + ) + ) + ) + } yield () + } + .unsafeRunSync() + StanfordCoreNLP.clearAnnotatorPool() + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 92c32f4b..f18d4adf 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -1,5 +1,7 @@ package docspell.common +import cats.data.NonEmptyList + import io.circe.{Decoder, Encoder} sealed trait Language { self: Product => @@ -11,28 +13,41 @@ sealed trait Language { self: Product => def iso3: String + val allowsNLP: Boolean = false + private[common] def allNames = Set(name, iso3, iso2) } object Language { + sealed trait NLPLanguage extends Language with Product { + override val allowsNLP = true + } + object NLPLanguage { + val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French) + } - case object German extends Language { + case object German extends NLPLanguage { val iso2 = "de" val iso3 = "deu" } - case object English extends Language { + case object English extends NLPLanguage { val iso2 = "en" val iso3 = "eng" } - case object French extends Language { + case object French extends NLPLanguage { val iso2 = "fr" val iso3 = "fra" } - val all: List[Language] = List(German, English, French) + case object Italian extends Language { + val iso2 = "it" + val iso3 = "ita" + } + + val all: List[Language] = List(German, English, French, Italian) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala index 36ebf7db..013b2275 100644 --- a/modules/common/src/main/scala/docspell/common/NlpMode.scala +++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala @@ -6,16 +6,18 @@ sealed trait NlpMode { self: Product => self.productPrefix } object NlpMode { - case object Full extends NlpMode - case object Basic extends NlpMode - case object Disabled extends NlpMode + case object Full extends NlpMode + case object Basic extends NlpMode + case object RegexOnly extends NlpMode + case object Disabled extends NlpMode def fromString(name: String): Either[String, NlpMode] = name.toLowerCase match { - case "full" => Right(Full) - case "basic" => Right(Basic) - case "disabled" => Right(Disabled) - case _ => Left(s"Unknown nlp-mode: $name") + case "full" => Right(Full) + case "basic" => Right(Basic) + case "regexonly" => Right(RegexOnly) + case "disabled" => Right(Disabled) + case _ => Left(s"Unknown nlp-mode: $name") } def unsafeFromString(name: String): NlpMode = diff --git a/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala new file mode 100644 index 00000000..6eef143b --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/syntax/FileSyntax.scala @@ -0,0 +1,20 @@ +package docspell.common.syntax + +import java.nio.file.Path + +trait FileSyntax { + + implicit final class PathOps(p: Path) { + + def absolutePath: Path = + p.normalize().toAbsolutePath + + def absolutePathAsString: String = + absolutePath.toString + + def /(next: String): Path = + p.resolve(next) + } +} + +object FileSyntax extends FileSyntax diff --git a/modules/common/src/main/scala/docspell/common/syntax/package.scala b/modules/common/src/main/scala/docspell/common/syntax/package.scala index 77e17039..8d512741 100644 --- a/modules/common/src/main/scala/docspell/common/syntax/package.scala +++ b/modules/common/src/main/scala/docspell/common/syntax/package.scala @@ -2,6 +2,11 @@ package docspell.common package object syntax { - object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax + object all + extends EitherSyntax + with StreamSyntax + with StringSyntax + with LoggerSyntax + with FileSyntax } diff --git a/modules/files/src/test/resources/examples/letter-ita.txt b/modules/files/src/test/resources/examples/letter-ita.txt new file mode 100644 index 00000000..cca09122 --- /dev/null +++ b/modules/files/src/test/resources/examples/letter-ita.txt @@ -0,0 +1,13 @@ +Pontremoli, 9 aprile 2013 + +Spettabile Villa Albicocca +Via Francigena, 9 +55100 Pontetetto (LU) + +Oggetto: Prenotazione + +Gentile Direttore, + +Vorrei prenotare una camera matrimoniale ……. + +In attesa di una Sua pronta risposta, La saluto cordialmente diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 2306a44d..345f4665 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -24,6 +24,7 @@ object Field { val content_de = Field("content_de") val content_en = Field("content_en") val content_fr = Field("content_fr") + val content_it = Field("content_it") val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") @@ -36,6 +37,8 @@ object Field { Field.content_en case Language.French => Field.content_fr + case Language.Italian => + Field.content_it } implicit val jsonEncoder: Encoder[Field] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index 1e3b09b3..0b7e6e31 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -40,6 +40,7 @@ object SolrQuery { Field.content_de, Field.content_en, Field.content_fr, + Field.content_it, Field.itemName, Field.itemNotes, Field.attachmentName diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 3deba577..769919bd 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -63,6 +63,12 @@ object SolrSetup { solrEngine, "Index all from database", FtsMigration.Result.indexAll.pure[F] + ), + FtsMigration[F]( + 7, + solrEngine, + "Add content_it field", + addContentItField.map(_ => FtsMigration.Result.reIndexAll) ) ) @@ -72,6 +78,9 @@ object SolrSetup { def addContentFrField: F[Unit] = addTextField(Some(Language.French))(Field.content_fr) + def addContentItField: F[Unit] = + addTextField(Some(Language.Italian))(Field.content_it) + def setupCoreSchema: F[Unit] = { val cmds0 = List( @@ -90,13 +99,15 @@ object SolrSetup { ) .traverse(addTextField(None)) - val cntLang = Language.all.traverse { + val cntLang = List(Language.German, Language.English, Language.French).traverse { case l @ Language.German => addTextField(l.some)(Field.content_de) case l @ Language.English => addTextField(l.some)(Field.content_en) case l @ Language.French => addTextField(l.some)(Field.content_fr) + case _ => + ().pure[F] } cmds0 *> cmds1 *> cntLang *> ().pure[F] @@ -125,6 +136,9 @@ object SolrSetup { case Some(Language.French) => run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.textFR(field))) + case Some(Language.Italian) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textIT(field))) } } } @@ -161,6 +175,9 @@ object SolrSetup { def textFR(field: Field): AddField = AddField(field, "text_fr", true, true, false) + + def textIT(field: Field): AddField = + AddField(field, "text_it", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 583b40b1..a495ea5a 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -277,7 +277,39 @@ docspell.joex { # files. working-dir = ${java.io.tmpdir}"/docspell-analysis" - nlp-config { + nlp { + # The mode for configuring NLP models: + # + # 1. full – builds the complete pipeline + # 2. basic - builds only the ner annotator + # 3. regexonly - matches each entry in your address book via regexps + # 4. disabled - doesn't use any stanford-nlp feature + # + # The full and basic variants rely on pre-build language models + # that are available for only 3 lanugages at the moment: German, + # English and French. + # + # Memory usage varies greatly among the languages. German has + # quite large models, that require about 1G heap. So joex should + # run with -Xmx1500M at least when using mode=full. + # + # The basic variant does a quite good job for German and + # English. It might be worse for French, always depending on the + # type of text that is analysed. Joex should run with about 600M + # heap, here again lanugage German uses the most. + # + # The regexonly variant doesn't depend on a language. It roughly + # works by converting all entries in your addressbook into + # regexps and matches each one against the text. This can get + # memory intensive, too, when the addressbook grows large. This + # is included in the full and basic by default, but can be used + # independently by setting mode=regexner. + # + # When mode=disabled, then the whole nlp pipeline is disabled, + # and you won't get any suggestions. Only what the classifier + # returns (if enabled). + mode = full + # The StanfordCoreNLP library caches language models which # requires quite some amount of memory. Setting this interval to a # positive duration, the cache is cleared after this amount of @@ -287,37 +319,28 @@ docspell.joex { # This has only any effect, if mode != disabled. clear-interval = "15 minutes" - # The mode for configuring NLP models. Currently 3 are available: - # - # 1. full – builds the complete pipeline, run with -Xmx1500M or more - # 2. basic - builds only the ner annotator, run with -Xmx600M or more - # 3. disabled - doesn't use any stanford-nlp feature - # - # The basic variant does a quite good job for German and - # English. It might be worse for French, always depending on the - # type of text that is analysed. - mode = full - } + regex-ner { + # Whether to enable custom NER annotation. This uses the + # address book of a collective as input for NER tagging (to + # automatically find correspondent and concerned entities). If + # the address book is large, this can be quite memory + # intensive and also makes text analysis much slower. But it + # improves accuracy and can be used independent of the + # lanugage. If this is set to 0, it is effectively disabled + # and NER tagging uses only statistical models (that also work + # quite well, but are restricted to the languages mentioned + # above). + # + # Note, this is only relevant if nlp-config.mode is not + # "disabled". + max-entries = 1000 - regex-ner { - # Whether to enable custom NER annotation. This uses the address - # book of a collective as input for NER tagging (to automatically - # find correspondent and concerned entities). If the address book - # is large, this can be quite memory intensive and also makes text - # analysis slower. But it greatly improves accuracy. If this is - # false, NER tagging uses only statistical models (that also work - # quite well). - # - # This setting might be moved to the collective settings in the - # future. - # - # Note, this is only relevant if nlp-config.mode = full. - enabled = true - - # The NER annotation uses a file of patterns that is derived from - # a collective's address book. This is is the time how long this - # file will be kept until a check for a state change is done. - file-cache-time = "1 minute" + # The NER annotation uses a file of patterns that is derived + # from a collective's address book. This is is the time how + # long this data will be kept until a check for a state change + # is done. + file-cache-time = "1 minute" + } } # Settings for doing document classification. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 5b2bccc5..4ad72d7c 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -60,15 +60,14 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - nlpConfig: TextAnalysisConfig.NlpConfig, - regexNer: RegexNer, + nlp: NlpConfig, classification: Classification ) { def textAnalysisConfig: TextAnalysisConfig = TextAnalysisConfig( maxLength, - nlpConfig, + TextAnalysisConfig.NlpConfig(nlp.clearInterval, nlp.mode), TextClassifierConfig( workingDir, NonEmptyList @@ -78,10 +77,16 @@ object Config { ) def regexNerFileConfig: RegexNerFile.Config = - RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) + RegexNerFile.Config( + nlp.regexNer.maxEntries, + workingDir, + nlp.regexNer.fileCacheTime + ) } - case class RegexNer(enabled: Boolean, fileCacheTime: Duration) + case class NlpConfig(mode: NlpMode, clearInterval: Duration, regexNer: RegexNer) + + case class RegexNer(maxEntries: Int, fileCacheTime: Duration) case class Classification( enabled: Boolean, diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala index 24e7f6ae..56e48012 100644 --- a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala +++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala @@ -29,7 +29,7 @@ trait RegexNerFile[F[_]] { object RegexNerFile { private[this] val logger = getLogger - case class Config(enabled: Boolean, directory: Path, minTime: Duration) + case class Config(maxEntries: Int, directory: Path, minTime: Duration) def apply[F[_]: Concurrent: ContextShift]( cfg: Config, @@ -49,7 +49,7 @@ object RegexNerFile { ) extends RegexNerFile[F] { def makeFile(collective: Ident): F[Option[Path]] = - if (cfg.enabled) doMakeFile(collective) + if (cfg.maxEntries > 0) doMakeFile(collective) else (None: Option[Path]).pure[F] def doMakeFile(collective: Ident): F[Option[Path]] = @@ -127,7 +127,7 @@ object RegexNerFile { for { _ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'") - names <- store.transact(QCollective.allNames(collective)) + names <- store.transact(QCollective.allNames(collective, cfg.maxEntries)) nerFile = NerFile(collective, lastUpdate, now) _ <- update(nerFile, NerFile.mkNerConfig(names)) } yield nerFile diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 1fd2401a..f336132d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -4,9 +4,8 @@ import cats.data.OptionT import cats.effect._ import cats.implicits._ -import docspell.analysis.TextAnalyser import docspell.analysis.classifier.{ClassifierModel, TextClassifier} -import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile @@ -54,7 +53,7 @@ object TextAnalysis { analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordNerSettings(ctx.args.meta.language, false, None) + val settings = NlpSettings(ctx.args.meta.language, false, None) for { customNer <- nerFile.makeFile(ctx.args.meta.collective) sett = settings.copy(regexNer = customNer) diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala index b9fe40c7..84caa840 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala @@ -1,10 +1,8 @@ package docspell.store.queries -import cats.data.OptionT import fs2.Stream -import docspell.common.ContactKind -import docspell.common.{Direction, Ident} +import docspell.common._ import docspell.store.qb.DSL._ import docspell.store.qb._ import docspell.store.records._ @@ -17,6 +15,7 @@ object QCollective { private val t = RTag.as("t") private val ro = ROrganization.as("o") private val rp = RPerson.as("p") + private val re = REquipment.as("e") private val rc = RContact.as("c") private val i = RItem.as("i") @@ -25,13 +24,37 @@ object QCollective { val empty = Names(Vector.empty, Vector.empty, Vector.empty) } - def allNames(collective: Ident): ConnectionIO[Names] = - (for { - orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name)) - pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name)) - equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name)) - } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name))) - .getOrElse(Names.empty) + def allNames(collective: Ident, maxEntries: Int): ConnectionIO[Names] = { + val created = Column[Timestamp]("created", TableDef("")) + union( + Select( + select(ro.name.s, lit(1).as("kind"), ro.created.as(created)), + from(ro), + ro.cid === collective + ), + Select( + select(rp.name.s, lit(2).as("kind"), rp.created.as(created)), + from(rp), + rp.cid === collective + ), + Select( + select(re.name.s, lit(3).as("kind"), re.created.as(created)), + from(re), + re.cid === collective + ) + ).orderBy(created.desc) + .limit(Batch.limit(maxEntries)) + .build + .query[(String, Int)] + .streamWithChunkSize(maxEntries) + .fold(Names.empty) { case (names, (name, kind)) => + if (kind == 1) names.copy(org = names.org :+ name) + else if (kind == 2) names.copy(pers = names.pers :+ name) + else names.copy(equip = names.equip :+ name) + } + .compile + .lastOrError + } case class InsightData( incoming: Int, diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 40fe5eb2..c7e04b7b 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -11,6 +11,7 @@ type Language = German | English | French + | Italian fromString : String -> Maybe Language @@ -24,6 +25,8 @@ fromString str = else if str == "fra" || str == "fr" || str == "french" then Just French + else if str == "ita" || str == "it" || str == "italian" then + Just Italian else Nothing @@ -40,6 +43,9 @@ toIso3 lang = French -> "fra" + Italian -> + "ita" + toName : Language -> String toName lang = @@ -53,7 +59,10 @@ toName lang = French -> "French" + Italian -> + "Italian" + all : List Language all = - [ German, English, French ] + [ German, English, French, Italian ] diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 373a6aed..aae8d835 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -98,9 +98,13 @@ let }; text-analysis = { max-length = 10000; - regex-ner = { - enabled = true; - file-cache-time = "1 minute"; + nlp = { + mode = "full"; + clear-interval = "15 minutes"; + regex-ner = { + max-entries = 1000; + file-cache-time = "1 minute"; + }; }; classification = { enabled = true; @@ -118,7 +122,6 @@ let ]; }; working-dir = "/tmp/docspell-analysis"; - clear-stanford-nlp-interval = "15 minutes"; }; processing = { max-due-date-years = 10; @@ -772,47 +775,96 @@ in { files. ''; }; - clear-stanford-nlp-interval = mkOption { - type = types.str; - default = defaults.text-analysis.clear-stanford-nlp-interval; - description = '' - Idle time after which the NLP caches are cleared to free - memory. If <= 0 clearing the cache is disabled. - ''; - }; - regex-ner = mkOption { + nlp = mkOption { type = types.submodule({ options = { - enabled = mkOption { - type = types.bool; - default = defaults.text-analysis.regex-ner.enabled; + mode = mkOption { + type = types.str; + default = defaults.text-analysis.nlp.mode; description = '' - Whether to enable custom NER annotation. This uses the address - book of a collective as input for NER tagging (to automatically - find correspondent and concerned entities). If the address book - is large, this can be quite memory intensive and also makes text - analysis slower. But it greatly improves accuracy. If this is - false, NER tagging uses only statistical models (that also work - quite well). + The mode for configuring NLP models: - This setting might be moved to the collective settings in the - future. + 1. full – builds the complete pipeline + 2. basic - builds only the ner annotator + 3. regexonly - matches each entry in your address book via regexps + 4. disabled - doesn't use any stanford-nlp feature + + The full and basic variants rely on pre-build language models + that are available for only 3 lanugages at the moment: German, + English and French. + + Memory usage varies greatly among the languages. German has + quite large models, that require about 1G heap. So joex should + run with -Xmx1500M at least when using mode=full. + + The basic variant does a quite good job for German and + English. It might be worse for French, always depending on the + type of text that is analysed. Joex should run with about 600M + heap, here again lanugage German uses the most. + + The regexonly variant doesn't depend on a language. It roughly + works by converting all entries in your addressbook into + regexps and matches each one against the text. This can get + memory intensive, too, when the addressbook grows large. This + is included in the full and basic by default, but can be used + independently by setting mode=regexner. + + When mode=disabled, then the whole nlp pipeline is disabled, + and you won't get any suggestions. Only what the classifier + returns (if enabled). ''; }; - file-cache-time = mkOption { + + clear-interval = mkOption { type = types.str; - default = defaults.text-analysis.ner-file-cache-time; + default = defaults.text-analysis.nlp.clear-interval; description = '' - The NER annotation uses a file of patterns that is derived from - a collective's address book. This is is the time how long this - file will be kept until a check for a state change is done. + Idle time after which the NLP caches are cleared to free + memory. If <= 0 clearing the cache is disabled. ''; }; + + regex-ner = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.int; + default = defaults.text-analysis.regex-ner.max-entries; + description = '' + Whether to enable custom NER annotation. This uses the + address book of a collective as input for NER tagging (to + automatically find correspondent and concerned entities). If + the address book is large, this can be quite memory + intensive and also makes text analysis much slower. But it + improves accuracy and can be used independent of the + lanugage. If this is set to 0, it is effectively disabled + and NER tagging uses only statistical models (that also work + quite well, but are restricted to the languages mentioned + above). + + Note, this is only relevant if nlp-config.mode is not + "disabled". + ''; + }; + file-cache-time = mkOption { + type = types.str; + default = defaults.text-analysis.ner-file-cache-time; + description = '' + The NER annotation uses a file of patterns that is derived from + a collective's address book. This is is the time how long this + file will be kept until a check for a state change is done. + ''; + }; + }; + }); + default = defaults.text-analysis.nlp.regex-ner; + description = ""; + }; }; }); - default = defaults.text-analysis.regex-ner; - description = ""; + default = defaults.text-analysis.nlp; + description = "Configure NLP"; }; classification = mkOption {