From aa937797bed2411d8bea6a6f8fa80fa0e30a866b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Thu, 14 Jan 2021 00:55:19 +0100 Subject: [PATCH] Choose nlp mode in config file --- .../docspell/analysis/TextAnalyser.scala | 42 +++++++++++++++---- .../analysis/TextAnalysisConfig.scala | 8 +++- .../main/scala/docspell/common/NlpMode.scala | 23 ++++++++++ .../docspell/common/config/Implicits.scala | 3 ++ .../joex/src/main/resources/reference.conf | 29 ++++++++++--- .../src/main/scala/docspell/joex/Config.scala | 9 ++-- 6 files changed, 95 insertions(+), 19 deletions(-) create mode 100644 modules/common/src/main/scala/docspell/common/NlpMode.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 38491c3a..a9234027 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -1,14 +1,17 @@ package docspell.analysis +import cats.Applicative import cats.effect._ import cats.implicits._ import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier} import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind -import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings} +import docspell.analysis.nlp._ import docspell.common._ +import edu.stanford.nlp.pipeline.StanfordCoreNLP + trait TextAnalyser[F[_]] { def annotate( @@ -33,8 +36,8 @@ object TextAnalyser { blocker: Blocker ): Resource[F, TextAnalyser[F]] = Resource - .liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval)) - .map(cache => + .liftF(Nlp(cfg.nlpConfig)) + .map(stanfordNer => new TextAnalyser[F] { def annotate( logger: Logger[F], @@ -44,7 +47,7 @@ object TextAnalyser { ): F[TextAnalyser.Result] = for { input <- textLimit(logger, text) - tags0 <- stanfordNer(cacheKey, settings, input) + tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input)) tags1 <- contactNer(input) dates <- dateNer(settings.lang, input) list = tags0 ++ tags1 @@ -62,10 +65,6 @@ object TextAnalyser { s" Analysing only first ${cfg.maxLength} characters." ) *> text.take(cfg.maxLength).pure[F] - private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) - : F[Vector[NerLabel]] = - StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text) - private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay { Contact.annotate(text) @@ -78,4 +77,31 @@ object TextAnalyser { } ) + private object Nlp { + + def apply[F[_]: Concurrent: Timer: BracketThrow]( + cfg: TextAnalysisConfig.NlpConfig + ): F[Input => F[Vector[NerLabel]]] = + cfg.mode match { + case NlpMode.Full => + PipelineCache.full(cfg.clearInterval).map(cache => full(cache)) + case NlpMode.Basic => + PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache)) + case NlpMode.Disabled => + Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F]) + } + + final case class Input(key: Ident, settings: StanfordNerSettings, text: String) + + def full[F[_]: BracketThrow]( + cache: PipelineCache[F, StanfordCoreNLP] + )(input: Input): F[Vector[NerLabel]] = + StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + + def basic[F[_]: BracketThrow]( + cache: PipelineCache[F, BasicCRFAnnotator.Annotator] + )(input: Input): F[Vector[NerLabel]] = + BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text) + + } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index 2dbfbfc4..abc92043 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,10 +1,16 @@ package docspell.analysis +import docspell.analysis.TextAnalysisConfig.NlpConfig import docspell.analysis.classifier.TextClassifierConfig import docspell.common._ case class TextAnalysisConfig( maxLength: Int, - clearStanfordPipelineInterval: Duration, + nlpConfig: NlpConfig, classifier: TextClassifierConfig ) + +object TextAnalysisConfig { + + case class NlpConfig(clearInterval: Duration, mode: NlpMode) +} diff --git a/modules/common/src/main/scala/docspell/common/NlpMode.scala b/modules/common/src/main/scala/docspell/common/NlpMode.scala new file mode 100644 index 00000000..36ebf7db --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/NlpMode.scala @@ -0,0 +1,23 @@ +package docspell.common + +sealed trait NlpMode { self: Product => + + def name: String = + self.productPrefix +} +object NlpMode { + case object Full extends NlpMode + case object Basic extends NlpMode + case object Disabled extends NlpMode + + def fromString(name: String): Either[String, NlpMode] = + name.toLowerCase match { + case "full" => Right(Full) + case "basic" => Right(Basic) + case "disabled" => Right(Disabled) + case _ => Left(s"Unknown nlp-mode: $name") + } + + def unsafeFromString(name: String): NlpMode = + fromString(name).fold(sys.error, identity) +} diff --git a/modules/common/src/main/scala/docspell/common/config/Implicits.scala b/modules/common/src/main/scala/docspell/common/config/Implicits.scala index c99c430a..9dab40dc 100644 --- a/modules/common/src/main/scala/docspell/common/config/Implicits.scala +++ b/modules/common/src/main/scala/docspell/common/config/Implicits.scala @@ -44,6 +44,9 @@ object Implicits { implicit val priorityReader: ConfigReader[Priority] = ConfigReader[String].emap(reason(Priority.fromString)) + implicit val nlpModeReader: ConfigReader[NlpMode] = + ConfigReader[String].emap(reason(NlpMode.fromString)) + def reason[A: ClassTag]( f: String => Either[String, A] ): String => Either[FailureReason, A] = diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 4aeb5a1b..583b40b1 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -277,12 +277,27 @@ docspell.joex { # files. working-dir = ${java.io.tmpdir}"/docspell-analysis" - # The StanfordCoreNLP library caches language models which - # requires quite some amount of memory. Setting this interval to a - # positive duration, the cache is cleared after this amount of - # idle time. Set it to 0 to disable it if you have enough memory, - # processing will be faster. - clear-stanford-nlp-interval = "15 minutes" + nlp-config { + # The StanfordCoreNLP library caches language models which + # requires quite some amount of memory. Setting this interval to a + # positive duration, the cache is cleared after this amount of + # idle time. Set it to 0 to disable it if you have enough memory, + # processing will be faster. + # + # This has only any effect, if mode != disabled. + clear-interval = "15 minutes" + + # The mode for configuring NLP models. Currently 3 are available: + # + # 1. full – builds the complete pipeline, run with -Xmx1500M or more + # 2. basic - builds only the ner annotator, run with -Xmx600M or more + # 3. disabled - doesn't use any stanford-nlp feature + # + # The basic variant does a quite good job for German and + # English. It might be worse for French, always depending on the + # type of text that is analysed. + mode = full + } regex-ner { # Whether to enable custom NER annotation. This uses the address @@ -295,6 +310,8 @@ docspell.joex { # # This setting might be moved to the collective settings in the # future. + # + # Note, this is only relevant if nlp-config.mode = full. enabled = true # The NER annotation uses a file of patterns that is derived from diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 8fba3582..5b2bccc5 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -4,7 +4,8 @@ import java.nio.file.Path import cats.data.NonEmptyList -import docspell.analysis.{TextAnalysisConfig, classifier} +import docspell.analysis.TextAnalysisConfig +import docspell.analysis.classifier.TextClassifierConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -59,7 +60,7 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - clearStanfordNlpInterval: Duration, + nlpConfig: TextAnalysisConfig.NlpConfig, regexNer: RegexNer, classification: Classification ) { @@ -67,8 +68,8 @@ object Config { def textAnalysisConfig: TextAnalysisConfig = TextAnalysisConfig( maxLength, - clearStanfordNlpInterval, - classifier.TextClassifierConfig( + nlpConfig, + TextClassifierConfig( workingDir, NonEmptyList .fromList(classification.classifiers)