From 6a1297fc956f5278a5fc98cc5d92e75d078d32af Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 27 Mar 2020 22:54:49 +0100 Subject: [PATCH] Add a limit for text analysis --- .../docspell/analysis/TextAnalyser.scala | 62 +++++++++++++++++ .../analysis/TextAnalysisConfig.scala | 5 ++ .../joex/src/main/resources/reference.conf | 12 ++++ .../src/main/scala/docspell/joex/Config.scala | 2 + .../docspell/joex/process/ProcessItem.scala | 9 ++- .../docspell/joex/process/TextAnalysis.scala | 66 +++++++------------ nix/module-joex.nix | 26 ++++++++ 7 files changed, 137 insertions(+), 45 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala new file mode 100644 index 00000000..881dbe23 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -0,0 +1,62 @@ +package docspell.analysis + +import cats.effect._ +import cats.implicits._ +import docspell.analysis.contact.Contact +import docspell.analysis.date.DateFind +import docspell.analysis.nlp.StanfordNerClassifier +import docspell.common._ + +trait TextAnalyser[F[_]] { + + def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result] + +} +object TextAnalyser { + + case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) { + + def all: Vector[NerLabel] = + labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString)) + } + + def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] = + Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] { + def annotate( + logger: Logger[F], + lang: Language, + text: String + ): F[TextAnalyser.Result] = + for { + input <- textLimit(logger, text) + tags0 <- stanfordNer(lang, input) + tags1 <- contactNer(input) + dates <- dateNer(lang, input) + list = tags0 ++ tags1 + spans = NerLabelSpan.build(list) + } yield Result(spans ++ list, dates) + + private def textLimit(logger: Logger[F], text: String): F[String] = + if (text.length <= cfg.maxLength) text.pure[F] + else + logger.info( + s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + + s" Analysing only first ${cfg.maxLength} characters." + ) *> text.take(cfg.maxLength).pure[F] + + private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] = + Sync[F].delay { + StanfordNerClassifier.nerAnnotate(lang)(text) + } + + private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay { + Contact.annotate(text) + } + + private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] = + Sync[F].delay { + DateFind.findDates(text, lang).toVector + } + }) + +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala new file mode 100644 index 00000000..577f6753 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -0,0 +1,5 @@ +package docspell.analysis + +case class TextAnalysisConfig( + maxLength: Int +) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index c33d727c..b05685a2 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -193,6 +193,18 @@ docspell.joex { } } + # Settings for text analysis + text-analysis { + # Maximum length of text to be analysed. + # + # All text to analyse must fit into RAM. A large document may take + # too much heap. Also, most important information is at the + # beginning of a document, so in most cases the first two pages + # should suffice. Default is 10000, which are about 2-3 pages + # (just a rough guess, of course). + max-length = 10000 + } + # Configuration for converting files into PDFs. # # Most of it is delegated to external tools, which can be configured diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 1d678766..d72abcee 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -1,5 +1,6 @@ package docspell.joex +import docspell.analysis.TextAnalysisConfig import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig} import docspell.store.JdbcConfig @@ -16,6 +17,7 @@ case class Config( periodicScheduler: PeriodicSchedulerConfig, houseKeeping: HouseKeepingConfig, extraction: ExtractConfig, + textAnalysis: TextAnalysisConfig, convert: ConvertConfig ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 048d4ac2..66d1fafa 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -2,6 +2,7 @@ package docspell.joex.process import cats.effect._ import docspell.common.ProcessItemArgs +import docspell.analysis.TextAnalysisConfig import docspell.joex.scheduler.Task import docspell.joex.Config @@ -14,13 +15,15 @@ object ProcessItem { .flatMap(ConvertPdf(cfg.convert, _)) .flatMap(TextExtraction(cfg.extraction, _)) .flatMap(Task.setProgress(50)) - .flatMap(analysisOnly[F]) + .flatMap(analysisOnly[F](cfg.textAnalysis)) .flatMap(Task.setProgress(75)) .flatMap(LinkProposal[F]) .flatMap(Task.setProgress(99)) - def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](item) + def analysisOnly[F[_]: Sync]( + cfg: TextAnalysisConfig + )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = + TextAnalysis[F](cfg)(item) .flatMap(FindProposal[F]) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index a1c16e07..554d1f40 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -2,9 +2,7 @@ package docspell.joex.process import cats.implicits._ import cats.effect.Sync -import docspell.analysis.nlp._ -import docspell.analysis.contact._ -import docspell.analysis.date._ +import docspell.analysis.{TextAnalyser, TextAnalysisConfig} import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Task @@ -12,50 +10,34 @@ import docspell.store.records.RAttachmentMeta object TextAnalysis { - def apply[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] = + def apply[F[_]: Sync]( + cfg: TextAnalysisConfig + )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - for { - _ <- ctx.logger.info("Starting text analysis") - s <- Duration.stopTime[F] - t <- item.metas.toList.traverse(annotateAttachment[F](ctx.args.meta.language)) - _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") - _ <- t.traverse(m => - ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) - ) - e <- s - _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") - v = t.toVector - } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + TextAnalyser.create[F](cfg).use { analyser => + for { + _ <- ctx.logger.info("Starting text analysis") + s <- Duration.stopTime[F] + t <- item.metas.toList + .traverse(annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)) + _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") + _ <- t.traverse(m => + ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels)) + ) + e <- s + _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") + v = t.toVector + } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + } } def annotateAttachment[F[_]: Sync]( - lang: Language + lang: Language, + logger: Logger[F], + analyser: TextAnalyser[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = for { - list0 <- stanfordNer[F](lang, rm) - list1 <- contactNer[F](rm) - list = list0 ++ list1 - spans = NerLabelSpan.build(list.toSeq) - dates <- dateNer[F](rm, lang) - } yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates) - - def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] = - Sync[F].delay { - rm.content.map(StanfordNerClassifier.nerAnnotate(lang)).getOrElse(Vector.empty) - } - - def contactNer[F[_]: Sync](rm: RAttachmentMeta): F[Vector[NerLabel]] = Sync[F].delay { - rm.content.map(Contact.annotate).getOrElse(Vector.empty) - } - - def dateNer[F[_]: Sync](rm: RAttachmentMeta, lang: Language): F[AttachmentDates] = - Sync[F].delay { - AttachmentDates( - rm, - rm.content - .map(txt => DateFind.findDates(txt, lang).toVector) - .getOrElse(Vector.empty) - ) - } + labels <- analyser.annotate(logger, lang, rm.content.getOrElse("")) + } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 1a79427f..1c81addc 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -78,6 +78,9 @@ let }; }; }; + text-analysis = { + max-length = 10000; + }; convert = { chunk-size = 524288; max-image-size = 14000000; @@ -530,6 +533,29 @@ in { ''; }; + text-analysis = mkOption { + type = types.submodule({ + options = { + max-length = mkOption { + type = types.int; + default = defaults.text-analysis.max-length; + description = '' + Maximum length of text to be analysed. + + All text to analyse must fit into RAM. A large document may take + too much heap. Also, most important information is at the + beginning of a document, so in most cases the first two pages + should suffice. Default is 10000, which are about 2-3 pages + (a rough guess). + ''; + }; + + }; + }); + default = defaults.text-analysis; + description = "Settings for text analysis"; + }; + convert = mkOption { type = types.submodule({ options = {