From 6a1297fc956f5278a5fc98cc5d92e75d078d32af Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Fri, 27 Mar 2020 22:54:49 +0100
Subject: [PATCH] Add a limit for text analysis

---
 .../docspell/analysis/TextAnalyser.scala      | 62 +++++++++++++++++
 .../analysis/TextAnalysisConfig.scala         |  5 ++
 .../joex/src/main/resources/reference.conf    | 12 ++++
 .../src/main/scala/docspell/joex/Config.scala |  2 +
 .../docspell/joex/process/ProcessItem.scala   |  9 ++-
 .../docspell/joex/process/TextAnalysis.scala  | 66 +++++++------------
 nix/module-joex.nix                           | 26 ++++++++
 7 files changed, 137 insertions(+), 45 deletions(-)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala

diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
new file mode 100644
index 00000000..881dbe23
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -0,0 +1,62 @@
+package docspell.analysis
+
+import cats.effect._
+import cats.implicits._
+import docspell.analysis.contact.Contact
+import docspell.analysis.date.DateFind
+import docspell.analysis.nlp.StanfordNerClassifier
+import docspell.common._
+
+trait TextAnalyser[F[_]] {
+
+  def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
+
+}
+object TextAnalyser {
+
+  case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
+
+    def all: Vector[NerLabel] =
+      labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
+  }
+
+  def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
+    Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
+      def annotate(
+          logger: Logger[F],
+          lang: Language,
+          text: String
+      ): F[TextAnalyser.Result] =
+        for {
+          input <- textLimit(logger, text)
+          tags0 <- stanfordNer(lang, input)
+          tags1 <- contactNer(input)
+          dates <- dateNer(lang, input)
+          list  = tags0 ++ tags1
+          spans = NerLabelSpan.build(list)
+        } yield Result(spans ++ list, dates)
+
+      private def textLimit(logger: Logger[F], text: String): F[String] =
+        if (text.length <= cfg.maxLength) text.pure[F]
+        else
+          logger.info(
+            s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
+              s" Analysing only first ${cfg.maxLength} characters."
+          ) *> text.take(cfg.maxLength).pure[F]
+
+      private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
+        Sync[F].delay {
+          StanfordNerClassifier.nerAnnotate(lang)(text)
+        }
+
+      private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay {
+        Contact.annotate(text)
+      }
+
+      private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
+        Sync[F].delay {
+          DateFind.findDates(text, lang).toVector
+        }
+    })
+
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
new file mode 100644
index 00000000..577f6753
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
@@ -0,0 +1,5 @@
+package docspell.analysis
+
+case class TextAnalysisConfig(
+    maxLength: Int
+)
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index c33d727c..b05685a2 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -193,6 +193,18 @@ docspell.joex {
     }
   }
 
+  # Settings for text analysis
+  text-analysis {
+    # Maximum length of text to be analysed.
+    #
+    # All text to analyse must fit into RAM. A large document may take
+    # too much heap. Also, most important information is at the
+    # beginning of a document, so in most cases the first two pages
+    # should suffice. Default is 10000, which are about 2-3 pages
+    # (just a rough guess, of course).
+    max-length = 10000
+  }
+
   # Configuration for converting files into PDFs.
   #
   # Most of it is delegated to external tools, which can be configured
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index 1d678766..d72abcee 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -1,5 +1,6 @@
 package docspell.joex
 
+import docspell.analysis.TextAnalysisConfig
 import docspell.common.{Ident, LenientUri}
 import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
 import docspell.store.JdbcConfig
@@ -16,6 +17,7 @@ case class Config(
     periodicScheduler: PeriodicSchedulerConfig,
     houseKeeping: HouseKeepingConfig,
     extraction: ExtractConfig,
+    textAnalysis: TextAnalysisConfig,
     convert: ConvertConfig
 )
 
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
index 048d4ac2..66d1fafa 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -2,6 +2,7 @@ package docspell.joex.process
 
 import cats.effect._
 import docspell.common.ProcessItemArgs
+import docspell.analysis.TextAnalysisConfig
 import docspell.joex.scheduler.Task
 import docspell.joex.Config
 
@@ -14,13 +15,15 @@ object ProcessItem {
       .flatMap(ConvertPdf(cfg.convert, _))
       .flatMap(TextExtraction(cfg.extraction, _))
       .flatMap(Task.setProgress(50))
-      .flatMap(analysisOnly[F])
+      .flatMap(analysisOnly[F](cfg.textAnalysis))
       .flatMap(Task.setProgress(75))
       .flatMap(LinkProposal[F])
       .flatMap(Task.setProgress(99))
 
-  def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](item)
+  def analysisOnly[F[_]: Sync](
+      cfg: TextAnalysisConfig
+  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+    TextAnalysis[F](cfg)(item)
       .flatMap(FindProposal[F])
       .flatMap(EvalProposals[F])
       .flatMap(SaveProposals[F])
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index a1c16e07..554d1f40 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -2,9 +2,7 @@ package docspell.joex.process
 
 import cats.implicits._
 import cats.effect.Sync
-import docspell.analysis.nlp._
-import docspell.analysis.contact._
-import docspell.analysis.date._
+import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
 import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Task
@@ -12,50 +10,34 @@ import docspell.store.records.RAttachmentMeta
 
 object TextAnalysis {
 
-  def apply[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+  def apply[F[_]: Sync](
+      cfg: TextAnalysisConfig
+  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     Task { ctx =>
-      for {
-        _ <- ctx.logger.info("Starting text analysis")
-        s <- Duration.stopTime[F]
-        t <- item.metas.toList.traverse(annotateAttachment[F](ctx.args.meta.language))
-        _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
-        _ <- t.traverse(m =>
-          ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
-        )
-        e <- s
-        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
-        v = t.toVector
-      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+      TextAnalyser.create[F](cfg).use { analyser =>
+        for {
+          _ <- ctx.logger.info("Starting text analysis")
+          s <- Duration.stopTime[F]
+          t <- item.metas.toList
+            .traverse(annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser))
+          _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
+          _ <- t.traverse(m =>
+            ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
+          )
+          e <- s
+          _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
+          v = t.toVector
+        } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+      }
     }
 
   def annotateAttachment[F[_]: Sync](
-      lang: Language
+      lang: Language,
+      logger: Logger[F],
+      analyser: TextAnalyser[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
     for {
-      list0 <- stanfordNer[F](lang, rm)
-      list1 <- contactNer[F](rm)
-      list  = list0 ++ list1
-      spans = NerLabelSpan.build(list.toSeq)
-      dates <- dateNer[F](rm, lang)
-    } yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
-
-  def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
-    Sync[F].delay {
-      rm.content.map(StanfordNerClassifier.nerAnnotate(lang)).getOrElse(Vector.empty)
-    }
-
-  def contactNer[F[_]: Sync](rm: RAttachmentMeta): F[Vector[NerLabel]] = Sync[F].delay {
-    rm.content.map(Contact.annotate).getOrElse(Vector.empty)
-  }
-
-  def dateNer[F[_]: Sync](rm: RAttachmentMeta, lang: Language): F[AttachmentDates] =
-    Sync[F].delay {
-      AttachmentDates(
-        rm,
-        rm.content
-          .map(txt => DateFind.findDates(txt, lang).toVector)
-          .getOrElse(Vector.empty)
-      )
-    }
+      labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
+    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
 
 }
diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index 1a79427f..1c81addc 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -78,6 +78,9 @@ let
         };
       };
     };
+    text-analysis = {
+      max-length = 10000;
+    };
     convert = {
       chunk-size = 524288;
       max-image-size = 14000000;
@@ -530,6 +533,29 @@ in {
         '';
       };
 
+      text-analysis = mkOption {
+        type = types.submodule({
+          options = {
+            max-length = mkOption {
+              type = types.int;
+              default = defaults.text-analysis.max-length;
+              description = ''
+                Maximum length of text to be analysed.
+
+                All text to analyse must fit into RAM. A large document may take
+                too much heap. Also, most important information is at the
+                beginning of a document, so in most cases the first two pages
+                should suffice. Default is 10000, which are about 2-3 pages
+                (a rough guess).
+              '';
+            };
+
+          };
+        });
+        default = defaults.text-analysis;
+        description = "Settings for text analysis";
+      };
+
       convert = mkOption {
         type = types.submodule({
           options = {