Make the text length limit optional

2025-08-09 03:04:52 +00:00 · 2021-01-22 22:56:51 +01:00
parent 8dd1672c8c
commit c7e850116f
5 changed files with 35 additions and 17 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -59,7 +59,9 @@ object TextAnalyser {
            new StanfordTextClassifier[F](cfg.classifier, blocker)

          private def textLimit(logger: Logger[F], text: String): F[String] =
-            if (text.length <= cfg.maxLength) text.pure[F]
+            if (cfg.maxLength <= 0)
+              logger.debug("Max text length limit disabled.") *> text.pure[F]
+            else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F]
            else
              logger.info(
                s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -269,9 +269,13 @@ docspell.joex {
    # All text to analyse must fit into RAM. A large document may take
    # too much heap. Also, most important information is at the
    # beginning of a document, so in most cases the first two pages
-    # should suffice. Default is 8000, which are about 2-3 pages (just
-    # a rough guess, of course).
-    max-length = 8000
+    # should suffice. Default is 5000, which are about 2 pages (just a
+    # rough guess, of course). For my data, more than 80% of the
+    # documents are less than 5000 characters.
+    #
+    # This values applies to nlp and the classifier. If this value is
+    # <= 0, the limit is disabled.
+    max-length = 5000

    # A working directory for the analyser to store temporary/working
    # files.
@ -363,6 +367,10 @@ docspell.joex {
      # If concerned with memory consumption, this restricts the
      # number of items to consider. More are better for training. A
      # negative value or zero means to train on all items.
+      #
+      # This limit and `text-analysis.max-length` define how much
+      # memory is required. On weaker hardware, it is advised to play
+      # with these values.
      item-count = 600

      # These settings are used to configure the classifier. If
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@ -578,7 +578,7 @@ object QItem {
        )
      )(
        Select(
-          select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s),
+          select(contentMax(maxLen), tagsTid.s, tagsName.s),
          from(i)
            .innerJoin(a, a.itemId === i.id)
            .innerJoin(m, a.id === m.id)
@ -597,7 +597,7 @@ object QItem {
  ): ConnectionIO[TextAndTag] =
    readTextAndTag(collective, itemId, pageSep) {
      Select(
-        select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s),
+        select(contentMax(maxLen), org.oid.s, org.name.s),
        from(i)
          .innerJoin(a, a.itemId === i.id)
          .innerJoin(m, m.id === a.id)
@ -614,7 +614,7 @@ object QItem {
  ): ConnectionIO[TextAndTag] =
    readTextAndTag(collective, itemId, pageSep) {
      Select(
-        select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
+        select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
        from(i)
          .innerJoin(a, a.itemId === i.id)
          .innerJoin(m, m.id === a.id)
@ -631,7 +631,7 @@ object QItem {
  ): ConnectionIO[TextAndTag] =
    readTextAndTag(collective, itemId, pageSep) {
      Select(
-        select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
+        select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
        from(i)
          .innerJoin(a, a.itemId === i.id)
          .innerJoin(m, m.id === a.id)
@ -648,7 +648,7 @@ object QItem {
  ): ConnectionIO[TextAndTag] =
    readTextAndTag(collective, itemId, pageSep) {
      Select(
-        select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s),
+        select(contentMax(maxLen), equip.eid.s, equip.name.s),
        from(i)
          .innerJoin(a, a.itemId === i.id)
          .innerJoin(m, m.id === a.id)
@ -657,6 +657,12 @@ object QItem {
      )
    }

+  private def contentMax(maxLen: Int): SelectExpr =
+    if (maxLen <= 0) {
+      logger.debug("Max text length limit disabled")
+      m.content.s
+    } else substring(m.content.s, 0, maxLen).s
+
  private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
      q: Select
  ): ConnectionIO[TextAndTag] =