From c7e850116f14a5287ecd1ec2ecafcea1afa6b38b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 22 Jan 2021 22:56:51 +0100 Subject: [PATCH] Make the text length limit optional --- .../scala/docspell/analysis/TextAnalyser.scala | 4 +++- modules/joex/src/main/resources/reference.conf | 14 +++++++++++--- .../scala/docspell/store/queries/QItem.scala | 16 +++++++++++----- website/site/content/docs/configure/_index.md | 4 ++-- .../site/content/docs/joex/file-processing.md | 14 ++++++++------ 5 files changed, 35 insertions(+), 17 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index c2deafce..bf3bd8ff 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -59,7 +59,9 @@ object TextAnalyser { new StanfordTextClassifier[F](cfg.classifier, blocker) private def textLimit(logger: Logger[F], text: String): F[String] = - if (text.length <= cfg.maxLength) text.pure[F] + if (cfg.maxLength <= 0) + logger.debug("Max text length limit disabled.") *> text.pure[F] + else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F] else logger.info( s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 7f2ee7d0..44274014 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -269,9 +269,13 @@ docspell.joex { # All text to analyse must fit into RAM. A large document may take # too much heap. Also, most important information is at the # beginning of a document, so in most cases the first two pages - # should suffice. Default is 8000, which are about 2-3 pages (just - # a rough guess, of course). - max-length = 8000 + # should suffice. Default is 5000, which are about 2 pages (just a + # rough guess, of course). For my data, more than 80% of the + # documents are less than 5000 characters. + # + # This values applies to nlp and the classifier. If this value is + # <= 0, the limit is disabled. + max-length = 5000 # A working directory for the analyser to store temporary/working # files. @@ -363,6 +367,10 @@ docspell.joex { # If concerned with memory consumption, this restricts the # number of items to consider. More are better for training. A # negative value or zero means to train on all items. + # + # This limit and `text-analysis.max-length` define how much + # memory is required. On weaker hardware, it is advised to play + # with these values. item-count = 600 # These settings are used to configure the classifier. If diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index b8ee49e2..cae01000 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -578,7 +578,7 @@ object QItem { ) )( Select( - select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s), + select(contentMax(maxLen), tagsTid.s, tagsName.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, a.id === m.id) @@ -597,7 +597,7 @@ object QItem { ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s), + select(contentMax(maxLen), org.oid.s, org.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -614,7 +614,7 @@ object QItem { ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), + select(contentMax(maxLen), pers0.pid.s, pers0.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -631,7 +631,7 @@ object QItem { ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), + select(contentMax(maxLen), pers0.pid.s, pers0.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -648,7 +648,7 @@ object QItem { ): ConnectionIO[TextAndTag] = readTextAndTag(collective, itemId, pageSep) { Select( - select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s), + select(contentMax(maxLen), equip.eid.s, equip.name.s), from(i) .innerJoin(a, a.itemId === i.id) .innerJoin(m, m.id === a.id) @@ -657,6 +657,12 @@ object QItem { ) } + private def contentMax(maxLen: Int): SelectExpr = + if (maxLen <= 0) { + logger.debug("Max text length limit disabled") + m.content.s + } else substring(m.content.s, 0, maxLen).s + private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)( q: Select ): ConnectionIO[TextAndTag] = diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index 81e697a6..a7bf0765 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -312,9 +312,9 @@ most should be used for learning. The default settings should work well for most cases. However, it always depends on the amount of data and the machine that runs joex. For example, by default the documents to learn from are limited to 600 (`classification.item-count`) and -every text is cut after 8000 characters (`text-analysis.max-length`). +every text is cut after 5000 characters (`text-analysis.max-length`). This is fine if *most* of your documents are small and only a few are -near 8000 characters). But if *all* your documents are very large, you +near 5000 characters). But if *all* your documents are very large, you probably need to either assign more heap memory or go down with the limits. diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index 506dd8e0..f1343dea 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -367,13 +367,15 @@ Training the model is a rather resource intensive process. How much memory is needed, depends on the number of documents to learn from and the size of text to consider. Both can be limited in the config file. The default values might require a heap of 1.4G if you have many and -large documents. The maximum text length is about 8000 characters, if +large documents. The maximum text length is set to 5000 characters. If *all* your documents would be that large, adjusting these values might -be necessary. But using an existing model is quite cheap and fast. A -model is trained periodically, the schedule can be defined in your -collective settings. For tags, you can define the tag categories that -should be trained (or that should not be trained). Docspell assigns -one tag from all tags in a category to a new document. +be necessary. A model is trained periodically, the schedule can be +defined in your collective settings. Although learning is resource +intensive, using an existing model is quite cheap and fast. + +For tags, you can define the tag categories that should be trained (or +that should not be trained). Docspell assigns one tag (or none) from +all tags in a category to a new document. Note that tags that can not be derived from the text only, should probably be excluded from learning. For example, if you tag all your