Merge pull request #586 from eikek/optional-text-limit

Make the text length limit optional
This commit is contained in:
mergify[bot] 2021-01-22 23:47:38 +00:00 committed by GitHub
commit e6d67c368b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 17 deletions

View File

@ -59,7 +59,9 @@ object TextAnalyser {
new StanfordTextClassifier[F](cfg.classifier, blocker) new StanfordTextClassifier[F](cfg.classifier, blocker)
private def textLimit(logger: Logger[F], text: String): F[String] = private def textLimit(logger: Logger[F], text: String): F[String] =
if (text.length <= cfg.maxLength) text.pure[F] if (cfg.maxLength <= 0)
logger.debug("Max text length limit disabled.") *> text.pure[F]
else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F]
else else
logger.info( logger.info(
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." + s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +

View File

@ -269,9 +269,13 @@ docspell.joex {
# All text to analyse must fit into RAM. A large document may take # All text to analyse must fit into RAM. A large document may take
# too much heap. Also, most important information is at the # too much heap. Also, most important information is at the
# beginning of a document, so in most cases the first two pages # beginning of a document, so in most cases the first two pages
# should suffice. Default is 8000, which are about 2-3 pages (just # should suffice. Default is 5000, which are about 2 pages (just a
# a rough guess, of course). # rough guess, of course). For my data, more than 80% of the
max-length = 8000 # documents are less than 5000 characters.
#
# This values applies to nlp and the classifier. If this value is
# <= 0, the limit is disabled.
max-length = 5000
# A working directory for the analyser to store temporary/working # A working directory for the analyser to store temporary/working
# files. # files.
@ -363,6 +367,10 @@ docspell.joex {
# If concerned with memory consumption, this restricts the # If concerned with memory consumption, this restricts the
# number of items to consider. More are better for training. A # number of items to consider. More are better for training. A
# negative value or zero means to train on all items. # negative value or zero means to train on all items.
#
# This limit and `text-analysis.max-length` define how much
# memory is required. On weaker hardware, it is advised to play
# with these values.
item-count = 600 item-count = 600
# These settings are used to configure the classifier. If # These settings are used to configure the classifier. If

View File

@ -578,7 +578,7 @@ object QItem {
) )
)( )(
Select( Select(
select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s), select(contentMax(maxLen), tagsTid.s, tagsName.s),
from(i) from(i)
.innerJoin(a, a.itemId === i.id) .innerJoin(a, a.itemId === i.id)
.innerJoin(m, a.id === m.id) .innerJoin(m, a.id === m.id)
@ -597,7 +597,7 @@ object QItem {
): ConnectionIO[TextAndTag] = ): ConnectionIO[TextAndTag] =
readTextAndTag(collective, itemId, pageSep) { readTextAndTag(collective, itemId, pageSep) {
Select( Select(
select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s), select(contentMax(maxLen), org.oid.s, org.name.s),
from(i) from(i)
.innerJoin(a, a.itemId === i.id) .innerJoin(a, a.itemId === i.id)
.innerJoin(m, m.id === a.id) .innerJoin(m, m.id === a.id)
@ -614,7 +614,7 @@ object QItem {
): ConnectionIO[TextAndTag] = ): ConnectionIO[TextAndTag] =
readTextAndTag(collective, itemId, pageSep) { readTextAndTag(collective, itemId, pageSep) {
Select( Select(
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
from(i) from(i)
.innerJoin(a, a.itemId === i.id) .innerJoin(a, a.itemId === i.id)
.innerJoin(m, m.id === a.id) .innerJoin(m, m.id === a.id)
@ -631,7 +631,7 @@ object QItem {
): ConnectionIO[TextAndTag] = ): ConnectionIO[TextAndTag] =
readTextAndTag(collective, itemId, pageSep) { readTextAndTag(collective, itemId, pageSep) {
Select( Select(
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s), select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
from(i) from(i)
.innerJoin(a, a.itemId === i.id) .innerJoin(a, a.itemId === i.id)
.innerJoin(m, m.id === a.id) .innerJoin(m, m.id === a.id)
@ -648,7 +648,7 @@ object QItem {
): ConnectionIO[TextAndTag] = ): ConnectionIO[TextAndTag] =
readTextAndTag(collective, itemId, pageSep) { readTextAndTag(collective, itemId, pageSep) {
Select( Select(
select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s), select(contentMax(maxLen), equip.eid.s, equip.name.s),
from(i) from(i)
.innerJoin(a, a.itemId === i.id) .innerJoin(a, a.itemId === i.id)
.innerJoin(m, m.id === a.id) .innerJoin(m, m.id === a.id)
@ -657,6 +657,12 @@ object QItem {
) )
} }
private def contentMax(maxLen: Int): SelectExpr =
if (maxLen <= 0) {
logger.debug("Max text length limit disabled")
m.content.s
} else substring(m.content.s, 0, maxLen).s
private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)( private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
q: Select q: Select
): ConnectionIO[TextAndTag] = ): ConnectionIO[TextAndTag] =

View File

@ -312,9 +312,9 @@ most should be used for learning. The default settings should work
well for most cases. However, it always depends on the amount of data well for most cases. However, it always depends on the amount of data
and the machine that runs joex. For example, by default the documents and the machine that runs joex. For example, by default the documents
to learn from are limited to 600 (`classification.item-count`) and to learn from are limited to 600 (`classification.item-count`) and
every text is cut after 8000 characters (`text-analysis.max-length`). every text is cut after 5000 characters (`text-analysis.max-length`).
This is fine if *most* of your documents are small and only a few are This is fine if *most* of your documents are small and only a few are
near 8000 characters). But if *all* your documents are very large, you near 5000 characters). But if *all* your documents are very large, you
probably need to either assign more heap memory or go down with the probably need to either assign more heap memory or go down with the
limits. limits.

View File

@ -367,13 +367,15 @@ Training the model is a rather resource intensive process. How much
memory is needed, depends on the number of documents to learn from and memory is needed, depends on the number of documents to learn from and
the size of text to consider. Both can be limited in the config file. the size of text to consider. Both can be limited in the config file.
The default values might require a heap of 1.4G if you have many and The default values might require a heap of 1.4G if you have many and
large documents. The maximum text length is about 8000 characters, if large documents. The maximum text length is set to 5000 characters. If
*all* your documents would be that large, adjusting these values might *all* your documents would be that large, adjusting these values might
be necessary. But using an existing model is quite cheap and fast. A be necessary. A model is trained periodically, the schedule can be
model is trained periodically, the schedule can be defined in your defined in your collective settings. Although learning is resource
collective settings. For tags, you can define the tag categories that intensive, using an existing model is quite cheap and fast.
should be trained (or that should not be trained). Docspell assigns
one tag from all tags in a category to a new document. For tags, you can define the tag categories that should be trained (or
that should not be trained). Docspell assigns one tag (or none) from
all tags in a category to a new document.
Note that tags that can not be derived from the text only, should Note that tags that can not be derived from the text only, should
probably be excluded from learning. For example, if you tag all your probably be excluded from learning. For example, if you tag all your