mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Make the text length limit optional
This commit is contained in:
parent
8dd1672c8c
commit
c7e850116f
@ -59,7 +59,9 @@ object TextAnalyser {
|
||||
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (text.length <= cfg.maxLength) text.pure[F]
|
||||
if (cfg.maxLength <= 0)
|
||||
logger.debug("Max text length limit disabled.") *> text.pure[F]
|
||||
else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F]
|
||||
else
|
||||
logger.info(
|
||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||
|
@ -269,9 +269,13 @@ docspell.joex {
|
||||
# All text to analyse must fit into RAM. A large document may take
|
||||
# too much heap. Also, most important information is at the
|
||||
# beginning of a document, so in most cases the first two pages
|
||||
# should suffice. Default is 8000, which are about 2-3 pages (just
|
||||
# a rough guess, of course).
|
||||
max-length = 8000
|
||||
# should suffice. Default is 5000, which are about 2 pages (just a
|
||||
# rough guess, of course). For my data, more than 80% of the
|
||||
# documents are less than 5000 characters.
|
||||
#
|
||||
# This values applies to nlp and the classifier. If this value is
|
||||
# <= 0, the limit is disabled.
|
||||
max-length = 5000
|
||||
|
||||
# A working directory for the analyser to store temporary/working
|
||||
# files.
|
||||
@ -363,6 +367,10 @@ docspell.joex {
|
||||
# If concerned with memory consumption, this restricts the
|
||||
# number of items to consider. More are better for training. A
|
||||
# negative value or zero means to train on all items.
|
||||
#
|
||||
# This limit and `text-analysis.max-length` define how much
|
||||
# memory is required. On weaker hardware, it is advised to play
|
||||
# with these values.
|
||||
item-count = 600
|
||||
|
||||
# These settings are used to configure the classifier. If
|
||||
|
@ -578,7 +578,7 @@ object QItem {
|
||||
)
|
||||
)(
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s),
|
||||
select(contentMax(maxLen), tagsTid.s, tagsName.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, a.id === m.id)
|
||||
@ -597,7 +597,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s),
|
||||
select(contentMax(maxLen), org.oid.s, org.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -614,7 +614,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -631,7 +631,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -648,7 +648,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s),
|
||||
select(contentMax(maxLen), equip.eid.s, equip.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -657,6 +657,12 @@ object QItem {
|
||||
)
|
||||
}
|
||||
|
||||
private def contentMax(maxLen: Int): SelectExpr =
|
||||
if (maxLen <= 0) {
|
||||
logger.debug("Max text length limit disabled")
|
||||
m.content.s
|
||||
} else substring(m.content.s, 0, maxLen).s
|
||||
|
||||
private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
|
||||
q: Select
|
||||
): ConnectionIO[TextAndTag] =
|
||||
|
@ -312,9 +312,9 @@ most should be used for learning. The default settings should work
|
||||
well for most cases. However, it always depends on the amount of data
|
||||
and the machine that runs joex. For example, by default the documents
|
||||
to learn from are limited to 600 (`classification.item-count`) and
|
||||
every text is cut after 8000 characters (`text-analysis.max-length`).
|
||||
every text is cut after 5000 characters (`text-analysis.max-length`).
|
||||
This is fine if *most* of your documents are small and only a few are
|
||||
near 8000 characters). But if *all* your documents are very large, you
|
||||
near 5000 characters). But if *all* your documents are very large, you
|
||||
probably need to either assign more heap memory or go down with the
|
||||
limits.
|
||||
|
||||
|
@ -367,13 +367,15 @@ Training the model is a rather resource intensive process. How much
|
||||
memory is needed, depends on the number of documents to learn from and
|
||||
the size of text to consider. Both can be limited in the config file.
|
||||
The default values might require a heap of 1.4G if you have many and
|
||||
large documents. The maximum text length is about 8000 characters, if
|
||||
large documents. The maximum text length is set to 5000 characters. If
|
||||
*all* your documents would be that large, adjusting these values might
|
||||
be necessary. But using an existing model is quite cheap and fast. A
|
||||
model is trained periodically, the schedule can be defined in your
|
||||
collective settings. For tags, you can define the tag categories that
|
||||
should be trained (or that should not be trained). Docspell assigns
|
||||
one tag from all tags in a category to a new document.
|
||||
be necessary. A model is trained periodically, the schedule can be
|
||||
defined in your collective settings. Although learning is resource
|
||||
intensive, using an existing model is quite cheap and fast.
|
||||
|
||||
For tags, you can define the tag categories that should be trained (or
|
||||
that should not be trained). Docspell assigns one tag (or none) from
|
||||
all tags in a category to a new document.
|
||||
|
||||
Note that tags that can not be derived from the text only, should
|
||||
probably be excluded from learning. For example, if you tag all your
|
||||
|
Loading…
x
Reference in New Issue
Block a user