mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 14:45:59 +00:00
Merge pull request #586 from eikek/optional-text-limit
Make the text length limit optional
This commit is contained in:
commit
e6d67c368b
@ -59,7 +59,9 @@ object TextAnalyser {
|
|||||||
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||||
|
|
||||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||||
if (text.length <= cfg.maxLength) text.pure[F]
|
if (cfg.maxLength <= 0)
|
||||||
|
logger.debug("Max text length limit disabled.") *> text.pure[F]
|
||||||
|
else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F]
|
||||||
else
|
else
|
||||||
logger.info(
|
logger.info(
|
||||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||||
|
@ -269,9 +269,13 @@ docspell.joex {
|
|||||||
# All text to analyse must fit into RAM. A large document may take
|
# All text to analyse must fit into RAM. A large document may take
|
||||||
# too much heap. Also, most important information is at the
|
# too much heap. Also, most important information is at the
|
||||||
# beginning of a document, so in most cases the first two pages
|
# beginning of a document, so in most cases the first two pages
|
||||||
# should suffice. Default is 8000, which are about 2-3 pages (just
|
# should suffice. Default is 5000, which are about 2 pages (just a
|
||||||
# a rough guess, of course).
|
# rough guess, of course). For my data, more than 80% of the
|
||||||
max-length = 8000
|
# documents are less than 5000 characters.
|
||||||
|
#
|
||||||
|
# This values applies to nlp and the classifier. If this value is
|
||||||
|
# <= 0, the limit is disabled.
|
||||||
|
max-length = 5000
|
||||||
|
|
||||||
# A working directory for the analyser to store temporary/working
|
# A working directory for the analyser to store temporary/working
|
||||||
# files.
|
# files.
|
||||||
@ -363,6 +367,10 @@ docspell.joex {
|
|||||||
# If concerned with memory consumption, this restricts the
|
# If concerned with memory consumption, this restricts the
|
||||||
# number of items to consider. More are better for training. A
|
# number of items to consider. More are better for training. A
|
||||||
# negative value or zero means to train on all items.
|
# negative value or zero means to train on all items.
|
||||||
|
#
|
||||||
|
# This limit and `text-analysis.max-length` define how much
|
||||||
|
# memory is required. On weaker hardware, it is advised to play
|
||||||
|
# with these values.
|
||||||
item-count = 600
|
item-count = 600
|
||||||
|
|
||||||
# These settings are used to configure the classifier. If
|
# These settings are used to configure the classifier. If
|
||||||
|
@ -578,7 +578,7 @@ object QItem {
|
|||||||
)
|
)
|
||||||
)(
|
)(
|
||||||
Select(
|
Select(
|
||||||
select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s),
|
select(contentMax(maxLen), tagsTid.s, tagsName.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, a.id === m.id)
|
.innerJoin(m, a.id === m.id)
|
||||||
@ -597,7 +597,7 @@ object QItem {
|
|||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s),
|
select(contentMax(maxLen), org.oid.s, org.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -614,7 +614,7 @@ object QItem {
|
|||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -631,7 +631,7 @@ object QItem {
|
|||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -648,7 +648,7 @@ object QItem {
|
|||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s),
|
select(contentMax(maxLen), equip.eid.s, equip.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -657,6 +657,12 @@ object QItem {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def contentMax(maxLen: Int): SelectExpr =
|
||||||
|
if (maxLen <= 0) {
|
||||||
|
logger.debug("Max text length limit disabled")
|
||||||
|
m.content.s
|
||||||
|
} else substring(m.content.s, 0, maxLen).s
|
||||||
|
|
||||||
private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
|
private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
|
||||||
q: Select
|
q: Select
|
||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
|
@ -312,9 +312,9 @@ most should be used for learning. The default settings should work
|
|||||||
well for most cases. However, it always depends on the amount of data
|
well for most cases. However, it always depends on the amount of data
|
||||||
and the machine that runs joex. For example, by default the documents
|
and the machine that runs joex. For example, by default the documents
|
||||||
to learn from are limited to 600 (`classification.item-count`) and
|
to learn from are limited to 600 (`classification.item-count`) and
|
||||||
every text is cut after 8000 characters (`text-analysis.max-length`).
|
every text is cut after 5000 characters (`text-analysis.max-length`).
|
||||||
This is fine if *most* of your documents are small and only a few are
|
This is fine if *most* of your documents are small and only a few are
|
||||||
near 8000 characters). But if *all* your documents are very large, you
|
near 5000 characters). But if *all* your documents are very large, you
|
||||||
probably need to either assign more heap memory or go down with the
|
probably need to either assign more heap memory or go down with the
|
||||||
limits.
|
limits.
|
||||||
|
|
||||||
|
@ -367,13 +367,15 @@ Training the model is a rather resource intensive process. How much
|
|||||||
memory is needed, depends on the number of documents to learn from and
|
memory is needed, depends on the number of documents to learn from and
|
||||||
the size of text to consider. Both can be limited in the config file.
|
the size of text to consider. Both can be limited in the config file.
|
||||||
The default values might require a heap of 1.4G if you have many and
|
The default values might require a heap of 1.4G if you have many and
|
||||||
large documents. The maximum text length is about 8000 characters, if
|
large documents. The maximum text length is set to 5000 characters. If
|
||||||
*all* your documents would be that large, adjusting these values might
|
*all* your documents would be that large, adjusting these values might
|
||||||
be necessary. But using an existing model is quite cheap and fast. A
|
be necessary. A model is trained periodically, the schedule can be
|
||||||
model is trained periodically, the schedule can be defined in your
|
defined in your collective settings. Although learning is resource
|
||||||
collective settings. For tags, you can define the tag categories that
|
intensive, using an existing model is quite cheap and fast.
|
||||||
should be trained (or that should not be trained). Docspell assigns
|
|
||||||
one tag from all tags in a category to a new document.
|
For tags, you can define the tag categories that should be trained (or
|
||||||
|
that should not be trained). Docspell assigns one tag (or none) from
|
||||||
|
all tags in a category to a new document.
|
||||||
|
|
||||||
Note that tags that can not be derived from the text only, should
|
Note that tags that can not be derived from the text only, should
|
||||||
probably be excluded from learning. For example, if you tag all your
|
probably be excluded from learning. For example, if you tag all your
|
||||||
|
Loading…
x
Reference in New Issue
Block a user