mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Make the text length limit optional
This commit is contained in:
@ -59,7 +59,9 @@ object TextAnalyser {
|
||||
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (text.length <= cfg.maxLength) text.pure[F]
|
||||
if (cfg.maxLength <= 0)
|
||||
logger.debug("Max text length limit disabled.") *> text.pure[F]
|
||||
else if (text.length <= cfg.maxLength || cfg.maxLength <= 0) text.pure[F]
|
||||
else
|
||||
logger.info(
|
||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||
|
@ -269,9 +269,13 @@ docspell.joex {
|
||||
# All text to analyse must fit into RAM. A large document may take
|
||||
# too much heap. Also, most important information is at the
|
||||
# beginning of a document, so in most cases the first two pages
|
||||
# should suffice. Default is 8000, which are about 2-3 pages (just
|
||||
# a rough guess, of course).
|
||||
max-length = 8000
|
||||
# should suffice. Default is 5000, which are about 2 pages (just a
|
||||
# rough guess, of course). For my data, more than 80% of the
|
||||
# documents are less than 5000 characters.
|
||||
#
|
||||
# This values applies to nlp and the classifier. If this value is
|
||||
# <= 0, the limit is disabled.
|
||||
max-length = 5000
|
||||
|
||||
# A working directory for the analyser to store temporary/working
|
||||
# files.
|
||||
@ -363,6 +367,10 @@ docspell.joex {
|
||||
# If concerned with memory consumption, this restricts the
|
||||
# number of items to consider. More are better for training. A
|
||||
# negative value or zero means to train on all items.
|
||||
#
|
||||
# This limit and `text-analysis.max-length` define how much
|
||||
# memory is required. On weaker hardware, it is advised to play
|
||||
# with these values.
|
||||
item-count = 600
|
||||
|
||||
# These settings are used to configure the classifier. If
|
||||
|
@ -578,7 +578,7 @@ object QItem {
|
||||
)
|
||||
)(
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s),
|
||||
select(contentMax(maxLen), tagsTid.s, tagsName.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, a.id === m.id)
|
||||
@ -597,7 +597,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s),
|
||||
select(contentMax(maxLen), org.oid.s, org.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -614,7 +614,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -631,7 +631,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||
select(contentMax(maxLen), pers0.pid.s, pers0.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -648,7 +648,7 @@ object QItem {
|
||||
): ConnectionIO[TextAndTag] =
|
||||
readTextAndTag(collective, itemId, pageSep) {
|
||||
Select(
|
||||
select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s),
|
||||
select(contentMax(maxLen), equip.eid.s, equip.name.s),
|
||||
from(i)
|
||||
.innerJoin(a, a.itemId === i.id)
|
||||
.innerJoin(m, m.id === a.id)
|
||||
@ -657,6 +657,12 @@ object QItem {
|
||||
)
|
||||
}
|
||||
|
||||
private def contentMax(maxLen: Int): SelectExpr =
|
||||
if (maxLen <= 0) {
|
||||
logger.debug("Max text length limit disabled")
|
||||
m.content.s
|
||||
} else substring(m.content.s, 0, maxLen).s
|
||||
|
||||
private def readTextAndTag(collective: Ident, itemId: Ident, pageSep: String)(
|
||||
q: Select
|
||||
): ConnectionIO[TextAndTag] =
|
||||
|
Reference in New Issue
Block a user