mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 07:05:59 +00:00
Add constraints from config to classifier training
For large and/or many documents, training the classifier can lead to OOM errors. Some limits have been set by default.
This commit is contained in:
parent
363cf5aef0
commit
9957c3267e
@ -269,9 +269,9 @@ docspell.joex {
|
|||||||
# All text to analyse must fit into RAM. A large document may take
|
# All text to analyse must fit into RAM. A large document may take
|
||||||
# too much heap. Also, most important information is at the
|
# too much heap. Also, most important information is at the
|
||||||
# beginning of a document, so in most cases the first two pages
|
# beginning of a document, so in most cases the first two pages
|
||||||
# should suffice. Default is 10000, which are about 2-3 pages
|
# should suffice. Default is 8000, which are about 2-3 pages (just
|
||||||
# (just a rough guess, of course).
|
# a rough guess, of course).
|
||||||
max-length = 10000
|
max-length = 8000
|
||||||
|
|
||||||
# A working directory for the analyser to store temporary/working
|
# A working directory for the analyser to store temporary/working
|
||||||
# files.
|
# files.
|
||||||
@ -363,7 +363,7 @@ docspell.joex {
|
|||||||
# If concerned with memory consumption, this restricts the
|
# If concerned with memory consumption, this restricts the
|
||||||
# number of items to consider. More are better for training. A
|
# number of items to consider. More are better for training. A
|
||||||
# negative value or zero means to train on all items.
|
# negative value or zero means to train on all items.
|
||||||
item-count = 0
|
item-count = 600
|
||||||
|
|
||||||
# These settings are used to configure the classifier. If
|
# These settings are used to configure the classifier. If
|
||||||
# multiple are given, they are all tried and the "best" is
|
# multiple are given, they are all tried and the "best" is
|
||||||
|
@ -94,5 +94,10 @@ object Config {
|
|||||||
enabled: Boolean,
|
enabled: Boolean,
|
||||||
itemCount: Int,
|
itemCount: Int,
|
||||||
classifiers: List[Map[String, String]]
|
classifiers: List[Map[String, String]]
|
||||||
)
|
) {
|
||||||
|
|
||||||
|
def itemCountOrWhenLower(other: Int): Int =
|
||||||
|
if (itemCount <= 0 || (itemCount > other && other > 0)) other
|
||||||
|
else itemCount
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,8 @@ object LearnClassifierTask {
|
|||||||
.learnAll(
|
.learnAll(
|
||||||
analyser,
|
analyser,
|
||||||
ctx.args.collective,
|
ctx.args.collective,
|
||||||
cfg.classification.itemCount
|
cfg.classification.itemCount,
|
||||||
|
cfg.maxLength
|
||||||
)
|
)
|
||||||
.run(ctx)
|
.run(ctx)
|
||||||
else ().pure[F]
|
else ().pure[F]
|
||||||
@ -51,10 +52,14 @@ object LearnClassifierTask {
|
|||||||
val learnTags =
|
val learnTags =
|
||||||
for {
|
for {
|
||||||
sett <- findActiveSettings[F](ctx, cfg)
|
sett <- findActiveSettings[F](ctx, cfg)
|
||||||
maxItems = math.min(cfg.classification.itemCount, sett.itemCount)
|
maxItems = cfg.classification.itemCountOrWhenLower(sett.itemCount)
|
||||||
_ <- OptionT.liftF(
|
_ <- OptionT.liftF(
|
||||||
LearnTags
|
LearnTags
|
||||||
.learnAllTagCategories(analyser)(ctx.args.collective, maxItems)
|
.learnAllTagCategories(analyser)(
|
||||||
|
ctx.args.collective,
|
||||||
|
maxItems,
|
||||||
|
cfg.maxLength
|
||||||
|
)
|
||||||
.run(ctx)
|
.run(ctx)
|
||||||
)
|
)
|
||||||
} yield ()
|
} yield ()
|
||||||
|
@ -14,51 +14,56 @@ object LearnItemEntities {
|
|||||||
def learnAll[F[_]: Sync: ContextShift, A](
|
def learnAll[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
learnCorrOrg(analyser, collective, maxItems)
|
learnCorrOrg(analyser, collective, maxItems, maxTextLen)
|
||||||
.flatMap(_ => learnCorrPerson[F, A](analyser, collective, maxItems))
|
.flatMap(_ => learnCorrPerson[F, A](analyser, collective, maxItems, maxTextLen))
|
||||||
.flatMap(_ => learnConcPerson(analyser, collective, maxItems))
|
.flatMap(_ => learnConcPerson(analyser, collective, maxItems, maxTextLen))
|
||||||
.flatMap(_ => learnConcEquip(analyser, collective, maxItems))
|
.flatMap(_ => learnConcEquip(analyser, collective, maxItems, maxTextLen))
|
||||||
|
|
||||||
def learnCorrOrg[F[_]: Sync: ContextShift, A](
|
def learnCorrOrg[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
learn(analyser, collective)(
|
learn(analyser, collective)(
|
||||||
ClassifierName.correspondentOrg,
|
ClassifierName.correspondentOrg,
|
||||||
ctx => SelectItems.forCorrOrg(ctx.store, collective, maxItems)
|
ctx => SelectItems.forCorrOrg(ctx.store, collective, maxItems, maxTextLen)
|
||||||
)
|
)
|
||||||
|
|
||||||
def learnCorrPerson[F[_]: Sync: ContextShift, A](
|
def learnCorrPerson[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
learn(analyser, collective)(
|
learn(analyser, collective)(
|
||||||
ClassifierName.correspondentPerson,
|
ClassifierName.correspondentPerson,
|
||||||
ctx => SelectItems.forCorrPerson(ctx.store, collective, maxItems)
|
ctx => SelectItems.forCorrPerson(ctx.store, collective, maxItems, maxTextLen)
|
||||||
)
|
)
|
||||||
|
|
||||||
def learnConcPerson[F[_]: Sync: ContextShift, A](
|
def learnConcPerson[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
learn(analyser, collective)(
|
learn(analyser, collective)(
|
||||||
ClassifierName.concernedPerson,
|
ClassifierName.concernedPerson,
|
||||||
ctx => SelectItems.forConcPerson(ctx.store, collective, maxItems)
|
ctx => SelectItems.forConcPerson(ctx.store, collective, maxItems, maxTextLen)
|
||||||
)
|
)
|
||||||
|
|
||||||
def learnConcEquip[F[_]: Sync: ContextShift, A](
|
def learnConcEquip[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
learn(analyser, collective)(
|
learn(analyser, collective)(
|
||||||
ClassifierName.concernedEquip,
|
ClassifierName.concernedEquip,
|
||||||
ctx => SelectItems.forConcEquip(ctx.store, collective, maxItems)
|
ctx => SelectItems.forConcEquip(ctx.store, collective, maxItems, maxTextLen)
|
||||||
)
|
)
|
||||||
|
|
||||||
private def learn[F[_]: Sync: ContextShift, A](
|
private def learn[F[_]: Sync: ContextShift, A](
|
||||||
|
@ -14,12 +14,13 @@ object LearnTags {
|
|||||||
def learnTagCategory[F[_]: Sync: ContextShift, A](
|
def learnTagCategory[F[_]: Sync: ContextShift, A](
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
)(
|
)(
|
||||||
category: String
|
category: String
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
val data = SelectItems.forCategory(ctx, collective)(maxItems, category)
|
val data = SelectItems.forCategory(ctx, collective)(maxItems, category, maxTextLen)
|
||||||
ctx.logger.info(s"Learn classifier for tag category: $category") *>
|
ctx.logger.info(s"Learn classifier for tag category: $category") *>
|
||||||
analyser.classifier.trainClassifier(ctx.logger, data)(
|
analyser.classifier.trainClassifier(ctx.logger, data)(
|
||||||
Kleisli(
|
Kleisli(
|
||||||
@ -34,12 +35,13 @@ object LearnTags {
|
|||||||
|
|
||||||
def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])(
|
def learnAllTagCategories[F[_]: Sync: ContextShift, A](analyser: TextAnalyser[F])(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
maxItems: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Task[F, A, Unit] =
|
): Task[F, A, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective))
|
cats <- ctx.store.transact(RClassifierSetting.getActiveCategories(collective))
|
||||||
task = learnTagCategory[F, A](analyser, collective, maxItems) _
|
task = learnTagCategory[F, A](analyser, collective, maxItems, maxTextLen) _
|
||||||
_ <- cats.map(task).traverse(_.run(ctx))
|
_ <- cats.map(task).traverse(_.run(ctx))
|
||||||
} yield ()
|
} yield ()
|
||||||
}
|
}
|
||||||
|
@ -16,20 +16,24 @@ object SelectItems {
|
|||||||
val noClass = LearnClassifierTask.noClass
|
val noClass = LearnClassifierTask.noClass
|
||||||
|
|
||||||
def forCategory[F[_]](ctx: Context[F, _], collective: Ident)(
|
def forCategory[F[_]](ctx: Context[F, _], collective: Ident)(
|
||||||
max: Int,
|
maxItems: Int,
|
||||||
category: String
|
category: String,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] =
|
): Stream[F, Data] =
|
||||||
forCategory(ctx.store, collective, max, category)
|
forCategory(ctx.store, collective, maxItems, category, maxTextLen)
|
||||||
|
|
||||||
def forCategory[F[_]](
|
def forCategory[F[_]](
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
max: Int,
|
maxItems: Int,
|
||||||
category: String
|
category: String,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] = {
|
): Stream[F, Data] = {
|
||||||
val connStream =
|
val connStream =
|
||||||
allItems(collective, max)
|
allItems(collective, maxItems)
|
||||||
.evalMap(item => QItem.resolveTextAndTag(collective, item, category, pageSep))
|
.evalMap(item =>
|
||||||
|
QItem.resolveTextAndTag(collective, item, category, maxTextLen, pageSep)
|
||||||
|
)
|
||||||
.through(mkData)
|
.through(mkData)
|
||||||
store.transact(connStream)
|
store.transact(connStream)
|
||||||
}
|
}
|
||||||
@ -37,11 +41,14 @@ object SelectItems {
|
|||||||
def forCorrOrg[F[_]](
|
def forCorrOrg[F[_]](
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
max: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] = {
|
): Stream[F, Data] = {
|
||||||
val connStream =
|
val connStream =
|
||||||
allItems(collective, max)
|
allItems(collective, maxItems)
|
||||||
.evalMap(item => QItem.resolveTextAndCorrOrg(collective, item, pageSep))
|
.evalMap(item =>
|
||||||
|
QItem.resolveTextAndCorrOrg(collective, item, maxTextLen, pageSep)
|
||||||
|
)
|
||||||
.through(mkData)
|
.through(mkData)
|
||||||
store.transact(connStream)
|
store.transact(connStream)
|
||||||
}
|
}
|
||||||
@ -49,11 +56,14 @@ object SelectItems {
|
|||||||
def forCorrPerson[F[_]](
|
def forCorrPerson[F[_]](
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
max: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] = {
|
): Stream[F, Data] = {
|
||||||
val connStream =
|
val connStream =
|
||||||
allItems(collective, max)
|
allItems(collective, maxItems)
|
||||||
.evalMap(item => QItem.resolveTextAndCorrPerson(collective, item, pageSep))
|
.evalMap(item =>
|
||||||
|
QItem.resolveTextAndCorrPerson(collective, item, maxTextLen, pageSep)
|
||||||
|
)
|
||||||
.through(mkData)
|
.through(mkData)
|
||||||
store.transact(connStream)
|
store.transact(connStream)
|
||||||
}
|
}
|
||||||
@ -61,11 +71,14 @@ object SelectItems {
|
|||||||
def forConcPerson[F[_]](
|
def forConcPerson[F[_]](
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
max: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] = {
|
): Stream[F, Data] = {
|
||||||
val connStream =
|
val connStream =
|
||||||
allItems(collective, max)
|
allItems(collective, maxItems)
|
||||||
.evalMap(item => QItem.resolveTextAndConcPerson(collective, item, pageSep))
|
.evalMap(item =>
|
||||||
|
QItem.resolveTextAndConcPerson(collective, item, maxTextLen, pageSep)
|
||||||
|
)
|
||||||
.through(mkData)
|
.through(mkData)
|
||||||
store.transact(connStream)
|
store.transact(connStream)
|
||||||
}
|
}
|
||||||
@ -73,11 +86,14 @@ object SelectItems {
|
|||||||
def forConcEquip[F[_]](
|
def forConcEquip[F[_]](
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
max: Int
|
maxItems: Int,
|
||||||
|
maxTextLen: Int
|
||||||
): Stream[F, Data] = {
|
): Stream[F, Data] = {
|
||||||
val connStream =
|
val connStream =
|
||||||
allItems(collective, max)
|
allItems(collective, maxItems)
|
||||||
.evalMap(item => QItem.resolveTextAndConcEquip(collective, item, pageSep))
|
.evalMap(item =>
|
||||||
|
QItem.resolveTextAndConcEquip(collective, item, maxTextLen, pageSep)
|
||||||
|
)
|
||||||
.through(mkData)
|
.through(mkData)
|
||||||
store.transact(connStream)
|
store.transact(connStream)
|
||||||
}
|
}
|
||||||
|
@ -547,7 +547,6 @@ object QItem {
|
|||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
limit: Batch
|
limit: Batch
|
||||||
): Stream[ConnectionIO, Ident] = {
|
): Stream[ConnectionIO, Ident] = {
|
||||||
|
|
||||||
val i = RItem.as("i")
|
val i = RItem.as("i")
|
||||||
Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed)
|
Select(i.id.s, from(i), i.cid === collective && i.state === ItemState.confirmed)
|
||||||
.orderBy(i.created.desc)
|
.orderBy(i.created.desc)
|
||||||
@ -561,6 +560,7 @@ object QItem {
|
|||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
tagCategory: String,
|
tagCategory: String,
|
||||||
|
maxLen: Int,
|
||||||
pageSep: String
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] = {
|
): ConnectionIO[TextAndTag] = {
|
||||||
val tags = TableDef("tags").as("tt")
|
val tags = TableDef("tags").as("tt")
|
||||||
@ -578,7 +578,7 @@ object QItem {
|
|||||||
)
|
)
|
||||||
)(
|
)(
|
||||||
Select(
|
Select(
|
||||||
select(m.content, tagsTid, tagsName),
|
select(substring(m.content.s, 0, maxLen).s, tagsTid.s, tagsName.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, a.id === m.id)
|
.innerJoin(m, a.id === m.id)
|
||||||
@ -592,11 +592,12 @@ object QItem {
|
|||||||
def resolveTextAndCorrOrg(
|
def resolveTextAndCorrOrg(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
|
maxLen: Int,
|
||||||
pageSep: String
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(m.content, org.oid, org.name),
|
select(substring(m.content.s, 0, maxLen).s, org.oid.s, org.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -608,11 +609,12 @@ object QItem {
|
|||||||
def resolveTextAndCorrPerson(
|
def resolveTextAndCorrPerson(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
|
maxLen: Int,
|
||||||
pageSep: String
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(m.content, pers0.pid, pers0.name),
|
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -624,11 +626,12 @@ object QItem {
|
|||||||
def resolveTextAndConcPerson(
|
def resolveTextAndConcPerson(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
|
maxLen: Int,
|
||||||
pageSep: String
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(m.content, pers0.pid, pers0.name),
|
select(substring(m.content.s, 0, maxLen).s, pers0.pid.s, pers0.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
@ -640,11 +643,12 @@ object QItem {
|
|||||||
def resolveTextAndConcEquip(
|
def resolveTextAndConcEquip(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
|
maxLen: Int,
|
||||||
pageSep: String
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] =
|
): ConnectionIO[TextAndTag] =
|
||||||
readTextAndTag(collective, itemId, pageSep) {
|
readTextAndTag(collective, itemId, pageSep) {
|
||||||
Select(
|
Select(
|
||||||
select(m.content, equip.eid, equip.name),
|
select(substring(m.content.s, 0, maxLen).s, equip.eid.s, equip.name.s),
|
||||||
from(i)
|
from(i)
|
||||||
.innerJoin(a, a.itemId === i.id)
|
.innerJoin(a, a.itemId === i.id)
|
||||||
.innerJoin(m, m.id === a.id)
|
.innerJoin(m, m.id === a.id)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user