Some cleanup

This commit is contained in:
Eike Kettner 2020-09-01 23:57:27 +02:00
parent f9fcee81a5
commit 4309bd8dfd
4 changed files with 13 additions and 9 deletions

View File

@ -299,9 +299,8 @@ docspell.joex {
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
# for more info about these settings. The settings are almost
# identical to them, as they yielded best results with *my*
# dataset.
# for more info about these settings. The settings here yielded
# good results with *my* dataset.
#
# Enclose regexps in triple quotes.
classifiers = [
@ -312,6 +311,7 @@ docspell.joex {
"maxNGramLeng" = "4"
"minNGramLeng" = "1"
"splitWordShape" = "chris4"
"intern" = "true" # makes it slower but saves memory
}
]
}

View File

@ -20,6 +20,7 @@ import bitpeace.MimetypeHint
object LearnClassifierTask {
val noClass = "__NONE__"
val pageSep = " --n-- "
type Args = LearnClassifierArgs
@ -80,7 +81,9 @@ object LearnClassifierTask {
val connStream =
for {
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
tt <- Stream.eval(
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
)
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
ctx.store.transact(connStream.filter(_.text.nonEmpty))
}

View File

@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.learn.LearnClassifierTask
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
@ -76,7 +77,7 @@ object TextAnalysis {
for {
model <- findActiveModel(ctx, cfg)
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
text = metas.flatMap(_.content).mkString(" ------ ")
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
modelData =
ctx.store.bitpeace
.get(model.id)
@ -89,8 +90,7 @@ object TextAnalysis {
.compile
.drain
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
})
}).filter(_ != LearnClassifierTask.noClass)
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
} yield cls

View File

@ -634,7 +634,8 @@ object QItem {
def resolveTextAndTag(
collective: Ident,
itemId: Ident,
tagCategory: String
tagCategory: String,
pageSep: String
): ConnectionIO[TextAndTag] = {
val aId = RAttachment.Columns.id.prefix("a")
val aItem = RAttachment.Columns.itemId.prefix("a")
@ -682,7 +683,7 @@ object QItem {
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
)
tag = texts.headOption.flatMap(_._2)
txt = texts.map(_._1).mkString(" --n-- ")
txt = texts.map(_._1).mkString(pageSep)
} yield TextAndTag(itemId, txt, tag)
}