mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Some cleanup
This commit is contained in:
parent
f9fcee81a5
commit
4309bd8dfd
@ -299,9 +299,8 @@ docspell.joex {
|
||||
# multiple are given, they are all tried and the "best" is
|
||||
# chosen at the end. See
|
||||
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||
# for more info about these settings. The settings are almost
|
||||
# identical to them, as they yielded best results with *my*
|
||||
# dataset.
|
||||
# for more info about these settings. The settings here yielded
|
||||
# good results with *my* dataset.
|
||||
#
|
||||
# Enclose regexps in triple quotes.
|
||||
classifiers = [
|
||||
@ -312,6 +311,7 @@ docspell.joex {
|
||||
"maxNGramLeng" = "4"
|
||||
"minNGramLeng" = "1"
|
||||
"splitWordShape" = "chris4"
|
||||
"intern" = "true" # makes it slower but saves memory
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ import bitpeace.MimetypeHint
|
||||
|
||||
object LearnClassifierTask {
|
||||
val noClass = "__NONE__"
|
||||
val pageSep = " --n-- "
|
||||
|
||||
type Args = LearnClassifierArgs
|
||||
|
||||
@ -80,7 +81,9 @@ object LearnClassifierTask {
|
||||
val connStream =
|
||||
for {
|
||||
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
||||
tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
|
||||
tt <- Stream.eval(
|
||||
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
|
||||
)
|
||||
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
||||
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
||||
}
|
||||
|
@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.learn.LearnClassifierTask
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
@ -76,7 +77,7 @@ object TextAnalysis {
|
||||
for {
|
||||
model <- findActiveModel(ctx, cfg)
|
||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||
text = metas.flatMap(_.content).mkString(" ------ ")
|
||||
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
modelData =
|
||||
ctx.store.bitpeace
|
||||
.get(model.id)
|
||||
@ -89,8 +90,7 @@ object TextAnalysis {
|
||||
.compile
|
||||
.drain
|
||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||
|
||||
})
|
||||
}).filter(_ != LearnClassifierTask.noClass)
|
||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||
} yield cls
|
||||
|
||||
|
@ -634,7 +634,8 @@ object QItem {
|
||||
def resolveTextAndTag(
|
||||
collective: Ident,
|
||||
itemId: Ident,
|
||||
tagCategory: String
|
||||
tagCategory: String,
|
||||
pageSep: String
|
||||
): ConnectionIO[TextAndTag] = {
|
||||
val aId = RAttachment.Columns.id.prefix("a")
|
||||
val aItem = RAttachment.Columns.itemId.prefix("a")
|
||||
@ -682,7 +683,7 @@ object QItem {
|
||||
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
||||
)
|
||||
tag = texts.headOption.flatMap(_._2)
|
||||
txt = texts.map(_._1).mkString(" --n-- ")
|
||||
txt = texts.map(_._1).mkString(pageSep)
|
||||
} yield TextAndTag(itemId, txt, tag)
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user