mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 15:15:58 +00:00
Some cleanup
This commit is contained in:
parent
f9fcee81a5
commit
4309bd8dfd
@ -299,9 +299,8 @@ docspell.joex {
|
|||||||
# multiple are given, they are all tried and the "best" is
|
# multiple are given, they are all tried and the "best" is
|
||||||
# chosen at the end. See
|
# chosen at the end. See
|
||||||
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||||
# for more info about these settings. The settings are almost
|
# for more info about these settings. The settings here yielded
|
||||||
# identical to them, as they yielded best results with *my*
|
# good results with *my* dataset.
|
||||||
# dataset.
|
|
||||||
#
|
#
|
||||||
# Enclose regexps in triple quotes.
|
# Enclose regexps in triple quotes.
|
||||||
classifiers = [
|
classifiers = [
|
||||||
@ -312,6 +311,7 @@ docspell.joex {
|
|||||||
"maxNGramLeng" = "4"
|
"maxNGramLeng" = "4"
|
||||||
"minNGramLeng" = "1"
|
"minNGramLeng" = "1"
|
||||||
"splitWordShape" = "chris4"
|
"splitWordShape" = "chris4"
|
||||||
|
"intern" = "true" # makes it slower but saves memory
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ import bitpeace.MimetypeHint
|
|||||||
|
|
||||||
object LearnClassifierTask {
|
object LearnClassifierTask {
|
||||||
val noClass = "__NONE__"
|
val noClass = "__NONE__"
|
||||||
|
val pageSep = " --n-- "
|
||||||
|
|
||||||
type Args = LearnClassifierArgs
|
type Args = LearnClassifierArgs
|
||||||
|
|
||||||
@ -80,7 +81,9 @@ object LearnClassifierTask {
|
|||||||
val connStream =
|
val connStream =
|
||||||
for {
|
for {
|
||||||
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
||||||
tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
|
tt <- Stream.eval(
|
||||||
|
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
|
||||||
|
)
|
||||||
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
||||||
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
|
import docspell.joex.learn.LearnClassifierTask
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
@ -76,7 +77,7 @@ object TextAnalysis {
|
|||||||
for {
|
for {
|
||||||
model <- findActiveModel(ctx, cfg)
|
model <- findActiveModel(ctx, cfg)
|
||||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||||
text = metas.flatMap(_.content).mkString(" ------ ")
|
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
modelData =
|
modelData =
|
||||||
ctx.store.bitpeace
|
ctx.store.bitpeace
|
||||||
.get(model.id)
|
.get(model.id)
|
||||||
@ -89,8 +90,7 @@ object TextAnalysis {
|
|||||||
.compile
|
.compile
|
||||||
.drain
|
.drain
|
||||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||||
|
}).filter(_ != LearnClassifierTask.noClass)
|
||||||
})
|
|
||||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||||
} yield cls
|
} yield cls
|
||||||
|
|
||||||
|
@ -634,7 +634,8 @@ object QItem {
|
|||||||
def resolveTextAndTag(
|
def resolveTextAndTag(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
itemId: Ident,
|
itemId: Ident,
|
||||||
tagCategory: String
|
tagCategory: String,
|
||||||
|
pageSep: String
|
||||||
): ConnectionIO[TextAndTag] = {
|
): ConnectionIO[TextAndTag] = {
|
||||||
val aId = RAttachment.Columns.id.prefix("a")
|
val aId = RAttachment.Columns.id.prefix("a")
|
||||||
val aItem = RAttachment.Columns.itemId.prefix("a")
|
val aItem = RAttachment.Columns.itemId.prefix("a")
|
||||||
@ -682,7 +683,7 @@ object QItem {
|
|||||||
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
||||||
)
|
)
|
||||||
tag = texts.headOption.flatMap(_._2)
|
tag = texts.headOption.flatMap(_._2)
|
||||||
txt = texts.map(_._1).mkString(" --n-- ")
|
txt = texts.map(_._1).mkString(pageSep)
|
||||||
} yield TextAndTag(itemId, txt, tag)
|
} yield TextAndTag(itemId, txt, tag)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user