diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index e09bfd3b..23ec5b47 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -299,9 +299,8 @@ docspell.joex { # multiple are given, they are all tried and the "best" is # chosen at the end. See # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html - # for more info about these settings. The settings are almost - # identical to them, as they yielded best results with *my* - # dataset. + # for more info about these settings. The settings here yielded + # good results with *my* dataset. # # Enclose regexps in triple quotes. classifiers = [ @@ -312,6 +311,7 @@ docspell.joex { "maxNGramLeng" = "4" "minNGramLeng" = "1" "splitWordShape" = "chris4" + "intern" = "true" # makes it slower but saves memory } ] } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 013cd215..c3d6e3f9 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -20,6 +20,7 @@ import bitpeace.MimetypeHint object LearnClassifierTask { val noClass = "__NONE__" + val pageSep = " --n-- " type Args = LearnClassifierArgs @@ -80,7 +81,9 @@ object LearnClassifierTask { val connStream = for { item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) - tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category)) + tt <- Stream.eval( + QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep) + ) } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) ctx.store.transact(connStream.filter(_.text.nonEmpty)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 039f52e7..ebb0894a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile +import docspell.joex.learn.LearnClassifierTask import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task @@ -76,7 +77,7 @@ object TextAnalysis { for { model <- findActiveModel(ctx, cfg) _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) - text = metas.flatMap(_.content).mkString(" ------ ") + text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) modelData = ctx.store.bitpeace .get(model.id) @@ -89,8 +90,7 @@ object TextAnalysis { .compile .drain .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) - - }) + }).filter(_ != LearnClassifierTask.noClass) _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) } yield cls diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 312523ce..d3d2653e 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -634,7 +634,8 @@ object QItem { def resolveTextAndTag( collective: Ident, itemId: Ident, - tagCategory: String + tagCategory: String, + pageSep: String ): ConnectionIO[TextAndTag] = { val aId = RAttachment.Columns.id.prefix("a") val aItem = RAttachment.Columns.itemId.prefix("a") @@ -682,7 +683,7 @@ object QItem { s"Got ${texts.size} text and tag entries for item ${itemId.id}" ) tag = texts.headOption.flatMap(_._2) - txt = texts.map(_._1).mkString(" --n-- ") + txt = texts.map(_._1).mkString(pageSep) } yield TextAndTag(itemId, txt, tag) }