Some cleanup

2025-08-01 13:04:52 +00:00 · 2020-09-01 23:57:27 +02:00
parent f9fcee81a5
commit 4309bd8dfd
4 changed files with 13 additions and 9 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -299,9 +299,8 @@ docspell.joex {
      # multiple are given, they are all tried and the "best" is
      # chosen at the end. See
      # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
-      # for more info about these settings. The settings are almost
+      # for more info about these settings. The settings here yielded
-      # identical to them, as they yielded best results with *my*
+      # good results with *my* dataset.
      # dataset.
      #
      # Enclose regexps in triple quotes.
      classifiers = [
@ -312,6 +311,7 @@ docspell.joex {
          "maxNGramLeng" = "4"
          "minNGramLeng" = "1"
          "splitWordShape" = "chris4"
          "intern" = "true" # makes it slower but saves memory
        }
      ]
    }
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@ -20,6 +20,7 @@ import bitpeace.MimetypeHint
 object LearnClassifierTask {
  val noClass = "__NONE__"
  val pageSep = " --n-- "
  type Args = LearnClassifierArgs
@ -80,7 +81,9 @@ object LearnClassifierTask {
    val connStream =
      for {
        item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
-        tt   <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
+        tt <- Stream.eval(
          QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
        )
      } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
    ctx.store.transact(connStream.filter(_.text.nonEmpty))
  }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.learn.LearnClassifierTask
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
@ -76,7 +77,7 @@ object TextAnalysis {
    for {
      model <- findActiveModel(ctx, cfg)
      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
-      text = metas.flatMap(_.content).mkString("   ------   ")
+      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
      modelData =
        ctx.store.bitpeace
          .get(model.id)
@ -89,8 +90,7 @@ object TextAnalysis {
          .compile
          .drain
          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
-
+      }).filter(_ != LearnClassifierTask.noClass)
      })
      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
    } yield cls
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@ -634,7 +634,8 @@ object QItem {
  def resolveTextAndTag(
      collective: Ident,
      itemId: Ident,
-      tagCategory: String
+      tagCategory: String,
      pageSep: String
  ): ConnectionIO[TextAndTag] = {
    val aId    = RAttachment.Columns.id.prefix("a")
    val aItem  = RAttachment.Columns.itemId.prefix("a")
@ -682,7 +683,7 @@ object QItem {
        s"Got ${texts.size} text and tag entries for item ${itemId.id}"
      )
      tag = texts.headOption.flatMap(_._2)
-      txt = texts.map(_._1).mkString(" --n-- ")
+      txt = texts.map(_._1).mkString(pageSep)
    } yield TextAndTag(itemId, txt, tag)
  }