Initial impl of a text classifier based on stanford-nlp

2025-08-05 02:24:52 +00:00 · 2020-08-31 22:35:27 +02:00
parent 8c4f2e702b
commit 0c97b4ef76
16 changed files with 376 additions and 18 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -298,7 +298,7 @@ docspell.joex {
      # These settings are used to configure the classifier. If
      # multiple are given, they are all tried and the "best" is
      # chosen at the end. See
-      # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
+      # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
      # for more info about these settings. The settings are almost
      # identical to them, as they yielded best results with *my*
      # dataset.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -2,7 +2,10 @@ package docspell.joex

 import java.nio.file.Path

+import cats.data.NonEmptyList
+
 import docspell.analysis.TextAnalysisConfig
+import docspell.analysis.nlp.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@ -62,7 +65,15 @@ object Config {
  ) {

    def textAnalysisConfig: TextAnalysisConfig =
-      TextAnalysisConfig(maxLength)
+      TextAnalysisConfig(
+        maxLength,
+        TextClassifierConfig(
+          workingDir,
+          NonEmptyList
+            .fromList(classification.classifiers)
+            .getOrElse(NonEmptyList.of(Map.empty))
+        )
+      )

    def regexNerFileConfig: RegexNerFile.Config =
      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@ -0,0 +1,64 @@
+package docspell.joex.learn
+
+import cats.data.Kleisli
+import cats.data.OptionT
+import cats.effect._
+import fs2.Stream
+
+import docspell.analysis.TextAnalyser
+import docspell.analysis.nlp.ClassifierModel
+import docspell.analysis.nlp.TextClassifier.Data
+import docspell.backend.ops.OCollective
+import docspell.common._
+import docspell.joex.Config
+import docspell.joex.scheduler._
+
+object LearnClassifierTask {
+
+  type Args = LearnClassifierArgs
+
+  def apply[F[_]: Sync: ContextShift](
+      cfg: Config.TextAnalysis,
+      blocker: Blocker,
+      analyser: TextAnalyser[F]
+  ): Task[F, Args, Unit] =
+    Task { ctx =>
+      (for {
+        sett <- findActiveSettings[F](ctx.args.collective, cfg)
+        data = selectItems(
+          ctx,
+          math.min(cfg.classification.itemCount, sett.itemCount),
+          sett.category.getOrElse("")
+        )
+        _ <- OptionT.liftF(
+          analyser
+            .classifier(blocker)
+            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx)))
+        )
+      } yield ())
+        .getOrElseF(logInactiveWarning(ctx.logger))
+    }
+
+  private def handleModel[F[_]](
+      ctx: Context[F, Args]
+  )(trainedModel: ClassifierModel): F[Unit] =
+    ???
+
+  private def selectItems[F[_]](
+      ctx: Context[F, Args],
+      max: Int,
+      category: String
+  ): Stream[F, Data] =
+    ???
+
+  private def findActiveSettings[F[_]: Sync](
+      coll: Ident,
+      cfg: Config.TextAnalysis
+  ): OptionT[F, OCollective.Classifier] =
+    ???
+
+  private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
+    logger.warn(
+      "Classification is disabled. Check joex config and the collective settings."
+    )
+}
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -4,7 +4,7 @@ import cats.effect._
 import cats.implicits._

 import docspell.analysis.TextAnalyser
-import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.nlp.StanfordNerSettings
 import docspell.common._
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
@ -42,7 +42,7 @@ object TextAnalysis {
      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(ctx.args.meta.language, false, None)
+    val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
    for {
      customNer <- nerFile.makeFile(ctx.args.meta.collective)
      sett = settings.copy(regexNer = customNer)