Separate ner from classification

2025-07-04 16:48:26 +00:00 · 2021-01-13 21:41:51 +01:00
parent f02f15e5bd
commit a699e87304
12 changed files with 25 additions and 28 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -3,13 +3,10 @@ package docspell.analysis
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
 import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
-import docspell.analysis.nlp.PipelineCache
+import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
 import docspell.analysis.nlp.StanfordNerClassifier
 import docspell.analysis.nlp.StanfordNerSettings
 import docspell.analysis.nlp.StanfordTextClassifier
 import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 trait TextAnalyser[F[_]] {
@ -67,7 +64,7 @@ object TextAnalyser {
          private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
              : F[Vector[NerLabel]] =
-            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
+            StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
          private def contactNer(text: String): F[Vector[NerLabel]] =
            Sync[F].delay {
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
@ -1,6 +1,6 @@
 package docspell.analysis
-import docspell.analysis.nlp.TextClassifierConfig
+import docspell.analysis.classifier.TextClassifierConfig
 import docspell.common._
 case class TextAnalysisConfig(
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala
@ -1,4 +1,4 @@
-package docspell.analysis.nlp
+package docspell.analysis.classifier
 import java.nio.file.Path
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
@ -1,4 +1,4 @@
-package docspell.analysis.nlp
+package docspell.analysis.classifier
 import java.nio.file.Path
@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref
 import cats.implicits._
 import fs2.Stream
-import docspell.analysis.nlp.TextClassifier._
+import docspell.analysis.classifier
 import docspell.analysis.classifier.TextClassifier._
 import docspell.analysis.nlp.Properties
 import docspell.common._
 import edu.stanford.nlp.classify.ColumnDataClassifier
@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
      case Some(text) =>
        Sync[F].delay {
          val cls = ColumnDataClassifier.getClassifier(
-            model.model.normalize().toAbsolutePath().toString()
+            model.model.normalize().toAbsolutePath.toString
          )
          val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
          Option(cat)
@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
        val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
        cdc.trainClassifier(in.train.toString())
        val score = cdc.testClassifier(in.test.toString())
-        TrainResult(score.first(), ClassifierModel(in.modelFile))
+        TrainResult(score.first(), classifier.ClassifierModel(in.modelFile))
      }
      _ <- logger.debug(s"Trained with result $res")
    } yield res
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala
@ -1,9 +1,9 @@
-package docspell.analysis.nlp
+package docspell.analysis.classifier
 import cats.data.Kleisli
 import fs2.Stream
-import docspell.analysis.nlp.TextClassifier.Data
+import docspell.analysis.classifier.TextClassifier.Data
 import docspell.common._
 trait TextClassifier[F[_]] {
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala
@ -1,4 +1,4 @@
-package docspell.analysis.nlp
+package docspell.analysis.classifier
 import java.nio.file.Path
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@ -9,7 +9,7 @@ import docspell.common._
 import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
-object StanfordNerClassifier {
+object StanfordNerAnnotator {
  /** Runs named entity recognition on the given `text`.
    *
@ -28,9 +28,9 @@ object StanfordNerClassifier {
  )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
    cache
      .obtain(cacheKey, settings)
-      .use(crf => Applicative[F].pure(runClassifier(crf, text)))
+      .use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
-  def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
+  def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
    val doc = new CoreDocument(text)
    nerClassifier.annotate(doc)
    doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
--- a/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala
@ -1,4 +1,4 @@
-package docspell.analysis.nlp
+package docspell.analysis.classifier
 import minitest._
 import cats.effect._
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
  test("find english ner labels") {
    val labels =
-      StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
+      StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText)
    val expect = Vector(
      NerLabel("Derek", NerTag.Person, 0, 5),
      NerLabel("Jeter", NerTag.Person, 6, 11),
@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
  test("find german ner labels") {
    val labels =
-      StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
+      StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText)
    val expect = Vector(
      NerLabel("Max", NerTag.Person, 0, 3),
      NerLabel("Mustermann", NerTag.Person, 4, 14),
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -4,8 +4,7 @@ import java.nio.file.Path
 import cats.data.NonEmptyList
-import docspell.analysis.TextAnalysisConfig
+import docspell.analysis.{TextAnalysisConfig, classifier}
 import docspell.analysis.nlp.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@ -69,7 +68,7 @@ object Config {
      TextAnalysisConfig(
        maxLength,
        clearStanfordNlpInterval,
-        TextClassifierConfig(
+        classifier.TextClassifierConfig(
          workingDir,
          NonEmptyList
            .fromList(classification.classifiers)
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@ -7,8 +7,8 @@ import cats.implicits._
 import fs2.{Pipe, Stream}
 import docspell.analysis.TextAnalyser
-import docspell.analysis.nlp.ClassifierModel
+import docspell.analysis.classifier.ClassifierModel
-import docspell.analysis.nlp.TextClassifier.Data
+import docspell.analysis.classifier.TextClassifier.Data
 import docspell.backend.ops.OCollective
 import docspell.common._
 import docspell.joex.Config
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -5,9 +5,8 @@ import cats.effect._
 import cats.implicits._
 import docspell.analysis.TextAnalyser
-import docspell.analysis.nlp.ClassifierModel
+import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
 import docspell.analysis.nlp.StanfordNerSettings
 import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
`@ -1,4 +1,4 @@`
	`package docspell.analysis.nlp`	`package docspell.analysis.classifier`

	`import java.nio.file.Path`	`import java.nio.file.Path`