Reorganize nlp pipeline and add nlp-unsupported language italian

Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
2025-09-28 23:58:21 +00:00 · 2021-01-16 23:43:24 +01:00
parent a70e9ab614
commit f01646aeb5
29 changed files with 676 additions and 255 deletions
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala
@@ -1,12 +1,13 @@
 package docspell.analysis.nlp

+import docspell.common.Language.NLPLanguage
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._

 object BaseCRFAnnotatorSuite extends SimpleTestSuite {

-  def annotate(language: Language): String => Vector[NerLabel] =
+  def annotate(language: NLPLanguage): String => Vector[NerLabel] =
    BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))

  test("find english ner labels") {
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala
@@ -1,8 +1,12 @@
 package docspell.analysis.nlp

+import java.nio.file.Paths
+
+import cats.effect.IO
 import minitest.SimpleTestSuite
 import docspell.files.TestFiles
 import docspell.common._
+import docspell.common.syntax.FileSyntax._
 import edu.stanford.nlp.pipeline.StanfordCoreNLP

 object StanfordNerAnnotatorSuite extends SimpleTestSuite {
@@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
    assertEquals(labels, expect)
    StanfordCoreNLP.clearAnnotatorPool()
  }
+
+  test("regexner-only annotator") {
+    val regexNerContent =
+      s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
+      |(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
+      |""".stripMargin
+
+    File
+      .withTempDir[IO](Paths.get("target"), "test-regex-ner")
+      .use { dir =>
+        for {
+          out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
+          ann    = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
+          labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
+          _ <- IO(
+            assertEquals(
+              labels,
+              Vector(
+                NerLabel("Andrea", NerTag.Person, 6, 12),
+                NerLabel("Rossi", NerTag.Person, 13, 18)
+              )
+            )
+          )
+        } yield ()
+      }
+      .unsafeRunSync()
+    StanfordCoreNLP.clearAnnotatorPool()
+  }
 }