Reorganize nlp pipeline and add nlp-unsupported language italian

Improves and reorganizes how nlp pipelines are setup. Now users can
choose from many options, depending on their hardware and usage
scenario.

This is the base to use more languages without depending on what
stanford-nlp supports. Support then is involves to text extraction and
simple regex-ner processing.
This commit is contained in:
Eike Kettner
2021-01-16 23:43:24 +01:00
parent a70e9ab614
commit f01646aeb5
29 changed files with 676 additions and 255 deletions

View File

@ -1,12 +1,13 @@
package docspell.analysis.nlp
import docspell.common.Language.NLPLanguage
import minitest.SimpleTestSuite
import docspell.files.TestFiles
import docspell.common._
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
def annotate(language: Language): String => Vector[NerLabel] =
def annotate(language: NLPLanguage): String => Vector[NerLabel] =
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
test("find english ner labels") {

View File

@ -1,8 +1,12 @@
package docspell.analysis.nlp
import java.nio.file.Paths
import cats.effect.IO
import minitest.SimpleTestSuite
import docspell.files.TestFiles
import docspell.common._
import docspell.common.syntax.FileSyntax._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
assertEquals(labels, expect)
StanfordCoreNLP.clearAnnotatorPool()
}
test("regexner-only annotator") {
val regexNerContent =
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|""".stripMargin
File
.withTempDir[IO](Paths.get("target"), "test-regex-ner")
.use { dir =>
for {
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
_ <- IO(
assertEquals(
labels,
Vector(
NerLabel("Andrea", NerTag.Person, 6, 12),
NerLabel("Rossi", NerTag.Person, 13, 18)
)
)
)
} yield ()
}
.unsafeRunSync()
StanfordCoreNLP.clearAnnotatorPool()
}
}