mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 18:38:26 +00:00
Reorganize nlp pipeline and add nlp-unsupported language italian
Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
This commit is contained in:
@ -1,12 +1,13 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common.Language.NLPLanguage
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
||||
|
||||
def annotate(language: Language): String => Vector[NerLabel] =
|
||||
def annotate(language: NLPLanguage): String => Vector[NerLabel] =
|
||||
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
|
||||
|
||||
test("find english ner labels") {
|
||||
|
@ -1,8 +1,12 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import cats.effect.IO
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
import docspell.common.syntax.FileSyntax._
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
|
||||
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||
@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||
assertEquals(labels, expect)
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
|
||||
test("regexner-only annotator") {
|
||||
val regexNerContent =
|
||||
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|""".stripMargin
|
||||
|
||||
File
|
||||
.withTempDir[IO](Paths.get("target"), "test-regex-ner")
|
||||
.use { dir =>
|
||||
for {
|
||||
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
|
||||
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
|
||||
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
|
||||
_ <- IO(
|
||||
assertEquals(
|
||||
labels,
|
||||
Vector(
|
||||
NerLabel("Andrea", NerTag.Person, 6, 12),
|
||||
NerLabel("Rossi", NerTag.Person, 13, 18)
|
||||
)
|
||||
)
|
||||
)
|
||||
} yield ()
|
||||
}
|
||||
.unsafeRunSync()
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user