From 4462ebae0fb1abafdfc6ec5f7dca56da81cc4014 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 22:29:53 +0100 Subject: [PATCH] Resurrect the basic ner classifier --- .../analysis/nlp/BasicCRFAnnotator.scala | 75 +++++++++++++++++++ .../analysis/nlp/BaseCRFAnnotatorSuite.scala | 63 ++++++++++++++++ ....scala => StanfordNerAnnotatorSuite.scala} | 4 +- 3 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala create mode 100644 modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala rename modules/analysis/src/test/scala/docspell/analysis/nlp/{TextAnalyserSuite.scala => StanfordNerAnnotatorSuite.scala} (95%) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala new file mode 100644 index 00000000..5823fba2 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -0,0 +1,75 @@ +package docspell.analysis.nlp + +import docspell.common._ +import edu.stanford.nlp.ie.AbstractSequenceClassifier +import edu.stanford.nlp.ie.crf.CRFClassifier +import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger + +import java.net.URL +import java.util.zip.GZIPInputStream + +import scala.jdk.CollectionConverters._ +import scala.util.Using + +/** This is only using the CRFClassifier without building an analysis + * pipeline. The ner-classifier cannot use results from POS-tagging + * etc. and is therefore not as good as the [[StanfordNerAnnotator]]. + * But it uses less memory, while still being not bad. + */ +object BasicCRFAnnotator { + private[this] val logger = getLogger + + lazy val germanNerClassifier = makeClassifier(Language.German) + lazy val englishNerClassifier = makeClassifier(Language.English) + lazy val frenchNerClassifier = makeClassifier(Language.French) + + def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { + val nerClassifier = lang match { + case Language.English => englishNerClassifier + case Language.German => germanNerClassifier + case Language.French => frenchNerClassifier + } + nerClassifier + .classify(text) + .asScala + .flatMap(a => a.asScala) + .collect(Function.unlift { label => + val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) + NerTag + .fromString(Option(tag).getOrElse("")) + .toOption + .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) + }) + .toVector + } + + private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") + val ner = classifierResource(lang) + Using(new GZIPInputStream(ner.openStream())) { in => + CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] + }.fold(throw _, identity) + } + + private def classifierResource(lang: Language): URL = { + def check(u: URL): URL = + if (u == null) sys.error(s"NER model url not found for language ${lang.name}") + else u + + check(lang match { + case Language.French => + getClass.getResource( + "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" + ) + case Language.German => + getClass.getResource( + "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" + ) + case Language.English => + getClass.getResource( + "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ) + }) + } +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala new file mode 100644 index 00000000..bffc6744 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/BaseCRFAnnotatorSuite.scala @@ -0,0 +1,63 @@ +package docspell.analysis.nlp + +import minitest.SimpleTestSuite +import docspell.files.TestFiles +import docspell.common._ + +object BaseCRFAnnotatorSuite extends SimpleTestSuite { + test("find english ner labels") { + val labels = + BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText) + val expect = Vector( + NerLabel("Derek", NerTag.Person, 0, 5), + NerLabel("Jeter", NerTag.Person, 6, 11), + NerLabel("Elm", NerTag.Misc, 17, 20), + NerLabel("Ave.", NerTag.Misc, 21, 25), + NerLabel("Treesville", NerTag.Misc, 27, 37), + NerLabel("Derek", NerTag.Person, 68, 73), + NerLabel("Jeter", NerTag.Person, 74, 79), + NerLabel("Elm", NerTag.Misc, 85, 88), + NerLabel("Ave.", NerTag.Misc, 89, 93), + NerLabel("Treesville", NerTag.Person, 95, 105), + NerLabel("Leaf", NerTag.Organization, 144, 148), + NerLabel("Chief", NerTag.Organization, 150, 155), + NerLabel("of", NerTag.Organization, 156, 158), + NerLabel("Syrup", NerTag.Organization, 159, 164), + NerLabel("Production", NerTag.Organization, 165, 175), + NerLabel("Old", NerTag.Organization, 176, 179), + NerLabel("Sticky", NerTag.Organization, 180, 186), + NerLabel("Pancake", NerTag.Organization, 187, 194), + NerLabel("Company", NerTag.Organization, 195, 202), + NerLabel("Maple", NerTag.Organization, 207, 212), + NerLabel("Lane", NerTag.Organization, 213, 217), + NerLabel("Forest", NerTag.Organization, 219, 225), + NerLabel("Hemptown", NerTag.Location, 239, 247), + NerLabel("Leaf", NerTag.Person, 276, 280), + NerLabel("Little", NerTag.Misc, 347, 353), + NerLabel("League", NerTag.Misc, 354, 360), + NerLabel("Derek", NerTag.Person, 1117, 1122), + NerLabel("Jeter", NerTag.Person, 1123, 1128) + ) + assertEquals(labels, expect) + } + + test("find german ner labels") { + val labels = + BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText) + val expect = Vector( + NerLabel("Max", NerTag.Person, 0, 3), + NerLabel("Mustermann", NerTag.Person, 4, 14), + NerLabel("Lilienweg", NerTag.Person, 16, 25), + NerLabel("Max", NerTag.Person, 77, 80), + NerLabel("Mustermann", NerTag.Person, 81, 91), + NerLabel("Lilienweg", NerTag.Location, 93, 102), + NerLabel("EasyCare", NerTag.Organization, 124, 132), + NerLabel("AG", NerTag.Organization, 133, 135), + NerLabel("Ackerweg", NerTag.Location, 158, 166), + NerLabel("Nebendorf", NerTag.Location, 184, 193), + NerLabel("Max", NerTag.Person, 505, 508), + NerLabel("Mustermann", NerTag.Person, 509, 519) + ) + assertEquals(labels, expect) + } +} diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala similarity index 95% rename from modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala index e0dfc4a0..1704ef1b 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordNerAnnotatorSuite.scala @@ -5,7 +5,7 @@ import docspell.files.TestFiles import docspell.common._ import edu.stanford.nlp.pipeline.StanfordCoreNLP -object TextAnalyserSuite extends SimpleTestSuite { +object StanfordNerAnnotatorSuite extends SimpleTestSuite { lazy val germanClassifier = new StanfordCoreNLP(Properties.nerGerman(None, false)) lazy val englishClassifier = @@ -45,6 +45,7 @@ object TextAnalyserSuite extends SimpleTestSuite { NerLabel("Jeter", NerTag.Person, 1123, 1128) ) assertEquals(labels, expect) + StanfordCoreNLP.clearAnnotatorPool() } test("find german ner labels") { @@ -65,5 +66,6 @@ object TextAnalyserSuite extends SimpleTestSuite { NerLabel("Mustermann", NerTag.Person, 509, 519) ) assertEquals(labels, expect) + StanfordCoreNLP.clearAnnotatorPool() } }