mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 18:39:33 +00:00
Resurrect the basic ner classifier
This commit is contained in:
parent
a699e87304
commit
4462ebae0f
@ -0,0 +1,75 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common._
|
||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||
import org.log4s.getLogger
|
||||
|
||||
import java.net.URL
|
||||
import java.util.zip.GZIPInputStream
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.Using
|
||||
|
||||
/** This is only using the CRFClassifier without building an analysis
|
||||
* pipeline. The ner-classifier cannot use results from POS-tagging
|
||||
* etc. and is therefore not as good as the [[StanfordNerAnnotator]].
|
||||
* But it uses less memory, while still being not bad.
|
||||
*/
|
||||
object BasicCRFAnnotator {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
lazy val germanNerClassifier = makeClassifier(Language.German)
|
||||
lazy val englishNerClassifier = makeClassifier(Language.English)
|
||||
lazy val frenchNerClassifier = makeClassifier(Language.French)
|
||||
|
||||
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
|
||||
val nerClassifier = lang match {
|
||||
case Language.English => englishNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
case Language.French => frenchNerClassifier
|
||||
}
|
||||
nerClassifier
|
||||
.classify(text)
|
||||
.asScala
|
||||
.flatMap(a => a.asScala)
|
||||
.collect(Function.unlift { label =>
|
||||
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
|
||||
NerTag
|
||||
.fromString(Option(tag).getOrElse(""))
|
||||
.toOption
|
||||
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
|
||||
})
|
||||
.toVector
|
||||
}
|
||||
|
||||
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
|
||||
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
||||
val ner = classifierResource(lang)
|
||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
||||
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
|
||||
}.fold(throw _, identity)
|
||||
}
|
||||
|
||||
private def classifierResource(lang: Language): URL = {
|
||||
def check(u: URL): URL =
|
||||
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
|
||||
else u
|
||||
|
||||
check(lang match {
|
||||
case Language.French =>
|
||||
getClass.getResource(
|
||||
"/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
|
||||
)
|
||||
case Language.German =>
|
||||
getClass.getResource(
|
||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||
)
|
||||
case Language.English =>
|
||||
getClass.getResource(
|
||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
||||
test("find english ner labels") {
|
||||
val labels =
|
||||
BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText)
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Elm", NerTag.Misc, 17, 20),
|
||||
NerLabel("Ave.", NerTag.Misc, 21, 25),
|
||||
NerLabel("Treesville", NerTag.Misc, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||
NerLabel("Elm", NerTag.Misc, 85, 88),
|
||||
NerLabel("Ave.", NerTag.Misc, 89, 93),
|
||||
NerLabel("Treesville", NerTag.Person, 95, 105),
|
||||
NerLabel("Leaf", NerTag.Organization, 144, 148),
|
||||
NerLabel("Chief", NerTag.Organization, 150, 155),
|
||||
NerLabel("of", NerTag.Organization, 156, 158),
|
||||
NerLabel("Syrup", NerTag.Organization, 159, 164),
|
||||
NerLabel("Production", NerTag.Organization, 165, 175),
|
||||
NerLabel("Old", NerTag.Organization, 176, 179),
|
||||
NerLabel("Sticky", NerTag.Organization, 180, 186),
|
||||
NerLabel("Pancake", NerTag.Organization, 187, 194),
|
||||
NerLabel("Company", NerTag.Organization, 195, 202),
|
||||
NerLabel("Maple", NerTag.Organization, 207, 212),
|
||||
NerLabel("Lane", NerTag.Organization, 213, 217),
|
||||
NerLabel("Forest", NerTag.Organization, 219, 225),
|
||||
NerLabel("Hemptown", NerTag.Location, 239, 247),
|
||||
NerLabel("Leaf", NerTag.Person, 276, 280),
|
||||
NerLabel("Little", NerTag.Misc, 347, 353),
|
||||
NerLabel("League", NerTag.Misc, 354, 360),
|
||||
NerLabel("Derek", NerTag.Person, 1117, 1122),
|
||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels =
|
||||
BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
NerLabel("Lilienweg", NerTag.Person, 16, 25),
|
||||
NerLabel("Max", NerTag.Person, 77, 80),
|
||||
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
||||
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
||||
NerLabel("EasyCare", NerTag.Organization, 124, 132),
|
||||
NerLabel("AG", NerTag.Organization, 133, 135),
|
||||
NerLabel("Ackerweg", NerTag.Location, 158, 166),
|
||||
NerLabel("Nebendorf", NerTag.Location, 184, 193),
|
||||
NerLabel("Max", NerTag.Person, 505, 508),
|
||||
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
}
|
@ -5,7 +5,7 @@ import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||
lazy val germanClassifier =
|
||||
new StanfordCoreNLP(Properties.nerGerman(None, false))
|
||||
lazy val englishClassifier =
|
||||
@ -45,6 +45,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
|
||||
test("find german ner labels") {
|
||||
@ -65,5 +66,6 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user