mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 09:30:12 +00:00 
			
		
		
		
	Resurrect the basic ner classifier
This commit is contained in:
		| @@ -0,0 +1,75 @@ | |||||||
|  | package docspell.analysis.nlp | ||||||
|  |  | ||||||
|  | import docspell.common._ | ||||||
|  | import edu.stanford.nlp.ie.AbstractSequenceClassifier | ||||||
|  | import edu.stanford.nlp.ie.crf.CRFClassifier | ||||||
|  | import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} | ||||||
|  | import org.log4s.getLogger | ||||||
|  |  | ||||||
|  | import java.net.URL | ||||||
|  | import java.util.zip.GZIPInputStream | ||||||
|  |  | ||||||
|  | import scala.jdk.CollectionConverters._ | ||||||
|  | import scala.util.Using | ||||||
|  |  | ||||||
|  | /** This is only using the CRFClassifier without building an analysis | ||||||
|  |   * pipeline. The ner-classifier cannot use results from POS-tagging | ||||||
|  |   * etc. and is therefore not as good as the [[StanfordNerAnnotator]]. | ||||||
|  |   * But it uses less memory, while still being not bad. | ||||||
|  |   */ | ||||||
|  | object BasicCRFAnnotator { | ||||||
|  |   private[this] val logger = getLogger | ||||||
|  |  | ||||||
|  |   lazy val germanNerClassifier  = makeClassifier(Language.German) | ||||||
|  |   lazy val englishNerClassifier = makeClassifier(Language.English) | ||||||
|  |   lazy val frenchNerClassifier  = makeClassifier(Language.French) | ||||||
|  |  | ||||||
|  |   def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { | ||||||
|  |     val nerClassifier = lang match { | ||||||
|  |       case Language.English => englishNerClassifier | ||||||
|  |       case Language.German  => germanNerClassifier | ||||||
|  |       case Language.French  => frenchNerClassifier | ||||||
|  |     } | ||||||
|  |     nerClassifier | ||||||
|  |       .classify(text) | ||||||
|  |       .asScala | ||||||
|  |       .flatMap(a => a.asScala) | ||||||
|  |       .collect(Function.unlift { label => | ||||||
|  |         val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) | ||||||
|  |         NerTag | ||||||
|  |           .fromString(Option(tag).getOrElse("")) | ||||||
|  |           .toOption | ||||||
|  |           .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) | ||||||
|  |       }) | ||||||
|  |       .toVector | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { | ||||||
|  |     logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...") | ||||||
|  |     val ner = classifierResource(lang) | ||||||
|  |     Using(new GZIPInputStream(ner.openStream())) { in => | ||||||
|  |       CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] | ||||||
|  |     }.fold(throw _, identity) | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   private def classifierResource(lang: Language): URL = { | ||||||
|  |     def check(u: URL): URL = | ||||||
|  |       if (u == null) sys.error(s"NER model url not found for language ${lang.name}") | ||||||
|  |       else u | ||||||
|  |  | ||||||
|  |     check(lang match { | ||||||
|  |       case Language.French => | ||||||
|  |         getClass.getResource( | ||||||
|  |           "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz" | ||||||
|  |         ) | ||||||
|  |       case Language.German => | ||||||
|  |         getClass.getResource( | ||||||
|  |           "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" | ||||||
|  |         ) | ||||||
|  |       case Language.English => | ||||||
|  |         getClass.getResource( | ||||||
|  |           "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" | ||||||
|  |         ) | ||||||
|  |     }) | ||||||
|  |   } | ||||||
|  | } | ||||||
| @@ -0,0 +1,63 @@ | |||||||
|  | package docspell.analysis.nlp | ||||||
|  |  | ||||||
|  | import minitest.SimpleTestSuite | ||||||
|  | import docspell.files.TestFiles | ||||||
|  | import docspell.common._ | ||||||
|  |  | ||||||
|  | object BaseCRFAnnotatorSuite extends SimpleTestSuite { | ||||||
|  |   test("find english ner labels") { | ||||||
|  |     val labels = | ||||||
|  |       BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText) | ||||||
|  |     val expect = Vector( | ||||||
|  |       NerLabel("Derek", NerTag.Person, 0, 5), | ||||||
|  |       NerLabel("Jeter", NerTag.Person, 6, 11), | ||||||
|  |       NerLabel("Elm", NerTag.Misc, 17, 20), | ||||||
|  |       NerLabel("Ave.", NerTag.Misc, 21, 25), | ||||||
|  |       NerLabel("Treesville", NerTag.Misc, 27, 37), | ||||||
|  |       NerLabel("Derek", NerTag.Person, 68, 73), | ||||||
|  |       NerLabel("Jeter", NerTag.Person, 74, 79), | ||||||
|  |       NerLabel("Elm", NerTag.Misc, 85, 88), | ||||||
|  |       NerLabel("Ave.", NerTag.Misc, 89, 93), | ||||||
|  |       NerLabel("Treesville", NerTag.Person, 95, 105), | ||||||
|  |       NerLabel("Leaf", NerTag.Organization, 144, 148), | ||||||
|  |       NerLabel("Chief", NerTag.Organization, 150, 155), | ||||||
|  |       NerLabel("of", NerTag.Organization, 156, 158), | ||||||
|  |       NerLabel("Syrup", NerTag.Organization, 159, 164), | ||||||
|  |       NerLabel("Production", NerTag.Organization, 165, 175), | ||||||
|  |       NerLabel("Old", NerTag.Organization, 176, 179), | ||||||
|  |       NerLabel("Sticky", NerTag.Organization, 180, 186), | ||||||
|  |       NerLabel("Pancake", NerTag.Organization, 187, 194), | ||||||
|  |       NerLabel("Company", NerTag.Organization, 195, 202), | ||||||
|  |       NerLabel("Maple", NerTag.Organization, 207, 212), | ||||||
|  |       NerLabel("Lane", NerTag.Organization, 213, 217), | ||||||
|  |       NerLabel("Forest", NerTag.Organization, 219, 225), | ||||||
|  |       NerLabel("Hemptown", NerTag.Location, 239, 247), | ||||||
|  |       NerLabel("Leaf", NerTag.Person, 276, 280), | ||||||
|  |       NerLabel("Little", NerTag.Misc, 347, 353), | ||||||
|  |       NerLabel("League", NerTag.Misc, 354, 360), | ||||||
|  |       NerLabel("Derek", NerTag.Person, 1117, 1122), | ||||||
|  |       NerLabel("Jeter", NerTag.Person, 1123, 1128) | ||||||
|  |     ) | ||||||
|  |     assertEquals(labels, expect) | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   test("find german ner labels") { | ||||||
|  |     val labels = | ||||||
|  |       BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText) | ||||||
|  |     val expect = Vector( | ||||||
|  |       NerLabel("Max", NerTag.Person, 0, 3), | ||||||
|  |       NerLabel("Mustermann", NerTag.Person, 4, 14), | ||||||
|  |       NerLabel("Lilienweg", NerTag.Person, 16, 25), | ||||||
|  |       NerLabel("Max", NerTag.Person, 77, 80), | ||||||
|  |       NerLabel("Mustermann", NerTag.Person, 81, 91), | ||||||
|  |       NerLabel("Lilienweg", NerTag.Location, 93, 102), | ||||||
|  |       NerLabel("EasyCare", NerTag.Organization, 124, 132), | ||||||
|  |       NerLabel("AG", NerTag.Organization, 133, 135), | ||||||
|  |       NerLabel("Ackerweg", NerTag.Location, 158, 166), | ||||||
|  |       NerLabel("Nebendorf", NerTag.Location, 184, 193), | ||||||
|  |       NerLabel("Max", NerTag.Person, 505, 508), | ||||||
|  |       NerLabel("Mustermann", NerTag.Person, 509, 519) | ||||||
|  |     ) | ||||||
|  |     assertEquals(labels, expect) | ||||||
|  |   } | ||||||
|  | } | ||||||
| @@ -5,7 +5,7 @@ import docspell.files.TestFiles | |||||||
| import docspell.common._ | import docspell.common._ | ||||||
| import edu.stanford.nlp.pipeline.StanfordCoreNLP | import edu.stanford.nlp.pipeline.StanfordCoreNLP | ||||||
| 
 | 
 | ||||||
| object TextAnalyserSuite extends SimpleTestSuite { | object StanfordNerAnnotatorSuite extends SimpleTestSuite { | ||||||
|   lazy val germanClassifier = |   lazy val germanClassifier = | ||||||
|     new StanfordCoreNLP(Properties.nerGerman(None, false)) |     new StanfordCoreNLP(Properties.nerGerman(None, false)) | ||||||
|   lazy val englishClassifier = |   lazy val englishClassifier = | ||||||
| @@ -45,6 +45,7 @@ object TextAnalyserSuite extends SimpleTestSuite { | |||||||
|       NerLabel("Jeter", NerTag.Person, 1123, 1128) |       NerLabel("Jeter", NerTag.Person, 1123, 1128) | ||||||
|     ) |     ) | ||||||
|     assertEquals(labels, expect) |     assertEquals(labels, expect) | ||||||
|  |     StanfordCoreNLP.clearAnnotatorPool() | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   test("find german ner labels") { |   test("find german ner labels") { | ||||||
| @@ -65,5 +66,6 @@ object TextAnalyserSuite extends SimpleTestSuite { | |||||||
|       NerLabel("Mustermann", NerTag.Person, 509, 519) |       NerLabel("Mustermann", NerTag.Person, 509, 519) | ||||||
|     ) |     ) | ||||||
|     assertEquals(labels, expect) |     assertEquals(labels, expect) | ||||||
|  |     StanfordCoreNLP.clearAnnotatorPool() | ||||||
|   } |   } | ||||||
| } | } | ||||||
		Reference in New Issue
	
	Block a user