mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Resurrect the basic ner classifier
This commit is contained in:
		@@ -0,0 +1,75 @@
 | 
				
			|||||||
 | 
					package docspell.analysis.nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import docspell.common._
 | 
				
			||||||
 | 
					import edu.stanford.nlp.ie.AbstractSequenceClassifier
 | 
				
			||||||
 | 
					import edu.stanford.nlp.ie.crf.CRFClassifier
 | 
				
			||||||
 | 
					import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 | 
				
			||||||
 | 
					import org.log4s.getLogger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import java.net.URL
 | 
				
			||||||
 | 
					import java.util.zip.GZIPInputStream
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import scala.jdk.CollectionConverters._
 | 
				
			||||||
 | 
					import scala.util.Using
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** This is only using the CRFClassifier without building an analysis
 | 
				
			||||||
 | 
					  * pipeline. The ner-classifier cannot use results from POS-tagging
 | 
				
			||||||
 | 
					  * etc. and is therefore not as good as the [[StanfordNerAnnotator]].
 | 
				
			||||||
 | 
					  * But it uses less memory, while still being not bad.
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					object BasicCRFAnnotator {
 | 
				
			||||||
 | 
					  private[this] val logger = getLogger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  lazy val germanNerClassifier  = makeClassifier(Language.German)
 | 
				
			||||||
 | 
					  lazy val englishNerClassifier = makeClassifier(Language.English)
 | 
				
			||||||
 | 
					  lazy val frenchNerClassifier  = makeClassifier(Language.French)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
 | 
				
			||||||
 | 
					    val nerClassifier = lang match {
 | 
				
			||||||
 | 
					      case Language.English => englishNerClassifier
 | 
				
			||||||
 | 
					      case Language.German  => germanNerClassifier
 | 
				
			||||||
 | 
					      case Language.French  => frenchNerClassifier
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    nerClassifier
 | 
				
			||||||
 | 
					      .classify(text)
 | 
				
			||||||
 | 
					      .asScala
 | 
				
			||||||
 | 
					      .flatMap(a => a.asScala)
 | 
				
			||||||
 | 
					      .collect(Function.unlift { label =>
 | 
				
			||||||
 | 
					        val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
 | 
				
			||||||
 | 
					        NerTag
 | 
				
			||||||
 | 
					          .fromString(Option(tag).getOrElse(""))
 | 
				
			||||||
 | 
					          .toOption
 | 
				
			||||||
 | 
					          .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
 | 
				
			||||||
 | 
					      })
 | 
				
			||||||
 | 
					      .toVector
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
 | 
				
			||||||
 | 
					    logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
 | 
				
			||||||
 | 
					    val ner = classifierResource(lang)
 | 
				
			||||||
 | 
					    Using(new GZIPInputStream(ner.openStream())) { in =>
 | 
				
			||||||
 | 
					      CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
 | 
				
			||||||
 | 
					    }.fold(throw _, identity)
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private def classifierResource(lang: Language): URL = {
 | 
				
			||||||
 | 
					    def check(u: URL): URL =
 | 
				
			||||||
 | 
					      if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
 | 
				
			||||||
 | 
					      else u
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    check(lang match {
 | 
				
			||||||
 | 
					      case Language.French =>
 | 
				
			||||||
 | 
					        getClass.getResource(
 | 
				
			||||||
 | 
					          "/edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					      case Language.German =>
 | 
				
			||||||
 | 
					        getClass.getResource(
 | 
				
			||||||
 | 
					          "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					      case Language.English =>
 | 
				
			||||||
 | 
					        getClass.getResource(
 | 
				
			||||||
 | 
					          "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -0,0 +1,63 @@
 | 
				
			|||||||
 | 
					package docspell.analysis.nlp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import minitest.SimpleTestSuite
 | 
				
			||||||
 | 
					import docspell.files.TestFiles
 | 
				
			||||||
 | 
					import docspell.common._
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					object BaseCRFAnnotatorSuite extends SimpleTestSuite {
 | 
				
			||||||
 | 
					  test("find english ner labels") {
 | 
				
			||||||
 | 
					    val labels =
 | 
				
			||||||
 | 
					      BasicCRFAnnotator.nerAnnotate(Language.English)(TestFiles.letterENText)
 | 
				
			||||||
 | 
					    val expect = Vector(
 | 
				
			||||||
 | 
					      NerLabel("Derek", NerTag.Person, 0, 5),
 | 
				
			||||||
 | 
					      NerLabel("Jeter", NerTag.Person, 6, 11),
 | 
				
			||||||
 | 
					      NerLabel("Elm", NerTag.Misc, 17, 20),
 | 
				
			||||||
 | 
					      NerLabel("Ave.", NerTag.Misc, 21, 25),
 | 
				
			||||||
 | 
					      NerLabel("Treesville", NerTag.Misc, 27, 37),
 | 
				
			||||||
 | 
					      NerLabel("Derek", NerTag.Person, 68, 73),
 | 
				
			||||||
 | 
					      NerLabel("Jeter", NerTag.Person, 74, 79),
 | 
				
			||||||
 | 
					      NerLabel("Elm", NerTag.Misc, 85, 88),
 | 
				
			||||||
 | 
					      NerLabel("Ave.", NerTag.Misc, 89, 93),
 | 
				
			||||||
 | 
					      NerLabel("Treesville", NerTag.Person, 95, 105),
 | 
				
			||||||
 | 
					      NerLabel("Leaf", NerTag.Organization, 144, 148),
 | 
				
			||||||
 | 
					      NerLabel("Chief", NerTag.Organization, 150, 155),
 | 
				
			||||||
 | 
					      NerLabel("of", NerTag.Organization, 156, 158),
 | 
				
			||||||
 | 
					      NerLabel("Syrup", NerTag.Organization, 159, 164),
 | 
				
			||||||
 | 
					      NerLabel("Production", NerTag.Organization, 165, 175),
 | 
				
			||||||
 | 
					      NerLabel("Old", NerTag.Organization, 176, 179),
 | 
				
			||||||
 | 
					      NerLabel("Sticky", NerTag.Organization, 180, 186),
 | 
				
			||||||
 | 
					      NerLabel("Pancake", NerTag.Organization, 187, 194),
 | 
				
			||||||
 | 
					      NerLabel("Company", NerTag.Organization, 195, 202),
 | 
				
			||||||
 | 
					      NerLabel("Maple", NerTag.Organization, 207, 212),
 | 
				
			||||||
 | 
					      NerLabel("Lane", NerTag.Organization, 213, 217),
 | 
				
			||||||
 | 
					      NerLabel("Forest", NerTag.Organization, 219, 225),
 | 
				
			||||||
 | 
					      NerLabel("Hemptown", NerTag.Location, 239, 247),
 | 
				
			||||||
 | 
					      NerLabel("Leaf", NerTag.Person, 276, 280),
 | 
				
			||||||
 | 
					      NerLabel("Little", NerTag.Misc, 347, 353),
 | 
				
			||||||
 | 
					      NerLabel("League", NerTag.Misc, 354, 360),
 | 
				
			||||||
 | 
					      NerLabel("Derek", NerTag.Person, 1117, 1122),
 | 
				
			||||||
 | 
					      NerLabel("Jeter", NerTag.Person, 1123, 1128)
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assertEquals(labels, expect)
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  test("find german ner labels") {
 | 
				
			||||||
 | 
					    val labels =
 | 
				
			||||||
 | 
					      BasicCRFAnnotator.nerAnnotate(Language.German)(TestFiles.letterDEText)
 | 
				
			||||||
 | 
					    val expect = Vector(
 | 
				
			||||||
 | 
					      NerLabel("Max", NerTag.Person, 0, 3),
 | 
				
			||||||
 | 
					      NerLabel("Mustermann", NerTag.Person, 4, 14),
 | 
				
			||||||
 | 
					      NerLabel("Lilienweg", NerTag.Person, 16, 25),
 | 
				
			||||||
 | 
					      NerLabel("Max", NerTag.Person, 77, 80),
 | 
				
			||||||
 | 
					      NerLabel("Mustermann", NerTag.Person, 81, 91),
 | 
				
			||||||
 | 
					      NerLabel("Lilienweg", NerTag.Location, 93, 102),
 | 
				
			||||||
 | 
					      NerLabel("EasyCare", NerTag.Organization, 124, 132),
 | 
				
			||||||
 | 
					      NerLabel("AG", NerTag.Organization, 133, 135),
 | 
				
			||||||
 | 
					      NerLabel("Ackerweg", NerTag.Location, 158, 166),
 | 
				
			||||||
 | 
					      NerLabel("Nebendorf", NerTag.Location, 184, 193),
 | 
				
			||||||
 | 
					      NerLabel("Max", NerTag.Person, 505, 508),
 | 
				
			||||||
 | 
					      NerLabel("Mustermann", NerTag.Person, 509, 519)
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assertEquals(labels, expect)
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -5,7 +5,7 @@ import docspell.files.TestFiles
 | 
				
			|||||||
import docspell.common._
 | 
					import docspell.common._
 | 
				
			||||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
 | 
					import edu.stanford.nlp.pipeline.StanfordCoreNLP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
object TextAnalyserSuite extends SimpleTestSuite {
 | 
					object StanfordNerAnnotatorSuite extends SimpleTestSuite {
 | 
				
			||||||
  lazy val germanClassifier =
 | 
					  lazy val germanClassifier =
 | 
				
			||||||
    new StanfordCoreNLP(Properties.nerGerman(None, false))
 | 
					    new StanfordCoreNLP(Properties.nerGerman(None, false))
 | 
				
			||||||
  lazy val englishClassifier =
 | 
					  lazy val englishClassifier =
 | 
				
			||||||
@@ -45,6 +45,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
 | 
				
			|||||||
      NerLabel("Jeter", NerTag.Person, 1123, 1128)
 | 
					      NerLabel("Jeter", NerTag.Person, 1123, 1128)
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assertEquals(labels, expect)
 | 
					    assertEquals(labels, expect)
 | 
				
			||||||
 | 
					    StanfordCoreNLP.clearAnnotatorPool()
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  test("find german ner labels") {
 | 
					  test("find german ner labels") {
 | 
				
			||||||
@@ -65,5 +66,6 @@ object TextAnalyserSuite extends SimpleTestSuite {
 | 
				
			|||||||
      NerLabel("Mustermann", NerTag.Person, 509, 519)
 | 
					      NerLabel("Mustermann", NerTag.Person, 509, 519)
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    assertEquals(labels, expect)
 | 
					    assertEquals(labels, expect)
 | 
				
			||||||
 | 
					    StanfordCoreNLP.clearAnnotatorPool()
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user