Add french language and upgrade stanford-nlp to 4.0.0

This commit is contained in:
Eike Kettner 2020-04-21 23:33:15 +02:00
parent 9945b43266
commit fdb46da26d
13 changed files with 208 additions and 54 deletions

View File

@ -1,6 +1,6 @@
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell)
[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell)
[![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=)](https://scala-steward.org)
[![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
[![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell)

View File

@ -54,6 +54,7 @@ object DateFind {
val p = lang match {
case Language.English => p2.or(p0).or(p1)
case Language.German => p1.or(p0).or(p2)
case Language.French => p1.or(p0).or(p2)
}
p.read(parts).toOption
}

View File

@ -0,0 +1,25 @@
package docspell.analysis.nlp
import docspell.common.{NerLabel, NerTag}
import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
object LabelConverter {
private def tagFromLabel[A <: CoreAnnotation[String]](
label: CoreLabel,
annot: Class[A]
): Option[NerTag] = {
val tag = label.get(annot)
Option(tag).flatMap(s => NerTag.fromString(s).toOption)
}
def findTag(label: CoreLabel): Option[NerTag] =
tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
.orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
def toNerLabel(label: CoreLabel): Option[NerLabel] =
findTag(label).map(t =>
NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
)
}

View File

@ -0,0 +1,97 @@
package docspell.analysis.nlp
import java.util.{Properties => JProps}
import docspell.analysis.nlp.Properties.Implicits._
object Properties {
def apply(ps: (String, String)*): JProps = {
val p = new JProps()
for ((k, v) <- ps)
p.setProperty(k, v)
p
}
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
Properties(
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile)
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
object Implicits {
implicit final class JPropsOps(val p: JProps) extends AnyVal {
def set(name: String, value: Option[String]): JProps =
value match {
case Some(v) =>
p.setProperty(name, v)
p
case None =>
p
}
def change(name: String, f: String => String): JProps =
Option(p.getProperty(name)) match {
case Some(current) =>
p.setProperty(name, f(current))
p
case None =>
p
}
def withRegexNer(mappingFile: Option[String]): JProps =
set("regexner.mapping", mappingFile)
.change(
"annotators",
v => if (mappingFile.isDefined) v + ",regexner" else v
)
def withHighRecall(flag: Boolean): JProps = {
if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
else p.setProperty("ner.combinationMode", "NORMAL")
p
}
}
}
}

View File

@ -1,16 +1,12 @@
package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import java.util.{Properties => JProps}
import scala.jdk.CollectionConverters._
import scala.util.Using
import docspell.common._
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
import org.log4s.getLogger
object StanfordNerClassifier {
@ -18,48 +14,32 @@ object StanfordNerClassifier {
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
lazy val frenchNerClassifier = makeClassifier(Language.French)
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
case Language.French => frenchNerClassifier
}
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
val doc = new CoreDocument(text)
nerClassifier.annotate(doc)
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
private def makeClassifier(lang: Language): StanfordCoreNLP = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
}.fold(throw _, identity)
new StanfordCoreNLP(classifierProperties(lang))
}
private def classifierResource(lang: Language): URL = {
def check(u: URL): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
else u
check(lang match {
private def classifierProperties(lang: Language): JProps =
lang match {
case Language.German =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
Properties.nerGerman(None, false)
case Language.English =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
)
})
}
Properties.nerEnglish(None)
case Language.French =>
Properties.nerFrench(None, false)
}
}

View File

@ -12,22 +12,30 @@ object TextAnalyserSuite extends SimpleTestSuite {
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Elm", NerTag.Misc, 17, 20),
NerLabel("Ave.", NerTag.Misc, 21, 25),
NerLabel("Treesville", NerTag.Misc, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Elm", NerTag.Misc, 85, 88),
NerLabel("Ave.", NerTag.Misc, 89, 93),
NerLabel("Treesville", NerTag.Person, 95, 105),
NerLabel("Leaf", NerTag.Organization, 144, 148),
NerLabel("Chief", NerTag.Organization, 150, 155),
NerLabel("of", NerTag.Organization, 156, 158),
NerLabel("Syrup", NerTag.Organization, 159, 164),
NerLabel("Production", NerTag.Organization, 165, 175),
NerLabel("Old", NerTag.Organization, 176, 179),
NerLabel("Sticky", NerTag.Organization, 180, 186),
NerLabel("Pancake", NerTag.Organization, 187, 194),
NerLabel("Company", NerTag.Organization, 195, 202),
NerLabel("Maple", NerTag.Location, 207, 212),
NerLabel("Lane", NerTag.Location, 213, 217),
NerLabel("Forest", NerTag.Location, 219, 225),
NerLabel("Maple", NerTag.Organization, 207, 212),
NerLabel("Lane", NerTag.Organization, 213, 217),
NerLabel("Forest", NerTag.Organization, 219, 225),
NerLabel("Hemptown", NerTag.Location, 239, 247),
NerLabel("Little", NerTag.Organization, 347, 353),
NerLabel("League", NerTag.Organization, 354, 360),
NerLabel("Leaf", NerTag.Person, 276, 280),
NerLabel("Little", NerTag.Misc, 347, 353),
NerLabel("League", NerTag.Misc, 354, 360),
NerLabel("Derek", NerTag.Person, 1117, 1122),
NerLabel("Jeter", NerTag.Person, 1123, 1128)
)
@ -40,7 +48,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),
NerLabel("Lilienweg", NerTag.Location, 16, 25),
NerLabel("Lilienweg", NerTag.Person, 16, 25),
NerLabel("Max", NerTag.Person, 77, 80),
NerLabel("Mustermann", NerTag.Person, 81, 91),
NerLabel("Lilienweg", NerTag.Location, 93, 102),

View File

@ -27,7 +27,12 @@ object Language {
val iso3 = "eng"
}
val all: List[Language] = List(German, English)
case object French extends Language {
val iso2 = "fr"
val iso3 = "fra"
}
val all: List[Language] = List(German, English, French)
def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase

View File

@ -23,6 +23,7 @@ object Field {
val content = Field("content")
val content_de = Field("content_de")
val content_en = Field("content_en")
val content_fr = Field("content_fr")
val itemName = Field("itemName")
val itemNotes = Field("itemNotes")
val folderId = Field("folder")
@ -33,6 +34,8 @@ object Field {
Field.content_de
case Language.English =>
Field.content_en
case Language.French =>
Field.content_fr
}
implicit val jsonEncoder: Encoder[Field] =

View File

@ -39,6 +39,7 @@ object SolrQuery {
Field.content,
Field.content_de,
Field.content_en,
Field.content_fr,
Field.itemName,
Field.itemNotes,
Field.attachmentName

View File

@ -80,6 +80,8 @@ object SolrSetup {
addTextField(l.some)(Field.content_de)
case l @ Language.English =>
addTextField(l.some)(Field.content_en)
case l @ Language.French =>
addTextField(l.some)(Field.content_fr)
}
cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -105,6 +107,9 @@ object SolrSetup {
case Some(Language.English) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textEN(field)))
case Some(Language.French) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textFR(field)))
}
}
}
@ -138,6 +143,9 @@ object SolrSetup {
def textEN(field: Field): AddField =
AddField(field, "text_en", true, true, false)
def textFR(field: Field): AddField =
AddField(field, "text_fr", true, true, false)
}
case class DeleteField(name: Field)

View File

@ -10,6 +10,7 @@ module Data.Language exposing
type Language
= German
| English
| French
fromString : String -> Maybe Language
@ -20,6 +21,9 @@ fromString str =
else if str == "eng" || str == "en" || str == "english" then
Just English
else if str == "fra" || str == "fr" || str == "french" then
Just French
else
Nothing
@ -33,6 +37,9 @@ toIso3 lang =
English ->
"eng"
French ->
"fra"
toName : Language -> String
toName lang =
@ -43,7 +50,10 @@ toName lang =
English ->
"English"
French ->
"French"
all : List Language
all =
[ German, English ]
[ German, English, French ]

View File

@ -31,7 +31,7 @@ object Dependencies {
val PostgresVersion = "42.2.16"
val PureConfigVersion = "0.13.0"
val Slf4jVersion = "1.7.30"
val StanfordNlpVersion = "3.9.2"
val StanfordNlpVersion = "4.0.0"
val TikaVersion = "1.24.1"
val YamuscaVersion = "0.6.2"
val SwaggerUIVersion = "3.32.3"
@ -135,11 +135,16 @@ object Dependencies {
)
val stanfordNlpModels = Seq(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
"models-english"
)
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-french"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier(
"models-english"
)
)
val tika = Seq(

View File

@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
}
private val nerModels = List(
"german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
"english.all.3class.distsim.crf.ser.gz"
"german.distsim.crf.ser.gz",
"english.conll.4class.distsim.crf.ser.gz",
"french-wikiner-4class.crf.ser.gz",
"french-mwt-statistical.tsv",
"french-mwt.tagger",
"french-mwt.tsv",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
"french-ud.tagger",
"french-ud.tagger.props",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props"
)
}