From fdb46da26d7be6ce457a5b8dbe4f104b198d034f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 21 Apr 2020 23:33:15 +0200 Subject: [PATCH] Add french language and upgrade stanford-nlp to 4.0.0 --- README.md | 2 +- .../docspell/analysis/date/DateFind.scala | 1 + .../analysis/nlp/LabelConverter.scala | 25 +++++ .../docspell/analysis/nlp/Properties.scala | 97 +++++++++++++++++++ .../analysis/nlp/StanfordNerClassifier.scala | 54 ++++------- .../analysis/nlp/TextAnalyserSuite.scala | 24 +++-- .../main/scala/docspell/common/Language.scala | 7 +- .../main/scala/docspell/ftssolr/Field.scala | 3 + .../scala/docspell/ftssolr/SolrQuery.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 8 ++ modules/webapp/src/main/elm/Data/Language.elm | 12 ++- project/Dependencies.scala | 13 ++- project/NerModelsPlugin.scala | 15 ++- 13 files changed, 208 insertions(+), 54 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala diff --git a/README.md b/README.md index 88928bef..6ad5e9e1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![Build Status](https://img.shields.io/travis/eikek/docspell?style=flat-square)](https://travis-ci.org/eikek/docspell) +[![Build Status](https://img.shields.io/travis/eikek/docspell/master?style=flat-square)](https://travis-ci.org/eikek/docspell) [![Scala Steward badge](https://img.shields.io/badge/Scala_Steward-helping-blue.svg?style=flat-square&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAQCAMAAAARSr4IAAAAVFBMVEUAAACHjojlOy5NWlrKzcYRKjGFjIbp293YycuLa3pYY2LSqql4f3pCUFTgSjNodYRmcXUsPD/NTTbjRS+2jomhgnzNc223cGvZS0HaSD0XLjbaSjElhIr+AAAAAXRSTlMAQObYZgAAAHlJREFUCNdNyosOwyAIhWHAQS1Vt7a77/3fcxxdmv0xwmckutAR1nkm4ggbyEcg/wWmlGLDAA3oL50xi6fk5ffZ3E2E3QfZDCcCN2YtbEWZt+Drc6u6rlqv7Uk0LdKqqr5rk2UCRXOk0vmQKGfc94nOJyQjouF9H/wCc9gECEYfONoAAAAASUVORK5CYII=)](https://scala-steward.org) [![License](https://img.shields.io/github/license/eikek/docspell.svg?style=flat-square&color=steelblue)](https://github.com/eikek/docspell/blob/master/LICENSE.txt) [![Docker Pulls](https://img.shields.io/docker/pulls/eikek0/docspell?color=steelblue)](https://hub.docker.com/r/eikek0/docspell) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 86fea719..f2170d31 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -54,6 +54,7 @@ object DateFind { val p = lang match { case Language.English => p2.or(p0).or(p1) case Language.German => p1.or(p0).or(p2) + case Language.French => p1.or(p0).or(p2) } p.read(parts).toOption } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala new file mode 100644 index 00000000..c32a532d --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/LabelConverter.scala @@ -0,0 +1,25 @@ +package docspell.analysis.nlp + +import docspell.common.{NerLabel, NerTag} + +import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel} + +object LabelConverter { + + private def tagFromLabel[A <: CoreAnnotation[String]]( + label: CoreLabel, + annot: Class[A] + ): Option[NerTag] = { + val tag = label.get(annot) + Option(tag).flatMap(s => NerTag.fromString(s).toOption) + } + + def findTag(label: CoreLabel): Option[NerTag] = + tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation]) + .orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation])) + + def toNerLabel(label: CoreLabel): Option[NerLabel] = + findTag(label).map(t => + NerLabel(label.word(), t, label.beginPosition(), label.endPosition()) + ) +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala new file mode 100644 index 00000000..75ee7040 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -0,0 +1,97 @@ +package docspell.analysis.nlp + +import java.util.{Properties => JProps} + +import docspell.analysis.nlp.Properties.Implicits._ + +object Properties { + + def apply(ps: (String, String)*): JProps = { + val p = new JProps() + for ((k, v) <- ps) + p.setProperty(k, v) + p + } + + def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = + Properties( + "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", + "tokenize.language" -> "de", + "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently + "ner.useSUTime" -> "false", //only english, unused in docspell + "ner.language" -> "de", + "ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + + def nerEnglish(regexNerMappingFile: Option[String]): JProps = + Properties( + "annotators" -> "tokenize,ssplit,pos,lemma,ner", + "tokenize.language" -> "en", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", + "ner.useSUTime" -> "false", + "ner.language" -> "en", + "ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile) + + def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = + Properties( + "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", + "tokenize.language" -> "fr", + "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv", + "mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger", + "mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger", + "ner.statisticalOnly" -> "true", + "ner.rulesOnly" -> "false", + "ner.applyFineGrained" -> "false", + "ner.applyNumericClassifiers" -> "false", + "ner.useSUTime" -> "false", + "ner.language" -> "de", + "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + + object Implicits { + implicit final class JPropsOps(val p: JProps) extends AnyVal { + + def set(name: String, value: Option[String]): JProps = + value match { + case Some(v) => + p.setProperty(name, v) + p + case None => + p + } + + def change(name: String, f: String => String): JProps = + Option(p.getProperty(name)) match { + case Some(current) => + p.setProperty(name, f(current)) + p + case None => + p + } + + def withRegexNer(mappingFile: Option[String]): JProps = + set("regexner.mapping", mappingFile) + .change( + "annotators", + v => if (mappingFile.isDefined) v + ",regexner" else v + ) + + def withHighRecall(flag: Boolean): JProps = { + if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL") + else p.setProperty("ner.combinationMode", "NORMAL") + p + } + } + } +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 094abcca..32c165f5 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -1,16 +1,12 @@ package docspell.analysis.nlp -import java.net.URL -import java.util.zip.GZIPInputStream +import java.util.{Properties => JProps} import scala.jdk.CollectionConverters._ -import scala.util.Using import docspell.common._ -import edu.stanford.nlp.ie.AbstractSequenceClassifier -import edu.stanford.nlp.ie.crf.CRFClassifier -import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} import org.log4s.getLogger object StanfordNerClassifier { @@ -18,48 +14,32 @@ object StanfordNerClassifier { lazy val germanNerClassifier = makeClassifier(Language.German) lazy val englishNerClassifier = makeClassifier(Language.English) + lazy val frenchNerClassifier = makeClassifier(Language.French) def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { val nerClassifier = lang match { case Language.English => englishNerClassifier case Language.German => germanNerClassifier + case Language.French => frenchNerClassifier } - nerClassifier - .classify(text) - .asScala - .flatMap(a => a.asScala) - .collect(Function.unlift { label => - val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) - NerTag - .fromString(Option(tag).getOrElse("")) - .toOption - .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) - }) - .toVector + val doc = new CoreDocument(text) + nerClassifier.annotate(doc) + + doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector } - private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { + private def makeClassifier(lang: Language): StanfordCoreNLP = { logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...") - val ner = classifierResource(lang) - Using(new GZIPInputStream(ner.openStream())) { in => - CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]] - }.fold(throw _, identity) + new StanfordCoreNLP(classifierProperties(lang)) } - private def classifierResource(lang: Language): URL = { - def check(u: URL): URL = - if (u == null) sys.error(s"NER model url not found for language ${lang.name}") - else u - - check(lang match { + private def classifierProperties(lang: Language): JProps = + lang match { case Language.German => - getClass.getResource( - "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz" - ) + Properties.nerGerman(None, false) case Language.English => - getClass.getResource( - "/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz" - ) - }) - } + Properties.nerEnglish(None) + case Language.French => + Properties.nerFrench(None, false) + } } diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index c851edce..b7c083a1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -12,22 +12,30 @@ object TextAnalyserSuite extends SimpleTestSuite { val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), - NerLabel("Treesville", NerTag.Person, 27, 37), + NerLabel("Elm", NerTag.Misc, 17, 20), + NerLabel("Ave.", NerTag.Misc, 21, 25), + NerLabel("Treesville", NerTag.Misc, 27, 37), NerLabel("Derek", NerTag.Person, 68, 73), NerLabel("Jeter", NerTag.Person, 74, 79), - NerLabel("Treesville", NerTag.Location, 95, 105), + NerLabel("Elm", NerTag.Misc, 85, 88), + NerLabel("Ave.", NerTag.Misc, 89, 93), + NerLabel("Treesville", NerTag.Person, 95, 105), + NerLabel("Leaf", NerTag.Organization, 144, 148), + NerLabel("Chief", NerTag.Organization, 150, 155), + NerLabel("of", NerTag.Organization, 156, 158), NerLabel("Syrup", NerTag.Organization, 159, 164), NerLabel("Production", NerTag.Organization, 165, 175), NerLabel("Old", NerTag.Organization, 176, 179), NerLabel("Sticky", NerTag.Organization, 180, 186), NerLabel("Pancake", NerTag.Organization, 187, 194), NerLabel("Company", NerTag.Organization, 195, 202), - NerLabel("Maple", NerTag.Location, 207, 212), - NerLabel("Lane", NerTag.Location, 213, 217), - NerLabel("Forest", NerTag.Location, 219, 225), + NerLabel("Maple", NerTag.Organization, 207, 212), + NerLabel("Lane", NerTag.Organization, 213, 217), + NerLabel("Forest", NerTag.Organization, 219, 225), NerLabel("Hemptown", NerTag.Location, 239, 247), - NerLabel("Little", NerTag.Organization, 347, 353), - NerLabel("League", NerTag.Organization, 354, 360), + NerLabel("Leaf", NerTag.Person, 276, 280), + NerLabel("Little", NerTag.Misc, 347, 353), + NerLabel("League", NerTag.Misc, 354, 360), NerLabel("Derek", NerTag.Person, 1117, 1122), NerLabel("Jeter", NerTag.Person, 1123, 1128) ) @@ -40,7 +48,7 @@ object TextAnalyserSuite extends SimpleTestSuite { val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), - NerLabel("Lilienweg", NerTag.Location, 16, 25), + NerLabel("Lilienweg", NerTag.Person, 16, 25), NerLabel("Max", NerTag.Person, 77, 80), NerLabel("Mustermann", NerTag.Person, 81, 91), NerLabel("Lilienweg", NerTag.Location, 93, 102), diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 7d836347..92c32f4b 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -27,7 +27,12 @@ object Language { val iso3 = "eng" } - val all: List[Language] = List(German, English) + case object French extends Language { + val iso2 = "fr" + val iso3 = "fra" + } + + val all: List[Language] = List(German, English, French) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 6031cd61..2306a44d 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -23,6 +23,7 @@ object Field { val content = Field("content") val content_de = Field("content_de") val content_en = Field("content_en") + val content_fr = Field("content_fr") val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") @@ -33,6 +34,8 @@ object Field { Field.content_de case Language.English => Field.content_en + case Language.French => + Field.content_fr } implicit val jsonEncoder: Encoder[Field] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index e07e9c36..1e3b09b3 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -39,6 +39,7 @@ object SolrQuery { Field.content, Field.content_de, Field.content_en, + Field.content_fr, Field.itemName, Field.itemNotes, Field.attachmentName diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 932519c8..efb94a09 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -80,6 +80,8 @@ object SolrSetup { addTextField(l.some)(Field.content_de) case l @ Language.English => addTextField(l.some)(Field.content_en) + case l @ Language.French => + addTextField(l.some)(Field.content_fr) } cmds0 *> cmds1 *> cntLang *> ().pure[F] @@ -105,6 +107,9 @@ object SolrSetup { case Some(Language.English) => run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.textEN(field))) + case Some(Language.French) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textFR(field))) } } } @@ -138,6 +143,9 @@ object SolrSetup { def textEN(field: Field): AddField = AddField(field, "text_en", true, true, false) + + def textFR(field: Field): AddField = + AddField(field, "text_fr", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 6704ec3e..40fe5eb2 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -10,6 +10,7 @@ module Data.Language exposing type Language = German | English + | French fromString : String -> Maybe Language @@ -20,6 +21,9 @@ fromString str = else if str == "eng" || str == "en" || str == "english" then Just English + else if str == "fra" || str == "fr" || str == "french" then + Just French + else Nothing @@ -33,6 +37,9 @@ toIso3 lang = English -> "eng" + French -> + "fra" + toName : Language -> String toName lang = @@ -43,7 +50,10 @@ toName lang = English -> "English" + French -> + "French" + all : List Language all = - [ German, English ] + [ German, English, French ] diff --git a/project/Dependencies.scala b/project/Dependencies.scala index ddcfa155..7ab0e4ad 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -31,7 +31,7 @@ object Dependencies { val PostgresVersion = "42.2.16" val PureConfigVersion = "0.13.0" val Slf4jVersion = "1.7.30" - val StanfordNlpVersion = "3.9.2" + val StanfordNlpVersion = "4.0.0" val TikaVersion = "1.24.1" val YamuscaVersion = "0.6.2" val SwaggerUIVersion = "3.32.3" @@ -135,11 +135,16 @@ object Dependencies { ) val stanfordNlpModels = Seq( + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier("models"), ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) .classifier("models-german"), - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier( - "models-english" - ) + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier("models-french"), + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier( + "models-english" + ) ) val tika = Seq( diff --git a/project/NerModelsPlugin.scala b/project/NerModelsPlugin.scala index cb658615..8d8fbb2c 100644 --- a/project/NerModelsPlugin.scala +++ b/project/NerModelsPlugin.scala @@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin { } private val nerModels = List( - "german.conll.germeval2014.hgc_175m_600.crf.ser.gz", - "english.all.3class.distsim.crf.ser.gz" + "german.distsim.crf.ser.gz", + "english.conll.4class.distsim.crf.ser.gz", + "french-wikiner-4class.crf.ser.gz", + "french-mwt-statistical.tsv", + "french-mwt.tagger", + "french-mwt.tsv", + "german-mwt.tsv", + "german-ud.tagger", + "german-ud.tagger.props", + "french-ud.tagger", + "french-ud.tagger.props", + "english-left3words-distsim.tagger", + "english-left3words-distsim.tagger.props" ) }