mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Add french language and upgrade stanford-nlp to 4.0.0
This commit is contained in:
parent
9945b43266
commit
fdb46da26d
@ -1,6 +1,6 @@
|
||||
<img align="right" src="./artwork/logo-only.svg" height="150px" style="padding-left: 20px"/>
|
||||
|
||||
[](https://travis-ci.org/eikek/docspell)
|
||||
[](https://travis-ci.org/eikek/docspell)
|
||||
[](https://scala-steward.org)
|
||||
[](https://github.com/eikek/docspell/blob/master/LICENSE.txt)
|
||||
[](https://hub.docker.com/r/eikek0/docspell)
|
||||
|
@ -54,6 +54,7 @@ object DateFind {
|
||||
val p = lang match {
|
||||
case Language.English => p2.or(p0).or(p1)
|
||||
case Language.German => p1.or(p0).or(p2)
|
||||
case Language.French => p1.or(p0).or(p2)
|
||||
}
|
||||
p.read(parts).toOption
|
||||
}
|
||||
|
@ -0,0 +1,25 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common.{NerLabel, NerTag}
|
||||
|
||||
import edu.stanford.nlp.ling.{CoreAnnotation, CoreAnnotations, CoreLabel}
|
||||
|
||||
object LabelConverter {
|
||||
|
||||
private def tagFromLabel[A <: CoreAnnotation[String]](
|
||||
label: CoreLabel,
|
||||
annot: Class[A]
|
||||
): Option[NerTag] = {
|
||||
val tag = label.get(annot)
|
||||
Option(tag).flatMap(s => NerTag.fromString(s).toOption)
|
||||
}
|
||||
|
||||
def findTag(label: CoreLabel): Option[NerTag] =
|
||||
tagFromLabel(label, classOf[CoreAnnotations.AnswerAnnotation])
|
||||
.orElse(tagFromLabel(label, classOf[CoreAnnotations.NamedEntityTagAnnotation]))
|
||||
|
||||
def toNerLabel(label: CoreLabel): Option[NerLabel] =
|
||||
findTag(label).map(t =>
|
||||
NerLabel(label.word(), t, label.beginPosition(), label.endPosition())
|
||||
)
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.util.{Properties => JProps}
|
||||
|
||||
import docspell.analysis.nlp.Properties.Implicits._
|
||||
|
||||
object Properties {
|
||||
|
||||
def apply(ps: (String, String)*): JProps = {
|
||||
val p = new JProps()
|
||||
for ((k, v) <- ps)
|
||||
p.setProperty(k, v)
|
||||
p
|
||||
}
|
||||
|
||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "de",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
|
||||
"ner.useSUTime" -> "false", //only english, unused in docspell
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||
"tokenize.language" -> "en",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "en",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile)
|
||||
|
||||
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "fr",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
object Implicits {
|
||||
implicit final class JPropsOps(val p: JProps) extends AnyVal {
|
||||
|
||||
def set(name: String, value: Option[String]): JProps =
|
||||
value match {
|
||||
case Some(v) =>
|
||||
p.setProperty(name, v)
|
||||
p
|
||||
case None =>
|
||||
p
|
||||
}
|
||||
|
||||
def change(name: String, f: String => String): JProps =
|
||||
Option(p.getProperty(name)) match {
|
||||
case Some(current) =>
|
||||
p.setProperty(name, f(current))
|
||||
p
|
||||
case None =>
|
||||
p
|
||||
}
|
||||
|
||||
def withRegexNer(mappingFile: Option[String]): JProps =
|
||||
set("regexner.mapping", mappingFile)
|
||||
.change(
|
||||
"annotators",
|
||||
v => if (mappingFile.isDefined) v + ",regexner" else v
|
||||
)
|
||||
|
||||
def withHighRecall(flag: Boolean): JProps = {
|
||||
if (flag) p.setProperty("ner.combinationMode", "HIGH_RECALL")
|
||||
else p.setProperty("ner.combinationMode", "NORMAL")
|
||||
p
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,16 +1,12 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.net.URL
|
||||
import java.util.zip.GZIPInputStream
|
||||
import java.util.{Properties => JProps}
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.Using
|
||||
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
||||
import org.log4s.getLogger
|
||||
|
||||
object StanfordNerClassifier {
|
||||
@ -18,48 +14,32 @@ object StanfordNerClassifier {
|
||||
|
||||
lazy val germanNerClassifier = makeClassifier(Language.German)
|
||||
lazy val englishNerClassifier = makeClassifier(Language.English)
|
||||
lazy val frenchNerClassifier = makeClassifier(Language.French)
|
||||
|
||||
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
|
||||
val nerClassifier = lang match {
|
||||
case Language.English => englishNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
case Language.French => frenchNerClassifier
|
||||
}
|
||||
nerClassifier
|
||||
.classify(text)
|
||||
.asScala
|
||||
.flatMap(a => a.asScala)
|
||||
.collect(Function.unlift { label =>
|
||||
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
|
||||
NerTag
|
||||
.fromString(Option(tag).getOrElse(""))
|
||||
.toOption
|
||||
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
|
||||
})
|
||||
.toVector
|
||||
val doc = new CoreDocument(text)
|
||||
nerClassifier.annotate(doc)
|
||||
|
||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
||||
}
|
||||
|
||||
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
|
||||
private def makeClassifier(lang: Language): StanfordCoreNLP = {
|
||||
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
|
||||
val ner = classifierResource(lang)
|
||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
||||
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
|
||||
}.fold(throw _, identity)
|
||||
new StanfordCoreNLP(classifierProperties(lang))
|
||||
}
|
||||
|
||||
private def classifierResource(lang: Language): URL = {
|
||||
def check(u: URL): URL =
|
||||
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
|
||||
else u
|
||||
|
||||
check(lang match {
|
||||
private def classifierProperties(lang: Language): JProps =
|
||||
lang match {
|
||||
case Language.German =>
|
||||
getClass.getResource(
|
||||
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
|
||||
)
|
||||
Properties.nerGerman(None, false)
|
||||
case Language.English =>
|
||||
getClass.getResource(
|
||||
"/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"
|
||||
)
|
||||
})
|
||||
}
|
||||
Properties.nerEnglish(None)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(None, false)
|
||||
}
|
||||
}
|
||||
|
@ -12,22 +12,30 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Elm", NerTag.Misc, 17, 20),
|
||||
NerLabel("Ave.", NerTag.Misc, 21, 25),
|
||||
NerLabel("Treesville", NerTag.Misc, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||
NerLabel("Treesville", NerTag.Location, 95, 105),
|
||||
NerLabel("Elm", NerTag.Misc, 85, 88),
|
||||
NerLabel("Ave.", NerTag.Misc, 89, 93),
|
||||
NerLabel("Treesville", NerTag.Person, 95, 105),
|
||||
NerLabel("Leaf", NerTag.Organization, 144, 148),
|
||||
NerLabel("Chief", NerTag.Organization, 150, 155),
|
||||
NerLabel("of", NerTag.Organization, 156, 158),
|
||||
NerLabel("Syrup", NerTag.Organization, 159, 164),
|
||||
NerLabel("Production", NerTag.Organization, 165, 175),
|
||||
NerLabel("Old", NerTag.Organization, 176, 179),
|
||||
NerLabel("Sticky", NerTag.Organization, 180, 186),
|
||||
NerLabel("Pancake", NerTag.Organization, 187, 194),
|
||||
NerLabel("Company", NerTag.Organization, 195, 202),
|
||||
NerLabel("Maple", NerTag.Location, 207, 212),
|
||||
NerLabel("Lane", NerTag.Location, 213, 217),
|
||||
NerLabel("Forest", NerTag.Location, 219, 225),
|
||||
NerLabel("Maple", NerTag.Organization, 207, 212),
|
||||
NerLabel("Lane", NerTag.Organization, 213, 217),
|
||||
NerLabel("Forest", NerTag.Organization, 219, 225),
|
||||
NerLabel("Hemptown", NerTag.Location, 239, 247),
|
||||
NerLabel("Little", NerTag.Organization, 347, 353),
|
||||
NerLabel("League", NerTag.Organization, 354, 360),
|
||||
NerLabel("Leaf", NerTag.Person, 276, 280),
|
||||
NerLabel("Little", NerTag.Misc, 347, 353),
|
||||
NerLabel("League", NerTag.Misc, 354, 360),
|
||||
NerLabel("Derek", NerTag.Person, 1117, 1122),
|
||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||
)
|
||||
@ -40,7 +48,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
NerLabel("Lilienweg", NerTag.Location, 16, 25),
|
||||
NerLabel("Lilienweg", NerTag.Person, 16, 25),
|
||||
NerLabel("Max", NerTag.Person, 77, 80),
|
||||
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
||||
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
||||
|
@ -27,7 +27,12 @@ object Language {
|
||||
val iso3 = "eng"
|
||||
}
|
||||
|
||||
val all: List[Language] = List(German, English)
|
||||
case object French extends Language {
|
||||
val iso2 = "fr"
|
||||
val iso3 = "fra"
|
||||
}
|
||||
|
||||
val all: List[Language] = List(German, English, French)
|
||||
|
||||
def fromString(str: String): Either[String, Language] = {
|
||||
val lang = str.toLowerCase
|
||||
|
@ -23,6 +23,7 @@ object Field {
|
||||
val content = Field("content")
|
||||
val content_de = Field("content_de")
|
||||
val content_en = Field("content_en")
|
||||
val content_fr = Field("content_fr")
|
||||
val itemName = Field("itemName")
|
||||
val itemNotes = Field("itemNotes")
|
||||
val folderId = Field("folder")
|
||||
@ -33,6 +34,8 @@ object Field {
|
||||
Field.content_de
|
||||
case Language.English =>
|
||||
Field.content_en
|
||||
case Language.French =>
|
||||
Field.content_fr
|
||||
}
|
||||
|
||||
implicit val jsonEncoder: Encoder[Field] =
|
||||
|
@ -39,6 +39,7 @@ object SolrQuery {
|
||||
Field.content,
|
||||
Field.content_de,
|
||||
Field.content_en,
|
||||
Field.content_fr,
|
||||
Field.itemName,
|
||||
Field.itemNotes,
|
||||
Field.attachmentName
|
||||
|
@ -80,6 +80,8 @@ object SolrSetup {
|
||||
addTextField(l.some)(Field.content_de)
|
||||
case l @ Language.English =>
|
||||
addTextField(l.some)(Field.content_en)
|
||||
case l @ Language.French =>
|
||||
addTextField(l.some)(Field.content_fr)
|
||||
}
|
||||
|
||||
cmds0 *> cmds1 *> cntLang *> ().pure[F]
|
||||
@ -105,6 +107,9 @@ object SolrSetup {
|
||||
case Some(Language.English) =>
|
||||
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||
run(AddField.command(AddField.textEN(field)))
|
||||
case Some(Language.French) =>
|
||||
run(DeleteField.command(DeleteField(field))).attempt *>
|
||||
run(AddField.command(AddField.textFR(field)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -138,6 +143,9 @@ object SolrSetup {
|
||||
|
||||
def textEN(field: Field): AddField =
|
||||
AddField(field, "text_en", true, true, false)
|
||||
|
||||
def textFR(field: Field): AddField =
|
||||
AddField(field, "text_fr", true, true, false)
|
||||
}
|
||||
|
||||
case class DeleteField(name: Field)
|
||||
|
@ -10,6 +10,7 @@ module Data.Language exposing
|
||||
type Language
|
||||
= German
|
||||
| English
|
||||
| French
|
||||
|
||||
|
||||
fromString : String -> Maybe Language
|
||||
@ -20,6 +21,9 @@ fromString str =
|
||||
else if str == "eng" || str == "en" || str == "english" then
|
||||
Just English
|
||||
|
||||
else if str == "fra" || str == "fr" || str == "french" then
|
||||
Just French
|
||||
|
||||
else
|
||||
Nothing
|
||||
|
||||
@ -33,6 +37,9 @@ toIso3 lang =
|
||||
English ->
|
||||
"eng"
|
||||
|
||||
French ->
|
||||
"fra"
|
||||
|
||||
|
||||
toName : Language -> String
|
||||
toName lang =
|
||||
@ -43,7 +50,10 @@ toName lang =
|
||||
English ->
|
||||
"English"
|
||||
|
||||
French ->
|
||||
"French"
|
||||
|
||||
|
||||
all : List Language
|
||||
all =
|
||||
[ German, English ]
|
||||
[ German, English, French ]
|
||||
|
@ -31,7 +31,7 @@ object Dependencies {
|
||||
val PostgresVersion = "42.2.16"
|
||||
val PureConfigVersion = "0.13.0"
|
||||
val Slf4jVersion = "1.7.30"
|
||||
val StanfordNlpVersion = "3.9.2"
|
||||
val StanfordNlpVersion = "4.0.0"
|
||||
val TikaVersion = "1.24.1"
|
||||
val YamuscaVersion = "0.6.2"
|
||||
val SwaggerUIVersion = "3.32.3"
|
||||
@ -135,11 +135,16 @@ object Dependencies {
|
||||
)
|
||||
|
||||
val stanfordNlpModels = Seq(
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-german"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
|
||||
"models-english"
|
||||
)
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-french"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier(
|
||||
"models-english"
|
||||
)
|
||||
)
|
||||
|
||||
val tika = Seq(
|
||||
|
@ -68,7 +68,18 @@ object NerModelsPlugin extends AutoPlugin {
|
||||
}
|
||||
|
||||
private val nerModels = List(
|
||||
"german.conll.germeval2014.hgc_175m_600.crf.ser.gz",
|
||||
"english.all.3class.distsim.crf.ser.gz"
|
||||
"german.distsim.crf.ser.gz",
|
||||
"english.conll.4class.distsim.crf.ser.gz",
|
||||
"french-wikiner-4class.crf.ser.gz",
|
||||
"french-mwt-statistical.tsv",
|
||||
"french-mwt.tagger",
|
||||
"french-mwt.tsv",
|
||||
"german-mwt.tsv",
|
||||
"german-ud.tagger",
|
||||
"german-ud.tagger.props",
|
||||
"french-ud.tagger",
|
||||
"french-ud.tagger.props",
|
||||
"english-left3words-distsim.tagger",
|
||||
"english-left3words-distsim.tagger.props"
|
||||
)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user