Updating stanford corenlp to 4.3.2; adding more languages

There are models for Spanish, that have been added now. Also the
Hungarian language has been added to the list of supported
languages (for tesseract mainly, no nlp models)
This commit is contained in:
eikek
2021-11-20 14:31:39 +01:00
parent 20fc9955ba
commit 501c6f2988
18 changed files with 162 additions and 40 deletions

View File

@ -40,7 +40,7 @@ object Dependencies {
val ScalaJavaTimeVersion = "2.3.0"
val ScodecBitsVersion = "1.1.29"
val Slf4jVersion = "1.7.32"
val StanfordNlpVersion = "4.2.2"
val StanfordNlpVersion = "4.3.2"
val TikaVersion = "2.1.0"
val YamuscaVersion = "0.8.1"
val SwaggerUIVersion = "4.1.0"
@ -185,18 +185,16 @@ object Dependencies {
)
)
val stanfordNlpModels = Seq(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-french"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier(
"models-english"
)
)
val stanfordNlpModels = {
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
Seq(
artifact.classifier("models"),
artifact.classifier("models-german"),
artifact.classifier("models-french"),
artifact.classifier("models-english"),
artifact.classifier("models-spanish")
)
}
val tika = Seq(
"org.apache.tika" % "tika-core" % TikaVersion

View File

@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
}
private val nerModels = List(
"german.distsim.crf.ser.gz",
// English
"english.conll.4class.distsim.crf.ser.gz",
"regexner_caseless.tab",
"regexner_cased.tab",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props",
// German
"german.distsim.crf.ser.gz",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
// French
"french-wikiner-4class.crf.ser.gz",
"french-mwt-statistical.tsv",
"french-mwt.tagger",
"french-mwt.tsv",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
"french-ud.tagger",
"french-ud.tagger.props",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props"
// Spanish
"spanish.ancora.distsim.s512.crf.ser.gz",
"spanish-mwt.tsv",
"spanish-ud.tagger",
"kbp_regexner_number_sp.tag",
"kbp_regexner_mapping_sp.tag"
)
}