mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Updating stanford corenlp to 4.3.2; adding more languages
There are models for Spanish, that have been added now. Also the Hungarian language has been added to the list of supported languages (for tesseract mainly, no nlp models)
This commit is contained in:
@ -40,7 +40,7 @@ object Dependencies {
|
||||
val ScalaJavaTimeVersion = "2.3.0"
|
||||
val ScodecBitsVersion = "1.1.29"
|
||||
val Slf4jVersion = "1.7.32"
|
||||
val StanfordNlpVersion = "4.2.2"
|
||||
val StanfordNlpVersion = "4.3.2"
|
||||
val TikaVersion = "2.1.0"
|
||||
val YamuscaVersion = "0.8.1"
|
||||
val SwaggerUIVersion = "4.1.0"
|
||||
@ -185,18 +185,16 @@ object Dependencies {
|
||||
)
|
||||
)
|
||||
|
||||
val stanfordNlpModels = Seq(
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-german"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-french"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier(
|
||||
"models-english"
|
||||
)
|
||||
)
|
||||
val stanfordNlpModels = {
|
||||
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
|
||||
Seq(
|
||||
artifact.classifier("models"),
|
||||
artifact.classifier("models-german"),
|
||||
artifact.classifier("models-french"),
|
||||
artifact.classifier("models-english"),
|
||||
artifact.classifier("models-spanish")
|
||||
)
|
||||
}
|
||||
|
||||
val tika = Seq(
|
||||
"org.apache.tika" % "tika-core" % TikaVersion
|
||||
|
@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
|
||||
}
|
||||
|
||||
private val nerModels = List(
|
||||
"german.distsim.crf.ser.gz",
|
||||
// English
|
||||
"english.conll.4class.distsim.crf.ser.gz",
|
||||
"regexner_caseless.tab",
|
||||
"regexner_cased.tab",
|
||||
"english-left3words-distsim.tagger",
|
||||
"english-left3words-distsim.tagger.props",
|
||||
// German
|
||||
"german.distsim.crf.ser.gz",
|
||||
"german-mwt.tsv",
|
||||
"german-ud.tagger",
|
||||
"german-ud.tagger.props",
|
||||
// French
|
||||
"french-wikiner-4class.crf.ser.gz",
|
||||
"french-mwt-statistical.tsv",
|
||||
"french-mwt.tagger",
|
||||
"french-mwt.tsv",
|
||||
"german-mwt.tsv",
|
||||
"german-ud.tagger",
|
||||
"german-ud.tagger.props",
|
||||
"french-ud.tagger",
|
||||
"french-ud.tagger.props",
|
||||
"english-left3words-distsim.tagger",
|
||||
"english-left3words-distsim.tagger.props"
|
||||
// Spanish
|
||||
"spanish.ancora.distsim.s512.crf.ser.gz",
|
||||
"spanish-mwt.tsv",
|
||||
"spanish-ud.tagger",
|
||||
"kbp_regexner_number_sp.tag",
|
||||
"kbp_regexner_mapping_sp.tag"
|
||||
)
|
||||
}
|
||||
|
Reference in New Issue
Block a user