Updating stanford corenlp to 4.3.2; adding more languages

There are models for Spanish, that have been added now. Also the
Hungarian language has been added to the list of supported
languages (for tesseract mainly, no nlp models)
This commit is contained in:
eikek 2021-11-20 14:31:39 +01:00
parent 20fc9955ba
commit 501c6f2988
18 changed files with 162 additions and 40 deletions

View File

@ -45,15 +45,16 @@ object DateFind {
private[this] val jpnChars =
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext =
if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text
TextSplitter
.splitToken(stext, " \t.,\n\r/年月日".toSet)
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de")
}
case class SimpleDate(year: Int, month: Int, day: Int) {
@ -91,6 +92,7 @@ object DateFind {
case Language.French => dmy.or(ymd).or(mdy)
case Language.Italian => dmy.or(ymd).or(mdy)
case Language.Spanish => dmy.or(ymd).or(mdy)
case Language.Hungarian => ymd
case Language.Czech => dmy.or(ymd).or(mdy)
case Language.Danish => dmy.or(ymd).or(mdy)
case Language.Finnish => dmy.or(ymd).or(mdy)

View File

@ -30,6 +30,8 @@ object MonthName {
italian
case Language.Spanish =>
spanish
case Language.Hungarian =>
hungarian
case Language.Swedish =>
swedish
case Language.Norwegian =>
@ -324,4 +326,19 @@ object MonthName {
List("11", "נובמבר"),
List("12", "דצמבר")
)
private val hungarian = List(
List("I", "jan", "január"),
List("II", "febr", "február"),
List("III", "márc", "március"),
List("IV", "ápr", "április"),
List("V", "máj", "május"),
List("VI", "jún", "június"),
List("VII", "júl", "július"),
List("VIII", "aug", "augusztus"),
List("IX", "szept", "szeptember"),
List("X", "okt", "október"),
List("XI", "nov", "november"),
List("XII", "dec", "december")
)
}

View File

@ -29,7 +29,7 @@ object BasicCRFAnnotator {
private[this] val logger = getLogger
// assert correct resource names
List(Language.French, Language.German, Language.English).foreach(classifierResource)
NLPLanguage.all.toList.foreach(classifierResource)
type Annotator = AbstractSequenceClassifier[CoreLabel]
@ -70,6 +70,12 @@ object BasicCRFAnnotator {
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
case Language.English =>
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
case Language.Spanish =>
"/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
// case Language.Italian =>
// "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
// case Language.Hungarian =>
// "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
})
}
@ -77,12 +83,14 @@ object BasicCRFAnnotator {
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
def forLang(language: NLPLanguage): Annotator =
language match {
case Language.French => frenchNerClassifier
case Language.German => germanNerClassifier
case Language.English => englishNerClassifier
case Language.Spanish => spanishNerClassifier
}
}

View File

@ -37,6 +37,8 @@ object Properties {
Properties.nerEnglish(regexNerFile)
case Language.French =>
Properties.nerFrench(regexNerFile, highRecall)
case Language.Spanish =>
Properties.nerSpanish(regexNerFile, highRecall)
}
case StanfordNerSettings.RegexOnly(path) =>
Properties.regexNerOnly(path)
@ -88,6 +90,18 @@ object Properties {
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
"tokenize.language" -> "es",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
"ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
"ner.applyNumericClassifiers" -> "true",
"ner.useSUTime" -> "false",
"ner.language" -> "es"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def regexNerOnly(regexNerMappingFile: Path): JProps =
Properties(
"annotators" -> "tokenize,ssplit"

View File

@ -13,7 +13,7 @@ import docspell.files.TestFiles
import munit._
class DateFindSpec extends FunSuite {
class DateFindTest extends FunSuite {
test("find simple dates") {
val expect = Vector(
@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
)
}
test("find spanish dates") {
assertEquals(
DateFind
.findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2011, 12, 15),
NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
)
)
)
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Spanish)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("2021-11-19", NerTag.Date, 0, 10)
)
)
)
}
}

View File

@ -30,7 +30,7 @@ object Language {
override val allowsNLP = true
}
object NLPLanguage {
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
}
case object German extends NLPLanguage {
@ -53,11 +53,16 @@ object Language {
val iso3 = "ita"
}
case object Spanish extends Language {
case object Spanish extends NLPLanguage {
val iso2 = "es"
val iso3 = "spa"
}
case object Hungarian extends Language {
val iso2 = "hu"
val iso3 = "hun"
}
case object Portuguese extends Language {
val iso2 = "pt"
val iso3 = "por"
@ -125,6 +130,7 @@ object Language {
French,
Italian,
Spanish,
Hungarian,
Dutch,
Portuguese,
Czech,

View File

@ -127,7 +127,13 @@ object SolrSetup {
"Add hebrew content field",
addContentField(Language.Hebrew)
),
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
SolrMigration[F](
19,
"Add hungarian",
addContentField(Language.Hungarian)
),
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
)
def addFolderField: F[Unit] =

View File

@ -18,11 +18,11 @@ import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queries.QItem
import docspell.store.records.RAttachment
import docspell.store.records.RAttachmentSource
import docspell.store.records.RCollective
import docspell.store.records.RItem
import docspell.store.queries.QItem
object ReProcessItem {
type Args = ReProcessItemArgs

View File

@ -0,0 +1,21 @@
CREATE TEMPORARY TABLE "temp_file_ids" (
cid varchar(254) not null,
file_id varchar(254) not null
);
INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
INSERT INTO "job"
SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
FROM "classifier_setting";
DELETE FROM "classifier_model";
DELETE FROM "filemeta"
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
DELETE FROM "filechunk"
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
DROP TABLE "temp_file_ids";

View File

@ -31,6 +31,7 @@ type Language
| Latvian
| Japanese
| Hebrew
| Hungarian
fromString : String -> Maybe Language
@ -86,6 +87,9 @@ fromString str =
else if str == "heb" || str == "he" || str == "hebrew" then
Just Hebrew
else if str == "hun" || str == "hu" || str == "hungarian" then
Just Hungarian
else
Nothing
@ -144,6 +148,9 @@ toIso3 lang =
Hebrew ->
"heb"
Hungarian ->
"hun"
all : List Language
all =
@ -164,4 +171,5 @@ all =
, Latvian
, Japanese
, Hebrew
, Hungarian
]

View File

@ -67,6 +67,9 @@ gb lang =
Hebrew ->
"Hebrew"
Hungarian ->
"Hungarian"
de : Language -> String
de lang =
@ -121,3 +124,6 @@ de lang =
Hebrew ->
"Hebräisch"
Hungarian ->
"Ungarisch"

View File

@ -914,7 +914,7 @@ in {
The full and basic variants rely on pre-build language models
that are available for only 3 lanugages at the moment: German,
English and French.
English, French and Spanish.
Memory usage varies greatly among the languages. German has
quite large models, that require about 1G heap. So joex should

View File

@ -40,7 +40,7 @@ object Dependencies {
val ScalaJavaTimeVersion = "2.3.0"
val ScodecBitsVersion = "1.1.29"
val Slf4jVersion = "1.7.32"
val StanfordNlpVersion = "4.2.2"
val StanfordNlpVersion = "4.3.2"
val TikaVersion = "2.1.0"
val YamuscaVersion = "0.8.1"
val SwaggerUIVersion = "4.1.0"
@ -185,18 +185,16 @@ object Dependencies {
)
)
val stanfordNlpModels = Seq(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-french"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier(
"models-english"
)
)
val stanfordNlpModels = {
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
Seq(
artifact.classifier("models"),
artifact.classifier("models-german"),
artifact.classifier("models-french"),
artifact.classifier("models-english"),
artifact.classifier("models-spanish")
)
}
val tika = Seq(
"org.apache.tika" % "tika-core" % TikaVersion

View File

@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
}
private val nerModels = List(
"german.distsim.crf.ser.gz",
// English
"english.conll.4class.distsim.crf.ser.gz",
"regexner_caseless.tab",
"regexner_cased.tab",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props",
// German
"german.distsim.crf.ser.gz",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
// French
"french-wikiner-4class.crf.ser.gz",
"french-mwt-statistical.tsv",
"french-mwt.tagger",
"french-mwt.tsv",
"german-mwt.tsv",
"german-ud.tagger",
"german-ud.tagger.props",
"french-ud.tagger",
"french-ud.tagger.props",
"english-left3words-distsim.tagger",
"english-left3words-distsim.tagger.props"
// Spanish
"spanish.ancora.distsim.s512.crf.ser.gz",
"spanish-mwt.tsv",
"spanish-ud.tagger",
"kbp_regexner_number_sp.tag",
"kbp_regexner_mapping_sp.tag"
)
}

View File

@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
which requires more memory for certain languages (with the advantage
of better results). Other values are `basic`, `regexonly` and
`disabled`. The modes `full` and `basic` use pre-defined lanugage
models for procesing documents of languaes German, English and French.
These require some amount of memory (see below).
models for procesing documents of languaes German, English, French and
Spanish. These require some amount of memory (see below).
The mode `basic` is like the "light" variant to `full`. It doesn't use
all NLP features, which makes memory consumption much lower, but comes

View File

@ -8,10 +8,10 @@ mktoc = true
+++
When uploading a file, it is only saved to the database together with
the given meta information. The file is not visible in the ui yet.
Then joex takes the next such file (or files in case you uploaded
many) and starts processing it. When processing finished, the item and
its files will show up in the ui.
the given meta information as a "job". The file is not visible in the
ui yet. Then joex takes the next such job and starts processing it.
When processing finished, the item and its files will show up in the
ui.
If an error occurs during processing, the item will be created
anyways, so you can see it. Depending on the error, some information
@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
whole language. These are also provided by [Stanford
NLP](https://nlp.stanford.edu/software/), but not for all languages.
So whether this can be used depends on the document language. Models
exist for German, English and French currently.
exist for German, English, French and Spanish currently.
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
run custom rules against a text. This can be used as a fallback for

View File

@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
language. Docspell uses the [Stanford NLP
Library](https://nlp.stanford.edu/software/) for its great machine
learning algorithms. Some of them, like certain NLP features, are only
available for some languages namely German, English and French. The
reason is that the required statistical models are not available for
other languages. However, docspell can still run other algorithms for
the other languages, like classification and custom rules based on the
address book.
available for some languages namely German, English, French and
Spanish. The reason is that the required statistical models are not
available for other languages. However, docspell can still run other
algorithms for the other languages, like classification and custom
rules based on the address book.
More information about file processing and text analysis can be found
[here](@/docs/joex/file-processing.md#text-analysis).