mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Updating stanford corenlp to 4.3.2; adding more languages
There are models for Spanish, that have been added now. Also the Hungarian language has been added to the list of supported languages (for tesseract mainly, no nlp models)
This commit is contained in:
@ -45,15 +45,16 @@ object DateFind {
|
||||
private[this] val jpnChars =
|
||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||
|
||||
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
val stext =
|
||||
if (lang == Language.Japanese) {
|
||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||
} else text
|
||||
|
||||
TextSplitter
|
||||
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
||||
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||
}
|
||||
|
||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||
@ -91,6 +92,7 @@ object DateFind {
|
||||
case Language.French => dmy.or(ymd).or(mdy)
|
||||
case Language.Italian => dmy.or(ymd).or(mdy)
|
||||
case Language.Spanish => dmy.or(ymd).or(mdy)
|
||||
case Language.Hungarian => ymd
|
||||
case Language.Czech => dmy.or(ymd).or(mdy)
|
||||
case Language.Danish => dmy.or(ymd).or(mdy)
|
||||
case Language.Finnish => dmy.or(ymd).or(mdy)
|
||||
|
@ -30,6 +30,8 @@ object MonthName {
|
||||
italian
|
||||
case Language.Spanish =>
|
||||
spanish
|
||||
case Language.Hungarian =>
|
||||
hungarian
|
||||
case Language.Swedish =>
|
||||
swedish
|
||||
case Language.Norwegian =>
|
||||
@ -324,4 +326,19 @@ object MonthName {
|
||||
List("11", "נובמבר"),
|
||||
List("12", "דצמבר")
|
||||
)
|
||||
|
||||
private val hungarian = List(
|
||||
List("I", "jan", "január"),
|
||||
List("II", "febr", "február"),
|
||||
List("III", "márc", "március"),
|
||||
List("IV", "ápr", "április"),
|
||||
List("V", "máj", "május"),
|
||||
List("VI", "jún", "június"),
|
||||
List("VII", "júl", "július"),
|
||||
List("VIII", "aug", "augusztus"),
|
||||
List("IX", "szept", "szeptember"),
|
||||
List("X", "okt", "október"),
|
||||
List("XI", "nov", "november"),
|
||||
List("XII", "dec", "december")
|
||||
)
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ object BasicCRFAnnotator {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
// assert correct resource names
|
||||
List(Language.French, Language.German, Language.English).foreach(classifierResource)
|
||||
NLPLanguage.all.toList.foreach(classifierResource)
|
||||
|
||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||
|
||||
@ -70,6 +70,12 @@ object BasicCRFAnnotator {
|
||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||
case Language.English =>
|
||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
case Language.Spanish =>
|
||||
"/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
|
||||
// case Language.Italian =>
|
||||
// "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
|
||||
// case Language.Hungarian =>
|
||||
// "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
|
||||
})
|
||||
}
|
||||
|
||||
@ -77,12 +83,14 @@ object BasicCRFAnnotator {
|
||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||
private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
|
||||
|
||||
def forLang(language: NLPLanguage): Annotator =
|
||||
language match {
|
||||
case Language.French => frenchNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
case Language.English => englishNerClassifier
|
||||
case Language.Spanish => spanishNerClassifier
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,8 @@ object Properties {
|
||||
Properties.nerEnglish(regexNerFile)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(regexNerFile, highRecall)
|
||||
case Language.Spanish =>
|
||||
Properties.nerSpanish(regexNerFile, highRecall)
|
||||
}
|
||||
case StanfordNerSettings.RegexOnly(path) =>
|
||||
Properties.regexNerOnly(path)
|
||||
@ -88,6 +90,18 @@ object Properties {
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
|
||||
"tokenize.language" -> "es",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
|
||||
"ner.applyNumericClassifiers" -> "true",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "es"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit"
|
||||
|
Binary file not shown.
@ -13,7 +13,7 @@ import docspell.files.TestFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class DateFindSpec extends FunSuite {
|
||||
class DateFindTest extends FunSuite {
|
||||
|
||||
test("find simple dates") {
|
||||
val expect = Vector(
|
||||
@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
|
||||
)
|
||||
}
|
||||
|
||||
test("find spanish dates") {
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2011, 12, 15),
|
||||
NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
|
||||
)
|
||||
)
|
||||
)
|
||||
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("2021-11-19", Language.Spanish)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 11, 19),
|
||||
NerLabel("2021-11-19", NerTag.Date, 0, 10)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
@ -30,7 +30,7 @@ object Language {
|
||||
override val allowsNLP = true
|
||||
}
|
||||
object NLPLanguage {
|
||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
|
||||
}
|
||||
|
||||
case object German extends NLPLanguage {
|
||||
@ -53,11 +53,16 @@ object Language {
|
||||
val iso3 = "ita"
|
||||
}
|
||||
|
||||
case object Spanish extends Language {
|
||||
case object Spanish extends NLPLanguage {
|
||||
val iso2 = "es"
|
||||
val iso3 = "spa"
|
||||
}
|
||||
|
||||
case object Hungarian extends Language {
|
||||
val iso2 = "hu"
|
||||
val iso3 = "hun"
|
||||
}
|
||||
|
||||
case object Portuguese extends Language {
|
||||
val iso2 = "pt"
|
||||
val iso3 = "por"
|
||||
@ -125,6 +130,7 @@ object Language {
|
||||
French,
|
||||
Italian,
|
||||
Spanish,
|
||||
Hungarian,
|
||||
Dutch,
|
||||
Portuguese,
|
||||
Czech,
|
||||
|
@ -127,7 +127,13 @@ object SolrSetup {
|
||||
"Add hebrew content field",
|
||||
addContentField(Language.Hebrew)
|
||||
),
|
||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
|
||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
|
||||
SolrMigration[F](
|
||||
19,
|
||||
"Add hungarian",
|
||||
addContentField(Language.Hungarian)
|
||||
),
|
||||
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
|
||||
)
|
||||
|
||||
def addFolderField: F[Unit] =
|
||||
|
@ -18,11 +18,11 @@ import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.RAttachment
|
||||
import docspell.store.records.RAttachmentSource
|
||||
import docspell.store.records.RCollective
|
||||
import docspell.store.records.RItem
|
||||
import docspell.store.queries.QItem
|
||||
|
||||
object ReProcessItem {
|
||||
type Args = ReProcessItemArgs
|
||||
|
@ -0,0 +1,21 @@
|
||||
CREATE TEMPORARY TABLE "temp_file_ids" (
|
||||
cid varchar(254) not null,
|
||||
file_id varchar(254) not null
|
||||
);
|
||||
|
||||
INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
|
||||
|
||||
INSERT INTO "job"
|
||||
SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
|
||||
'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
|
||||
FROM "classifier_setting";
|
||||
|
||||
DELETE FROM "classifier_model";
|
||||
|
||||
DELETE FROM "filemeta"
|
||||
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||
|
||||
DELETE FROM "filechunk"
|
||||
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||
|
||||
DROP TABLE "temp_file_ids";
|
@ -31,6 +31,7 @@ type Language
|
||||
| Latvian
|
||||
| Japanese
|
||||
| Hebrew
|
||||
| Hungarian
|
||||
|
||||
|
||||
fromString : String -> Maybe Language
|
||||
@ -86,6 +87,9 @@ fromString str =
|
||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||
Just Hebrew
|
||||
|
||||
else if str == "hun" || str == "hu" || str == "hungarian" then
|
||||
Just Hungarian
|
||||
|
||||
else
|
||||
Nothing
|
||||
|
||||
@ -144,6 +148,9 @@ toIso3 lang =
|
||||
Hebrew ->
|
||||
"heb"
|
||||
|
||||
Hungarian ->
|
||||
"hun"
|
||||
|
||||
|
||||
all : List Language
|
||||
all =
|
||||
@ -164,4 +171,5 @@ all =
|
||||
, Latvian
|
||||
, Japanese
|
||||
, Hebrew
|
||||
, Hungarian
|
||||
]
|
||||
|
@ -67,6 +67,9 @@ gb lang =
|
||||
Hebrew ->
|
||||
"Hebrew"
|
||||
|
||||
Hungarian ->
|
||||
"Hungarian"
|
||||
|
||||
|
||||
de : Language -> String
|
||||
de lang =
|
||||
@ -121,3 +124,6 @@ de lang =
|
||||
|
||||
Hebrew ->
|
||||
"Hebräisch"
|
||||
|
||||
Hungarian ->
|
||||
"Ungarisch"
|
||||
|
Reference in New Issue
Block a user