diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index dd5f3baf..fa3c5d1c 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -45,15 +45,16 @@ object DateFind { private[this] val jpnChars = ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet - private def splitWords(text: String, lang: Language): Stream[Pure, Word] = { + private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = { val stext = if (lang == Language.Japanese) { text.map(c => if (jpnChars.contains(c)) c else ' ') } else text TextSplitter - .splitToken(stext, " \t.,\n\r/年月日".toSet) + .splitToken(stext, " -\t.,\n\r/年月日".toSet) .filter(w => lang != Language.Latvian || w.value != "gada") + .filter(w => lang != Language.Spanish || w.value != "de") } case class SimpleDate(year: Int, month: Int, day: Int) { @@ -91,6 +92,7 @@ object DateFind { case Language.French => dmy.or(ymd).or(mdy) case Language.Italian => dmy.or(ymd).or(mdy) case Language.Spanish => dmy.or(ymd).or(mdy) + case Language.Hungarian => ymd case Language.Czech => dmy.or(ymd).or(mdy) case Language.Danish => dmy.or(ymd).or(mdy) case Language.Finnish => dmy.or(ymd).or(mdy) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index a447eb0b..0679e1b3 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -30,6 +30,8 @@ object MonthName { italian case Language.Spanish => spanish + case Language.Hungarian => + hungarian case Language.Swedish => swedish case Language.Norwegian => @@ -324,4 +326,19 @@ object MonthName { List("11", "נובמבר"), List("12", "דצמבר") ) + + private val hungarian = List( + List("I", "jan", "január"), + List("II", "febr", "február"), + List("III", "márc", "március"), + List("IV", "ápr", "április"), + List("V", "máj", "május"), + List("VI", "jún", "június"), + List("VII", "júl", "július"), + List("VIII", "aug", "augusztus"), + List("IX", "szept", "szeptember"), + List("X", "okt", "október"), + List("XI", "nov", "november"), + List("XII", "dec", "december") + ) } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala index 04dc33cc..ae580992 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala @@ -29,7 +29,7 @@ object BasicCRFAnnotator { private[this] val logger = getLogger // assert correct resource names - List(Language.French, Language.German, Language.English).foreach(classifierResource) + NLPLanguage.all.toList.foreach(classifierResource) type Annotator = AbstractSequenceClassifier[CoreLabel] @@ -70,6 +70,12 @@ object BasicCRFAnnotator { "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz" case Language.English => "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" + case Language.Spanish => + "/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz" + // case Language.Italian => + // "/edu/stanford/nlp/models/ner/italian.crf.ser.gz" + // case Language.Hungarian => + // "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz" }) } @@ -77,12 +83,14 @@ object BasicCRFAnnotator { private[this] lazy val germanNerClassifier = makeAnnotator(Language.German) private[this] lazy val englishNerClassifier = makeAnnotator(Language.English) private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French) + private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish) def forLang(language: NLPLanguage): Annotator = language match { case Language.French => frenchNerClassifier case Language.German => germanNerClassifier case Language.English => englishNerClassifier + case Language.Spanish => spanishNerClassifier } } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index f5e903bd..cae02474 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -37,6 +37,8 @@ object Properties { Properties.nerEnglish(regexNerFile) case Language.French => Properties.nerFrench(regexNerFile, highRecall) + case Language.Spanish => + Properties.nerSpanish(regexNerFile, highRecall) } case StanfordNerSettings.RegexOnly(path) => Properties.regexNerOnly(path) @@ -88,6 +90,18 @@ object Properties { "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz" ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = + Properties( + "annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner", + "tokenize.language" -> "es", + "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv", + "pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger", + "ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz", + "ner.applyNumericClassifiers" -> "true", + "ner.useSUTime" -> "false", + "ner.language" -> "es" + ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall) + def regexNerOnly(regexNerMappingFile: Path): JProps = Properties( "annotators" -> "tokenize,ssplit" diff --git a/modules/analysis/src/test/resources/test.ser.gz b/modules/analysis/src/test/resources/test.ser.gz index b6d0956b..bf409946 100644 Binary files a/modules/analysis/src/test/resources/test.ser.gz and b/modules/analysis/src/test/resources/test.ser.gz differ diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala similarity index 86% rename from modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index f63a90ef..70746b66 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -13,7 +13,7 @@ import docspell.files.TestFiles import munit._ -class DateFindSpec extends FunSuite { +class DateFindTest extends FunSuite { test("find simple dates") { val expect = Vector( @@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite { ) } + test("find spanish dates") { + assertEquals( + DateFind + .findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2011, 12, 15), + NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50) + ) + ) + ) + println(DateFind.splitWords("2021-11-19", Language.Spanish).toList) + assertEquals( + DateFind + .findDates("2021-11-19", Language.Spanish) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("2021-11-19", NerTag.Date, 0, 10) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 35fde297..f8a3ff2b 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -30,7 +30,7 @@ object Language { override val allowsNLP = true } object NLPLanguage { - val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French) + val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish) } case object German extends NLPLanguage { @@ -53,11 +53,16 @@ object Language { val iso3 = "ita" } - case object Spanish extends Language { + case object Spanish extends NLPLanguage { val iso2 = "es" val iso3 = "spa" } + case object Hungarian extends Language { + val iso2 = "hu" + val iso3 = "hun" + } + case object Portuguese extends Language { val iso2 = "pt" val iso3 = "por" @@ -125,6 +130,7 @@ object Language { French, Italian, Spanish, + Hungarian, Dutch, Portuguese, Czech, diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 1abf0cce..56ab9a75 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -127,7 +127,13 @@ object SolrSetup { "Add hebrew content field", addContentField(Language.Hebrew) ), - SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field") + SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"), + SolrMigration[F]( + 19, + "Add hungarian", + addContentField(Language.Hungarian) + ), + SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field") ) def addFolderField: F[Unit] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 813abf88..a2ea3c16 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -18,11 +18,11 @@ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task +import docspell.store.queries.QItem import docspell.store.records.RAttachment import docspell.store.records.RAttachmentSource import docspell.store.records.RCollective import docspell.store.records.RItem -import docspell.store.queries.QItem object ReProcessItem { type Args = ReProcessItemArgs diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql new file mode 100644 index 00000000..92acd861 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql @@ -0,0 +1,21 @@ +CREATE TEMPORARY TABLE "temp_file_ids" ( + cid varchar(254) not null, + file_id varchar(254) not null +); + +INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model"; + +INSERT INTO "job" + SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}', + 'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0 + FROM "classifier_setting"; + +DELETE FROM "classifier_model"; + +DELETE FROM "filemeta" +WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids"); + +DELETE FROM "filechunk" +WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids"); + +DROP TABLE "temp_file_ids"; diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 74479cd7..a42dd803 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -31,6 +31,7 @@ type Language | Latvian | Japanese | Hebrew + | Hungarian fromString : String -> Maybe Language @@ -86,6 +87,9 @@ fromString str = else if str == "heb" || str == "he" || str == "hebrew" then Just Hebrew + else if str == "hun" || str == "hu" || str == "hungarian" then + Just Hungarian + else Nothing @@ -144,6 +148,9 @@ toIso3 lang = Hebrew -> "heb" + Hungarian -> + "hun" + all : List Language all = @@ -164,4 +171,5 @@ all = , Latvian , Japanese , Hebrew + , Hungarian ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 93bcfe9c..5da90b73 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -67,6 +67,9 @@ gb lang = Hebrew -> "Hebrew" + Hungarian -> + "Hungarian" + de : Language -> String de lang = @@ -121,3 +124,6 @@ de lang = Hebrew -> "Hebräisch" + + Hungarian -> + "Ungarisch" diff --git a/nix/module-joex.nix b/nix/module-joex.nix index aefd6c4a..003ff7b7 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -914,7 +914,7 @@ in { The full and basic variants rely on pre-build language models that are available for only 3 lanugages at the moment: German, - English and French. + English, French and Spanish. Memory usage varies greatly among the languages. German has quite large models, that require about 1G heap. So joex should diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 5595f758..a487d08a 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -40,7 +40,7 @@ object Dependencies { val ScalaJavaTimeVersion = "2.3.0" val ScodecBitsVersion = "1.1.29" val Slf4jVersion = "1.7.32" - val StanfordNlpVersion = "4.2.2" + val StanfordNlpVersion = "4.3.2" val TikaVersion = "2.1.0" val YamuscaVersion = "0.8.1" val SwaggerUIVersion = "4.1.0" @@ -185,18 +185,16 @@ object Dependencies { ) ) - val stanfordNlpModels = Seq( - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) - .classifier("models"), - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) - .classifier("models-german"), - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) - .classifier("models-french"), - ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) - .classifier( - "models-english" - ) - ) + val stanfordNlpModels = { + val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion + Seq( + artifact.classifier("models"), + artifact.classifier("models-german"), + artifact.classifier("models-french"), + artifact.classifier("models-english"), + artifact.classifier("models-spanish") + ) + } val tika = Seq( "org.apache.tika" % "tika-core" % TikaVersion diff --git a/project/NerModelsPlugin.scala b/project/NerModelsPlugin.scala index a2d60869..f2cae44a 100644 --- a/project/NerModelsPlugin.scala +++ b/project/NerModelsPlugin.scala @@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin { } private val nerModels = List( - "german.distsim.crf.ser.gz", + // English "english.conll.4class.distsim.crf.ser.gz", + "regexner_caseless.tab", + "regexner_cased.tab", + "english-left3words-distsim.tagger", + "english-left3words-distsim.tagger.props", + // German + "german.distsim.crf.ser.gz", + "german-mwt.tsv", + "german-ud.tagger", + "german-ud.tagger.props", + // French "french-wikiner-4class.crf.ser.gz", "french-mwt-statistical.tsv", "french-mwt.tagger", "french-mwt.tsv", - "german-mwt.tsv", - "german-ud.tagger", - "german-ud.tagger.props", "french-ud.tagger", "french-ud.tagger.props", - "english-left3words-distsim.tagger", - "english-left3words-distsim.tagger.props" + // Spanish + "spanish.ancora.distsim.s512.crf.ser.gz", + "spanish-mwt.tsv", + "spanish-ud.tagger", + "kbp_regexner_number_sp.tag", + "kbp_regexner_mapping_sp.tag" ) } diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index 5eef1bb5..83d27088 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`, which requires more memory for certain languages (with the advantage of better results). Other values are `basic`, `regexonly` and `disabled`. The modes `full` and `basic` use pre-defined lanugage -models for procesing documents of languaes German, English and French. -These require some amount of memory (see below). +models for procesing documents of languaes German, English, French and +Spanish. These require some amount of memory (see below). The mode `basic` is like the "light" variant to `full`. It doesn't use all NLP features, which makes memory consumption much lower, but comes diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index 5ab0e0b1..360412db 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -8,10 +8,10 @@ mktoc = true +++ When uploading a file, it is only saved to the database together with -the given meta information. The file is not visible in the ui yet. -Then joex takes the next such file (or files in case you uploaded -many) and starts processing it. When processing finished, the item and -its files will show up in the ui. +the given meta information as a "job". The file is not visible in the +ui yet. Then joex takes the next such job and starts processing it. +When processing finished, the item and its files will show up in the +ui. If an error occurs during processing, the item will be created anyways, so you can see it. Depending on the error, some information @@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a whole language. These are also provided by [Stanford NLP](https://nlp.stanford.edu/software/), but not for all languages. So whether this can be used depends on the document language. Models -exist for German, English and French currently. +exist for German, English, French and Spanish currently. Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to run custom rules against a text. This can be used as a fallback for diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md index fb096641..3f97e29e 100644 --- a/website/site/content/docs/webapp/metadata.md +++ b/website/site/content/docs/webapp/metadata.md @@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the language. Docspell uses the [Stanford NLP Library](https://nlp.stanford.edu/software/) for its great machine learning algorithms. Some of them, like certain NLP features, are only -available for some languages – namely German, English and French. The -reason is that the required statistical models are not available for -other languages. However, docspell can still run other algorithms for -the other languages, like classification and custom rules based on the -address book. +available for some languages – namely German, English, French and +Spanish. The reason is that the required statistical models are not +available for other languages. However, docspell can still run other +algorithms for the other languages, like classification and custom +rules based on the address book. More information about file processing and text analysis can be found [here](@/docs/joex/file-processing.md#text-analysis).