From 3f75af0807627f0ec43b8120df1e5dfcdc1f905c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 17 Jan 2021 22:53:12 +0100 Subject: [PATCH] Add 9 more lanugages to the list of document lanugages --- docker/joex-base.dockerfile | 9 ++ .../docspell/analysis/date/DateFind.scala | 26 ++- .../docspell/analysis/date/MonthName.scala | 152 ++++++++++++++++++ .../main/scala/docspell/common/Language.scala | 63 +++++++- .../main/scala/docspell/ftssolr/Field.scala | 3 +- .../scala/docspell/ftssolr/SolrSetup.scala | 24 ++- modules/webapp/src/main/elm/Data/Language.elm | 106 +++++++++++- 7 files changed, 371 insertions(+), 12 deletions(-) diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index b9b160ed..87633eb0 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -17,6 +17,15 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr-data-fra \ tesseract-ocr-data-ita \ tesseract-ocr-data-spa \ + tesseract-ocr-data-por \ + tesseract-ocr-data-ces \ + tesseract-ocr-data-nld \ + tesseract-ocr-data-dan \ + tesseract-ocr-data-fin \ + tesseract-ocr-data-nor \ + tesseract-ocr-data-swe \ + tesseract-ocr-data-rus \ + tesseract-ocr-data-ron \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 438bff85..698606f0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -56,16 +56,26 @@ object DateFind { // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { - val p0 = pattern0(lang) - val p1 = pattern1(lang) - val p2 = pattern2(lang) + val ymd = pattern0(lang) + val dmy = pattern1(lang) + val mdy = pattern2(lang) + // most is from wikipedia… val p = lang match { case Language.English => - p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1) - case Language.German => p1.or(p0).or(p2) - case Language.French => p1.or(p0).or(p2) - case Language.Italian => p1.or(p0).or(p2) - case Language.Spanish => p1.or(p0).or(p2) + mdy.alt(dmy).map(t => t._1 ++ t._2).or(mdy).or(ymd).or(dmy) + case Language.German => dmy.or(ymd).or(mdy) + case Language.French => dmy.or(ymd).or(mdy) + case Language.Italian => dmy.or(ymd).or(mdy) + case Language.Spanish => dmy.or(ymd).or(mdy) + case Language.Czech => dmy.or(ymd).or(mdy) + case Language.Danish => dmy.or(ymd).or(mdy) + case Language.Finnish => dmy.or(ymd).or(mdy) + case Language.Norwegian => dmy.or(ymd).or(mdy) + case Language.Portuguese => dmy.or(ymd).or(mdy) + case Language.Romanian => dmy.or(ymd).or(mdy) + case Language.Russian => dmy.or(ymd).or(mdy) + case Language.Swedish => ymd.or(dmy).or(mdy) + case Language.Dutch => dmy.or(ymd).or(mdy) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 503e15e4..333275a0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -24,6 +24,24 @@ object MonthName { italian case Language.Spanish => spanish + case Language.Swedish => + swedish + case Language.Norwegian => + norwegian + case Language.Dutch => + dutch + case Language.Czech => + czech + case Language.Danish => + danish + case Language.Portuguese => + portuguese + case Language.Romanian => + romanian + case Language.Finnish => + finnish + case Language.Russian => + russian } private val numbers = List( @@ -115,4 +133,138 @@ object MonthName { List("nov", "noviembre"), List("dic", "diciembre") ) + + private val swedish = List( + List("jan", "januari"), + List("febr", "februari"), + List("mars"), + List("april"), + List("maj"), + List("juni"), + List("juli"), + List("aug", "augusti"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dec", "december") + ) + private val norwegian = List( + List("jan", "januar"), + List("febr", "februar"), + List("mars"), + List("april"), + List("mai"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("des", "desember") + ) + + private val czech = List( + List("led", "leden"), + List("un", "ún", "únor", "unor"), + List("brez", "březen", "brezen"), + List("dub", "duben"), + List("kvet", "květen"), + List("cerv", "červen"), + List("cerven", "červenec"), + List("srp", "srpen"), + List("zari", "září"), + List("ríj", "rij", "říjen"), + List("list", "listopad"), + List("pros", "prosinec") + ) + + private val romanian = List( + List("ian", "ianuarie"), + List("feb", "februarie"), + List("mar", "martie"), + List("apr", "aprilie"), + List("mai"), + List("iunie"), + List("iulie"), + List("aug", "august"), + List("sept", "septembrie"), + List("oct", "octombrie"), + List("noem", "nov", "noiembrie"), + List("dec", "decembrie") + ) + + private val danish = List( + List("jan", "januar"), + List("febr", "februar"), + List("marts"), + List("april"), + List("maj"), + List("juni"), + List("juli"), + List("aug", "august"), + List("sept", "september"), + List("okt", "oktober"), + List("nov", "november"), + List("dec", "december") + ) + + private val portuguese = List( + List("jan", "janeiro"), + List("fev", "fevereiro"), + List("março", "marco"), + List("abril"), + List("maio"), + List("junho"), + List("julho"), + List("agosto"), + List("set", "setembro"), + List("out", "outubro"), + List("nov", "novembro"), + List("dez", "dezembro") + ) + + private val finnish = List( + List("tammikuu"), + List("helmikuu"), + List("maaliskuu"), + List("huhtikuu"), + List("toukokuu"), + List("kesäkuu"), + List("heinäkuu"), + List("elokuu"), + List("syyskuu"), + List("lokakuu"), + List("marraskuu"), + List("joulukuu") + ) + + private val russian = List( + List("январь"), + List("февраль"), + List("март"), + List("апрель"), + List("май"), + List("июнь"), + List("июль"), + List("август"), + List("сентябрь"), + List("октябрь"), + List("ноябрь"), + List("декабрь") + ) + + private val dutch = List( + List("jan", "januari"), + List("feb", "februari"), + List("maart"), + List("apr", "april"), + List("mei"), + List("juni"), + List("juli"), + List("aug", "augustus"), + List("sept", "september"), + List("okt", "oct", "oktober"), + List("nov", "november"), + List("dec", "december") + ) } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 3a39dd11..72f5e0df 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -52,7 +52,68 @@ object Language { val iso3 = "spa" } - val all: List[Language] = List(German, English, French, Italian, Spanish) + case object Portuguese extends Language { + val iso2 = "pt" + val iso3 = "por" + } + + case object Czech extends Language { + val iso2 = "cs" + val iso3 = "ces" + } + + case object Danish extends Language { + val iso2 = "da" + val iso3 = "dan" + } + + case object Finnish extends Language { + val iso2 = "fi" + val iso3 = "fin" + } + + case object Norwegian extends Language { + val iso2 = "no" + val iso3 = "nor" + } + + case object Swedish extends Language { + val iso2 = "sv" + val iso3 = "swe" + } + + case object Russian extends Language { + val iso2 = "ru" + val iso3 = "rus" + } + + case object Romanian extends Language { + val iso2 = "ro" + val iso3 = "ron" + } + + case object Dutch extends Language { + val iso2 = "nl" + val iso3 = "nld" + } + + val all: List[Language] = + List( + German, + English, + French, + Italian, + Spanish, + Dutch, + Portuguese, + Czech, + Danish, + Finnish, + Norwegian, + Swedish, + Russian, + Romanian + ) def fromString(str: String): Either[String, Language] = { val lang = str.toLowerCase diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index a10ca0e8..ff55e5ae 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -32,7 +32,8 @@ object Field { .map(contentField) def contentField(lang: Language): Field = - Field(s"content_${lang.iso2}") + if (lang == Language.Czech) Field(s"content_cz") + else Field(s"content_${lang.iso2}") implicit val jsonEncoder: Encoder[Field] = Encoder.encodeString.contramap(_.name) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 63b90db9..e4a9df04 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -75,12 +75,33 @@ object SolrSetup { solrEngine, "Add content_es field", addContentField(Language.Spanish).map(_ => FtsMigration.Result.reIndexAll) + ), + FtsMigration[F]( + 9, + solrEngine, + "Add more content fields", + addMoreContentFields.map(_ => FtsMigration.Result.reIndexAll) ) ) def addFolderField: F[Unit] = addStringField(Field.folderId) + def addMoreContentFields: F[Unit] = { + val remain = List[Language]( + Language.Norwegian, + Language.Romanian, + Language.Swedish, + Language.Finnish, + Language.Danish, + Language.Czech, + Language.Dutch, + Language.Portuguese, + Language.Russian + ) + remain.traverse(addContentField).map(_ => ()) + } + def setupCoreSchema: F[Unit] = { val cmds0 = List( @@ -162,7 +183,8 @@ object SolrSetup { AddField(field, "text_general", true, true, false) def textLang(field: Field, lang: Language): AddField = - AddField(field, s"text_${lang.iso2}", true, true, false) + if (lang == Language.Czech) AddField(field, s"text_cz", true, true, false) + else AddField(field, s"text_${lang.iso2}", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index f6c1caee..9df00fa3 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -13,6 +13,15 @@ type Language | French | Italian | Spanish + | Portuguese + | Czech + | Danish + | Finnish + | Norwegian + | Swedish + | Russian + | Romanian + | Dutch fromString : String -> Maybe Language @@ -32,6 +41,33 @@ fromString str = else if str == "spa" || str == "es" || str == "spanish" then Just Spanish + else if str == "por" || str == "pt" || str == "portuguese" then + Just Portuguese + + else if str == "ces" || str == "cs" || str == "czech" then + Just Czech + + else if str == "dan" || str == "da" || str == "danish" then + Just Danish + + else if str == "nld" || str == "nd" || str == "dutch" then + Just Dutch + + else if str == "fin" || str == "fi" || str == "finnish" then + Just Finnish + + else if str == "nor" || str == "no" || str == "norwegian" then + Just Norwegian + + else if str == "swe" || str == "sv" || str == "swedish" then + Just Swedish + + else if str == "rus" || str == "ru" || str == "russian" then + Just Russian + + else if str == "ron" || str == "ro" || str == "romanian" then + Just Romanian + else Nothing @@ -54,6 +90,33 @@ toIso3 lang = Spanish -> "spa" + Portuguese -> + "por" + + Czech -> + "ces" + + Danish -> + "dan" + + Finnish -> + "fin" + + Norwegian -> + "nor" + + Swedish -> + "swe" + + Russian -> + "rus" + + Romanian -> + "ron" + + Dutch -> + "nld" + toName : Language -> String toName lang = @@ -73,7 +136,48 @@ toName lang = Spanish -> "Spanish" + Portuguese -> + "Portuguese" + + Czech -> + "Czech" + + Danish -> + "Danish" + + Finnish -> + "Finnish" + + Norwegian -> + "Norwegian" + + Swedish -> + "Swedish" + + Russian -> + "Russian" + + Romanian -> + "Romanian" + + Dutch -> + "Dutch" + all : List Language all = - [ German, English, French, Italian, Spanish ] + [ German + , English + , French + , Italian + , Spanish + , Portuguese + , Czech + , Dutch + , Danish + , Finnish + , Norwegian + , Swedish + , Russian + , Romanian + ]