From 9991ad5fcc43ccefe011a6cc4d01bdae4bcd4573 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 9 Mar 2021 00:19:33 +0100 Subject: [PATCH] Add latvian language --- docker/joex-base.dockerfile | 1 + .../docspell/analysis/date/DateFind.scala | 6 ++++ .../docspell/analysis/date/MonthName.scala | 17 ++++++++++ .../docspell/analysis/date/DateFindSpec.scala | 32 +++++++++++++++++++ .../main/scala/docspell/common/Language.scala | 8 ++++- modules/webapp/src/main/elm/Data/Language.elm | 11 +++++++ 6 files changed, 74 insertions(+), 1 deletion(-) diff --git a/docker/joex-base.dockerfile b/docker/joex-base.dockerfile index 87633eb0..40db9dc5 100644 --- a/docker/joex-base.dockerfile +++ b/docker/joex-base.dockerfile @@ -26,6 +26,7 @@ RUN apk add --no-cache openjdk11-jre \ tesseract-ocr-data-swe \ tesseract-ocr-data-rus \ tesseract-ocr-data-ron \ + tesseract-ocr-data-lav \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index f67c32f0..c517bc4a 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -17,6 +17,7 @@ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = TextSplitter .splitToken(text, " \t.,\n\r/".toSet) + .filter(w => lang != Language.Latvian || w.value != "gada") .sliding(3) .filter(_.length == 3) .flatMap(q => @@ -55,6 +56,10 @@ object DateFind { case ((m, d), y) => List(SimpleDate(y, m, d)) } + def lavLong = + (readYear >> readDay >> readMonth(Language.Latvian)).map { case ((y, d), m) => + List(SimpleDate(y, m, d)) + } // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { @@ -77,6 +82,7 @@ object DateFind { case Language.Russian => dmy.or(ymd).or(mdy) case Language.Swedish => ymd.or(dmy).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy) + case Language.Latvian => dmy.or(lavLong).or(ymd) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 333275a0..bc117d95 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -42,6 +42,8 @@ object MonthName { finnish case Language.Russian => russian + case Language.Latvian => + latvian } private val numbers = List( @@ -267,4 +269,19 @@ object MonthName { List("nov", "november"), List("dec", "december") ) + + private val latvian = List( + List("janvāris", "janv."), + List("februāris", "febr."), + List("marts"), + List("aprīlis", "apr."), + List("maijs"), + List("jūnijs", "jūn."), + List("jūlijs", "jūl."), + List("augusts", "aug."), + List("septembris", "sept."), + List("oktobris", "okt."), + List("novembris", "nov."), + List("decembris", "dec.") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 800db6d1..705d0f43 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -103,4 +103,36 @@ object DateFindSpec extends SimpleTestSuite { ) } + test("find latvian dates") { + assertEquals( + DateFind.findDates("on 2020. gada 30. jūlijs there", Language.Latvian).toVector, + Vector( + NerDateLabel( + LocalDate.of(2020, 7, 30), + NerLabel("2020. gada 30. jūlijs", NerTag.Date, 3, 24) + ) + ) + ) + assertEquals( + DateFind.findDates("Lai gan 30.07.2020", Language.Latvian).toVector, + Vector( + NerDateLabel( + LocalDate.of(2020, 7, 30), + NerLabel("30.07.2020", NerTag.Date, 8, 18) + ) + ) + ) + assertEquals( + DateFind + .findDates("Es nevaru šodien 2020.gada 30.oktobris iet uz", Language.Latvian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2020, 10, 30), + NerLabel("2020.gada 30.oktobris", NerTag.Date, 17, 38) + ) + ) + ) + } + } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 72f5e0df..13ea7ac0 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -97,6 +97,11 @@ object Language { val iso3 = "nld" } + case object Latvian extends Language { + val iso2 = "lv" + val iso3 = "lav" + } + val all: List[Language] = List( German, @@ -112,7 +117,8 @@ object Language { Norwegian, Swedish, Russian, - Romanian + Romanian, + Latvian ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 9df00fa3..f0cdac3d 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -22,6 +22,7 @@ type Language | Russian | Romanian | Dutch + | Latvian fromString : String -> Maybe Language @@ -68,6 +69,9 @@ fromString str = else if str == "ron" || str == "ro" || str == "romanian" then Just Romanian + else if str == "lav" || str == "lv" || str == "latvian" then + Just Latvian + else Nothing @@ -117,6 +121,9 @@ toIso3 lang = Dutch -> "nld" + Latvian -> + "lav" + toName : Language -> String toName lang = @@ -163,6 +170,9 @@ toName lang = Dutch -> "Dutch" + Latvian -> + "Latvian" + all : List Language all = @@ -180,4 +190,5 @@ all = , Swedish , Russian , Romanian + , Latvian ]