From c0feb13f637fb05ddd63d9dcf363346ee5875011 Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 31 Oct 2022 23:38:41 +0100 Subject: [PATCH] Add Estonian language Closes: #1646 --- docker/dockerfiles/joex.dockerfile | 1 + .../docspell/analysis/date/DateFind.scala | 1 + .../docspell/analysis/date/MonthName.scala | 17 +++++++++++ .../docspell/analysis/date/DateFindTest.scala | 28 +++++++++++++++++++ .../main/scala/docspell/common/Language.scala | 8 +++++- .../docspell/ftspsql/FtsRepository.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 7 ++++- modules/webapp/src/main/elm/Data/Language.elm | 8 ++++++ .../src/main/elm/Messages/Data/Language.elm | 9 ++++++ 9 files changed, 78 insertions(+), 2 deletions(-) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index f7ba272b..ccb34cbd 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -33,6 +33,7 @@ RUN JDKPKG="openjdk11-jre"; \ tesseract-ocr-data-heb \ tesseract-ocr-data-lit \ tesseract-ocr-data-pol \ + tesseract-ocr-data-est \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 915c2d22..6dcb4f85 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -110,6 +110,7 @@ object DateFind { case Language.Hebrew => dmy case Language.Lithuanian => ymd case Language.Polish => dmy + case Language.Estonian => dmy } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 4b2e6295..bf2100fd 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -60,6 +60,8 @@ object MonthName { lithuanian case Language.Polish => polish + case Language.Estonian => + estonian } private val numbers = List( @@ -375,4 +377,19 @@ object MonthName { List("listopada", "lis"), List("grudnia", "gru") ) + + private val estonian = List( + List("jaanuar", "jaan"), + List("veebruar", "veebr"), + List("märts"), + List("aprill", "apr"), + List("mai"), + List("juuni"), + List("juuli"), + List("august", "aug"), + List("september", "sept"), + List("oktoober", "okt"), + List("november", "nov"), + List("detsember", "dets") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index 51533322..baa14cba 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -259,4 +259,32 @@ class DateFindTest extends FunSuite { ) ) } + + test("find estonian dates") { + assertEquals( + DateFind + .findDates( + "Some text in estonian 21 juuli 2022 and stuff", + Language.Estonian + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 7, 21), + NerLabel("21 juuli 2022", NerTag.Date, 22, 35) + ) + ) + ) + assertEquals( + DateFind + .findDates("19.11.21", Language.Estonian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("19.11.21", NerTag.Date, 0, 8) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index e6242f09..28053234 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -133,6 +133,11 @@ object Language { val iso3 = "pol" } + case object Estonian extends Language { + val iso2 = "et" + val iso3 = "est" + } + val all: List[Language] = List( German, @@ -154,7 +159,8 @@ object Language { Japanese, Hebrew, Lithuanian, - Polish + Polish, + Estonian ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index b8729760..85b9e835 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -204,5 +204,6 @@ object FtsRepository extends DoobieMeta { case Language.Hebrew => "simple" case Language.Lithuanian => "simple" case Language.Polish => "simple" + case Language.Estonian => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 0149b457..9af925e8 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -156,7 +156,12 @@ object SolrSetup { addContentField(Language.Polish) ), SolrMigration.reIndexAll(26, "Re-Index after adding polish content field"), - SolrMigration.reIndexAll(27, "Re-Index after collective-id change") + SolrMigration.reIndexAll(27, "Re-Index after collective-id change"), + SolrMigration[F]( + 28, + "Add Estonian", + addContentField(Language.Estonian) + ) ) def addFolderField: F[Unit] = diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 0d28df7b..ac711c14 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -34,6 +34,7 @@ type Language | Hungarian | Lithuanian | Polish + | Estonian fromString : String -> Maybe Language @@ -98,6 +99,9 @@ fromString str = else if str == "pol" || str == "pl" || str == "polish" then Just Polish + else if str == "est" || str == "et" || str == "estonian" then + Just Estonian + else Nothing @@ -165,6 +169,9 @@ toIso3 lang = Polish -> "pol" + Estonian -> + "est" + all : List Language all = @@ -188,4 +195,5 @@ all = , Hungarian , Lithuanian , Polish + , Estonian ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 7ff75568..a7e676e1 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -77,6 +77,9 @@ gb lang = Polish -> "Polish" + Estonian -> + "Estonian" + de : Language -> String de lang = @@ -141,6 +144,9 @@ de lang = Polish -> "Polnisch" + Estonian -> + "Estnisch" + fr : Language -> String fr lang = @@ -204,3 +210,6 @@ fr lang = Polish -> "Polonais" + + Estonian -> + "Estonien"