diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 5dc385b3..c346671a 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -32,6 +32,7 @@ RUN apk update && \ tesseract-ocr-data-lit \ tesseract-ocr-data-pol \ tesseract-ocr-data-est \ + tesseract-ocr-data-ukr \ unpaper \ weasyprint \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 6dcb4f85..b3ef4915 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -54,10 +54,28 @@ object DateFind { (sep + "md") -> text } else sep -> text + val ukrFlexion = List( + "р", + "рік", + "року", + "ого", + "го", + "ий", + "ій", + "й", + "ше", + "ге", + "тє", + "те", + "ме", + "е", + "є" + ) TextSplitter .splitToken(stext, separators.toSet) .filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Spanish || w.value != "de") + .filter(w => lang != Language.Ukrainian || !ukrFlexion.contains(w.value)) } case class SimpleDate(year: Int, month: Int, day: Int) { @@ -111,6 +129,7 @@ object DateFind { case Language.Lithuanian => ymd case Language.Polish => dmy case Language.Estonian => dmy + case Language.Ukrainian => dmy.or(ymd) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index bf2100fd..776d36c5 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -62,6 +62,8 @@ object MonthName { polish case Language.Estonian => estonian + case Language.Ukrainian => + ukrainian } private val numbers = List( @@ -392,4 +394,19 @@ object MonthName { List("november", "nov"), List("detsember", "dets") ) + + private val ukrainian = List( + List("січня", "січн", "січ"), + List("лютого", "лют"), + List("березня", "бер"), + List("квітня", "квіт", "кві"), + List("травня", "трав", "тра"), + List("червня", "черв", "чер"), + List("липня", "лип"), + List("серпня", "серп", "сер"), + List("вересня", "вер"), + List("жовтня", "жовт", "жов"), + List("листопада", "лист", "лис"), + List("грудня", "груд", "гру") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index baa14cba..a6a29960 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -287,4 +287,82 @@ class DateFindTest extends FunSuite { ) ) } + + test("find ukrainian dates") { + // officially used ones + assertEquals( + DateFind + .findDates( + "Цей текст був написаний 5 листопада 2022 року. Слава Україні!", + Language.Ukrainian + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 11, 5), + NerLabel("5 листопада 2022", NerTag.Date, 24, 40) + ) + ) + ) + assertEquals( + DateFind + .findDates("05.11.2022 — це субота", Language.Ukrainian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 11, 5), + NerLabel("05.11.2022", NerTag.Date, 0, 10) + ) + ) + ) + // less common but also used + assertEquals( + DateFind + .findDates( + "Сьогодні 5 лист. 2022 р. Слава Україні!", + Language.Ukrainian + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 11, 5), + NerLabel("5 лист. 2022", NerTag.Date, 9, 21) + ) + ) + ) + assertEquals( + DateFind + .findDates("Дата: 2022.11.05", Language.Ukrainian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 11, 5), + NerLabel("2022.11.05", NerTag.Date, 6, 16) + ) + ) + ) + // vernacular variants + assertEquals( + DateFind + .findDates("Ілля Рєпін народився 5-го серпня 1844-го року.", Language.Ukrainian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(1844, 8, 5), + NerLabel("5-го серпня 1844", NerTag.Date, 21, 37) + ) + ) + ) + assertEquals( + DateFind + .findDates("3-тє жовт., 2022-й рік — це 33 дні тому", Language.Ukrainian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 10, 3), + NerLabel("3-тє жовт., 2022", NerTag.Date, 0, 16) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 28053234..fb857041 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -138,6 +138,11 @@ object Language { val iso3 = "est" } + case object Ukrainian extends Language { + val iso2 = "uk" + val iso3 = "ukr" + } + val all: List[Language] = List( German, @@ -160,7 +165,8 @@ object Language { Hebrew, Lithuanian, Polish, - Estonian + Estonian, + Ukrainian ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index 85b9e835..b2582dfb 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -205,5 +205,6 @@ object FtsRepository extends DoobieMeta { case Language.Lithuanian => "simple" case Language.Polish => "simple" case Language.Estonian => "simple" + case Language.Ukrainian => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 9af925e8..90bfb349 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -161,7 +161,13 @@ object SolrSetup { 28, "Add Estonian", addContentField(Language.Estonian) - ) + ), + SolrMigration[F]( + 29, + "Add Ukrainian", + addContentField(Language.Ukrainian) + ), + SolrMigration.reIndexAll(30, "Re-Index after adding Estonian and Ukrainian") ) def addFolderField: F[Unit] = diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index ac711c14..e94a805f 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -35,6 +35,7 @@ type Language | Lithuanian | Polish | Estonian + | Ukrainian fromString : String -> Maybe Language @@ -102,6 +103,9 @@ fromString str = else if str == "est" || str == "et" || str == "estonian" then Just Estonian + else if str == "ukr" || str == "uk" || str == "ukrainian" then + Just Ukrainian + else Nothing @@ -172,6 +176,9 @@ toIso3 lang = Estonian -> "est" + Ukrainian -> + "ukr" + all : List Language all = @@ -196,4 +203,5 @@ all = , Lithuanian , Polish , Estonian + , Ukrainian ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index a7e676e1..f6e70488 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -80,6 +80,9 @@ gb lang = Estonian -> "Estonian" + Ukrainian -> + "Ukrainian" + de : Language -> String de lang = @@ -147,6 +150,9 @@ de lang = Estonian -> "Estnisch" + Ukrainian -> + "Ukrainisch" + fr : Language -> String fr lang = @@ -213,3 +219,6 @@ fr lang = Estonian -> "Estonien" + + Ukrainian -> + "Ukrainien" diff --git a/website/site/content/docs/dev/add-language.md b/website/site/content/docs/dev/add-language.md index 011f62ed..479111bc 100644 --- a/website/site/content/docs/dev/add-language.md +++ b/website/site/content/docs/dev/add-language.md @@ -21,7 +21,11 @@ relevant things to do. These are: the `all` list (then fix compile errors) - define a list of month names to support date recognition and update `DateFind.scala` to recognize date patterns for that language. Add - some tests to `DateFindTest`. + some tests to `DateFindTest`. While writing test-cases, you can check + them via `sbt`'s command prompt as following: + ``` + testOnly docspell.analysis.date.DateFindTest + ``` - add it to joex' dockerfile to be available for tesseract - update the solr migration/field definitions in `SolrSetup`. Create a new solr migration that adds the content field for the new