diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 4a5f9f63..43eee525 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -33,6 +33,7 @@ RUN apk update && \ tesseract-ocr-data-pol \ tesseract-ocr-data-est \ tesseract-ocr-data-ukr \ + tesseract-ocr-data-slk \ unpaper \ weasyprint \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 3e4973ae..1260a3db 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -131,6 +131,7 @@ object DateFind { case Language.Estonian => dmy case Language.Khmer => dmy case Language.Ukrainian => dmy.or(ymd) + case Language.Slovak => dmy.or(ymd) } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index a97aa53d..b646b46b 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -66,6 +66,8 @@ object MonthName { ukrainian case Language.Khmer => khmer + case Language.Slovak => + slovak } private val numbers = List( @@ -426,4 +428,19 @@ object MonthName { List("листопада", "лист", "лис"), List("грудня", "груд", "гру") ) + + private val slovak = List( + List("jan", "január", "januára"), + List("feb", "február", "februára"), + List("mar", "marec", "marca"), + List("apr", "apríl", "apríla"), + List("maj", "máj", "mája"), + List("jun", "jún", "júna"), + List("jul", "júl", "júla"), + List("aug", "august", "augusta"), + List("sep", "september", "septembra"), + List("okt", "október", "októbra"), + List("nov", "november", "novembra"), + List("dec", "december", "decembra") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index a6a29960..71012e86 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -365,4 +365,57 @@ class DateFindTest extends FunSuite { ) ) } + + test("find slovak dates") { + assertEquals( + DateFind + .findDates( + "Do funkcie bola inaugurovaná 15. júna 2019 pred Národnou radou SR", + Language.Slovak + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2019, 6, 15), + NerLabel("15. júna 2019", NerTag.Date, 29, 42) + ) + ) + ) + assertEquals( + DateFind + .findDates( + "Dátum narodenia: 14. feb 2015", + Language.Slovak + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2015, 2, 14), + NerLabel("14. feb 2015", NerTag.Date, 17, 29) + ) + ) + ) + assertEquals( + DateFind + .findDates("19.11.2021", Language.Slovak) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("19.11.2021", NerTag.Date, 0, 10) + ) + ) + ) + assertEquals( + DateFind + .findDates("Dátum: 2022.11.05", Language.Slovak) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 11, 5), + NerLabel("2022.11.05", NerTag.Date, 7, 17) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index cb874eb0..7b7ef815 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -148,6 +148,11 @@ object Language { val iso3 = "ukr" } + case object Slovak extends Language { + val iso2 = "sk" + val iso3 = "svk" + } + val all: List[Language] = List( German, @@ -172,6 +177,7 @@ object Language { Polish, Estonian, Ukrainian, + Slovak, Khmer ) diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index 0c15c899..eecacbc9 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -207,5 +207,6 @@ object FtsRepository extends DoobieMeta { case Language.Estonian => "simple" case Language.Ukrainian => "simple" case Language.Khmer => "simple" + case Language.Slovak => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 0e736bdf..cf4f7edf 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -183,7 +183,18 @@ object SolrSetup { "Add Khmer", addContentField(Language.Khmer) ), - SolrMigration.reIndexAll(34, "Re-Index after adding Khmer") + SolrMigration.reIndexAll(34, "Re-Index after adding Khmer"), + SolrMigration[F]( + 35, + "Add new field type for slovak content", + addFieldType(AddFieldType.textSvk) + ), + SolrMigration[F]( + 36, + "Add Slovak", + addContentField(Language.Slovak) + ), + SolrMigration.reIndexAll(37, "Re-Index after adding Slovak") ) def addFolderField: F[Unit] = @@ -368,6 +379,17 @@ object SolrSetup { ) ) + val textSvk = AddFieldType( + "text_sk", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 61144660..0d863f38 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -37,6 +37,7 @@ type Language | Estonian | Ukrainian | Khmer + | Slovak fromString : String -> Maybe Language @@ -110,6 +111,9 @@ fromString str = else if str == "khm" || str == "kh" || str == "khmer" then Just Khmer + else if str == "svk" || str == "sk" || str == "slovak" then + Just Slovak + else Nothing @@ -186,6 +190,9 @@ toIso3 lang = Khmer -> "khm" + Slovak -> + "svk" + all : List Language all = @@ -212,4 +219,5 @@ all = , Estonian , Ukrainian , Khmer + , Slovak ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 71912beb..2369a5a4 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -86,6 +86,9 @@ gb lang = Khmer -> "Khmer" + Slovak -> + "Slovak" + de : Language -> String de lang = @@ -159,6 +162,9 @@ de lang = Khmer -> "Khmer" + Slovak -> + "Slowakisch" + fr : Language -> String fr lang = @@ -231,3 +237,6 @@ fr lang = Khmer -> "Khmer" + + Slovak -> + "Slovaquie"