diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 1180f4b5..318f3600 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -32,6 +32,7 @@ RUN JDKPKG="openjdk11-jre"; \ tesseract-ocr-data-jpn \ tesseract-ocr-data-heb \ tesseract-ocr-data-lit \ + tesseract-ocr-data-pol \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index e136e382..915c2d22 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -46,13 +46,16 @@ object DateFind { ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = { - val stext = + val sep = " -\t.,\n\r/" + val (separators, stext) = if (lang == Language.Japanese) { - text.map(c => if (jpnChars.contains(c)) c else ' ') - } else text + (sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ') + } else if (lang == Language.Lithuanian) { + (sep + "md") -> text + } else sep -> text TextSplitter - .splitToken(stext, " -\t.,\n\r/年月日md".toSet) + .splitToken(stext, separators.toSet) .filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Spanish || w.value != "de") } @@ -106,6 +109,7 @@ object DateFind { case Language.Japanese => ymd case Language.Hebrew => dmy case Language.Lithuanian => ymd + case Language.Polish => dmy } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 5b87ae85..4b2e6295 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -58,6 +58,8 @@ object MonthName { hebrew case Language.Lithuanian => lithuanian + case Language.Polish => + polish } private val numbers = List( @@ -358,4 +360,19 @@ object MonthName { List("lapkritis", "lapkričio", "lapkr"), List("gruodis", "gruodžio", "gruod") ) + + private val polish = List( + List("stycznia", "sty"), + List("lutego", "lut"), + List("marca", "mar"), + List("kwietnia", "kwi"), + List("maja", "maj"), + List("czerwca", "cze"), + List("lipca", "lip"), + List("sierpnia", "sie"), + List("września", "wrz"), + List("października", "paź"), + List("listopada", "lis"), + List("grudnia", "gru") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index 1cff23eb..51533322 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -231,4 +231,32 @@ class DateFindTest extends FunSuite { ) ) } + + test("find polish dates") { + assertEquals( + DateFind + .findDates( + "Some text in polish 21 maja 2022 and stuff", + Language.Polish + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 5, 21), + NerLabel("21 maja 2022", NerTag.Date, 20, 32) + ) + ) + ) + assertEquals( + DateFind + .findDates("19.11.2021", Language.Polish) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("19.11.2021", NerTag.Date, 0, 10) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 334b53e8..e6242f09 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -128,6 +128,11 @@ object Language { val iso3 = "lit" } + case object Polish extends Language { + val iso2 = "pl" + val iso3 = "pol" + } + val all: List[Language] = List( German, @@ -148,7 +153,8 @@ object Language { Latvian, Japanese, Hebrew, - Lithuanian + Lithuanian, + Polish ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index 3b90d534..cbc5053b 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -194,5 +194,6 @@ object FtsRepository extends DoobieMeta { case Language.Japanese => "simple" case Language.Hebrew => "simple" case Language.Lithuanian => "simple" + case Language.Polish => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index e4a57322..e0dde5a6 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -144,7 +144,18 @@ object SolrSetup { "Add lithuanian", addContentField(Language.Lithuanian) ), - SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field") + SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"), + SolrMigration[F]( + 24, + "Add new field type for polish content", + addFieldType(AddFieldType.textPol) + ), + SolrMigration[F]( + 25, + "Add polish", + addContentField(Language.Polish) + ), + SolrMigration.reIndexAll(26, "Re-Index after adding polish content field") ) def addFolderField: F[Unit] = @@ -297,6 +308,17 @@ object SolrSetup { ) ) + val textPol = AddFieldType( + "text_pl", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index adc8c0f1..0d28df7b 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -33,6 +33,7 @@ type Language | Hebrew | Hungarian | Lithuanian + | Polish fromString : String -> Maybe Language @@ -94,6 +95,9 @@ fromString str = else if str == "lit" || str == "lt" || str == "lithuanian" then Just Lithuanian + else if str == "pol" || str == "pl" || str == "polish" then + Just Polish + else Nothing @@ -158,6 +162,9 @@ toIso3 lang = Lithuanian -> "lit" + Polish -> + "pol" + all : List Language all = @@ -180,4 +187,5 @@ all = , Hebrew , Hungarian , Lithuanian + , Polish ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 257ba44f..7ff75568 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -74,6 +74,9 @@ gb lang = Lithuanian -> "Lithuanian" + Polish -> + "Polish" + de : Language -> String de lang = @@ -135,6 +138,9 @@ de lang = Lithuanian -> "Litauisch" + Polish -> + "Polnisch" + fr : Language -> String fr lang = @@ -195,3 +201,6 @@ fr lang = Lithuanian -> "Lituanien" + + Polish -> + "Polonais"