From 9d69401fea8ff07330c8a9116bd0d987827317c9 Mon Sep 17 00:00:00 2001 From: eikek Date: Sat, 21 May 2022 14:11:38 +0200 Subject: [PATCH] Add Lithuanian to processing languages SOLR doesn't support Lithuanian, maybe it can be added via plugins. A manual setup of solr is required then. It has been added with basic support. Closes: #1540 --- docker/dockerfiles/joex.dockerfile | 1 + .../docspell/analysis/date/DateFind.scala | 3 +- .../docspell/analysis/date/MonthName.scala | 17 +++++++++++ .../docspell/analysis/date/DateFindTest.scala | 29 ++++++++++++++++++- .../main/scala/docspell/common/Language.scala | 8 ++++- .../docspell/ftspsql/FtsRepository.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 24 ++++++++++++++- modules/webapp/src/main/elm/Data/Language.elm | 8 +++++ .../src/main/elm/Messages/Data/Language.elm | 9 ++++++ 9 files changed, 96 insertions(+), 4 deletions(-) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index cad70533..1180f4b5 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -31,6 +31,7 @@ RUN JDKPKG="openjdk11-jre"; \ tesseract-ocr-data-lav \ tesseract-ocr-data-jpn \ tesseract-ocr-data-heb \ + tesseract-ocr-data-lit \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index fa3c5d1c..e136e382 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -52,7 +52,7 @@ object DateFind { } else text TextSplitter - .splitToken(stext, " -\t.,\n\r/年月日".toSet) + .splitToken(stext, " -\t.,\n\r/年月日md".toSet) .filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Spanish || w.value != "de") } @@ -105,6 +105,7 @@ object DateFind { case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Japanese => ymd case Language.Hebrew => dmy + case Language.Lithuanian => ymd } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 0679e1b3..5b87ae85 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -56,6 +56,8 @@ object MonthName { japanese case Language.Hebrew => hebrew + case Language.Lithuanian => + lithuanian } private val numbers = List( @@ -341,4 +343,19 @@ object MonthName { List("XI", "nov", "november"), List("XII", "dec", "december") ) + + private val lithuanian = List( + List("sausis", "sausio", "saus"), + List("vasaris", "vasario", "vas"), + List("kovas", "kovo", "kov"), + List("balandis", "balandžio", "bal"), + List("gegužis", "gegužės", "geg"), + List("birželis", "birželio", "birž"), + List("liepa", "liepos", "liep"), + List("rugpjūtis", "rugpjūčio", "rugp"), + List("rugsėjis", "rugsėjo", "rugs"), + List("spalis", "spalio", "spal"), + List("lapkritis", "lapkričio", "lapkr"), + List("gruodis", "gruodžio", "gruod") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index 70746b66..1cff23eb 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -191,7 +191,6 @@ class DateFindTest extends FunSuite { ) ) ) - println(DateFind.splitWords("2021-11-19", Language.Spanish).toList) assertEquals( DateFind .findDates("2021-11-19", Language.Spanish) @@ -204,4 +203,32 @@ class DateFindTest extends FunSuite { ) ) } + + test("find lithuanian dates") { + assertEquals( + DateFind + .findDates( + "Some text in lithuanian 2022 m. gegužės 21 d. and stuff", + Language.Lithuanian + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 5, 21), + NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42) + ) + ) + ) + assertEquals( + DateFind + .findDates("2021-11-19", Language.Lithuanian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("2021-11-19", NerTag.Date, 0, 10) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index f8a3ff2b..334b53e8 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -123,6 +123,11 @@ object Language { val iso3 = "heb" } + case object Lithuanian extends Language { + val iso2 = "lt" + val iso3 = "lit" + } + val all: List[Language] = List( German, @@ -142,7 +147,8 @@ object Language { Romanian, Latvian, Japanese, - Hebrew + Hebrew, + Lithuanian ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index ff32acbf..3b90d534 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -193,5 +193,6 @@ object FtsRepository extends DoobieMeta { case Language.Latvian => "simple" case Language.Japanese => "simple" case Language.Hebrew => "simple" + case Language.Lithuanian => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 56ab9a75..e4a57322 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -133,7 +133,18 @@ object SolrSetup { "Add hungarian", addContentField(Language.Hungarian) ), - SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field") + SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"), + SolrMigration[F]( + 21, + "Add new field type for lithuanian content", + addFieldType(AddFieldType.textLit) + ), + SolrMigration[F]( + 22, + "Add lithuanian", + addContentField(Language.Lithuanian) + ), + SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field") ) def addFolderField: F[Unit] = @@ -275,6 +286,17 @@ object SolrSetup { ) ) + val textLit = AddFieldType( + "text_lt", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index a42dd803..adc8c0f1 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -32,6 +32,7 @@ type Language | Japanese | Hebrew | Hungarian + | Lithuanian fromString : String -> Maybe Language @@ -90,6 +91,9 @@ fromString str = else if str == "hun" || str == "hu" || str == "hungarian" then Just Hungarian + else if str == "lit" || str == "lt" || str == "lithuanian" then + Just Lithuanian + else Nothing @@ -151,6 +155,9 @@ toIso3 lang = Hungarian -> "hun" + Lithuanian -> + "lit" + all : List Language all = @@ -172,4 +179,5 @@ all = , Japanese , Hebrew , Hungarian + , Lithuanian ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 148be430..257ba44f 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -71,6 +71,9 @@ gb lang = Hungarian -> "Hungarian" + Lithuanian -> + "Lithuanian" + de : Language -> String de lang = @@ -129,6 +132,9 @@ de lang = Hungarian -> "Ungarisch" + Lithuanian -> + "Litauisch" + fr : Language -> String fr lang = @@ -186,3 +192,6 @@ fr lang = Hungarian -> "Hongrois" + + Lithuanian -> + "Lituanien"