diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index cad70533..318f3600 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -31,6 +31,8 @@ RUN JDKPKG="openjdk11-jre"; \ tesseract-ocr-data-lav \ tesseract-ocr-data-jpn \ tesseract-ocr-data-heb \ + tesseract-ocr-data-lit \ + tesseract-ocr-data-pol \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index fa3c5d1c..915c2d22 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -46,13 +46,16 @@ object DateFind { ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = { - val stext = + val sep = " -\t.,\n\r/" + val (separators, stext) = if (lang == Language.Japanese) { - text.map(c => if (jpnChars.contains(c)) c else ' ') - } else text + (sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ') + } else if (lang == Language.Lithuanian) { + (sep + "md") -> text + } else sep -> text TextSplitter - .splitToken(stext, " -\t.,\n\r/年月日".toSet) + .splitToken(stext, separators.toSet) .filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Spanish || w.value != "de") } @@ -105,6 +108,8 @@ object DateFind { case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Japanese => ymd case Language.Hebrew => dmy + case Language.Lithuanian => ymd + case Language.Polish => dmy } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 0679e1b3..4b2e6295 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -56,6 +56,10 @@ object MonthName { japanese case Language.Hebrew => hebrew + case Language.Lithuanian => + lithuanian + case Language.Polish => + polish } private val numbers = List( @@ -341,4 +345,34 @@ object MonthName { List("XI", "nov", "november"), List("XII", "dec", "december") ) + + private val lithuanian = List( + List("sausis", "sausio", "saus"), + List("vasaris", "vasario", "vas"), + List("kovas", "kovo", "kov"), + List("balandis", "balandžio", "bal"), + List("gegužis", "gegužės", "geg"), + List("birželis", "birželio", "birž"), + List("liepa", "liepos", "liep"), + List("rugpjūtis", "rugpjūčio", "rugp"), + List("rugsėjis", "rugsėjo", "rugs"), + List("spalis", "spalio", "spal"), + List("lapkritis", "lapkričio", "lapkr"), + List("gruodis", "gruodžio", "gruod") + ) + + private val polish = List( + List("stycznia", "sty"), + List("lutego", "lut"), + List("marca", "mar"), + List("kwietnia", "kwi"), + List("maja", "maj"), + List("czerwca", "cze"), + List("lipca", "lip"), + List("sierpnia", "sie"), + List("września", "wrz"), + List("października", "paź"), + List("listopada", "lis"), + List("grudnia", "gru") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala index 70746b66..51533322 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala @@ -191,7 +191,6 @@ class DateFindTest extends FunSuite { ) ) ) - println(DateFind.splitWords("2021-11-19", Language.Spanish).toList) assertEquals( DateFind .findDates("2021-11-19", Language.Spanish) @@ -204,4 +203,60 @@ class DateFindTest extends FunSuite { ) ) } + + test("find lithuanian dates") { + assertEquals( + DateFind + .findDates( + "Some text in lithuanian 2022 m. gegužės 21 d. and stuff", + Language.Lithuanian + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 5, 21), + NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42) + ) + ) + ) + assertEquals( + DateFind + .findDates("2021-11-19", Language.Lithuanian) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("2021-11-19", NerTag.Date, 0, 10) + ) + ) + ) + } + + test("find polish dates") { + assertEquals( + DateFind + .findDates( + "Some text in polish 21 maja 2022 and stuff", + Language.Polish + ) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2022, 5, 21), + NerLabel("21 maja 2022", NerTag.Date, 20, 32) + ) + ) + ) + assertEquals( + DateFind + .findDates("19.11.2021", Language.Polish) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 11, 19), + NerLabel("19.11.2021", NerTag.Date, 0, 10) + ) + ) + ) + } } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index f8a3ff2b..e6242f09 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -123,6 +123,16 @@ object Language { val iso3 = "heb" } + case object Lithuanian extends Language { + val iso2 = "lt" + val iso3 = "lit" + } + + case object Polish extends Language { + val iso2 = "pl" + val iso3 = "pol" + } + val all: List[Language] = List( German, @@ -142,7 +152,9 @@ object Language { Romanian, Latvian, Japanese, - Hebrew + Hebrew, + Lithuanian, + Polish ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index ff32acbf..cbc5053b 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -193,5 +193,7 @@ object FtsRepository extends DoobieMeta { case Language.Latvian => "simple" case Language.Japanese => "simple" case Language.Hebrew => "simple" + case Language.Lithuanian => "simple" + case Language.Polish => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 56ab9a75..e0dde5a6 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -133,7 +133,29 @@ object SolrSetup { "Add hungarian", addContentField(Language.Hungarian) ), - SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field") + SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"), + SolrMigration[F]( + 21, + "Add new field type for lithuanian content", + addFieldType(AddFieldType.textLit) + ), + SolrMigration[F]( + 22, + "Add lithuanian", + addContentField(Language.Lithuanian) + ), + SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"), + SolrMigration[F]( + 24, + "Add new field type for polish content", + addFieldType(AddFieldType.textPol) + ), + SolrMigration[F]( + 25, + "Add polish", + addContentField(Language.Polish) + ), + SolrMigration.reIndexAll(26, "Re-Index after adding polish content field") ) def addFolderField: F[Unit] = @@ -275,6 +297,28 @@ object SolrSetup { ) ) + val textLit = AddFieldType( + "text_lt", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + + val textPol = AddFieldType( + "text_pl", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index a42dd803..0d28df7b 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -32,6 +32,8 @@ type Language | Japanese | Hebrew | Hungarian + | Lithuanian + | Polish fromString : String -> Maybe Language @@ -90,6 +92,12 @@ fromString str = else if str == "hun" || str == "hu" || str == "hungarian" then Just Hungarian + else if str == "lit" || str == "lt" || str == "lithuanian" then + Just Lithuanian + + else if str == "pol" || str == "pl" || str == "polish" then + Just Polish + else Nothing @@ -151,6 +159,12 @@ toIso3 lang = Hungarian -> "hun" + Lithuanian -> + "lit" + + Polish -> + "pol" + all : List Language all = @@ -172,4 +186,6 @@ all = , Japanese , Hebrew , Hungarian + , Lithuanian + , Polish ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 148be430..7ff75568 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -71,6 +71,12 @@ gb lang = Hungarian -> "Hungarian" + Lithuanian -> + "Lithuanian" + + Polish -> + "Polish" + de : Language -> String de lang = @@ -129,6 +135,12 @@ de lang = Hungarian -> "Ungarisch" + Lithuanian -> + "Litauisch" + + Polish -> + "Polnisch" + fr : Language -> String fr lang = @@ -186,3 +198,9 @@ fr lang = Hungarian -> "Hongrois" + + Lithuanian -> + "Lituanien" + + Polish -> + "Polonais" diff --git a/website/site/content/docs/dev/add-language.md b/website/site/content/docs/dev/add-language.md new file mode 100644 index 00000000..67a2b135 --- /dev/null +++ b/website/site/content/docs/dev/add-language.md @@ -0,0 +1,70 @@ ++++ +title = "Adding new language" +weight = 30 ++++ + +# Adding a new language for document processing + +Then there are other commits and issues to look at: + +- [Add Lithuanian](https://github.com/eikek/docspell/issues/1540) and [PR](https://github.com/eikek/docspell/pull/1559/commits/9d69401fea8ff07330c8a9116bd0d987827317c9) +- [Add Polish](https://github.com/eikek/docspell/issues/1345) and [PR](https://github.com/eikek/docspell/pull/1559/commits/1228937574ec52b36d5d77925c5fcdb1f536220c) +- [Add Spanish language](https://github.com/eikek/docspell/commit/26dff18ae0d32ce2b32b4d11ce381ada0e99314f) +- [Add Latvian language](https://github.com/eikek/docspell/issues/679) and [PR](https://github.com/eikek/docspell/pull/694/commits/9991ad5fcc43ccefe011a6cc4d01bdae4bcd4573) +- [Add Japanese language](https://github.com/eikek/docspell/issues/948) and [PR](https://github.com/eikek/docspell/pull/961/commits/f994d4b2488e64668ee064676f8c6469d9ccc1be), had some corrections: [1](https://github.com/eikek/docspell/commit/c59d4f8a6d021ec4b01a92320c211248503f16a5), [Issue](https://github.com/eikek/docspell/issues/973) +- [Add Hebrew language](https://github.com/eikek/docspell/pull/1027) + +Some older commits may be a bit out of date, but still show the +relevant things to do. These are: + +- add it to `Language.scala`, create a new `case object` and add it to + the `all` list (then fix compile errors) +- define a list of month names to support date recognition and update + `DateFind.scala` to recognize date patterns for that language. Add + some tests to `DateFindTest`. +- add it to joex' dockerfile to be available for tesseract +- update the solr migration/field definitions in `SolrSetup`. Create a + new solr migration that adds the content field for the new + language - it is a copy&paste from other similar changes. +- update `FtsRepository` for the PostgreSQL fulltext search variant: + if not sure, use `simple` here +- update the elm file so it shows up on the client. Also requires to + add translations in `Messages.Data.Language` + +## Test + +Check if everything is fine with `sbt Test/compile`. After the project +compiles without errors, run `sbt fix` to apply formatting fixes. + +It would be good to startup docspell and check the new lanugage a bit, +including whether fulltext search is working. + +Sometimes, SOLR doesn't support a language. In this case the migration +needs to first add the new *field type*. There are examples for +Lithuanian and Hebrew in the code. + +For the docker image, you can run + +```bash +PLATFORMS=linux/amd64 ./build.sh 0.36.0-SNAPSHOT +``` + +in `docker/dockerfile` directory to build the docker image (just +choose some version, it doesn't matter). + +## Non-NLP only + +Note that this is without support for NLP. Including support for NLP +means that the [stanford nlp](https://github.com/stanfordnlp/CoreNLP) +library needs to provide models for it and these must be included in +the build and tested a bit. + +## Opening issues on Github + +You can also open an issue on github requesting to support a language. +I kindly ask to include all necessary information, like in +[this](https://github.com/eikek/docspell/issues/1540) issue. I know +that I can dig it out from websites, but it would be nice to have +everything ready. Also it is better to know from a local person some +details, like which date patterns are more likely to appear than +others. diff --git a/website/site/content/docs/dev/development.md b/website/site/content/docs/dev/development.md index 09d3df9a..3720403c 100644 --- a/website/site/content/docs/dev/development.md +++ b/website/site/content/docs/dev/development.md @@ -206,9 +206,3 @@ publishing the release. However, for the nightly releases, this doesn't matter - everything must be automated here obviously. I also wanted the docker images to be built from the exact same artifacts that have been released at github (in contrast to being built again). - - -# Background Info - -There is a list of [ADRs](@/docs/dev/adr/_index.md) containing -internal/background info for various topics.