Merge pull request #1559 from eikek/more-languages

More languages
2025-10-15 12:21:52 +00:00 · 2022-05-21 12:53:54 +00:00
parent 0f1c3abd6e 5ec311c331
commit 6b913873e6
11 changed files with 265 additions and 13 deletions
--- a/docker/dockerfiles/joex.dockerfile
+++ b/docker/dockerfiles/joex.dockerfile
@@ -31,6 +31,8 @@ RUN JDKPKG="openjdk11-jre"; \
    tesseract-ocr-data-lav \
    tesseract-ocr-data-jpn \
    tesseract-ocr-data-heb \
+    tesseract-ocr-data-lit \
+    tesseract-ocr-data-pol \
    unpaper \
    wkhtmltopdf \
    libreoffice \
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -46,13 +46,16 @@ object DateFind {
    ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet

  private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
-    val stext =
+    val sep = " -\t.,\n\r/"
+    val (separators, stext) =
      if (lang == Language.Japanese) {
-        text.map(c => if (jpnChars.contains(c)) c else ' ')
-      } else text
+        (sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
+      } else if (lang == Language.Lithuanian) {
+        (sep + "md") -> text
+      } else sep -> text

    TextSplitter
-      .splitToken(stext, " -\t.,\n\r/年月日".toSet)
+      .splitToken(stext, separators.toSet)
      .filter(w => lang != Language.Latvian || w.value != "gada")
      .filter(w => lang != Language.Spanish || w.value != "de")
  }
@@ -105,6 +108,8 @@ object DateFind {
        case Language.Latvian    => dmy.or(lavLong).or(ymd)
        case Language.Japanese   => ymd
        case Language.Hebrew     => dmy
+        case Language.Lithuanian => ymd
+        case Language.Polish     => dmy
      }
      p.read(parts) match {
        case Result.Success(sds, _) =>
--- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
@@ -56,6 +56,10 @@ object MonthName {
        japanese
      case Language.Hebrew =>
        hebrew
+      case Language.Lithuanian =>
+        lithuanian
+      case Language.Polish =>
+        polish
    }

  private val numbers = List(
@@ -341,4 +345,34 @@ object MonthName {
    List("XI", "nov", "november"),
    List("XII", "dec", "december")
  )
+
+  private val lithuanian = List(
+    List("sausis", "sausio", "saus"),
+    List("vasaris", "vasario", "vas"),
+    List("kovas", "kovo", "kov"),
+    List("balandis", "balandžio", "bal"),
+    List("gegužis", "gegužės", "geg"),
+    List("birželis", "birželio", "birž"),
+    List("liepa", "liepos", "liep"),
+    List("rugpjūtis", "rugpjūčio", "rugp"),
+    List("rugsėjis", "rugsėjo", "rugs"),
+    List("spalis", "spalio", "spal"),
+    List("lapkritis", "lapkričio", "lapkr"),
+    List("gruodis", "gruodžio", "gruod")
+  )
+
+  private val polish = List(
+    List("stycznia", "sty"),
+    List("lutego", "lut"),
+    List("marca", "mar"),
+    List("kwietnia", "kwi"),
+    List("maja", "maj"),
+    List("czerwca", "cze"),
+    List("lipca", "lip"),
+    List("sierpnia", "sie"),
+    List("września", "wrz"),
+    List("października", "paź"),
+    List("listopada", "lis"),
+    List("grudnia", "gru")
+  )
 }
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
@@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
        )
      )
    )
-    println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
    assertEquals(
      DateFind
        .findDates("2021-11-19", Language.Spanish)
@@ -204,4 +203,60 @@ class DateFindTest extends FunSuite {
      )
    )
  }
+
+  test("find lithuanian dates") {
+    assertEquals(
+      DateFind
+        .findDates(
+          "Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
+          Language.Lithuanian
+        )
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2022, 5, 21),
+          NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
+        )
+      )
+    )
+    assertEquals(
+      DateFind
+        .findDates("2021-11-19", Language.Lithuanian)
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2021, 11, 19),
+          NerLabel("2021-11-19", NerTag.Date, 0, 10)
+        )
+      )
+    )
+  }
+
+  test("find polish dates") {
+    assertEquals(
+      DateFind
+        .findDates(
+          "Some text in polish 21 maja 2022 and stuff",
+          Language.Polish
+        )
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2022, 5, 21),
+          NerLabel("21 maja 2022", NerTag.Date, 20, 32)
+        )
+      )
+    )
+    assertEquals(
+      DateFind
+        .findDates("19.11.2021", Language.Polish)
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2021, 11, 19),
+          NerLabel("19.11.2021", NerTag.Date, 0, 10)
+        )
+      )
+    )
+  }
 }
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@@ -123,6 +123,16 @@ object Language {
    val iso3 = "heb"
  }

+  case object Lithuanian extends Language {
+    val iso2 = "lt"
+    val iso3 = "lit"
+  }
+
+  case object Polish extends Language {
+    val iso2 = "pl"
+    val iso3 = "pol"
+  }
+
  val all: List[Language] =
    List(
      German,
@@ -142,7 +152,9 @@ object Language {
      Romanian,
      Latvian,
      Japanese,
-      Hebrew
+      Hebrew,
+      Lithuanian,
+      Polish
    )

  def fromString(str: String): Either[String, Language] = {
--- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala
+++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala
@@ -193,5 +193,7 @@ object FtsRepository extends DoobieMeta {
      case Language.Latvian    => "simple"
      case Language.Japanese   => "simple"
      case Language.Hebrew     => "simple"
+      case Language.Lithuanian => "simple"
+      case Language.Polish     => "simple"
    }
 }
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -133,7 +133,29 @@ object SolrSetup {
            "Add hungarian",
            addContentField(Language.Hungarian)
          ),
-          SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
+          SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
+          SolrMigration[F](
+            21,
+            "Add new field type for lithuanian content",
+            addFieldType(AddFieldType.textLit)
+          ),
+          SolrMigration[F](
+            22,
+            "Add lithuanian",
+            addContentField(Language.Lithuanian)
+          ),
+          SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"),
+          SolrMigration[F](
+            24,
+            "Add new field type for polish content",
+            addFieldType(AddFieldType.textPol)
+          ),
+          SolrMigration[F](
+            25,
+            "Add polish",
+            addContentField(Language.Polish)
+          ),
+          SolrMigration.reIndexAll(26, "Re-Index after adding polish content field")
        )

      def addFolderField: F[Unit] =
@@ -275,6 +297,28 @@ object SolrSetup {
      )
    )

+    val textLit = AddFieldType(
+      "text_lt",
+      "solr.TextField",
+      Analyzer(
+        Tokenizer("solr.StandardTokenizerFactory", Map.empty),
+        List(
+          Filter("solr.LowerCaseFilterFactory", Map.empty)
+        )
+      )
+    )
+
+    val textPol = AddFieldType(
+      "text_pl",
+      "solr.TextField",
+      Analyzer(
+        Tokenizer("solr.StandardTokenizerFactory", Map.empty),
+        List(
+          Filter("solr.LowerCaseFilterFactory", Map.empty)
+        )
+      )
+    )
+
    final case class Filter(`class`: String, attr: Map[String, String])
    final case class Tokenizer(`class`: String, attr: Map[String, String])
    final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@@ -32,6 +32,8 @@ type Language
    | Japanese
    | Hebrew
    | Hungarian
+    | Lithuanian
+    | Polish


 fromString : String -> Maybe Language
@@ -90,6 +92,12 @@ fromString str =
    else if str == "hun" || str == "hu" || str == "hungarian" then
        Just Hungarian

+    else if str == "lit" || str == "lt" || str == "lithuanian" then
+        Just Lithuanian
+
+    else if str == "pol" || str == "pl" || str == "polish" then
+        Just Polish
+
    else
        Nothing

@@ -151,6 +159,12 @@ toIso3 lang =
        Hungarian ->
            "hun"

+        Lithuanian ->
+            "lit"
+
+        Polish ->
+            "pol"
+

 all : List Language
 all =
@@ -172,4 +186,6 @@ all =
    , Japanese
    , Hebrew
    , Hungarian
+    , Lithuanian
+    , Polish
    ]
--- a/modules/webapp/src/main/elm/Messages/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm
@@ -71,6 +71,12 @@ gb lang =
        Hungarian ->
            "Hungarian"

+        Lithuanian ->
+            "Lithuanian"
+
+        Polish ->
+            "Polish"
+

 de : Language -> String
 de lang =
@@ -129,6 +135,12 @@ de lang =
        Hungarian ->
            "Ungarisch"

+        Lithuanian ->
+            "Litauisch"
+
+        Polish ->
+            "Polnisch"
+

 fr : Language -> String
 fr lang =
@@ -186,3 +198,9 @@ fr lang =

        Hungarian ->
            "Hongrois"
+
+        Lithuanian ->
+            "Lituanien"
+
+        Polish ->
+            "Polonais"
--- a/website/site/content/docs/dev/add-language.md
+++ b/website/site/content/docs/dev/add-language.md
@@ -0,0 +1,70 @@
+++
+title = "Adding new language"
+weight = 30
+++
+
+# Adding a new language for document processing
+
+Then there are other commits and issues to look at:
+
+- [Add Lithuanian](https://github.com/eikek/docspell/issues/1540) and [PR](https://github.com/eikek/docspell/pull/1559/commits/9d69401fea8ff07330c8a9116bd0d987827317c9)
+- [Add Polish](https://github.com/eikek/docspell/issues/1345) and [PR](https://github.com/eikek/docspell/pull/1559/commits/1228937574ec52b36d5d77925c5fcdb1f536220c)
+- [Add Spanish language](https://github.com/eikek/docspell/commit/26dff18ae0d32ce2b32b4d11ce381ada0e99314f)
+- [Add Latvian language](https://github.com/eikek/docspell/issues/679) and [PR](https://github.com/eikek/docspell/pull/694/commits/9991ad5fcc43ccefe011a6cc4d01bdae4bcd4573)
+- [Add Japanese language](https://github.com/eikek/docspell/issues/948) and [PR](https://github.com/eikek/docspell/pull/961/commits/f994d4b2488e64668ee064676f8c6469d9ccc1be), had some corrections: [1](https://github.com/eikek/docspell/commit/c59d4f8a6d021ec4b01a92320c211248503f16a5), [Issue](https://github.com/eikek/docspell/issues/973)
+- [Add Hebrew language](https://github.com/eikek/docspell/pull/1027)
+
+Some older commits may be a bit out of date, but still show the
+relevant things to do. These are:
+
+- add it to `Language.scala`, create a new `case object` and add it to
+  the `all` list (then fix compile errors)
+- define a list of month names to support date recognition and update
+  `DateFind.scala` to recognize date patterns for that language. Add
+  some tests to `DateFindTest`.
+- add it to joex' dockerfile to be available for tesseract
+- update the solr migration/field definitions in `SolrSetup`. Create a
+  new solr migration that adds the content field for the new
+  language - it is a copy&paste from other similar changes.
+- update `FtsRepository` for the PostgreSQL fulltext search variant:
+  if not sure, use `simple` here
+- update the elm file so it shows up on the client. Also requires to
+  add translations in `Messages.Data.Language`
+
+## Test
+
+Check if everything is fine with `sbt Test/compile`. After the project
+compiles without errors, run `sbt fix` to apply formatting fixes.
+
+It would be good to startup docspell and check the new lanugage a bit,
+including whether fulltext search is working.
+
+Sometimes, SOLR doesn't support a language. In this case the migration
+needs to first add the new *field type*. There are examples for
+Lithuanian and Hebrew in the code.
+
+For the docker image, you can run
+
+```bash
+PLATFORMS=linux/amd64 ./build.sh 0.36.0-SNAPSHOT
+```
+
+in `docker/dockerfile` directory to build the docker image (just
+choose some version, it doesn't matter).
+
+## Non-NLP only
+
+Note that this is without support for NLP. Including support for NLP
+means that the [stanford nlp](https://github.com/stanfordnlp/CoreNLP)
+library needs to provide models for it and these must be included in
+the build and tested a bit.
+
+## Opening issues on Github
+
+You can also open an issue on github requesting to support a language.
+I kindly ask to include all necessary information, like in
+[this](https://github.com/eikek/docspell/issues/1540) issue. I know
+that I can dig it out from websites, but it would be nice to have
+everything ready. Also it is better to know from a local person some
+details, like which date patterns are more likely to appear than
+others.
--- a/website/site/content/docs/dev/development.md
+++ b/website/site/content/docs/dev/development.md
@@ -206,9 +206,3 @@ publishing the release. However, for the nightly releases, this
 doesn't matter - everything must be automated here obviously. I also
 wanted the docker images to be built from the exact same artifacts
 that have been released at github (in contrast to being built again).
-
-
-# Background Info
-
-There is a list of [ADRs](@/docs/dev/adr/_index.md) containing
-internal/background info for various topics.