mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Add Ukrainian language
This commit is contained in:
		@@ -32,6 +32,7 @@ RUN apk update && \
 | 
			
		||||
    tesseract-ocr-data-lit \
 | 
			
		||||
    tesseract-ocr-data-pol \
 | 
			
		||||
    tesseract-ocr-data-est \
 | 
			
		||||
    tesseract-ocr-data-ukr \
 | 
			
		||||
    unpaper \
 | 
			
		||||
    weasyprint \
 | 
			
		||||
    libreoffice \
 | 
			
		||||
 
 | 
			
		||||
@@ -54,10 +54,28 @@ object DateFind {
 | 
			
		||||
        (sep + "md") -> text
 | 
			
		||||
      } else sep -> text
 | 
			
		||||
 | 
			
		||||
    val ukrFlexion = List(
 | 
			
		||||
      "р",
 | 
			
		||||
      "рік",
 | 
			
		||||
      "року",
 | 
			
		||||
      "ого",
 | 
			
		||||
      "го",
 | 
			
		||||
      "ий",
 | 
			
		||||
      "ій",
 | 
			
		||||
      "й",
 | 
			
		||||
      "ше",
 | 
			
		||||
      "ге",
 | 
			
		||||
      "тє",
 | 
			
		||||
      "те",
 | 
			
		||||
      "ме",
 | 
			
		||||
      "е",
 | 
			
		||||
      "є"
 | 
			
		||||
    )
 | 
			
		||||
    TextSplitter
 | 
			
		||||
      .splitToken(stext, separators.toSet)
 | 
			
		||||
      .filter(w => lang != Language.Latvian || w.value != "gada")
 | 
			
		||||
      .filter(w => lang != Language.Spanish || w.value != "de")
 | 
			
		||||
      .filter(w => lang != Language.Ukrainian || !ukrFlexion.contains(w.value))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  case class SimpleDate(year: Int, month: Int, day: Int) {
 | 
			
		||||
@@ -111,6 +129,7 @@ object DateFind {
 | 
			
		||||
        case Language.Lithuanian => ymd
 | 
			
		||||
        case Language.Polish     => dmy
 | 
			
		||||
        case Language.Estonian   => dmy
 | 
			
		||||
        case Language.Ukrainian  => dmy.or(ymd)
 | 
			
		||||
      }
 | 
			
		||||
      p.read(parts) match {
 | 
			
		||||
        case Result.Success(sds, _) =>
 | 
			
		||||
 
 | 
			
		||||
@@ -62,6 +62,8 @@ object MonthName {
 | 
			
		||||
        polish
 | 
			
		||||
      case Language.Estonian =>
 | 
			
		||||
        estonian
 | 
			
		||||
      case Language.Ukrainian =>
 | 
			
		||||
        ukrainian
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  private val numbers = List(
 | 
			
		||||
@@ -392,4 +394,19 @@ object MonthName {
 | 
			
		||||
    List("november", "nov"),
 | 
			
		||||
    List("detsember", "dets")
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
  private val ukrainian = List(
 | 
			
		||||
    List("січня", "січн", "січ"),
 | 
			
		||||
    List("лютого", "лют"),
 | 
			
		||||
    List("березня", "бер"),
 | 
			
		||||
    List("квітня", "квіт", "кві"),
 | 
			
		||||
    List("травня", "трав", "тра"),
 | 
			
		||||
    List("червня", "черв", "чер"),
 | 
			
		||||
    List("липня", "лип"),
 | 
			
		||||
    List("серпня", "серп", "сер"),
 | 
			
		||||
    List("вересня", "вер"),
 | 
			
		||||
    List("жовтня", "жовт", "жов"),
 | 
			
		||||
    List("листопада", "лист", "лис"),
 | 
			
		||||
    List("грудня", "груд", "гру")
 | 
			
		||||
  )
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -287,4 +287,82 @@ class DateFindTest extends FunSuite {
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  test("find ukrainian dates") {
 | 
			
		||||
    // officially used ones
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates(
 | 
			
		||||
          "Цей текст був написаний 5 листопада 2022 року. Слава Україні!",
 | 
			
		||||
          Language.Ukrainian
 | 
			
		||||
        )
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(2022, 11, 5),
 | 
			
		||||
          NerLabel("5 листопада 2022", NerTag.Date, 24, 40)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates("05.11.2022 — це субота", Language.Ukrainian)
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(2022, 11, 5),
 | 
			
		||||
          NerLabel("05.11.2022", NerTag.Date, 0, 10)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    // less common but also used
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates(
 | 
			
		||||
          "Сьогодні 5 лист. 2022 р. Слава Україні!",
 | 
			
		||||
          Language.Ukrainian
 | 
			
		||||
        )
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(2022, 11, 5),
 | 
			
		||||
          NerLabel("5 лист. 2022", NerTag.Date, 9, 21)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates("Дата: 2022.11.05", Language.Ukrainian)
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(2022, 11, 5),
 | 
			
		||||
          NerLabel("2022.11.05", NerTag.Date, 6, 16)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    // vernacular variants
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates("Ілля Рєпін народився 5-го серпня 1844-го року.", Language.Ukrainian)
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(1844, 8, 5),
 | 
			
		||||
          NerLabel("5-го серпня 1844", NerTag.Date, 21, 37)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
    assertEquals(
 | 
			
		||||
      DateFind
 | 
			
		||||
        .findDates("3-тє жовт., 2022-й рік  — це 33 дні тому", Language.Ukrainian)
 | 
			
		||||
        .toVector,
 | 
			
		||||
      Vector(
 | 
			
		||||
        NerDateLabel(
 | 
			
		||||
          LocalDate.of(2022, 10, 3),
 | 
			
		||||
          NerLabel("3-тє жовт., 2022", NerTag.Date, 0, 16)
 | 
			
		||||
        )
 | 
			
		||||
      )
 | 
			
		||||
    )
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -138,6 +138,11 @@ object Language {
 | 
			
		||||
    val iso3 = "est"
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  case object Ukrainian extends Language {
 | 
			
		||||
    val iso2 = "uk"
 | 
			
		||||
    val iso3 = "ukr"
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  val all: List[Language] =
 | 
			
		||||
    List(
 | 
			
		||||
      German,
 | 
			
		||||
@@ -160,7 +165,8 @@ object Language {
 | 
			
		||||
      Hebrew,
 | 
			
		||||
      Lithuanian,
 | 
			
		||||
      Polish,
 | 
			
		||||
      Estonian
 | 
			
		||||
      Estonian,
 | 
			
		||||
      Ukrainian
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
  def fromString(str: String): Either[String, Language] = {
 | 
			
		||||
 
 | 
			
		||||
@@ -205,5 +205,6 @@ object FtsRepository extends DoobieMeta {
 | 
			
		||||
      case Language.Lithuanian => "simple"
 | 
			
		||||
      case Language.Polish     => "simple"
 | 
			
		||||
      case Language.Estonian   => "simple"
 | 
			
		||||
      case Language.Ukrainian  => "simple"
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -161,7 +161,13 @@ object SolrSetup {
 | 
			
		||||
            28,
 | 
			
		||||
            "Add Estonian",
 | 
			
		||||
            addContentField(Language.Estonian)
 | 
			
		||||
          )
 | 
			
		||||
          ),
 | 
			
		||||
          SolrMigration[F](
 | 
			
		||||
            29,
 | 
			
		||||
            "Add Ukrainian",
 | 
			
		||||
            addContentField(Language.Ukrainian)
 | 
			
		||||
          ),
 | 
			
		||||
          SolrMigration.reIndexAll(30, "Re-Index after adding Estonian and Ukrainian")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
      def addFolderField: F[Unit] =
 | 
			
		||||
 
 | 
			
		||||
@@ -35,6 +35,7 @@ type Language
 | 
			
		||||
    | Lithuanian
 | 
			
		||||
    | Polish
 | 
			
		||||
    | Estonian
 | 
			
		||||
    | Ukrainian
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
fromString : String -> Maybe Language
 | 
			
		||||
@@ -102,6 +103,9 @@ fromString str =
 | 
			
		||||
    else if str == "est" || str == "et" || str == "estonian" then
 | 
			
		||||
        Just Estonian
 | 
			
		||||
 | 
			
		||||
    else if str == "ukr" || str == "uk" || str == "ukrainian" then
 | 
			
		||||
        Just Ukrainian
 | 
			
		||||
 | 
			
		||||
    else
 | 
			
		||||
        Nothing
 | 
			
		||||
 | 
			
		||||
@@ -172,6 +176,9 @@ toIso3 lang =
 | 
			
		||||
        Estonian ->
 | 
			
		||||
            "est"
 | 
			
		||||
 | 
			
		||||
        Ukrainian ->
 | 
			
		||||
            "ukr"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
all : List Language
 | 
			
		||||
all =
 | 
			
		||||
@@ -196,4 +203,5 @@ all =
 | 
			
		||||
    , Lithuanian
 | 
			
		||||
    , Polish
 | 
			
		||||
    , Estonian
 | 
			
		||||
    , Ukrainian
 | 
			
		||||
    ]
 | 
			
		||||
 
 | 
			
		||||
@@ -80,6 +80,9 @@ gb lang =
 | 
			
		||||
        Estonian ->
 | 
			
		||||
            "Estonian"
 | 
			
		||||
 | 
			
		||||
        Ukrainian ->
 | 
			
		||||
            "Ukrainian"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
de : Language -> String
 | 
			
		||||
de lang =
 | 
			
		||||
@@ -147,6 +150,9 @@ de lang =
 | 
			
		||||
        Estonian ->
 | 
			
		||||
            "Estnisch"
 | 
			
		||||
 | 
			
		||||
        Ukrainian ->
 | 
			
		||||
            "Ukrainisch"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
fr : Language -> String
 | 
			
		||||
fr lang =
 | 
			
		||||
@@ -213,3 +219,6 @@ fr lang =
 | 
			
		||||
 | 
			
		||||
        Estonian ->
 | 
			
		||||
            "Estonien"
 | 
			
		||||
 | 
			
		||||
        Ukrainian ->
 | 
			
		||||
            "Ukrainien"
 | 
			
		||||
 
 | 
			
		||||
@@ -21,7 +21,11 @@ relevant things to do. These are:
 | 
			
		||||
  the `all` list (then fix compile errors)
 | 
			
		||||
- define a list of month names to support date recognition and update
 | 
			
		||||
  `DateFind.scala` to recognize date patterns for that language. Add
 | 
			
		||||
  some tests to `DateFindTest`.
 | 
			
		||||
  some tests to `DateFindTest`. While writing test-cases, you can check
 | 
			
		||||
  them via `sbt`'s command prompt as following:
 | 
			
		||||
    ```
 | 
			
		||||
    testOnly docspell.analysis.date.DateFindTest
 | 
			
		||||
    ```
 | 
			
		||||
- add it to joex' dockerfile to be available for tesseract
 | 
			
		||||
- update the solr migration/field definitions in `SolrSetup`. Create a
 | 
			
		||||
  new solr migration that adds the content field for the new
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user