Add Ukrainian language

This commit is contained in:
GooRoo
2022-11-09 22:24:32 +01:00
parent 96e6b9fb91
commit 61d5585e68
10 changed files with 152 additions and 3 deletions

View File

@ -32,6 +32,7 @@ RUN apk update && \
tesseract-ocr-data-lit \
tesseract-ocr-data-pol \
tesseract-ocr-data-est \
tesseract-ocr-data-ukr \
unpaper \
weasyprint \
libreoffice \

View File

@ -54,10 +54,28 @@ object DateFind {
(sep + "md") -> text
} else sep -> text
val ukrFlexion = List(
"р",
"рік",
"року",
"ого",
"го",
"ий",
"ій",
"й",
"ше",
"ге",
"тє",
"те",
"ме",
"е",
"є"
)
TextSplitter
.splitToken(stext, separators.toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de")
.filter(w => lang != Language.Ukrainian || !ukrFlexion.contains(w.value))
}
case class SimpleDate(year: Int, month: Int, day: Int) {
@ -111,6 +129,7 @@ object DateFind {
case Language.Lithuanian => ymd
case Language.Polish => dmy
case Language.Estonian => dmy
case Language.Ukrainian => dmy.or(ymd)
}
p.read(parts) match {
case Result.Success(sds, _) =>

View File

@ -62,6 +62,8 @@ object MonthName {
polish
case Language.Estonian =>
estonian
case Language.Ukrainian =>
ukrainian
}
private val numbers = List(
@ -392,4 +394,19 @@ object MonthName {
List("november", "nov"),
List("detsember", "dets")
)
private val ukrainian = List(
List("січня", "січн", "січ"),
List("лютого", "лют"),
List("березня", "бер"),
List("квітня", "квіт", "кві"),
List("травня", "трав", "тра"),
List("червня", "черв", "чер"),
List("липня", "лип"),
List("серпня", "серп", "сер"),
List("вересня", "вер"),
List("жовтня", "жовт", "жов"),
List("листопада", "лист", "лис"),
List("грудня", "груд", "гру")
)
}

View File

@ -287,4 +287,82 @@ class DateFindTest extends FunSuite {
)
)
}
test("find ukrainian dates") {
// officially used ones
assertEquals(
DateFind
.findDates(
"Цей текст був написаний 5 листопада 2022 року. Слава Україні!",
Language.Ukrainian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 11, 5),
NerLabel("5 листопада 2022", NerTag.Date, 24, 40)
)
)
)
assertEquals(
DateFind
.findDates("05.11.2022 — це субота", Language.Ukrainian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 11, 5),
NerLabel("05.11.2022", NerTag.Date, 0, 10)
)
)
)
// less common but also used
assertEquals(
DateFind
.findDates(
"Сьогодні 5 лист. 2022 р. Слава Україні!",
Language.Ukrainian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 11, 5),
NerLabel("5 лист. 2022", NerTag.Date, 9, 21)
)
)
)
assertEquals(
DateFind
.findDates("Дата: 2022.11.05", Language.Ukrainian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 11, 5),
NerLabel("2022.11.05", NerTag.Date, 6, 16)
)
)
)
// vernacular variants
assertEquals(
DateFind
.findDates("Ілля Рєпін народився 5-го серпня 1844-го року.", Language.Ukrainian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(1844, 8, 5),
NerLabel("5-го серпня 1844", NerTag.Date, 21, 37)
)
)
)
assertEquals(
DateFind
.findDates("3-тє жовт., 2022-й рік — це 33 дні тому", Language.Ukrainian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 10, 3),
NerLabel("3-тє жовт., 2022", NerTag.Date, 0, 16)
)
)
)
}
}

View File

@ -138,6 +138,11 @@ object Language {
val iso3 = "est"
}
case object Ukrainian extends Language {
val iso2 = "uk"
val iso3 = "ukr"
}
val all: List[Language] =
List(
German,
@ -160,7 +165,8 @@ object Language {
Hebrew,
Lithuanian,
Polish,
Estonian
Estonian,
Ukrainian
)
def fromString(str: String): Either[String, Language] = {

View File

@ -205,5 +205,6 @@ object FtsRepository extends DoobieMeta {
case Language.Lithuanian => "simple"
case Language.Polish => "simple"
case Language.Estonian => "simple"
case Language.Ukrainian => "simple"
}
}

View File

@ -161,7 +161,13 @@ object SolrSetup {
28,
"Add Estonian",
addContentField(Language.Estonian)
)
),
SolrMigration[F](
29,
"Add Ukrainian",
addContentField(Language.Ukrainian)
),
SolrMigration.reIndexAll(30, "Re-Index after adding Estonian and Ukrainian")
)
def addFolderField: F[Unit] =

View File

@ -35,6 +35,7 @@ type Language
| Lithuanian
| Polish
| Estonian
| Ukrainian
fromString : String -> Maybe Language
@ -102,6 +103,9 @@ fromString str =
else if str == "est" || str == "et" || str == "estonian" then
Just Estonian
else if str == "ukr" || str == "uk" || str == "ukrainian" then
Just Ukrainian
else
Nothing
@ -172,6 +176,9 @@ toIso3 lang =
Estonian ->
"est"
Ukrainian ->
"ukr"
all : List Language
all =
@ -196,4 +203,5 @@ all =
, Lithuanian
, Polish
, Estonian
, Ukrainian
]

View File

@ -80,6 +80,9 @@ gb lang =
Estonian ->
"Estonian"
Ukrainian ->
"Ukrainian"
de : Language -> String
de lang =
@ -147,6 +150,9 @@ de lang =
Estonian ->
"Estnisch"
Ukrainian ->
"Ukrainisch"
fr : Language -> String
fr lang =
@ -213,3 +219,6 @@ fr lang =
Estonian ->
"Estonien"
Ukrainian ->
"Ukrainien"

View File

@ -21,7 +21,11 @@ relevant things to do. These are:
the `all` list (then fix compile errors)
- define a list of month names to support date recognition and update
`DateFind.scala` to recognize date patterns for that language. Add
some tests to `DateFindTest`.
some tests to `DateFindTest`. While writing test-cases, you can check
them via `sbt`'s command prompt as following:
```
testOnly docspell.analysis.date.DateFindTest
```
- add it to joex' dockerfile to be available for tesseract
- update the solr migration/field definitions in `SolrSetup`. Create a
new solr migration that adds the content field for the new