mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Merge pull request #1835 from GooRoo/ukrainian-ocr
Add Ukrainian language
This commit is contained in:
@ -32,6 +32,7 @@ RUN apk update && \
|
|||||||
tesseract-ocr-data-lit \
|
tesseract-ocr-data-lit \
|
||||||
tesseract-ocr-data-pol \
|
tesseract-ocr-data-pol \
|
||||||
tesseract-ocr-data-est \
|
tesseract-ocr-data-est \
|
||||||
|
tesseract-ocr-data-ukr \
|
||||||
unpaper \
|
unpaper \
|
||||||
weasyprint \
|
weasyprint \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -54,10 +54,28 @@ object DateFind {
|
|||||||
(sep + "md") -> text
|
(sep + "md") -> text
|
||||||
} else sep -> text
|
} else sep -> text
|
||||||
|
|
||||||
|
val ukrFlexion = List(
|
||||||
|
"р",
|
||||||
|
"рік",
|
||||||
|
"року",
|
||||||
|
"ого",
|
||||||
|
"го",
|
||||||
|
"ий",
|
||||||
|
"ій",
|
||||||
|
"й",
|
||||||
|
"ше",
|
||||||
|
"ге",
|
||||||
|
"тє",
|
||||||
|
"те",
|
||||||
|
"ме",
|
||||||
|
"е",
|
||||||
|
"є"
|
||||||
|
)
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(stext, separators.toSet)
|
.splitToken(stext, separators.toSet)
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||||
|
.filter(w => lang != Language.Ukrainian || !ukrFlexion.contains(w.value))
|
||||||
}
|
}
|
||||||
|
|
||||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||||
@ -111,6 +129,7 @@ object DateFind {
|
|||||||
case Language.Lithuanian => ymd
|
case Language.Lithuanian => ymd
|
||||||
case Language.Polish => dmy
|
case Language.Polish => dmy
|
||||||
case Language.Estonian => dmy
|
case Language.Estonian => dmy
|
||||||
|
case Language.Ukrainian => dmy.or(ymd)
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
|
@ -62,6 +62,8 @@ object MonthName {
|
|||||||
polish
|
polish
|
||||||
case Language.Estonian =>
|
case Language.Estonian =>
|
||||||
estonian
|
estonian
|
||||||
|
case Language.Ukrainian =>
|
||||||
|
ukrainian
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -392,4 +394,19 @@ object MonthName {
|
|||||||
List("november", "nov"),
|
List("november", "nov"),
|
||||||
List("detsember", "dets")
|
List("detsember", "dets")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val ukrainian = List(
|
||||||
|
List("січня", "січн", "січ"),
|
||||||
|
List("лютого", "лют"),
|
||||||
|
List("березня", "бер"),
|
||||||
|
List("квітня", "квіт", "кві"),
|
||||||
|
List("травня", "трав", "тра"),
|
||||||
|
List("червня", "черв", "чер"),
|
||||||
|
List("липня", "лип"),
|
||||||
|
List("серпня", "серп", "сер"),
|
||||||
|
List("вересня", "вер"),
|
||||||
|
List("жовтня", "жовт", "жов"),
|
||||||
|
List("листопада", "лист", "лис"),
|
||||||
|
List("грудня", "груд", "гру")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -287,4 +287,82 @@ class DateFindTest extends FunSuite {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find ukrainian dates") {
|
||||||
|
// officially used ones
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates(
|
||||||
|
"Цей текст був написаний 5 листопада 2022 року. Слава Україні!",
|
||||||
|
Language.Ukrainian
|
||||||
|
)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 11, 5),
|
||||||
|
NerLabel("5 листопада 2022", NerTag.Date, 24, 40)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("05.11.2022 — це субота", Language.Ukrainian)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 11, 5),
|
||||||
|
NerLabel("05.11.2022", NerTag.Date, 0, 10)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
// less common but also used
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates(
|
||||||
|
"Сьогодні 5 лист. 2022 р. Слава Україні!",
|
||||||
|
Language.Ukrainian
|
||||||
|
)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 11, 5),
|
||||||
|
NerLabel("5 лист. 2022", NerTag.Date, 9, 21)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("Дата: 2022.11.05", Language.Ukrainian)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 11, 5),
|
||||||
|
NerLabel("2022.11.05", NerTag.Date, 6, 16)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
// vernacular variants
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("Ілля Рєпін народився 5-го серпня 1844-го року.", Language.Ukrainian)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(1844, 8, 5),
|
||||||
|
NerLabel("5-го серпня 1844", NerTag.Date, 21, 37)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("3-тє жовт., 2022-й рік — це 33 дні тому", Language.Ukrainian)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 10, 3),
|
||||||
|
NerLabel("3-тє жовт., 2022", NerTag.Date, 0, 16)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -138,6 +138,11 @@ object Language {
|
|||||||
val iso3 = "est"
|
val iso3 = "est"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Ukrainian extends Language {
|
||||||
|
val iso2 = "uk"
|
||||||
|
val iso3 = "ukr"
|
||||||
|
}
|
||||||
|
|
||||||
val all: List[Language] =
|
val all: List[Language] =
|
||||||
List(
|
List(
|
||||||
German,
|
German,
|
||||||
@ -160,7 +165,8 @@ object Language {
|
|||||||
Hebrew,
|
Hebrew,
|
||||||
Lithuanian,
|
Lithuanian,
|
||||||
Polish,
|
Polish,
|
||||||
Estonian
|
Estonian,
|
||||||
|
Ukrainian
|
||||||
)
|
)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
|
@ -205,5 +205,6 @@ object FtsRepository extends DoobieMeta {
|
|||||||
case Language.Lithuanian => "simple"
|
case Language.Lithuanian => "simple"
|
||||||
case Language.Polish => "simple"
|
case Language.Polish => "simple"
|
||||||
case Language.Estonian => "simple"
|
case Language.Estonian => "simple"
|
||||||
|
case Language.Ukrainian => "simple"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -161,7 +161,13 @@ object SolrSetup {
|
|||||||
28,
|
28,
|
||||||
"Add Estonian",
|
"Add Estonian",
|
||||||
addContentField(Language.Estonian)
|
addContentField(Language.Estonian)
|
||||||
)
|
),
|
||||||
|
SolrMigration[F](
|
||||||
|
29,
|
||||||
|
"Add Ukrainian",
|
||||||
|
addContentField(Language.Ukrainian)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(30, "Re-Index after adding Estonian and Ukrainian")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
|
@ -35,6 +35,7 @@ type Language
|
|||||||
| Lithuanian
|
| Lithuanian
|
||||||
| Polish
|
| Polish
|
||||||
| Estonian
|
| Estonian
|
||||||
|
| Ukrainian
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -102,6 +103,9 @@ fromString str =
|
|||||||
else if str == "est" || str == "et" || str == "estonian" then
|
else if str == "est" || str == "et" || str == "estonian" then
|
||||||
Just Estonian
|
Just Estonian
|
||||||
|
|
||||||
|
else if str == "ukr" || str == "uk" || str == "ukrainian" then
|
||||||
|
Just Ukrainian
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -172,6 +176,9 @@ toIso3 lang =
|
|||||||
Estonian ->
|
Estonian ->
|
||||||
"est"
|
"est"
|
||||||
|
|
||||||
|
Ukrainian ->
|
||||||
|
"ukr"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -196,4 +203,5 @@ all =
|
|||||||
, Lithuanian
|
, Lithuanian
|
||||||
, Polish
|
, Polish
|
||||||
, Estonian
|
, Estonian
|
||||||
|
, Ukrainian
|
||||||
]
|
]
|
||||||
|
@ -80,6 +80,9 @@ gb lang =
|
|||||||
Estonian ->
|
Estonian ->
|
||||||
"Estonian"
|
"Estonian"
|
||||||
|
|
||||||
|
Ukrainian ->
|
||||||
|
"Ukrainian"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -147,6 +150,9 @@ de lang =
|
|||||||
Estonian ->
|
Estonian ->
|
||||||
"Estnisch"
|
"Estnisch"
|
||||||
|
|
||||||
|
Ukrainian ->
|
||||||
|
"Ukrainisch"
|
||||||
|
|
||||||
|
|
||||||
fr : Language -> String
|
fr : Language -> String
|
||||||
fr lang =
|
fr lang =
|
||||||
@ -213,3 +219,6 @@ fr lang =
|
|||||||
|
|
||||||
Estonian ->
|
Estonian ->
|
||||||
"Estonien"
|
"Estonien"
|
||||||
|
|
||||||
|
Ukrainian ->
|
||||||
|
"Ukrainien"
|
||||||
|
@ -21,7 +21,11 @@ relevant things to do. These are:
|
|||||||
the `all` list (then fix compile errors)
|
the `all` list (then fix compile errors)
|
||||||
- define a list of month names to support date recognition and update
|
- define a list of month names to support date recognition and update
|
||||||
`DateFind.scala` to recognize date patterns for that language. Add
|
`DateFind.scala` to recognize date patterns for that language. Add
|
||||||
some tests to `DateFindTest`.
|
some tests to `DateFindTest`. While writing test-cases, you can check
|
||||||
|
them via `sbt`'s command prompt as following:
|
||||||
|
```
|
||||||
|
testOnly docspell.analysis.date.DateFindTest
|
||||||
|
```
|
||||||
- add it to joex' dockerfile to be available for tesseract
|
- add it to joex' dockerfile to be available for tesseract
|
||||||
- update the solr migration/field definitions in `SolrSetup`. Create a
|
- update the solr migration/field definitions in `SolrSetup`. Create a
|
||||||
new solr migration that adds the content field for the new
|
new solr migration that adds the content field for the new
|
||||||
|
Reference in New Issue
Block a user