mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-14 08:20:21 +00:00
Add latvian language
This commit is contained in:
parent
e4ef299582
commit
9991ad5fcc
@ -26,6 +26,7 @@ RUN apk add --no-cache openjdk11-jre \
|
|||||||
tesseract-ocr-data-swe \
|
tesseract-ocr-data-swe \
|
||||||
tesseract-ocr-data-rus \
|
tesseract-ocr-data-rus \
|
||||||
tesseract-ocr-data-ron \
|
tesseract-ocr-data-ron \
|
||||||
|
tesseract-ocr-data-lav \
|
||||||
unpaper \
|
unpaper \
|
||||||
wkhtmltopdf \
|
wkhtmltopdf \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -17,6 +17,7 @@ object DateFind {
|
|||||||
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(text, " \t.,\n\r/".toSet)
|
.splitToken(text, " \t.,\n\r/".toSet)
|
||||||
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
.sliding(3)
|
.sliding(3)
|
||||||
.filter(_.length == 3)
|
.filter(_.length == 3)
|
||||||
.flatMap(q =>
|
.flatMap(q =>
|
||||||
@ -55,6 +56,10 @@ object DateFind {
|
|||||||
case ((m, d), y) =>
|
case ((m, d), y) =>
|
||||||
List(SimpleDate(y, m, d))
|
List(SimpleDate(y, m, d))
|
||||||
}
|
}
|
||||||
|
def lavLong =
|
||||||
|
(readYear >> readDay >> readMonth(Language.Latvian)).map { case ((y, d), m) =>
|
||||||
|
List(SimpleDate(y, m, d))
|
||||||
|
}
|
||||||
|
|
||||||
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
|
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
|
||||||
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
|
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
|
||||||
@ -77,6 +82,7 @@ object DateFind {
|
|||||||
case Language.Russian => dmy.or(ymd).or(mdy)
|
case Language.Russian => dmy.or(ymd).or(mdy)
|
||||||
case Language.Swedish => ymd.or(dmy).or(mdy)
|
case Language.Swedish => ymd.or(dmy).or(mdy)
|
||||||
case Language.Dutch => dmy.or(ymd).or(mdy)
|
case Language.Dutch => dmy.or(ymd).or(mdy)
|
||||||
|
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
|
@ -42,6 +42,8 @@ object MonthName {
|
|||||||
finnish
|
finnish
|
||||||
case Language.Russian =>
|
case Language.Russian =>
|
||||||
russian
|
russian
|
||||||
|
case Language.Latvian =>
|
||||||
|
latvian
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -267,4 +269,19 @@ object MonthName {
|
|||||||
List("nov", "november"),
|
List("nov", "november"),
|
||||||
List("dec", "december")
|
List("dec", "december")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val latvian = List(
|
||||||
|
List("janvāris", "janv."),
|
||||||
|
List("februāris", "febr."),
|
||||||
|
List("marts"),
|
||||||
|
List("aprīlis", "apr."),
|
||||||
|
List("maijs"),
|
||||||
|
List("jūnijs", "jūn."),
|
||||||
|
List("jūlijs", "jūl."),
|
||||||
|
List("augusts", "aug."),
|
||||||
|
List("septembris", "sept."),
|
||||||
|
List("oktobris", "okt."),
|
||||||
|
List("novembris", "nov."),
|
||||||
|
List("decembris", "dec.")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -103,4 +103,36 @@ object DateFindSpec extends SimpleTestSuite {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find latvian dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind.findDates("on 2020. gada 30. jūlijs there", Language.Latvian).toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2020, 7, 30),
|
||||||
|
NerLabel("2020. gada 30. jūlijs", NerTag.Date, 3, 24)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind.findDates("Lai gan 30.07.2020", Language.Latvian).toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2020, 7, 30),
|
||||||
|
NerLabel("30.07.2020", NerTag.Date, 8, 18)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("Es nevaru šodien 2020.gada 30.oktobris iet uz", Language.Latvian)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2020, 10, 30),
|
||||||
|
NerLabel("2020.gada 30.oktobris", NerTag.Date, 17, 38)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -97,6 +97,11 @@ object Language {
|
|||||||
val iso3 = "nld"
|
val iso3 = "nld"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Latvian extends Language {
|
||||||
|
val iso2 = "lv"
|
||||||
|
val iso3 = "lav"
|
||||||
|
}
|
||||||
|
|
||||||
val all: List[Language] =
|
val all: List[Language] =
|
||||||
List(
|
List(
|
||||||
German,
|
German,
|
||||||
@ -112,7 +117,8 @@ object Language {
|
|||||||
Norwegian,
|
Norwegian,
|
||||||
Swedish,
|
Swedish,
|
||||||
Russian,
|
Russian,
|
||||||
Romanian
|
Romanian,
|
||||||
|
Latvian
|
||||||
)
|
)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
|
@ -22,6 +22,7 @@ type Language
|
|||||||
| Russian
|
| Russian
|
||||||
| Romanian
|
| Romanian
|
||||||
| Dutch
|
| Dutch
|
||||||
|
| Latvian
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -68,6 +69,9 @@ fromString str =
|
|||||||
else if str == "ron" || str == "ro" || str == "romanian" then
|
else if str == "ron" || str == "ro" || str == "romanian" then
|
||||||
Just Romanian
|
Just Romanian
|
||||||
|
|
||||||
|
else if str == "lav" || str == "lv" || str == "latvian" then
|
||||||
|
Just Latvian
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -117,6 +121,9 @@ toIso3 lang =
|
|||||||
Dutch ->
|
Dutch ->
|
||||||
"nld"
|
"nld"
|
||||||
|
|
||||||
|
Latvian ->
|
||||||
|
"lav"
|
||||||
|
|
||||||
|
|
||||||
toName : Language -> String
|
toName : Language -> String
|
||||||
toName lang =
|
toName lang =
|
||||||
@ -163,6 +170,9 @@ toName lang =
|
|||||||
Dutch ->
|
Dutch ->
|
||||||
"Dutch"
|
"Dutch"
|
||||||
|
|
||||||
|
Latvian ->
|
||||||
|
"Latvian"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -180,4 +190,5 @@ all =
|
|||||||
, Swedish
|
, Swedish
|
||||||
, Russian
|
, Russian
|
||||||
, Romanian
|
, Romanian
|
||||||
|
, Latvian
|
||||||
]
|
]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user