Add latvian language

This commit is contained in:
Eike Kettner 2021-03-09 00:19:33 +01:00
parent e4ef299582
commit 9991ad5fcc
6 changed files with 74 additions and 1 deletions

View File

@ -26,6 +26,7 @@ RUN apk add --no-cache openjdk11-jre \
tesseract-ocr-data-swe \ tesseract-ocr-data-swe \
tesseract-ocr-data-rus \ tesseract-ocr-data-rus \
tesseract-ocr-data-ron \ tesseract-ocr-data-ron \
tesseract-ocr-data-lav \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -17,6 +17,7 @@ object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter TextSplitter
.splitToken(text, " \t.,\n\r/".toSet) .splitToken(text, " \t.,\n\r/".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.sliding(3) .sliding(3)
.filter(_.length == 3) .filter(_.length == 3)
.flatMap(q => .flatMap(q =>
@ -55,6 +56,10 @@ object DateFind {
case ((m, d), y) => case ((m, d), y) =>
List(SimpleDate(y, m, d)) List(SimpleDate(y, m, d))
} }
def lavLong =
(readYear >> readDay >> readMonth(Language.Latvian)).map { case ((y, d), m) =>
List(SimpleDate(y, m, d))
}
// ymd , ydm, dmy , dym, myd, mdy // ymd , ydm, dmy , dym, myd, mdy
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
@ -77,6 +82,7 @@ object DateFind {
case Language.Russian => dmy.or(ymd).or(mdy) case Language.Russian => dmy.or(ymd).or(mdy)
case Language.Swedish => ymd.or(dmy).or(mdy) case Language.Swedish => ymd.or(dmy).or(mdy)
case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy)
case Language.Latvian => dmy.or(lavLong).or(ymd)
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -42,6 +42,8 @@ object MonthName {
finnish finnish
case Language.Russian => case Language.Russian =>
russian russian
case Language.Latvian =>
latvian
} }
private val numbers = List( private val numbers = List(
@ -267,4 +269,19 @@ object MonthName {
List("nov", "november"), List("nov", "november"),
List("dec", "december") List("dec", "december")
) )
private val latvian = List(
List("janvāris", "janv."),
List("februāris", "febr."),
List("marts"),
List("aprīlis", "apr."),
List("maijs"),
List("jūnijs", "jūn."),
List("jūlijs", "jūl."),
List("augusts", "aug."),
List("septembris", "sept."),
List("oktobris", "okt."),
List("novembris", "nov."),
List("decembris", "dec.")
)
} }

View File

@ -103,4 +103,36 @@ object DateFindSpec extends SimpleTestSuite {
) )
} }
test("find latvian dates") {
assertEquals(
DateFind.findDates("on 2020. gada 30. jūlijs there", Language.Latvian).toVector,
Vector(
NerDateLabel(
LocalDate.of(2020, 7, 30),
NerLabel("2020. gada 30. jūlijs", NerTag.Date, 3, 24)
)
)
)
assertEquals(
DateFind.findDates("Lai gan 30.07.2020", Language.Latvian).toVector,
Vector(
NerDateLabel(
LocalDate.of(2020, 7, 30),
NerLabel("30.07.2020", NerTag.Date, 8, 18)
)
)
)
assertEquals(
DateFind
.findDates("Es nevaru šodien 2020.gada 30.oktobris iet uz", Language.Latvian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2020, 10, 30),
NerLabel("2020.gada 30.oktobris", NerTag.Date, 17, 38)
)
)
)
}
} }

View File

@ -97,6 +97,11 @@ object Language {
val iso3 = "nld" val iso3 = "nld"
} }
case object Latvian extends Language {
val iso2 = "lv"
val iso3 = "lav"
}
val all: List[Language] = val all: List[Language] =
List( List(
German, German,
@ -112,7 +117,8 @@ object Language {
Norwegian, Norwegian,
Swedish, Swedish,
Russian, Russian,
Romanian Romanian,
Latvian
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -22,6 +22,7 @@ type Language
| Russian | Russian
| Romanian | Romanian
| Dutch | Dutch
| Latvian
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -68,6 +69,9 @@ fromString str =
else if str == "ron" || str == "ro" || str == "romanian" then else if str == "ron" || str == "ro" || str == "romanian" then
Just Romanian Just Romanian
else if str == "lav" || str == "lv" || str == "latvian" then
Just Latvian
else else
Nothing Nothing
@ -117,6 +121,9 @@ toIso3 lang =
Dutch -> Dutch ->
"nld" "nld"
Latvian ->
"lav"
toName : Language -> String toName : Language -> String
toName lang = toName lang =
@ -163,6 +170,9 @@ toName lang =
Dutch -> Dutch ->
"Dutch" "Dutch"
Latvian ->
"Latvian"
all : List Language all : List Language
all = all =
@ -180,4 +190,5 @@ all =
, Swedish , Swedish
, Russian , Russian
, Romanian , Romanian
, Latvian
] ]