Add japanese document language

This commit is contained in:
eikek 2021-07-28 20:05:38 +02:00
parent 0104cdc825
commit f994d4b248
7 changed files with 66 additions and 2 deletions

View File

@ -29,6 +29,7 @@ RUN JDKPKG="openjdk11"; \
tesseract-ocr-data-rus \ tesseract-ocr-data-rus \
tesseract-ocr-data-ron \ tesseract-ocr-data-ron \
tesseract-ocr-data-lav \ tesseract-ocr-data-lav \
tesseract-ocr-data-jpn \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -22,7 +22,7 @@ object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter TextSplitter
.splitToken(text, " \t.,\n\r/".toSet) .splitToken(text, " \t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Latvian || w.value != "gada")
.sliding(3) .sliding(3)
.filter(_.size == 3) .filter(_.size == 3)
@ -89,6 +89,7 @@ object DateFind {
case Language.Swedish => ymd.or(dmy).or(mdy) case Language.Swedish => ymd.or(dmy).or(mdy)
case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy)
case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -50,6 +50,8 @@ object MonthName {
russian russian
case Language.Latvian => case Language.Latvian =>
latvian latvian
case Language.Japanese =>
japanese
} }
private val numbers = List( private val numbers = List(
@ -290,4 +292,19 @@ object MonthName {
List("novembris", "nov."), List("novembris", "nov."),
List("decembris", "dec.") List("decembris", "dec.")
) )
private val japanese = List(
List("1", "一"),
List("2", "二"),
List("3", "三"),
List("4", "四"),
List("5", "五"),
List("6", "六"),
List("7", "七"),
List("8", "八"),
List("9", "九"),
List("10", "十"),
List("11", "十一"),
List("12", "十二")
)
} }

View File

@ -143,4 +143,29 @@ class DateFindSpec extends FunSuite {
) )
} }
test("find japanese dates") {
assertEquals(
DateFind
.findDates("some text in japanese 2021.7.21 and more", Language.Japanese)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 7, 21),
NerLabel("2021.7.21", NerTag.Date, 22, 31)
)
)
)
assertEquals(
DateFind
.findDates("some text in japanese 2021年7月21日 and more", Language.Japanese)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 7, 21),
NerLabel("2021年7月21", NerTag.Date, 22, 31)
)
)
)
}
} }

View File

@ -108,6 +108,11 @@ object Language {
val iso3 = "lav" val iso3 = "lav"
} }
case object Japanese extends Language {
val iso2 = "ja"
val iso3 = "jpn"
}
val all: List[Language] = val all: List[Language] =
List( List(
German, German,
@ -124,7 +129,8 @@ object Language {
Swedish, Swedish,
Russian, Russian,
Romanian, Romanian,
Latvian Latvian,
Japanese
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -29,6 +29,7 @@ type Language
| Romanian | Romanian
| Dutch | Dutch
| Latvian | Latvian
| Japanese
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -78,6 +79,9 @@ fromString str =
else if str == "lav" || str == "lv" || str == "latvian" then else if str == "lav" || str == "lv" || str == "latvian" then
Just Latvian Just Latvian
else if str == "jpn" || str == "ja" || str == "japanese" then
Just Japanese
else else
Nothing Nothing
@ -130,6 +134,9 @@ toIso3 lang =
Latvian -> Latvian ->
"lav" "lav"
Japanese ->
"jpn"
all : List Language all : List Language
all = all =
@ -148,4 +155,5 @@ all =
, Russian , Russian
, Romanian , Romanian
, Latvian , Latvian
, Japanese
] ]

View File

@ -61,6 +61,9 @@ gb lang =
Latvian -> Latvian ->
"Latvian" "Latvian"
Japanese ->
"Japanese"
de : Language -> String de : Language -> String
de lang = de lang =
@ -109,3 +112,6 @@ de lang =
Latvian -> Latvian ->
"Lettisch" "Lettisch"
Japanese ->
"Japanisch"