mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 02:49:32 +00:00
Add japanese document language
This commit is contained in:
parent
0104cdc825
commit
f994d4b248
@ -29,6 +29,7 @@ RUN JDKPKG="openjdk11"; \
|
|||||||
tesseract-ocr-data-rus \
|
tesseract-ocr-data-rus \
|
||||||
tesseract-ocr-data-ron \
|
tesseract-ocr-data-ron \
|
||||||
tesseract-ocr-data-lav \
|
tesseract-ocr-data-lav \
|
||||||
|
tesseract-ocr-data-jpn \
|
||||||
unpaper \
|
unpaper \
|
||||||
wkhtmltopdf \
|
wkhtmltopdf \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -22,7 +22,7 @@ object DateFind {
|
|||||||
|
|
||||||
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(text, " \t.,\n\r/".toSet)
|
.splitToken(text, " \t.,\n\r/年月日".toSet)
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
.sliding(3)
|
.sliding(3)
|
||||||
.filter(_.size == 3)
|
.filter(_.size == 3)
|
||||||
@ -89,6 +89,7 @@ object DateFind {
|
|||||||
case Language.Swedish => ymd.or(dmy).or(mdy)
|
case Language.Swedish => ymd.or(dmy).or(mdy)
|
||||||
case Language.Dutch => dmy.or(ymd).or(mdy)
|
case Language.Dutch => dmy.or(ymd).or(mdy)
|
||||||
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
||||||
|
case Language.Japanese => ymd
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
|
@ -50,6 +50,8 @@ object MonthName {
|
|||||||
russian
|
russian
|
||||||
case Language.Latvian =>
|
case Language.Latvian =>
|
||||||
latvian
|
latvian
|
||||||
|
case Language.Japanese =>
|
||||||
|
japanese
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -290,4 +292,19 @@ object MonthName {
|
|||||||
List("novembris", "nov."),
|
List("novembris", "nov."),
|
||||||
List("decembris", "dec.")
|
List("decembris", "dec.")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val japanese = List(
|
||||||
|
List("1", "一"),
|
||||||
|
List("2", "二"),
|
||||||
|
List("3", "三"),
|
||||||
|
List("4", "四"),
|
||||||
|
List("5", "五"),
|
||||||
|
List("6", "六"),
|
||||||
|
List("7", "七"),
|
||||||
|
List("8", "八"),
|
||||||
|
List("9", "九"),
|
||||||
|
List("10", "十"),
|
||||||
|
List("11", "十一"),
|
||||||
|
List("12", "十二")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -143,4 +143,29 @@ class DateFindSpec extends FunSuite {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find japanese dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("some text in japanese 2021.7.21 and more", Language.Japanese)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 7, 21),
|
||||||
|
NerLabel("2021.7.21", NerTag.Date, 22, 31)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("some text in japanese 2021年7月21日 and more", Language.Japanese)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 7, 21),
|
||||||
|
NerLabel("2021年7月21", NerTag.Date, 22, 31)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -108,6 +108,11 @@ object Language {
|
|||||||
val iso3 = "lav"
|
val iso3 = "lav"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Japanese extends Language {
|
||||||
|
val iso2 = "ja"
|
||||||
|
val iso3 = "jpn"
|
||||||
|
}
|
||||||
|
|
||||||
val all: List[Language] =
|
val all: List[Language] =
|
||||||
List(
|
List(
|
||||||
German,
|
German,
|
||||||
@ -124,7 +129,8 @@ object Language {
|
|||||||
Swedish,
|
Swedish,
|
||||||
Russian,
|
Russian,
|
||||||
Romanian,
|
Romanian,
|
||||||
Latvian
|
Latvian,
|
||||||
|
Japanese
|
||||||
)
|
)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
|
@ -29,6 +29,7 @@ type Language
|
|||||||
| Romanian
|
| Romanian
|
||||||
| Dutch
|
| Dutch
|
||||||
| Latvian
|
| Latvian
|
||||||
|
| Japanese
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -78,6 +79,9 @@ fromString str =
|
|||||||
else if str == "lav" || str == "lv" || str == "latvian" then
|
else if str == "lav" || str == "lv" || str == "latvian" then
|
||||||
Just Latvian
|
Just Latvian
|
||||||
|
|
||||||
|
else if str == "jpn" || str == "ja" || str == "japanese" then
|
||||||
|
Just Japanese
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -130,6 +134,9 @@ toIso3 lang =
|
|||||||
Latvian ->
|
Latvian ->
|
||||||
"lav"
|
"lav"
|
||||||
|
|
||||||
|
Japanese ->
|
||||||
|
"jpn"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -148,4 +155,5 @@ all =
|
|||||||
, Russian
|
, Russian
|
||||||
, Romanian
|
, Romanian
|
||||||
, Latvian
|
, Latvian
|
||||||
|
, Japanese
|
||||||
]
|
]
|
||||||
|
@ -61,6 +61,9 @@ gb lang =
|
|||||||
Latvian ->
|
Latvian ->
|
||||||
"Latvian"
|
"Latvian"
|
||||||
|
|
||||||
|
Japanese ->
|
||||||
|
"Japanese"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -109,3 +112,6 @@ de lang =
|
|||||||
|
|
||||||
Latvian ->
|
Latvian ->
|
||||||
"Lettisch"
|
"Lettisch"
|
||||||
|
|
||||||
|
Japanese ->
|
||||||
|
"Japanisch"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user