mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 10:59:33 +00:00
Merge pull request #962 from wallace11/japanese-test-improvements
Japanese test improvements
This commit is contained in:
commit
16ade6934a
@ -21,9 +21,7 @@ import docspell.common._
|
|||||||
object DateFind {
|
object DateFind {
|
||||||
|
|
||||||
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
||||||
TextSplitter
|
splitWords(text, lang)
|
||||||
.splitToken(text, " \t.,\n\r/年月日".toSet)
|
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
|
||||||
.sliding(3)
|
.sliding(3)
|
||||||
.filter(_.size == 3)
|
.filter(_.size == 3)
|
||||||
.flatMap(q =>
|
.flatMap(q =>
|
||||||
@ -44,6 +42,20 @@ object DateFind {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private[this] val jpnChars =
|
||||||
|
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||||
|
|
||||||
|
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||||
|
val stext =
|
||||||
|
if (lang == Language.Japanese) {
|
||||||
|
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||||
|
} else text
|
||||||
|
|
||||||
|
TextSplitter
|
||||||
|
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
||||||
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
|
}
|
||||||
|
|
||||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||||
def toLocalDate: LocalDate =
|
def toLocalDate: LocalDate =
|
||||||
LocalDate.of(if (year < 100) 2000 + year else year, month, day)
|
LocalDate.of(if (year < 100) 2000 + year else year, month, day)
|
||||||
|
@ -146,23 +146,34 @@ class DateFindSpec extends FunSuite {
|
|||||||
test("find japanese dates") {
|
test("find japanese dates") {
|
||||||
assertEquals(
|
assertEquals(
|
||||||
DateFind
|
DateFind
|
||||||
.findDates("some text in japanese 2021.7.21 and more", Language.Japanese)
|
.findDates("今日の日付は2021.7.21です。", Language.Japanese)
|
||||||
.toVector,
|
.toVector,
|
||||||
Vector(
|
Vector(
|
||||||
NerDateLabel(
|
NerDateLabel(
|
||||||
LocalDate.of(2021, 7, 21),
|
LocalDate.of(2021, 7, 21),
|
||||||
NerLabel("2021.7.21", NerTag.Date, 22, 31)
|
NerLabel("2021.7.21", NerTag.Date, 6, 15)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
assertEquals(
|
assertEquals(
|
||||||
DateFind
|
DateFind
|
||||||
.findDates("some text in japanese 2021年7月21日 and more", Language.Japanese)
|
.findDates("今日の日付は2021年7月21日です。", Language.Japanese)
|
||||||
.toVector,
|
.toVector,
|
||||||
Vector(
|
Vector(
|
||||||
NerDateLabel(
|
NerDateLabel(
|
||||||
LocalDate.of(2021, 7, 21),
|
LocalDate.of(2021, 7, 21),
|
||||||
NerLabel("2021年7月21", NerTag.Date, 22, 31)
|
NerLabel("2021年7月21", NerTag.Date, 6, 15)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("年月日2021年7月21日(日)", Language.Japanese)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 7, 21),
|
||||||
|
NerLabel("2021年7月21", NerTag.Date, 3, 12)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user