Merge pull request #962 from wallace11/japanese-test-improvements

Japanese test improvements
This commit is contained in:
eikek 2021-07-29 01:47:04 +02:00 committed by GitHub
commit 16ade6934a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 7 deletions

View File

@ -21,9 +21,7 @@ import docspell.common._
object DateFind { object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter splitWords(text, lang)
.splitToken(text, " \t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.sliding(3) .sliding(3)
.filter(_.size == 3) .filter(_.size == 3)
.flatMap(q => .flatMap(q =>
@ -44,6 +42,20 @@ object DateFind {
) )
) )
private[this] val jpnChars =
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext =
if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text
TextSplitter
.splitToken(stext, " \t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
}
case class SimpleDate(year: Int, month: Int, day: Int) { case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate = def toLocalDate: LocalDate =
LocalDate.of(if (year < 100) 2000 + year else year, month, day) LocalDate.of(if (year < 100) 2000 + year else year, month, day)

View File

@ -146,23 +146,34 @@ class DateFindSpec extends FunSuite {
test("find japanese dates") { test("find japanese dates") {
assertEquals( assertEquals(
DateFind DateFind
.findDates("some text in japanese 2021.7.21 and more", Language.Japanese) .findDates("今日の日付は2021.7.21です。", Language.Japanese)
.toVector, .toVector,
Vector( Vector(
NerDateLabel( NerDateLabel(
LocalDate.of(2021, 7, 21), LocalDate.of(2021, 7, 21),
NerLabel("2021.7.21", NerTag.Date, 22, 31) NerLabel("2021.7.21", NerTag.Date, 6, 15)
) )
) )
) )
assertEquals( assertEquals(
DateFind DateFind
.findDates("some text in japanese 2021年7月21日 and more", Language.Japanese) .findDates("今日の日付は2021年7月21日です。", Language.Japanese)
.toVector, .toVector,
Vector( Vector(
NerDateLabel( NerDateLabel(
LocalDate.of(2021, 7, 21), LocalDate.of(2021, 7, 21),
NerLabel("2021年7月21", NerTag.Date, 22, 31) NerLabel("2021年7月21", NerTag.Date, 6, 15)
)
)
)
assertEquals(
DateFind
.findDates("年月日2021年7月21日", Language.Japanese)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 7, 21),
NerLabel("2021年7月21", NerTag.Date, 3, 12)
) )
) )
) )