diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index ebb7ad7a..4d90324e 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -21,9 +21,7 @@ import docspell.common._ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = - TextSplitter - .splitToken(text, " \t.,\n\r/年月日".toSet) - .filter(w => lang != Language.Latvian || w.value != "gada") + splitWords(text, lang) .sliding(3) .filter(_.size == 3) .flatMap(q => @@ -44,6 +42,20 @@ object DateFind { ) ) + private[this] val jpnChars = + ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet + + private def splitWords(text: String, lang: Language): Stream[Pure, Word] = { + val stext = + if (lang == Language.Japanese) { + text.map(c => if (jpnChars.contains(c)) c else ' ') + } else text + + TextSplitter + .splitToken(stext, " \t.,\n\r/年月日".toSet) + .filter(w => lang != Language.Latvian || w.value != "gada") + } + case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = LocalDate.of(if (year < 100) 2000 + year else year, month, day) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index f7109b70..a41eb6d3 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -146,23 +146,34 @@ class DateFindSpec extends FunSuite { test("find japanese dates") { assertEquals( DateFind - .findDates("some text in japanese 2021.7.21 and more", Language.Japanese) + .findDates("今日の日付は2021.7.21です。", Language.Japanese) .toVector, Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021.7.21", NerTag.Date, 22, 31) + NerLabel("2021.7.21", NerTag.Date, 6, 15) ) ) ) assertEquals( DateFind - .findDates("some text in japanese 2021年7月21日 and more", Language.Japanese) + .findDates("今日の日付は2021年7月21日です。", Language.Japanese) .toVector, Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021年7月21", NerTag.Date, 22, 31) + NerLabel("2021年7月21", NerTag.Date, 6, 15) + ) + ) + ) + assertEquals( + DateFind + .findDates("年月日2021年7月21日(日)", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021年7月21", NerTag.Date, 3, 12) ) ) )