diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index ebb7ad7a..4d90324e 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -21,9 +21,7 @@ import docspell.common._ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = - TextSplitter - .splitToken(text, " \t.,\n\r/年月日".toSet) - .filter(w => lang != Language.Latvian || w.value != "gada") + splitWords(text, lang) .sliding(3) .filter(_.size == 3) .flatMap(q => @@ -44,6 +42,20 @@ object DateFind { ) ) + private[this] val jpnChars = + ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet + + private def splitWords(text: String, lang: Language): Stream[Pure, Word] = { + val stext = + if (lang == Language.Japanese) { + text.map(c => if (jpnChars.contains(c)) c else ' ') + } else text + + TextSplitter + .splitToken(stext, " \t.,\n\r/年月日".toSet) + .filter(w => lang != Language.Latvian || w.value != "gada") + } + case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = LocalDate.of(if (year < 100) 2000 + year else year, month, day) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 07a09a4c..a41eb6d3 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -151,7 +151,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021.7.21", NerTag.Date, 22, 31) + NerLabel("2021.7.21", NerTag.Date, 6, 15) ) ) ) @@ -162,7 +162,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021年7月21", NerTag.Date, 22, 31) + NerLabel("2021年7月21", NerTag.Date, 6, 15) ) ) ) @@ -173,7 +173,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021年7月21", NerTag.Date, 22, 31) + NerLabel("2021年7月21", NerTag.Date, 3, 12) ) ) )