From 119a4ffdc95677756f774d6a58c0dd3475845932 Mon Sep 17 00:00:00 2001 From: wallace11 <11wallace11@gmail.com> Date: Thu, 29 Jul 2021 01:08:48 +0300 Subject: [PATCH 1/4] Update Japanese tests with more sensible data --- .../src/test/scala/docspell/analysis/date/DateFindSpec.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index f7109b70..48a5ed1d 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -146,7 +146,7 @@ class DateFindSpec extends FunSuite { test("find japanese dates") { assertEquals( DateFind - .findDates("some text in japanese 2021.7.21 and more", Language.Japanese) + .findDates("今日の日付は2021.7.21です。", Language.Japanese) .toVector, Vector( NerDateLabel( @@ -157,7 +157,7 @@ class DateFindSpec extends FunSuite { ) assertEquals( DateFind - .findDates("some text in japanese 2021年7月21日 and more", Language.Japanese) + .findDates("今日の日付は2021年7月21日です。", Language.Japanese) .toVector, Vector( NerDateLabel( From 1095a7d56f7b870831f2a465259665b27c873090 Mon Sep 17 00:00:00 2001 From: wallace11 <11wallace11@gmail.com> Date: Thu, 29 Jul 2021 01:13:22 +0300 Subject: [PATCH 2/4] Add another Japanese test --- .../scala/docspell/analysis/date/DateFindSpec.scala | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 48a5ed1d..4ba026f1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -166,6 +166,17 @@ class DateFindSpec extends FunSuite { ) ) ) + assertEquals( + DateFind + .findDates("年月日2021年7月21日(日)", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021年7月21", NerTag.Date, 22, 31) + ) + ) + ) } } From e8348e2809cd69542e47eafca16f6e7913ae88bd Mon Sep 17 00:00:00 2001 From: wallace <11wallace11@gmail.com> Date: Thu, 29 Jul 2021 02:08:48 +0300 Subject: [PATCH 3/4] Remove excessive spaces --- .../src/test/scala/docspell/analysis/date/DateFindSpec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 4ba026f1..07a09a4c 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -176,7 +176,7 @@ class DateFindSpec extends FunSuite { NerLabel("2021年7月21", NerTag.Date, 22, 31) ) ) - ) + ) } } From 4af8dd0950e949b9e142123dbf3915b048075fdd Mon Sep 17 00:00:00 2001 From: eikek Date: Thu, 29 Jul 2021 01:35:15 +0200 Subject: [PATCH 4/4] Preprocess japanese texts to find dates Not very efficient, but should work to find the position of dates in japanese text. --- .../docspell/analysis/date/DateFind.scala | 18 +++++++++++++++--- .../docspell/analysis/date/DateFindSpec.scala | 6 +++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index ebb7ad7a..4d90324e 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -21,9 +21,7 @@ import docspell.common._ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = - TextSplitter - .splitToken(text, " \t.,\n\r/年月日".toSet) - .filter(w => lang != Language.Latvian || w.value != "gada") + splitWords(text, lang) .sliding(3) .filter(_.size == 3) .flatMap(q => @@ -44,6 +42,20 @@ object DateFind { ) ) + private[this] val jpnChars = + ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet + + private def splitWords(text: String, lang: Language): Stream[Pure, Word] = { + val stext = + if (lang == Language.Japanese) { + text.map(c => if (jpnChars.contains(c)) c else ' ') + } else text + + TextSplitter + .splitToken(stext, " \t.,\n\r/年月日".toSet) + .filter(w => lang != Language.Latvian || w.value != "gada") + } + case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = LocalDate.of(if (year < 100) 2000 + year else year, month, day) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 07a09a4c..a41eb6d3 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -151,7 +151,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021.7.21", NerTag.Date, 22, 31) + NerLabel("2021.7.21", NerTag.Date, 6, 15) ) ) ) @@ -162,7 +162,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021年7月21", NerTag.Date, 22, 31) + NerLabel("2021年7月21", NerTag.Date, 6, 15) ) ) ) @@ -173,7 +173,7 @@ class DateFindSpec extends FunSuite { Vector( NerDateLabel( LocalDate.of(2021, 7, 21), - NerLabel("2021年7月21", NerTag.Date, 22, 31) + NerLabel("2021年7月21", NerTag.Date, 3, 12) ) ) )