Preprocess japanese texts to find dates

Not very efficient, but should work to find the position of dates in
japanese text.
This commit is contained in:
eikek 2021-07-29 01:35:15 +02:00
parent e8348e2809
commit 4af8dd0950
2 changed files with 18 additions and 6 deletions

View File

@ -21,9 +21,7 @@ import docspell.common._
object DateFind { object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter splitWords(text, lang)
.splitToken(text, " \t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.sliding(3) .sliding(3)
.filter(_.size == 3) .filter(_.size == 3)
.flatMap(q => .flatMap(q =>
@ -44,6 +42,20 @@ object DateFind {
) )
) )
private[this] val jpnChars =
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext =
if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text
TextSplitter
.splitToken(stext, " \t.,\n\r/年月日".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
}
case class SimpleDate(year: Int, month: Int, day: Int) { case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate = def toLocalDate: LocalDate =
LocalDate.of(if (year < 100) 2000 + year else year, month, day) LocalDate.of(if (year < 100) 2000 + year else year, month, day)

View File

@ -151,7 +151,7 @@ class DateFindSpec extends FunSuite {
Vector( Vector(
NerDateLabel( NerDateLabel(
LocalDate.of(2021, 7, 21), LocalDate.of(2021, 7, 21),
NerLabel("2021.7.21", NerTag.Date, 22, 31) NerLabel("2021.7.21", NerTag.Date, 6, 15)
) )
) )
) )
@ -162,7 +162,7 @@ class DateFindSpec extends FunSuite {
Vector( Vector(
NerDateLabel( NerDateLabel(
LocalDate.of(2021, 7, 21), LocalDate.of(2021, 7, 21),
NerLabel("2021年7月21", NerTag.Date, 22, 31) NerLabel("2021年7月21", NerTag.Date, 6, 15)
) )
) )
) )
@ -173,7 +173,7 @@ class DateFindSpec extends FunSuite {
Vector( Vector(
NerDateLabel( NerDateLabel(
LocalDate.of(2021, 7, 21), LocalDate.of(2021, 7, 21),
NerLabel("2021年7月21", NerTag.Date, 22, 31) NerLabel("2021年7月21", NerTag.Date, 3, 12)
) )
) )
) )