mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-02-15 20:33:26 +00:00
Preprocess japanese texts to find dates
Not very efficient, but should work to find the position of dates in japanese text.
This commit is contained in:
parent
e8348e2809
commit
4af8dd0950
@ -21,9 +21,7 @@ import docspell.common._
|
||||
object DateFind {
|
||||
|
||||
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
|
||||
TextSplitter
|
||||
.splitToken(text, " \t.,\n\r/年月日".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
splitWords(text, lang)
|
||||
.sliding(3)
|
||||
.filter(_.size == 3)
|
||||
.flatMap(q =>
|
||||
@ -44,6 +42,20 @@ object DateFind {
|
||||
)
|
||||
)
|
||||
|
||||
private[this] val jpnChars =
|
||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||
|
||||
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
val stext =
|
||||
if (lang == Language.Japanese) {
|
||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||
} else text
|
||||
|
||||
TextSplitter
|
||||
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
}
|
||||
|
||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||
def toLocalDate: LocalDate =
|
||||
LocalDate.of(if (year < 100) 2000 + year else year, month, day)
|
||||
|
@ -151,7 +151,7 @@ class DateFindSpec extends FunSuite {
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 7, 21),
|
||||
NerLabel("2021.7.21", NerTag.Date, 22, 31)
|
||||
NerLabel("2021.7.21", NerTag.Date, 6, 15)
|
||||
)
|
||||
)
|
||||
)
|
||||
@ -162,7 +162,7 @@ class DateFindSpec extends FunSuite {
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 7, 21),
|
||||
NerLabel("2021年7月21", NerTag.Date, 22, 31)
|
||||
NerLabel("2021年7月21", NerTag.Date, 6, 15)
|
||||
)
|
||||
)
|
||||
)
|
||||
@ -173,7 +173,7 @@ class DateFindSpec extends FunSuite {
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 7, 21),
|
||||
NerLabel("2021年7月21", NerTag.Date, 22, 31)
|
||||
NerLabel("2021年7月21", NerTag.Date, 3, 12)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user