From fb05e997abc80db6e005f42a9041f4b8e369667a Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 10 Jan 2021 08:45:31 +0100 Subject: [PATCH 1/2] Provide multiple date suggestions for English Issue: #561 --- build.sbt | 4 +- .../docspell/analysis/date/DateFind.scala | 81 ++++++++++++------- .../docspell/analysis/date/DateFindSpec.scala | 26 ++++++ 3 files changed, 79 insertions(+), 32 deletions(-) diff --git a/build.sbt b/build.sbt index 5eb05301..91016ca0 100644 --- a/build.sbt +++ b/build.sbt @@ -40,9 +40,9 @@ val sharedSettings = Seq( packageTools(logger, dir, v) }, scalacOptions in (Compile, console) := - (scalacOptions.value.filter(o => !o.contains("Xlint")) ++ Seq("-Xlint:_,-unused")), + (scalacOptions.value.filter(o => !o.contains("-Xlint") && !o.contains("-W"))), scalacOptions in (Test, console) := - (scalacOptions.value.filter(o => !o.contains("Xlint")) ++ Seq("-Xlint:_,-unused")) + (scalacOptions.value.filter(o => !o.contains("-Xlint") && !o.contains("-W"))) ) ++ scalafixSettings val testSettings = Seq( diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 84c9dd55..1c1bb686 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -17,51 +17,53 @@ object DateFind { .splitToken(text, " \t.,\n\r/".toSet) .sliding(3) .filter(_.length == 3) - .map(q => - SimpleDate - .fromParts(q.toList, lang) - .map(sd => - NerDateLabel( - sd.toLocalDate, - NerLabel( - text.substring(q.head.begin, q(2).end), - NerTag.Date, - q.head.begin, - q(1).end + .flatMap(q => + Stream.emits( + SimpleDate + .fromParts(q.toList, lang) + .map(sd => + NerDateLabel( + sd.toLocalDate, + NerLabel( + text.substring(q.head.begin, q(2).end), + NerTag.Date, + q.head.begin, + q(1).end + ) ) ) - ) + ) ) - .collect({ case Some(d) => d }) - private case class SimpleDate(year: Int, month: Int, day: Int) { + case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = LocalDate.of(if (year < 100) 2000 + year else year, month, day) } - private object SimpleDate { + object SimpleDate { val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) => - SimpleDate(y, m, d) + List(SimpleDate(y, m, d)) } val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) => - SimpleDate(y, m, d) + List(SimpleDate(y, m, d)) } val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) => - SimpleDate(y, m, d) + List(SimpleDate(y, m, d)) } // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔ - def fromParts(parts: List[Word], lang: Language): Option[SimpleDate] = { + def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = { val p = lang match { - case Language.English => p2.or(p0).or(p1) - case Language.German => p1.or(p0).or(p2) - case Language.French => p1.or(p0).or(p2) + case Language.English => + p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1) + case Language.German => p1.or(p0).or(p2) + case Language.French => p1.or(p0).or(p2) } p.read(parts) match { - case Result.Success(sd, _) => - Either.catchNonFatal(sd.toLocalDate).map(_ => sd).toOption + case Result.Success(sds, _) => + sds.flatMap(sd => Either.catchNonFatal(sd.toLocalDate).toOption.map(_ => sd)) case Result.Failure => - None + Nil } } @@ -89,6 +91,15 @@ object DateFind { def map[B](f: A => B): Reader[B] = Reader(read.andThen(_.map(f))) + def flatMap[B](f: A => Reader[B]): Reader[B] = + Reader(read.andThen { + case Result.Success(a, rest) => f(a).read(rest) + case Result.Failure => Result.Failure + }) + + def alt(other: Reader[A]): Reader[(A, A)] = + Reader(words => Result.combine(read(words), other.read(words))) + def or(other: Reader[A]): Reader[A] = Reader(words => read(words) match { @@ -113,21 +124,31 @@ object DateFind { sealed trait Result[+A] { def toOption: Option[A] def map[B](f: A => B): Result[B] + def flatMap[B](f: A => Result[B]): Result[B] def next[B](r: Reader[B]): Result[(A, B)] } object Result { final case class Success[A](value: A, rest: List[Word]) extends Result[A] { - val toOption = Some(value) - def map[B](f: A => B): Result[B] = Success(f(value), rest) + val toOption = Some(value) + def flatMap[B](f: A => Result[B]): Result[B] = f(value) + def map[B](f: A => B): Result[B] = Success(f(value), rest) def next[B](r: Reader[B]): Result[(A, B)] = r.read(rest).map(b => (value, b)) } final case object Failure extends Result[Nothing] { - val toOption = None - def map[B](f: Nothing => B): Result[B] = this - def next[B](r: Reader[B]): Result[(Nothing, B)] = this + val toOption = None + def flatMap[B](f: Nothing => Result[B]): Result[B] = this + def map[B](f: Nothing => B): Result[B] = this + def next[B](r: Reader[B]): Result[(Nothing, B)] = this } + def combine[A](r0: Result[A], r1: Result[A]): Result[(A, A)] = + (r0, r1) match { + case (Success(a0, _), Success(a1, r1)) => + Success((a0, a1), r1) + case _ => + Failure + } } private val months = List( diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index d954f7a7..11566f3c 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -43,4 +43,30 @@ object DateFindSpec extends SimpleTestSuite { Vector.empty ) } + + test("different date formats") { + assertEquals( + DateFind.findDates("on 11/05/2020", Language.English).toVector, + Vector( + NerDateLabel( + LocalDate.of(2020, 11, 5), + NerLabel("11/05/2020", NerTag.Date, 3, 8) + ), + NerDateLabel( + LocalDate.of(2020, 5, 11), + NerLabel("11/05/2020", NerTag.Date, 3, 8) + ) + ) + ) + assertEquals( + DateFind.findDates("on 21/05/2020", Language.English).toVector, + Vector( + NerDateLabel( + LocalDate.of(2020, 5, 21), + NerLabel("21/05/2020", NerTag.Date, 3, 8) + ) + ) + ) + } + } From 75986c461f75cf1286024961a09d2d7e468663a0 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 10 Jan 2021 09:10:39 +0100 Subject: [PATCH 2/2] Fix ner date label boundary reporting --- .../scala/docspell/analysis/date/DateFind.scala | 2 +- .../docspell/analysis/date/DateFindSpec.scala | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 1c1bb686..90fcd8cd 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -28,7 +28,7 @@ object DateFind { text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, - q(1).end + q(2).end ) ) ) diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 11566f3c..cb971f75 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -11,19 +11,19 @@ object DateFindSpec extends SimpleTestSuite { val expect = Vector( NerDateLabel( LocalDate.parse("2016-11-07"), - NerLabel("November 7, 2016", NerTag.Date, 50, 60) + NerLabel("November 7, 2016", NerTag.Date, 50, 66) ), NerDateLabel( LocalDate.parse("2016-11-07"), - NerLabel("November 7, 2016", NerTag.Date, 119, 129) + NerLabel("November 7, 2016", NerTag.Date, 119, 135) ), NerDateLabel( LocalDate.parse("2019-09-03"), - NerLabel("September 3, 2019", NerTag.Date, 249, 260) + NerLabel("September 3, 2019", NerTag.Date, 249, 266) ), NerDateLabel( LocalDate.parse("2016-12-12"), - NerLabel("December 12, 2016", NerTag.Date, 1076, 1087) + NerLabel("December 12, 2016", NerTag.Date, 1076, 1093) ) ) @@ -50,11 +50,11 @@ object DateFindSpec extends SimpleTestSuite { Vector( NerDateLabel( LocalDate.of(2020, 11, 5), - NerLabel("11/05/2020", NerTag.Date, 3, 8) + NerLabel("11/05/2020", NerTag.Date, 3, 13) ), NerDateLabel( LocalDate.of(2020, 5, 11), - NerLabel("11/05/2020", NerTag.Date, 3, 8) + NerLabel("11/05/2020", NerTag.Date, 3, 13) ) ) ) @@ -63,7 +63,7 @@ object DateFindSpec extends SimpleTestSuite { Vector( NerDateLabel( LocalDate.of(2020, 5, 21), - NerLabel("21/05/2020", NerTag.Date, 3, 8) + NerLabel("21/05/2020", NerTag.Date, 3, 13) ) ) )