From c7d4c77e6d246148ca5835ed87cbf187f4dbd1cf Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 26 Feb 2021 00:34:57 +0100 Subject: [PATCH] Allow more suggestions for date variants in English --- .../docspell/analysis/date/DateFind.scala | 33 ++++++++++------ .../docspell/analysis/date/DateFindSpec.scala | 38 ++++++++++++++++++- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 698606f0..f67c32f0 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -4,7 +4,9 @@ import java.time.LocalDate import scala.util.Try +import cats.data.{NonEmptyList => Nel} import cats.implicits._ +import cats.kernel.Semigroup import fs2.{Pure, Stream} import docspell.analysis.split._ @@ -61,8 +63,7 @@ object DateFind { val mdy = pattern2(lang) // most is from wikipedia… val p = lang match { - case Language.English => - mdy.alt(dmy).map(t => t._1 ++ t._2).or(mdy).or(ymd).or(dmy) + case Language.English => Reader.all(dmy, mdy, ymd) case Language.German => dmy.or(ymd).or(mdy) case Language.French => dmy.or(ymd).or(mdy) case Language.Italian => dmy.or(ymd).or(mdy) @@ -117,9 +118,6 @@ object DateFind { case Result.Failure => Result.Failure }) - def alt(other: Reader[A]): Reader[(A, A)] = - Reader(words => Result.combine(read(words), other.read(words))) - def or(other: Reader[A]): Reader[A] = Reader(words => read(words) match { @@ -133,6 +131,9 @@ object DateFind { def fail[A]: Reader[A] = Reader(_ => Result.Failure) + def all[A: Semigroup](reader: Reader[A], more: Reader[A]*): Reader[A] = + Reader(words => Nel.of(reader, more: _*).map(_.read(words)).reduce) + def readFirst[A](f: Word => Option[A]): Reader[A] = Reader({ case Nil => Result.Failure @@ -162,12 +163,22 @@ object DateFind { def map[B](f: Nothing => B): Result[B] = this def next[B](r: Reader[B]): Result[(Nothing, B)] = this } - def combine[A](r0: Result[A], r1: Result[A]): Result[(A, A)] = - (r0, r1) match { - case (Success(a0, _), Success(a1, r1)) => - Success((a0, a1), r1) - case _ => - Failure + + implicit def resultSemigroup[A: Semigroup]: Semigroup[Result[A]] = + Semigroup.instance { (r0, r1) => + (r0, r1) match { + case (Success(a0, r0), Success(a1, r1)) => + Success(Semigroup[A].combine(a0, a1), if (r0.size < r1.size) r0 else r1) + + case (s @ Success(_, _), Failure) => + s + + case (Failure, s @ Success(_, _)) => + s + + case (Failure, Failure) => + Failure + } } } } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index cb971f75..800db6d1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -49,11 +49,11 @@ object DateFindSpec extends SimpleTestSuite { DateFind.findDates("on 11/05/2020", Language.English).toVector, Vector( NerDateLabel( - LocalDate.of(2020, 11, 5), + LocalDate.of(2020, 5, 11), NerLabel("11/05/2020", NerTag.Date, 3, 13) ), NerDateLabel( - LocalDate.of(2020, 5, 11), + LocalDate.of(2020, 11, 5), NerLabel("11/05/2020", NerTag.Date, 3, 13) ) ) @@ -69,4 +69,38 @@ object DateFindSpec extends SimpleTestSuite { ) } + test("more english variants") { + assertEquals( + DateFind.findDates("on 26/01/15", Language.English).toVector, + Vector( + NerDateLabel( + LocalDate.of(2015, 1, 26), + NerLabel("26/01/15", NerTag.Date, 3, 11) + ), + NerDateLabel( + LocalDate.of(2026, 1, 15), + NerLabel("26/01/15", NerTag.Date, 3, 11) + ) + ) + ) + + assertEquals( + DateFind.findDates("on 10/09/11", Language.English).toVector, + Vector( + NerDateLabel( + LocalDate.of(2011, 9, 10), + NerLabel("10/09/11", NerTag.Date, 3, 11) + ), + NerDateLabel( + LocalDate.of(2011, 10, 9), + NerLabel("10/09/11", NerTag.Date, 3, 11) + ), + NerDateLabel( + LocalDate.of(2010, 9, 11), + NerLabel("10/09/11", NerTag.Date, 3, 11) + ) + ) + ) + } + }