From f994d4b2488e64668ee064676f8c6469d9ccc1be Mon Sep 17 00:00:00 2001 From: eikek Date: Wed, 28 Jul 2021 20:05:38 +0200 Subject: [PATCH] Add japanese document language --- docker/dockerfiles/joex.dockerfile | 1 + .../docspell/analysis/date/DateFind.scala | 3 ++- .../docspell/analysis/date/MonthName.scala | 17 +++++++++++++ .../docspell/analysis/date/DateFindSpec.scala | 25 +++++++++++++++++++ .../main/scala/docspell/common/Language.scala | 8 +++++- modules/webapp/src/main/elm/Data/Language.elm | 8 ++++++ .../src/main/elm/Messages/Data/Language.elm | 6 +++++ 7 files changed, 66 insertions(+), 2 deletions(-) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 130f7c30..a75de16c 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -29,6 +29,7 @@ RUN JDKPKG="openjdk11"; \ tesseract-ocr-data-rus \ tesseract-ocr-data-ron \ tesseract-ocr-data-lav \ + tesseract-ocr-data-jpn \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 038dba08..ebb7ad7a 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -22,7 +22,7 @@ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = TextSplitter - .splitToken(text, " \t.,\n\r/".toSet) + .splitToken(text, " \t.,\n\r/年月日".toSet) .filter(w => lang != Language.Latvian || w.value != "gada") .sliding(3) .filter(_.size == 3) @@ -89,6 +89,7 @@ object DateFind { case Language.Swedish => ymd.or(dmy).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Latvian => dmy.or(lavLong).or(ymd) + case Language.Japanese => ymd } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 8a5852d0..37b16852 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -50,6 +50,8 @@ object MonthName { russian case Language.Latvian => latvian + case Language.Japanese => + japanese } private val numbers = List( @@ -290,4 +292,19 @@ object MonthName { List("novembris", "nov."), List("decembris", "dec.") ) + + private val japanese = List( + List("1", "一"), + List("2", "二"), + List("3", "三"), + List("4", "四"), + List("5", "五"), + List("6", "六"), + List("7", "七"), + List("8", "八"), + List("9", "九"), + List("10", "十"), + List("11", "十一"), + List("12", "十二") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 7784fd2f..f7109b70 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -143,4 +143,29 @@ class DateFindSpec extends FunSuite { ) } + test("find japanese dates") { + assertEquals( + DateFind + .findDates("some text in japanese 2021.7.21 and more", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021.7.21", NerTag.Date, 22, 31) + ) + ) + ) + assertEquals( + DateFind + .findDates("some text in japanese 2021年7月21日 and more", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021年7月21", NerTag.Date, 22, 31) + ) + ) + ) + } + } diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index a3e012fa..d46aba3a 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -108,6 +108,11 @@ object Language { val iso3 = "lav" } + case object Japanese extends Language { + val iso2 = "ja" + val iso3 = "jpn" + } + val all: List[Language] = List( German, @@ -124,7 +129,8 @@ object Language { Swedish, Russian, Romanian, - Latvian + Latvian, + Japanese ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 94b8a033..41522878 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -29,6 +29,7 @@ type Language | Romanian | Dutch | Latvian + | Japanese fromString : String -> Maybe Language @@ -78,6 +79,9 @@ fromString str = else if str == "lav" || str == "lv" || str == "latvian" then Just Latvian + else if str == "jpn" || str == "ja" || str == "japanese" then + Just Japanese + else Nothing @@ -130,6 +134,9 @@ toIso3 lang = Latvian -> "lav" + Japanese -> + "jpn" + all : List Language all = @@ -148,4 +155,5 @@ all = , Russian , Romanian , Latvian + , Japanese ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 3f66c7f6..60070c56 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -61,6 +61,9 @@ gb lang = Latvian -> "Latvian" + Japanese -> + "Japanese" + de : Language -> String de lang = @@ -109,3 +112,6 @@ de lang = Latvian -> "Lettisch" + + Japanese -> + "Japanisch"