From 2a89942ae0b5e8fe7e1d411eee607fdcc9d3bedc Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:51:12 +0700 Subject: [PATCH 01/12] add tesseract lang for khmer --- docker/dockerfiles/joex.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index f641607e..89033c4f 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -33,6 +33,7 @@ RUN apk update && \ tesseract-ocr-data-pol \ tesseract-ocr-data-est \ tesseract-ocr-data-ukr \ + tesseract-ocr-data-khm \ unpaper \ weasyprint \ libreoffice \ From 3511e1ef19c48d7efcd95900f28edeb61935ac52 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:51:28 +0700 Subject: [PATCH 02/12] enable analysis-extras module for ico tokenizer --- docker/docker-compose/docker-compose.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose/docker-compose.yml b/docker/docker-compose/docker-compose.yml index fc07ee4d..bad80db3 100644 --- a/docker/docker-compose/docker-compose.yml +++ b/docker/docker-compose/docker-compose.yml @@ -126,8 +126,9 @@ services: volumes: - docspell-solr_data:/var/solr command: - - solr-precreate - - docspell + - bash + - -c + - 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras' healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"] interval: 1m From 7c6fc5daddb51db9d0ba409316cadd8dfcc2fda4 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:51:41 +0700 Subject: [PATCH 03/12] add khmer lang to ui --- modules/webapp/src/main/elm/Data/Language.elm | 8 ++++++++ modules/webapp/src/main/elm/Messages/Data/Language.elm | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index e94a805f..61144660 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -36,6 +36,7 @@ type Language | Polish | Estonian | Ukrainian + | Khmer fromString : String -> Maybe Language @@ -106,6 +107,9 @@ fromString str = else if str == "ukr" || str == "uk" || str == "ukrainian" then Just Ukrainian + else if str == "khm" || str == "kh" || str == "khmer" then + Just Khmer + else Nothing @@ -179,6 +183,9 @@ toIso3 lang = Ukrainian -> "ukr" + Khmer -> + "khm" + all : List Language all = @@ -204,4 +211,5 @@ all = , Polish , Estonian , Ukrainian + , Khmer ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index f6e70488..71912beb 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -83,6 +83,9 @@ gb lang = Ukrainian -> "Ukrainian" + Khmer -> + "Khmer" + de : Language -> String de lang = @@ -153,6 +156,9 @@ de lang = Ukrainian -> "Ukrainisch" + Khmer -> + "Khmer" + fr : Language -> String fr lang = @@ -222,3 +228,6 @@ fr lang = Ukrainian -> "Ukrainien" + + Khmer -> + "Khmer" From 71548b93a7921c3fb635034cef2c7a41a827d335 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:51:50 +0700 Subject: [PATCH 04/12] add khmer lang --- .../src/main/scala/docspell/common/Language.scala | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index fb857041..349feb38 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -36,7 +36,7 @@ object Language { case object German extends NLPLanguage { val iso2 = "de" val iso3 = "deu" - } + } case object English extends NLPLanguage { val iso2 = "en" @@ -73,6 +73,11 @@ object Language { val iso3 = "ces" } + case object Khmer extends Language { + val iso2 = "kh" + val iso3 = "khm" + } + case object Danish extends Language { val iso2 = "da" val iso3 = "dan" @@ -166,7 +171,8 @@ object Language { Lithuanian, Polish, Estonian, - Ukrainian + Ukrainian, + Khmer ) def fromString(str: String): Either[String, Language] = { From 54deaf2cd7ab94c091592444df40ece629428a45 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:52:03 +0700 Subject: [PATCH 05/12] specify khmer lang date pattern --- .../src/main/scala/docspell/analysis/date/DateFind.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index b3ef4915..3e4973ae 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -129,6 +129,7 @@ object DateFind { case Language.Lithuanian => ymd case Language.Polish => dmy case Language.Estonian => dmy + case Language.Khmer => dmy case Language.Ukrainian => dmy.or(ymd) } p.read(parts) match { From 4b01a399e44ef05130d48c8d81ad3a12287ed641 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:52:14 +0700 Subject: [PATCH 06/12] add khmer lang month name --- .../docspell/analysis/date/MonthName.scala | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 776d36c5..a97aa53d 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -64,6 +64,8 @@ object MonthName { estonian case Language.Ukrainian => ukrainian + case Language.Khmer => + khmer } private val numbers = List( @@ -81,6 +83,21 @@ object MonthName { List("12") ) + private val khmer = List( + List("០១", "មករា"), + List("០២", "កុម្ភៈ"), + List("០៣", "មិនា"), + List("០៤", "មេសា"), + List("០៥", "ឧសភា"), + List("០៦", "មិថុនា"), + List("០៧", "កក្កដា"), + List("០៨", "សីហា"), + List("០៩", "កញ្ញា"), + List("១០", "តុលា"), + List("១១", "វិច្ឆិកា"), + List("១២", "ធ្នូ") + ) + private val english = List( List("jan", "january"), List("feb", "february"), From b118c5076aed20af3d56731840bd9a4ccb663259 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:52:26 +0700 Subject: [PATCH 07/12] update default pg config for khmer lang --- .../fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index b2582dfb..f25d64ce 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta { case Language.Polish => "simple" case Language.Estonian => "simple" case Language.Ukrainian => "simple" + case Language.Khmer => "simple" } } From 05d75743b87e52509cdba7917db2e338de6af1bb Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Thu, 16 Mar 2023 23:53:03 +0700 Subject: [PATCH 08/12] add solr `content_kh` field type with icu tokenizer for khmer content --- .../main/scala/docspell/ftssolr/Field.scala | 1 + .../scala/docspell/ftssolr/SolrSetup.scala | 23 ++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 2b960671..d331fc56 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -30,6 +30,7 @@ object Field { val content_de = contentField(Language.German) val content_en = contentField(Language.English) val content_fr = contentField(Language.French) + val content_kh = contentField(Language.Khmer) val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index b76fac57..0e736bdf 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -172,7 +172,18 @@ object SolrSetup { "Add Ukrainian", addContentField(Language.Ukrainian) ), - SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian") + SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"), + SolrMigration[F]( + 32, + "Add new field type for khmer content", + addFieldType(AddFieldType.textKhm) + ), + SolrMigration[F]( + 33, + "Add Khmer", + addContentField(Language.Khmer) + ), + SolrMigration.reIndexAll(34, "Re-Index after adding Khmer") ) def addFolderField: F[Unit] = @@ -347,6 +358,16 @@ object SolrSetup { ) ) + val textKhm = AddFieldType( + "text_kh", + "solr.TextField", + Analyzer( + Tokenizer("solr.ICUTokenizerFactory", Map.empty), + List( + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) From 7202f9f117536ec6d114f57b6d489022b10114cf Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Fri, 17 Mar 2023 09:02:01 +0700 Subject: [PATCH 09/12] fixed format by `sbt fix` --- modules/common/src/main/scala/docspell/common/Language.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 349feb38..cb874eb0 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -36,7 +36,7 @@ object Language { case object German extends NLPLanguage { val iso2 = "de" val iso3 = "deu" - } + } case object English extends NLPLanguage { val iso2 = "en" From 40642dea1016406fad2e5ae883227c20a77ecfff Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Fri, 17 Mar 2023 17:50:48 +0700 Subject: [PATCH 10/12] temporary download khmer traineddata before the package being added to registry --- docker/dockerfiles/joex.dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 89033c4f..6214b4b1 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -33,7 +33,6 @@ RUN apk update && \ tesseract-ocr-data-pol \ tesseract-ocr-data-est \ tesseract-ocr-data-ukr \ - tesseract-ocr-data-khm \ unpaper \ weasyprint \ libreoffice \ @@ -70,6 +69,12 @@ RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$versi ln -snf docspell-joex-* docspell-joex && \ rm docspell-joex/conf/docspell-joex.conf +# temporary download traineddata directly for khmer lang +# before tesseract-ocr-data-khm being added to the registry +RUN \ + wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \ + mv khm.traineddata /usr/share/tessdata + # Using these data files for japanese, because they work better. See #973 RUN \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ From c576f08c53ad2901c9f01122eec1d4cf6cd2b1c9 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Wed, 29 Mar 2023 17:48:45 +0700 Subject: [PATCH 11/12] add khmer font --- docker/dockerfiles/joex.dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 6214b4b1..a65abc30 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -40,6 +40,7 @@ RUN apk update && \ ttf-dejavu \ ttf-freefont \ ttf-liberation \ + font-noto-khmer \ libxml2-dev \ libxslt-dev \ pngquant \ @@ -63,6 +64,7 @@ RUN apk update && \ RUN apk add --no-cache py3-setuptools && ocrmypdf --version WORKDIR /opt + RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \ unzip docspell-joex-*.zip && \ rm docspell-joex-*.zip && \ From 0678c33870057b4b4dd18f18fdf4c0aff7cc3e49 Mon Sep 17 00:00:00 2001 From: xshadowlegendx Date: Sat, 1 Apr 2023 12:18:27 +0700 Subject: [PATCH 12/12] specify extra module has to be enabled for khmer lang to work --- website/site/content/docs/install/prereq.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/site/content/docs/install/prereq.md b/website/site/content/docs/install/prereq.md index a11166c4..bfcceca3 100644 --- a/website/site/content/docs/install/prereq.md +++ b/website/site/content/docs/install/prereq.md @@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html). That will provide you with the connection url (the last part is the core name). +Then start solr with `-Dsolr.modules=analysis-extras` +to enable some additional analyzer like `icu` for `Khmer` language etc +as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer), +which we used for tokenization and segmentation for `Khmer` language in docspell. + When using the provided `docker-compose.yml` setup, SOLR is already setup. SOLR must be reachable from all joex and all rest server components.