diff --git a/docker/docker-compose/docker-compose.yml b/docker/docker-compose/docker-compose.yml index fc07ee4d..bad80db3 100644 --- a/docker/docker-compose/docker-compose.yml +++ b/docker/docker-compose/docker-compose.yml @@ -126,8 +126,9 @@ services: volumes: - docspell-solr_data:/var/solr command: - - solr-precreate - - docspell + - bash + - -c + - 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras' healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"] interval: 1m diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index f641607e..a65abc30 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -40,6 +40,7 @@ RUN apk update && \ ttf-dejavu \ ttf-freefont \ ttf-liberation \ + font-noto-khmer \ libxml2-dev \ libxslt-dev \ pngquant \ @@ -63,12 +64,19 @@ RUN apk update && \ RUN apk add --no-cache py3-setuptools && ocrmypdf --version WORKDIR /opt + RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \ unzip docspell-joex-*.zip && \ rm docspell-joex-*.zip && \ ln -snf docspell-joex-* docspell-joex && \ rm docspell-joex/conf/docspell-joex.conf +# temporary download traineddata directly for khmer lang +# before tesseract-ocr-data-khm being added to the registry +RUN \ + wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \ + mv khm.traineddata /usr/share/tessdata + # Using these data files for japanese, because they work better. See #973 RUN \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index b3ef4915..3e4973ae 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -129,6 +129,7 @@ object DateFind { case Language.Lithuanian => ymd case Language.Polish => dmy case Language.Estonian => dmy + case Language.Khmer => dmy case Language.Ukrainian => dmy.or(ymd) } p.read(parts) match { diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 776d36c5..a97aa53d 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -64,6 +64,8 @@ object MonthName { estonian case Language.Ukrainian => ukrainian + case Language.Khmer => + khmer } private val numbers = List( @@ -81,6 +83,21 @@ object MonthName { List("12") ) + private val khmer = List( + List("០១", "មករា"), + List("០២", "កុម្ភៈ"), + List("០៣", "មិនា"), + List("០៤", "មេសា"), + List("០៥", "ឧសភា"), + List("០៦", "មិថុនា"), + List("០៧", "កក្កដា"), + List("០៨", "សីហា"), + List("០៩", "កញ្ញា"), + List("១០", "តុលា"), + List("១១", "វិច្ឆិកា"), + List("១២", "ធ្នូ") + ) + private val english = List( List("jan", "january"), List("feb", "february"), diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index fb857041..cb874eb0 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -73,6 +73,11 @@ object Language { val iso3 = "ces" } + case object Khmer extends Language { + val iso2 = "kh" + val iso3 = "khm" + } + case object Danish extends Language { val iso2 = "da" val iso3 = "dan" @@ -166,7 +171,8 @@ object Language { Lithuanian, Polish, Estonian, - Ukrainian + Ukrainian, + Khmer ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index b2582dfb..f25d64ce 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta { case Language.Polish => "simple" case Language.Estonian => "simple" case Language.Ukrainian => "simple" + case Language.Khmer => "simple" } } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index 2b960671..d331fc56 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -30,6 +30,7 @@ object Field { val content_de = contentField(Language.German) val content_en = contentField(Language.English) val content_fr = contentField(Language.French) + val content_kh = contentField(Language.Khmer) val itemName = Field("itemName") val itemNotes = Field("itemNotes") val folderId = Field("folder") diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index b76fac57..0e736bdf 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -172,7 +172,18 @@ object SolrSetup { "Add Ukrainian", addContentField(Language.Ukrainian) ), - SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian") + SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"), + SolrMigration[F]( + 32, + "Add new field type for khmer content", + addFieldType(AddFieldType.textKhm) + ), + SolrMigration[F]( + 33, + "Add Khmer", + addContentField(Language.Khmer) + ), + SolrMigration.reIndexAll(34, "Re-Index after adding Khmer") ) def addFolderField: F[Unit] = @@ -347,6 +358,16 @@ object SolrSetup { ) ) + val textKhm = AddFieldType( + "text_kh", + "solr.TextField", + Analyzer( + Tokenizer("solr.ICUTokenizerFactory", Map.empty), + List( + ) + ) + ) + final case class Filter(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index e94a805f..61144660 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -36,6 +36,7 @@ type Language | Polish | Estonian | Ukrainian + | Khmer fromString : String -> Maybe Language @@ -106,6 +107,9 @@ fromString str = else if str == "ukr" || str == "uk" || str == "ukrainian" then Just Ukrainian + else if str == "khm" || str == "kh" || str == "khmer" then + Just Khmer + else Nothing @@ -179,6 +183,9 @@ toIso3 lang = Ukrainian -> "ukr" + Khmer -> + "khm" + all : List Language all = @@ -204,4 +211,5 @@ all = , Polish , Estonian , Ukrainian + , Khmer ] diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index f6e70488..71912beb 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -83,6 +83,9 @@ gb lang = Ukrainian -> "Ukrainian" + Khmer -> + "Khmer" + de : Language -> String de lang = @@ -153,6 +156,9 @@ de lang = Ukrainian -> "Ukrainisch" + Khmer -> + "Khmer" + fr : Language -> String fr lang = @@ -222,3 +228,6 @@ fr lang = Ukrainian -> "Ukrainien" + + Khmer -> + "Khmer" diff --git a/website/site/content/docs/install/prereq.md b/website/site/content/docs/install/prereq.md index a11166c4..bfcceca3 100644 --- a/website/site/content/docs/install/prereq.md +++ b/website/site/content/docs/install/prereq.md @@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html). That will provide you with the connection url (the last part is the core name). +Then start solr with `-Dsolr.modules=analysis-extras` +to enable some additional analyzer like `icu` for `Khmer` language etc +as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer), +which we used for tokenization and segmentation for `Khmer` language in docspell. + When using the provided `docker-compose.yml` setup, SOLR is already setup. SOLR must be reachable from all joex and all rest server components.