Merge pull request from xshadowlegendx/add-khmer-lang

add khmer lang
This commit is contained in:
eikek 2023-04-05 19:59:09 +02:00 committed by GitHub
commit fd6b7ceee3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 82 additions and 4 deletions
docker
docker-compose
dockerfiles
modules
analysis/src/main/scala/docspell/analysis/date
common/src/main/scala/docspell/common
fts-psql/src/main/scala/docspell/ftspsql
fts-solr/src/main/scala/docspell/ftssolr
webapp/src/main/elm
Data
Messages/Data
website/site/content/docs/install

@ -126,8 +126,9 @@ services:
volumes:
- docspell-solr_data:/var/solr
command:
- solr-precreate
- docspell
- bash
- -c
- 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras'
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
interval: 1m

@ -40,6 +40,7 @@ RUN apk update && \
ttf-dejavu \
ttf-freefont \
ttf-liberation \
font-noto-khmer \
libxml2-dev \
libxslt-dev \
pngquant \
@ -63,12 +64,19 @@ RUN apk update && \
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
WORKDIR /opt
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
unzip docspell-joex-*.zip && \
rm docspell-joex-*.zip && \
ln -snf docspell-joex-* docspell-joex && \
rm docspell-joex/conf/docspell-joex.conf
# temporary download traineddata directly for khmer lang
# before tesseract-ocr-data-khm being added to the registry
RUN \
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
mv khm.traineddata /usr/share/tessdata
# Using these data files for japanese, because they work better. See #973
RUN \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \

@ -129,6 +129,7 @@ object DateFind {
case Language.Lithuanian => ymd
case Language.Polish => dmy
case Language.Estonian => dmy
case Language.Khmer => dmy
case Language.Ukrainian => dmy.or(ymd)
}
p.read(parts) match {

@ -64,6 +64,8 @@ object MonthName {
estonian
case Language.Ukrainian =>
ukrainian
case Language.Khmer =>
khmer
}
private val numbers = List(
@ -81,6 +83,21 @@ object MonthName {
List("12")
)
private val khmer = List(
List("០១", "មករា"),
List("០២", "កុម្ភៈ"),
List("០៣", "មិនា"),
List("០៤", "មេសា"),
List("០៥", "ឧសភា"),
List("០៦", "មិថុនា"),
List("០៧", "កក្កដា"),
List("០៨", "សីហា"),
List("០៩", "កញ្ញា"),
List("១០", "តុលា"),
List("១១", "វិច្ឆិកា"),
List("១២", "ធ្នូ")
)
private val english = List(
List("jan", "january"),
List("feb", "february"),

@ -73,6 +73,11 @@ object Language {
val iso3 = "ces"
}
case object Khmer extends Language {
val iso2 = "kh"
val iso3 = "khm"
}
case object Danish extends Language {
val iso2 = "da"
val iso3 = "dan"
@ -166,7 +171,8 @@ object Language {
Lithuanian,
Polish,
Estonian,
Ukrainian
Ukrainian,
Khmer
)
def fromString(str: String): Either[String, Language] = {

@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta {
case Language.Polish => "simple"
case Language.Estonian => "simple"
case Language.Ukrainian => "simple"
case Language.Khmer => "simple"
}
}

@ -30,6 +30,7 @@ object Field {
val content_de = contentField(Language.German)
val content_en = contentField(Language.English)
val content_fr = contentField(Language.French)
val content_kh = contentField(Language.Khmer)
val itemName = Field("itemName")
val itemNotes = Field("itemNotes")
val folderId = Field("folder")

@ -172,7 +172,18 @@ object SolrSetup {
"Add Ukrainian",
addContentField(Language.Ukrainian)
),
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian")
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"),
SolrMigration[F](
32,
"Add new field type for khmer content",
addFieldType(AddFieldType.textKhm)
),
SolrMigration[F](
33,
"Add Khmer",
addContentField(Language.Khmer)
),
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
)
def addFolderField: F[Unit] =
@ -347,6 +358,16 @@ object SolrSetup {
)
)
val textKhm = AddFieldType(
"text_kh",
"solr.TextField",
Analyzer(
Tokenizer("solr.ICUTokenizerFactory", Map.empty),
List(
)
)
)
final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

@ -36,6 +36,7 @@ type Language
| Polish
| Estonian
| Ukrainian
| Khmer
fromString : String -> Maybe Language
@ -106,6 +107,9 @@ fromString str =
else if str == "ukr" || str == "uk" || str == "ukrainian" then
Just Ukrainian
else if str == "khm" || str == "kh" || str == "khmer" then
Just Khmer
else
Nothing
@ -179,6 +183,9 @@ toIso3 lang =
Ukrainian ->
"ukr"
Khmer ->
"khm"
all : List Language
all =
@ -204,4 +211,5 @@ all =
, Polish
, Estonian
, Ukrainian
, Khmer
]

@ -83,6 +83,9 @@ gb lang =
Ukrainian ->
"Ukrainian"
Khmer ->
"Khmer"
de : Language -> String
de lang =
@ -153,6 +156,9 @@ de lang =
Ukrainian ->
"Ukrainisch"
Khmer ->
"Khmer"
fr : Language -> String
fr lang =
@ -222,3 +228,6 @@ fr lang =
Ukrainian ->
"Ukrainien"
Khmer ->
"Khmer"

@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html).
That will provide you with the connection url (the last part is the
core name).
Then start solr with `-Dsolr.modules=analysis-extras`
to enable some additional analyzer like `icu` for `Khmer` language etc
as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer),
which we used for tokenization and segmentation for `Khmer` language in docspell.
When using the provided `docker-compose.yml` setup, SOLR is already setup.
SOLR must be reachable from all joex and all rest server components.