Merge pull request #2012 from xshadowlegendx/add-khmer-lang

add khmer lang
This commit is contained in:
eikek
2023-04-05 19:59:09 +02:00
committed by GitHub
11 changed files with 82 additions and 4 deletions

View File

@ -126,8 +126,9 @@ services:
volumes: volumes:
- docspell-solr_data:/var/solr - docspell-solr_data:/var/solr
command: command:
- solr-precreate - bash
- docspell - -c
- 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras'
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"] test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
interval: 1m interval: 1m

View File

@ -40,6 +40,7 @@ RUN apk update && \
ttf-dejavu \ ttf-dejavu \
ttf-freefont \ ttf-freefont \
ttf-liberation \ ttf-liberation \
font-noto-khmer \
libxml2-dev \ libxml2-dev \
libxslt-dev \ libxslt-dev \
pngquant \ pngquant \
@ -63,12 +64,19 @@ RUN apk update && \
RUN apk add --no-cache py3-setuptools && ocrmypdf --version RUN apk add --no-cache py3-setuptools && ocrmypdf --version
WORKDIR /opt WORKDIR /opt
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \ RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
unzip docspell-joex-*.zip && \ unzip docspell-joex-*.zip && \
rm docspell-joex-*.zip && \ rm docspell-joex-*.zip && \
ln -snf docspell-joex-* docspell-joex && \ ln -snf docspell-joex-* docspell-joex && \
rm docspell-joex/conf/docspell-joex.conf rm docspell-joex/conf/docspell-joex.conf
# temporary download traineddata directly for khmer lang
# before tesseract-ocr-data-khm being added to the registry
RUN \
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
mv khm.traineddata /usr/share/tessdata
# Using these data files for japanese, because they work better. See #973 # Using these data files for japanese, because they work better. See #973
RUN \ RUN \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \

View File

@ -129,6 +129,7 @@ object DateFind {
case Language.Lithuanian => ymd case Language.Lithuanian => ymd
case Language.Polish => dmy case Language.Polish => dmy
case Language.Estonian => dmy case Language.Estonian => dmy
case Language.Khmer => dmy
case Language.Ukrainian => dmy.or(ymd) case Language.Ukrainian => dmy.or(ymd)
} }
p.read(parts) match { p.read(parts) match {

View File

@ -64,6 +64,8 @@ object MonthName {
estonian estonian
case Language.Ukrainian => case Language.Ukrainian =>
ukrainian ukrainian
case Language.Khmer =>
khmer
} }
private val numbers = List( private val numbers = List(
@ -81,6 +83,21 @@ object MonthName {
List("12") List("12")
) )
private val khmer = List(
List("០១", "មករា"),
List("០២", "កុម្ភៈ"),
List("០៣", "មិនា"),
List("០៤", "មេសា"),
List("០៥", "ឧសភា"),
List("០៦", "មិថុនា"),
List("០៧", "កក្កដា"),
List("០៨", "សីហា"),
List("០៩", "កញ្ញា"),
List("១០", "តុលា"),
List("១១", "វិច្ឆិកា"),
List("១២", "ធ្នូ")
)
private val english = List( private val english = List(
List("jan", "january"), List("jan", "january"),
List("feb", "february"), List("feb", "february"),

View File

@ -73,6 +73,11 @@ object Language {
val iso3 = "ces" val iso3 = "ces"
} }
case object Khmer extends Language {
val iso2 = "kh"
val iso3 = "khm"
}
case object Danish extends Language { case object Danish extends Language {
val iso2 = "da" val iso2 = "da"
val iso3 = "dan" val iso3 = "dan"
@ -166,7 +171,8 @@ object Language {
Lithuanian, Lithuanian,
Polish, Polish,
Estonian, Estonian,
Ukrainian Ukrainian,
Khmer
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta {
case Language.Polish => "simple" case Language.Polish => "simple"
case Language.Estonian => "simple" case Language.Estonian => "simple"
case Language.Ukrainian => "simple" case Language.Ukrainian => "simple"
case Language.Khmer => "simple"
} }
} }

View File

@ -30,6 +30,7 @@ object Field {
val content_de = contentField(Language.German) val content_de = contentField(Language.German)
val content_en = contentField(Language.English) val content_en = contentField(Language.English)
val content_fr = contentField(Language.French) val content_fr = contentField(Language.French)
val content_kh = contentField(Language.Khmer)
val itemName = Field("itemName") val itemName = Field("itemName")
val itemNotes = Field("itemNotes") val itemNotes = Field("itemNotes")
val folderId = Field("folder") val folderId = Field("folder")

View File

@ -172,7 +172,18 @@ object SolrSetup {
"Add Ukrainian", "Add Ukrainian",
addContentField(Language.Ukrainian) addContentField(Language.Ukrainian)
), ),
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian") SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"),
SolrMigration[F](
32,
"Add new field type for khmer content",
addFieldType(AddFieldType.textKhm)
),
SolrMigration[F](
33,
"Add Khmer",
addContentField(Language.Khmer)
),
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
) )
def addFolderField: F[Unit] = def addFolderField: F[Unit] =
@ -347,6 +358,16 @@ object SolrSetup {
) )
) )
val textKhm = AddFieldType(
"text_kh",
"solr.TextField",
Analyzer(
Tokenizer("solr.ICUTokenizerFactory", Map.empty),
List(
)
)
)
final case class Filter(`class`: String, attr: Map[String, String]) final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -36,6 +36,7 @@ type Language
| Polish | Polish
| Estonian | Estonian
| Ukrainian | Ukrainian
| Khmer
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -106,6 +107,9 @@ fromString str =
else if str == "ukr" || str == "uk" || str == "ukrainian" then else if str == "ukr" || str == "uk" || str == "ukrainian" then
Just Ukrainian Just Ukrainian
else if str == "khm" || str == "kh" || str == "khmer" then
Just Khmer
else else
Nothing Nothing
@ -179,6 +183,9 @@ toIso3 lang =
Ukrainian -> Ukrainian ->
"ukr" "ukr"
Khmer ->
"khm"
all : List Language all : List Language
all = all =
@ -204,4 +211,5 @@ all =
, Polish , Polish
, Estonian , Estonian
, Ukrainian , Ukrainian
, Khmer
] ]

View File

@ -83,6 +83,9 @@ gb lang =
Ukrainian -> Ukrainian ->
"Ukrainian" "Ukrainian"
Khmer ->
"Khmer"
de : Language -> String de : Language -> String
de lang = de lang =
@ -153,6 +156,9 @@ de lang =
Ukrainian -> Ukrainian ->
"Ukrainisch" "Ukrainisch"
Khmer ->
"Khmer"
fr : Language -> String fr : Language -> String
fr lang = fr lang =
@ -222,3 +228,6 @@ fr lang =
Ukrainian -> Ukrainian ->
"Ukrainien" "Ukrainien"
Khmer ->
"Khmer"

View File

@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html).
That will provide you with the connection url (the last part is the That will provide you with the connection url (the last part is the
core name). core name).
Then start solr with `-Dsolr.modules=analysis-extras`
to enable some additional analyzer like `icu` for `Khmer` language etc
as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer),
which we used for tokenization and segmentation for `Khmer` language in docspell.
When using the provided `docker-compose.yml` setup, SOLR is already setup. When using the provided `docker-compose.yml` setup, SOLR is already setup.
SOLR must be reachable from all joex and all rest server components. SOLR must be reachable from all joex and all rest server components.