mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Merge pull request #2012 from xshadowlegendx/add-khmer-lang
add khmer lang
This commit is contained in:
@ -126,8 +126,9 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
- docspell-solr_data:/var/solr
|
- docspell-solr_data:/var/solr
|
||||||
command:
|
command:
|
||||||
- solr-precreate
|
- bash
|
||||||
- docspell
|
- -c
|
||||||
|
- 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras'
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
|
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
|
||||||
interval: 1m
|
interval: 1m
|
||||||
|
@ -40,6 +40,7 @@ RUN apk update && \
|
|||||||
ttf-dejavu \
|
ttf-dejavu \
|
||||||
ttf-freefont \
|
ttf-freefont \
|
||||||
ttf-liberation \
|
ttf-liberation \
|
||||||
|
font-noto-khmer \
|
||||||
libxml2-dev \
|
libxml2-dev \
|
||||||
libxslt-dev \
|
libxslt-dev \
|
||||||
pngquant \
|
pngquant \
|
||||||
@ -63,12 +64,19 @@ RUN apk update && \
|
|||||||
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
|
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
|
||||||
|
|
||||||
WORKDIR /opt
|
WORKDIR /opt
|
||||||
|
|
||||||
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
|
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
|
||||||
unzip docspell-joex-*.zip && \
|
unzip docspell-joex-*.zip && \
|
||||||
rm docspell-joex-*.zip && \
|
rm docspell-joex-*.zip && \
|
||||||
ln -snf docspell-joex-* docspell-joex && \
|
ln -snf docspell-joex-* docspell-joex && \
|
||||||
rm docspell-joex/conf/docspell-joex.conf
|
rm docspell-joex/conf/docspell-joex.conf
|
||||||
|
|
||||||
|
# temporary download traineddata directly for khmer lang
|
||||||
|
# before tesseract-ocr-data-khm being added to the registry
|
||||||
|
RUN \
|
||||||
|
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
||||||
|
mv khm.traineddata /usr/share/tessdata
|
||||||
|
|
||||||
# Using these data files for japanese, because they work better. See #973
|
# Using these data files for japanese, because they work better. See #973
|
||||||
RUN \
|
RUN \
|
||||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
||||||
|
@ -129,6 +129,7 @@ object DateFind {
|
|||||||
case Language.Lithuanian => ymd
|
case Language.Lithuanian => ymd
|
||||||
case Language.Polish => dmy
|
case Language.Polish => dmy
|
||||||
case Language.Estonian => dmy
|
case Language.Estonian => dmy
|
||||||
|
case Language.Khmer => dmy
|
||||||
case Language.Ukrainian => dmy.or(ymd)
|
case Language.Ukrainian => dmy.or(ymd)
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
|
@ -64,6 +64,8 @@ object MonthName {
|
|||||||
estonian
|
estonian
|
||||||
case Language.Ukrainian =>
|
case Language.Ukrainian =>
|
||||||
ukrainian
|
ukrainian
|
||||||
|
case Language.Khmer =>
|
||||||
|
khmer
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -81,6 +83,21 @@ object MonthName {
|
|||||||
List("12")
|
List("12")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val khmer = List(
|
||||||
|
List("០១", "មករា"),
|
||||||
|
List("០២", "កុម្ភៈ"),
|
||||||
|
List("០៣", "មិនា"),
|
||||||
|
List("០៤", "មេសា"),
|
||||||
|
List("០៥", "ឧសភា"),
|
||||||
|
List("០៦", "មិថុនា"),
|
||||||
|
List("០៧", "កក្កដា"),
|
||||||
|
List("០៨", "សីហា"),
|
||||||
|
List("០៩", "កញ្ញា"),
|
||||||
|
List("១០", "តុលា"),
|
||||||
|
List("១១", "វិច្ឆិកា"),
|
||||||
|
List("១២", "ធ្នូ")
|
||||||
|
)
|
||||||
|
|
||||||
private val english = List(
|
private val english = List(
|
||||||
List("jan", "january"),
|
List("jan", "january"),
|
||||||
List("feb", "february"),
|
List("feb", "february"),
|
||||||
|
@ -73,6 +73,11 @@ object Language {
|
|||||||
val iso3 = "ces"
|
val iso3 = "ces"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Khmer extends Language {
|
||||||
|
val iso2 = "kh"
|
||||||
|
val iso3 = "khm"
|
||||||
|
}
|
||||||
|
|
||||||
case object Danish extends Language {
|
case object Danish extends Language {
|
||||||
val iso2 = "da"
|
val iso2 = "da"
|
||||||
val iso3 = "dan"
|
val iso3 = "dan"
|
||||||
@ -166,7 +171,8 @@ object Language {
|
|||||||
Lithuanian,
|
Lithuanian,
|
||||||
Polish,
|
Polish,
|
||||||
Estonian,
|
Estonian,
|
||||||
Ukrainian
|
Ukrainian,
|
||||||
|
Khmer
|
||||||
)
|
)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
|
@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta {
|
|||||||
case Language.Polish => "simple"
|
case Language.Polish => "simple"
|
||||||
case Language.Estonian => "simple"
|
case Language.Estonian => "simple"
|
||||||
case Language.Ukrainian => "simple"
|
case Language.Ukrainian => "simple"
|
||||||
|
case Language.Khmer => "simple"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,7 @@ object Field {
|
|||||||
val content_de = contentField(Language.German)
|
val content_de = contentField(Language.German)
|
||||||
val content_en = contentField(Language.English)
|
val content_en = contentField(Language.English)
|
||||||
val content_fr = contentField(Language.French)
|
val content_fr = contentField(Language.French)
|
||||||
|
val content_kh = contentField(Language.Khmer)
|
||||||
val itemName = Field("itemName")
|
val itemName = Field("itemName")
|
||||||
val itemNotes = Field("itemNotes")
|
val itemNotes = Field("itemNotes")
|
||||||
val folderId = Field("folder")
|
val folderId = Field("folder")
|
||||||
|
@ -172,7 +172,18 @@ object SolrSetup {
|
|||||||
"Add Ukrainian",
|
"Add Ukrainian",
|
||||||
addContentField(Language.Ukrainian)
|
addContentField(Language.Ukrainian)
|
||||||
),
|
),
|
||||||
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian")
|
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"),
|
||||||
|
SolrMigration[F](
|
||||||
|
32,
|
||||||
|
"Add new field type for khmer content",
|
||||||
|
addFieldType(AddFieldType.textKhm)
|
||||||
|
),
|
||||||
|
SolrMigration[F](
|
||||||
|
33,
|
||||||
|
"Add Khmer",
|
||||||
|
addContentField(Language.Khmer)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
@ -347,6 +358,16 @@ object SolrSetup {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val textKhm = AddFieldType(
|
||||||
|
"text_kh",
|
||||||
|
"solr.TextField",
|
||||||
|
Analyzer(
|
||||||
|
Tokenizer("solr.ICUTokenizerFactory", Map.empty),
|
||||||
|
List(
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
final case class Filter(`class`: String, attr: Map[String, String])
|
final case class Filter(`class`: String, attr: Map[String, String])
|
||||||
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
||||||
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
||||||
|
@ -36,6 +36,7 @@ type Language
|
|||||||
| Polish
|
| Polish
|
||||||
| Estonian
|
| Estonian
|
||||||
| Ukrainian
|
| Ukrainian
|
||||||
|
| Khmer
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -106,6 +107,9 @@ fromString str =
|
|||||||
else if str == "ukr" || str == "uk" || str == "ukrainian" then
|
else if str == "ukr" || str == "uk" || str == "ukrainian" then
|
||||||
Just Ukrainian
|
Just Ukrainian
|
||||||
|
|
||||||
|
else if str == "khm" || str == "kh" || str == "khmer" then
|
||||||
|
Just Khmer
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -179,6 +183,9 @@ toIso3 lang =
|
|||||||
Ukrainian ->
|
Ukrainian ->
|
||||||
"ukr"
|
"ukr"
|
||||||
|
|
||||||
|
Khmer ->
|
||||||
|
"khm"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -204,4 +211,5 @@ all =
|
|||||||
, Polish
|
, Polish
|
||||||
, Estonian
|
, Estonian
|
||||||
, Ukrainian
|
, Ukrainian
|
||||||
|
, Khmer
|
||||||
]
|
]
|
||||||
|
@ -83,6 +83,9 @@ gb lang =
|
|||||||
Ukrainian ->
|
Ukrainian ->
|
||||||
"Ukrainian"
|
"Ukrainian"
|
||||||
|
|
||||||
|
Khmer ->
|
||||||
|
"Khmer"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -153,6 +156,9 @@ de lang =
|
|||||||
Ukrainian ->
|
Ukrainian ->
|
||||||
"Ukrainisch"
|
"Ukrainisch"
|
||||||
|
|
||||||
|
Khmer ->
|
||||||
|
"Khmer"
|
||||||
|
|
||||||
|
|
||||||
fr : Language -> String
|
fr : Language -> String
|
||||||
fr lang =
|
fr lang =
|
||||||
@ -222,3 +228,6 @@ fr lang =
|
|||||||
|
|
||||||
Ukrainian ->
|
Ukrainian ->
|
||||||
"Ukrainien"
|
"Ukrainien"
|
||||||
|
|
||||||
|
Khmer ->
|
||||||
|
"Khmer"
|
||||||
|
@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html).
|
|||||||
That will provide you with the connection url (the last part is the
|
That will provide you with the connection url (the last part is the
|
||||||
core name).
|
core name).
|
||||||
|
|
||||||
|
Then start solr with `-Dsolr.modules=analysis-extras`
|
||||||
|
to enable some additional analyzer like `icu` for `Khmer` language etc
|
||||||
|
as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer),
|
||||||
|
which we used for tokenization and segmentation for `Khmer` language in docspell.
|
||||||
|
|
||||||
When using the provided `docker-compose.yml` setup, SOLR is already setup.
|
When using the provided `docker-compose.yml` setup, SOLR is already setup.
|
||||||
|
|
||||||
SOLR must be reachable from all joex and all rest server components.
|
SOLR must be reachable from all joex and all rest server components.
|
||||||
|
Reference in New Issue
Block a user