mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-07 06:39:32 +00:00
Merge pull request #2012 from xshadowlegendx/add-khmer-lang
add khmer lang
This commit is contained in:
commit
fd6b7ceee3
docker
modules
analysis/src/main/scala/docspell/analysis/date
common/src/main/scala/docspell/common
fts-psql/src/main/scala/docspell/ftspsql
fts-solr/src/main/scala/docspell/ftssolr
webapp/src/main/elm
website/site/content/docs/install
@ -126,8 +126,9 @@ services:
|
||||
volumes:
|
||||
- docspell-solr_data:/var/solr
|
||||
command:
|
||||
- solr-precreate
|
||||
- docspell
|
||||
- bash
|
||||
- -c
|
||||
- 'precreate-core docspell; exec solr -f -Dsolr.modules=analysis-extras'
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8983/solr/docspell/admin/ping"]
|
||||
interval: 1m
|
||||
|
@ -40,6 +40,7 @@ RUN apk update && \
|
||||
ttf-dejavu \
|
||||
ttf-freefont \
|
||||
ttf-liberation \
|
||||
font-noto-khmer \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
pngquant \
|
||||
@ -63,12 +64,19 @@ RUN apk update && \
|
||||
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
|
||||
|
||||
WORKDIR /opt
|
||||
|
||||
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
|
||||
unzip docspell-joex-*.zip && \
|
||||
rm docspell-joex-*.zip && \
|
||||
ln -snf docspell-joex-* docspell-joex && \
|
||||
rm docspell-joex/conf/docspell-joex.conf
|
||||
|
||||
# temporary download traineddata directly for khmer lang
|
||||
# before tesseract-ocr-data-khm being added to the registry
|
||||
RUN \
|
||||
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
||||
mv khm.traineddata /usr/share/tessdata
|
||||
|
||||
# Using these data files for japanese, because they work better. See #973
|
||||
RUN \
|
||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
||||
|
@ -129,6 +129,7 @@ object DateFind {
|
||||
case Language.Lithuanian => ymd
|
||||
case Language.Polish => dmy
|
||||
case Language.Estonian => dmy
|
||||
case Language.Khmer => dmy
|
||||
case Language.Ukrainian => dmy.or(ymd)
|
||||
}
|
||||
p.read(parts) match {
|
||||
|
@ -64,6 +64,8 @@ object MonthName {
|
||||
estonian
|
||||
case Language.Ukrainian =>
|
||||
ukrainian
|
||||
case Language.Khmer =>
|
||||
khmer
|
||||
}
|
||||
|
||||
private val numbers = List(
|
||||
@ -81,6 +83,21 @@ object MonthName {
|
||||
List("12")
|
||||
)
|
||||
|
||||
private val khmer = List(
|
||||
List("០១", "មករា"),
|
||||
List("០២", "កុម្ភៈ"),
|
||||
List("០៣", "មិនា"),
|
||||
List("០៤", "មេសា"),
|
||||
List("០៥", "ឧសភា"),
|
||||
List("០៦", "មិថុនា"),
|
||||
List("០៧", "កក្កដា"),
|
||||
List("០៨", "សីហា"),
|
||||
List("០៩", "កញ្ញា"),
|
||||
List("១០", "តុលា"),
|
||||
List("១១", "វិច្ឆិកា"),
|
||||
List("១២", "ធ្នូ")
|
||||
)
|
||||
|
||||
private val english = List(
|
||||
List("jan", "january"),
|
||||
List("feb", "february"),
|
||||
|
@ -73,6 +73,11 @@ object Language {
|
||||
val iso3 = "ces"
|
||||
}
|
||||
|
||||
case object Khmer extends Language {
|
||||
val iso2 = "kh"
|
||||
val iso3 = "khm"
|
||||
}
|
||||
|
||||
case object Danish extends Language {
|
||||
val iso2 = "da"
|
||||
val iso3 = "dan"
|
||||
@ -166,7 +171,8 @@ object Language {
|
||||
Lithuanian,
|
||||
Polish,
|
||||
Estonian,
|
||||
Ukrainian
|
||||
Ukrainian,
|
||||
Khmer
|
||||
)
|
||||
|
||||
def fromString(str: String): Either[String, Language] = {
|
||||
|
@ -206,5 +206,6 @@ object FtsRepository extends DoobieMeta {
|
||||
case Language.Polish => "simple"
|
||||
case Language.Estonian => "simple"
|
||||
case Language.Ukrainian => "simple"
|
||||
case Language.Khmer => "simple"
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ object Field {
|
||||
val content_de = contentField(Language.German)
|
||||
val content_en = contentField(Language.English)
|
||||
val content_fr = contentField(Language.French)
|
||||
val content_kh = contentField(Language.Khmer)
|
||||
val itemName = Field("itemName")
|
||||
val itemNotes = Field("itemNotes")
|
||||
val folderId = Field("folder")
|
||||
|
@ -172,7 +172,18 @@ object SolrSetup {
|
||||
"Add Ukrainian",
|
||||
addContentField(Language.Ukrainian)
|
||||
),
|
||||
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian")
|
||||
SolrMigration.reIndexAll(31, "Re-Index after adding Estonian and Ukrainian"),
|
||||
SolrMigration[F](
|
||||
32,
|
||||
"Add new field type for khmer content",
|
||||
addFieldType(AddFieldType.textKhm)
|
||||
),
|
||||
SolrMigration[F](
|
||||
33,
|
||||
"Add Khmer",
|
||||
addContentField(Language.Khmer)
|
||||
),
|
||||
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
|
||||
)
|
||||
|
||||
def addFolderField: F[Unit] =
|
||||
@ -347,6 +358,16 @@ object SolrSetup {
|
||||
)
|
||||
)
|
||||
|
||||
val textKhm = AddFieldType(
|
||||
"text_kh",
|
||||
"solr.TextField",
|
||||
Analyzer(
|
||||
Tokenizer("solr.ICUTokenizerFactory", Map.empty),
|
||||
List(
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
final case class Filter(`class`: String, attr: Map[String, String])
|
||||
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
||||
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
||||
|
@ -36,6 +36,7 @@ type Language
|
||||
| Polish
|
||||
| Estonian
|
||||
| Ukrainian
|
||||
| Khmer
|
||||
|
||||
|
||||
fromString : String -> Maybe Language
|
||||
@ -106,6 +107,9 @@ fromString str =
|
||||
else if str == "ukr" || str == "uk" || str == "ukrainian" then
|
||||
Just Ukrainian
|
||||
|
||||
else if str == "khm" || str == "kh" || str == "khmer" then
|
||||
Just Khmer
|
||||
|
||||
else
|
||||
Nothing
|
||||
|
||||
@ -179,6 +183,9 @@ toIso3 lang =
|
||||
Ukrainian ->
|
||||
"ukr"
|
||||
|
||||
Khmer ->
|
||||
"khm"
|
||||
|
||||
|
||||
all : List Language
|
||||
all =
|
||||
@ -204,4 +211,5 @@ all =
|
||||
, Polish
|
||||
, Estonian
|
||||
, Ukrainian
|
||||
, Khmer
|
||||
]
|
||||
|
@ -83,6 +83,9 @@ gb lang =
|
||||
Ukrainian ->
|
||||
"Ukrainian"
|
||||
|
||||
Khmer ->
|
||||
"Khmer"
|
||||
|
||||
|
||||
de : Language -> String
|
||||
de lang =
|
||||
@ -153,6 +156,9 @@ de lang =
|
||||
Ukrainian ->
|
||||
"Ukrainisch"
|
||||
|
||||
Khmer ->
|
||||
"Khmer"
|
||||
|
||||
|
||||
fr : Language -> String
|
||||
fr lang =
|
||||
@ -222,3 +228,6 @@ fr lang =
|
||||
|
||||
Ukrainian ->
|
||||
"Ukrainien"
|
||||
|
||||
Khmer ->
|
||||
"Khmer"
|
||||
|
@ -79,6 +79,11 @@ documentation](https://solr.apache.org/guide/8_4/installing-solr.html).
|
||||
That will provide you with the connection url (the last part is the
|
||||
core name).
|
||||
|
||||
Then start solr with `-Dsolr.modules=analysis-extras`
|
||||
to enable some additional analyzer like `icu` for `Khmer` language etc
|
||||
as described [here](https://solr.apache.org/guide/solr/latest/indexing-guide/language-analysis.html#hebrew-lao-myanmar-khmer),
|
||||
which we used for tokenization and segmentation for `Khmer` language in docspell.
|
||||
|
||||
When using the provided `docker-compose.yml` setup, SOLR is already setup.
|
||||
|
||||
SOLR must be reachable from all joex and all rest server components.
|
||||
|
Loading…
x
Reference in New Issue
Block a user