mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-02-15 20:33:26 +00:00
Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)
* Add Japanese Vertical Support * Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
parent
36c00cc9ec
commit
e731d822dc
@ -77,7 +77,7 @@ RUN \
|
||||
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
||||
mv khm.traineddata /usr/share/tessdata
|
||||
|
||||
# Using these data files for japanese, because they work better. See #973
|
||||
# Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445.
|
||||
RUN \
|
||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
|
||||
|
@ -125,6 +125,7 @@ object DateFind {
|
||||
case Language.Dutch => dmy.or(ymd).or(mdy)
|
||||
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
||||
case Language.Japanese => ymd
|
||||
case Language.JpnVert => ymd
|
||||
case Language.Hebrew => dmy
|
||||
case Language.Lithuanian => ymd
|
||||
case Language.Polish => dmy
|
||||
|
@ -54,6 +54,8 @@ object MonthName {
|
||||
latvian
|
||||
case Language.Japanese =>
|
||||
japanese
|
||||
case Language.JpnVert =>
|
||||
japanese
|
||||
case Language.Hebrew =>
|
||||
hebrew
|
||||
case Language.Lithuanian =>
|
||||
|
@ -123,6 +123,11 @@ object Language {
|
||||
val iso3 = "jpn"
|
||||
}
|
||||
|
||||
/*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/
|
||||
case object JpnVert extends Language {
|
||||
val iso2 = "ja_vert"
|
||||
val iso3 = "jpn_vert"
|
||||
}
|
||||
case object Hebrew extends Language {
|
||||
val iso2 = "he"
|
||||
val iso3 = "heb"
|
||||
@ -172,6 +177,7 @@ object Language {
|
||||
Romanian,
|
||||
Latvian,
|
||||
Japanese,
|
||||
JpnVert,
|
||||
Hebrew,
|
||||
Lithuanian,
|
||||
Polish,
|
||||
|
@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta {
|
||||
case Language.Czech => "simple"
|
||||
case Language.Latvian => "simple"
|
||||
case Language.Japanese => "simple"
|
||||
case Language.JpnVert => "simple"
|
||||
case Language.Hebrew => "simple"
|
||||
case Language.Lithuanian => "simple"
|
||||
case Language.Polish => "simple"
|
||||
|
@ -593,13 +593,32 @@ Docpell Update Check
|
||||
# To convert image files to PDF files, tesseract is used. This
|
||||
# also extracts the text in one go.
|
||||
tesseract = {
|
||||
# Custom Language Mappings Below
|
||||
# Japanese Vertical Mapping
|
||||
arg-mappings = {
|
||||
"tesseract_lang" = {
|
||||
value = "{{lang}}"
|
||||
mappings = [
|
||||
{
|
||||
matches = "jpn_vert"
|
||||
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
|
||||
},
|
||||
# Start Other Custom Language Mappings Here
|
||||
# Default Mapping Below
|
||||
{
|
||||
matches = ".*"
|
||||
args = [ "-l", "{{lang}}" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
command = {
|
||||
program = "tesseract"
|
||||
# Default arguments for all processing go below.
|
||||
args = [
|
||||
"{{infile}}",
|
||||
"out",
|
||||
"-l",
|
||||
"{{lang}}",
|
||||
"{{tesseract_lang}}",
|
||||
"pdf",
|
||||
"txt"
|
||||
]
|
||||
@ -648,11 +667,32 @@ Docpell Update Check
|
||||
# (where ocr is not necessary). In this case, the pdf will be
|
||||
# converted to PDF/A.
|
||||
ocrmypdf = {
|
||||
# Custom argument mappings for this program.
|
||||
arg-mappings = {
|
||||
"ocr_lang" = {
|
||||
value = "{{lang}}"
|
||||
# Custom Language Mappings Below
|
||||
# Japanese Vertical Mapping
|
||||
mappings = [
|
||||
{
|
||||
matches = "jpn_vert"
|
||||
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
|
||||
},
|
||||
# Start Other Custom Language Mappings Here
|
||||
# Default Mapping Below
|
||||
{
|
||||
matches = ".*"
|
||||
args = [ "-l", "{{lang}}" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
enabled = true
|
||||
command = {
|
||||
program = "ocrmypdf"
|
||||
# Default arguments for all processing go below.
|
||||
args = [
|
||||
"-l", "{{lang}}",
|
||||
"{{ocr_lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j", "1",
|
||||
@ -893,4 +933,4 @@ Docpell Update Check
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,5 +6,8 @@
|
||||
"@fortawesome/fontawesome-free": "^6.0.0",
|
||||
"@tailwindcss/forms": "^0.5.0",
|
||||
"flag-icons": "^7.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"tailwindcss": "^3.4.1"
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ type Language
|
||||
| Dutch
|
||||
| Latvian
|
||||
| Japanese
|
||||
| JpnVert
|
||||
| Hebrew
|
||||
| Hungarian
|
||||
| Lithuanian
|
||||
@ -90,6 +91,9 @@ fromString str =
|
||||
else if str == "jpn" || str == "ja" || str == "japanese" then
|
||||
Just Japanese
|
||||
|
||||
else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then
|
||||
Just JpnVert
|
||||
|
||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||
Just Hebrew
|
||||
|
||||
@ -169,6 +173,9 @@ toIso3 lang =
|
||||
Japanese ->
|
||||
"jpn"
|
||||
|
||||
JpnVert ->
|
||||
"jpn_vert"
|
||||
|
||||
Hebrew ->
|
||||
"heb"
|
||||
|
||||
@ -212,6 +219,7 @@ all =
|
||||
, Romanian
|
||||
, Latvian
|
||||
, Japanese
|
||||
, JpnVert
|
||||
, Hebrew
|
||||
, Hungarian
|
||||
, Lithuanian
|
||||
|
@ -65,6 +65,9 @@ gb lang =
|
||||
Japanese ->
|
||||
"Japanese"
|
||||
|
||||
JpnVert ->
|
||||
"JpnVert"
|
||||
|
||||
Hebrew ->
|
||||
"Hebrew"
|
||||
|
||||
@ -141,6 +144,9 @@ de lang =
|
||||
Japanese ->
|
||||
"Japanisch"
|
||||
|
||||
JpnVert ->
|
||||
"JpnVert"
|
||||
|
||||
Hebrew ->
|
||||
"Hebräisch"
|
||||
|
||||
@ -217,6 +223,9 @@ fr lang =
|
||||
Japanese ->
|
||||
"Japonnais"
|
||||
|
||||
JpnVert ->
|
||||
"JpnVert"
|
||||
|
||||
Hebrew ->
|
||||
"Hébreu"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user