Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)

* Add Japanese Vertical Support 
* Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
tenpai 2024-04-16 18:24:57 +00:00 committed by GitHub
parent 36c00cc9ec
commit e731d822dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 75 additions and 5 deletions

View File

@ -77,7 +77,7 @@ RUN \
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
mv khm.traineddata /usr/share/tessdata
# Using these data files for japanese, because they work better. See #973
# Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445.
RUN \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \

View File

@ -125,6 +125,7 @@ object DateFind {
case Language.Dutch => dmy.or(ymd).or(mdy)
case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd
case Language.JpnVert => ymd
case Language.Hebrew => dmy
case Language.Lithuanian => ymd
case Language.Polish => dmy

View File

@ -54,6 +54,8 @@ object MonthName {
latvian
case Language.Japanese =>
japanese
case Language.JpnVert =>
japanese
case Language.Hebrew =>
hebrew
case Language.Lithuanian =>

View File

@ -123,6 +123,11 @@ object Language {
val iso3 = "jpn"
}
/*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/
case object JpnVert extends Language {
val iso2 = "ja_vert"
val iso3 = "jpn_vert"
}
case object Hebrew extends Language {
val iso2 = "he"
val iso3 = "heb"
@ -172,6 +177,7 @@ object Language {
Romanian,
Latvian,
Japanese,
JpnVert,
Hebrew,
Lithuanian,
Polish,

View File

@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta {
case Language.Czech => "simple"
case Language.Latvian => "simple"
case Language.Japanese => "simple"
case Language.JpnVert => "simple"
case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
case Language.Polish => "simple"

View File

@ -593,13 +593,32 @@ Docpell Update Check
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = {
# Custom Language Mappings Below
# Japanese Vertical Mapping
arg-mappings = {
"tesseract_lang" = {
value = "{{lang}}"
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
command = {
program = "tesseract"
# Default arguments for all processing go below.
args = [
"{{infile}}",
"out",
"-l",
"{{lang}}",
"{{tesseract_lang}}",
"pdf",
"txt"
]
@ -648,11 +667,32 @@ Docpell Update Check
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
# Custom argument mappings for this program.
arg-mappings = {
"ocr_lang" = {
value = "{{lang}}"
# Custom Language Mappings Below
# Japanese Vertical Mapping
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
enabled = true
command = {
program = "ocrmypdf"
# Default arguments for all processing go below.
args = [
"-l", "{{lang}}",
"{{ocr_lang}}",
"--skip-text",
"--deskew",
"-j", "1",
@ -893,4 +933,4 @@ Docpell Update Check
}
}
}
}
}

View File

@ -6,5 +6,8 @@
"@fortawesome/fontawesome-free": "^6.0.0",
"@tailwindcss/forms": "^0.5.0",
"flag-icons": "^7.2.0"
},
"dependencies": {
"tailwindcss": "^3.4.1"
}
}

View File

@ -30,6 +30,7 @@ type Language
| Dutch
| Latvian
| Japanese
| JpnVert
| Hebrew
| Hungarian
| Lithuanian
@ -90,6 +91,9 @@ fromString str =
else if str == "jpn" || str == "ja" || str == "japanese" then
Just Japanese
else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then
Just JpnVert
else if str == "heb" || str == "he" || str == "hebrew" then
Just Hebrew
@ -169,6 +173,9 @@ toIso3 lang =
Japanese ->
"jpn"
JpnVert ->
"jpn_vert"
Hebrew ->
"heb"
@ -212,6 +219,7 @@ all =
, Romanian
, Latvian
, Japanese
, JpnVert
, Hebrew
, Hungarian
, Lithuanian

View File

@ -65,6 +65,9 @@ gb lang =
Japanese ->
"Japanese"
JpnVert ->
"JpnVert"
Hebrew ->
"Hebrew"
@ -141,6 +144,9 @@ de lang =
Japanese ->
"Japanisch"
JpnVert ->
"JpnVert"
Hebrew ->
"Hebräisch"
@ -217,6 +223,9 @@ fr lang =
Japanese ->
"Japonnais"
JpnVert ->
"JpnVert"
Hebrew ->
"Hébreu"