Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)

* Add Japanese Vertical Support 
* Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
tenpai
2024-04-16 18:24:57 +00:00
committed by GitHub
parent 36c00cc9ec
commit e731d822dc
9 changed files with 75 additions and 5 deletions

View File

@ -77,7 +77,7 @@ RUN \
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \ wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
mv khm.traineddata /usr/share/tessdata mv khm.traineddata /usr/share/tessdata
# Using these data files for japanese, because they work better. See #973 # Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445.
RUN \ RUN \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \

View File

@ -125,6 +125,7 @@ object DateFind {
case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy)
case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd case Language.Japanese => ymd
case Language.JpnVert => ymd
case Language.Hebrew => dmy case Language.Hebrew => dmy
case Language.Lithuanian => ymd case Language.Lithuanian => ymd
case Language.Polish => dmy case Language.Polish => dmy

View File

@ -54,6 +54,8 @@ object MonthName {
latvian latvian
case Language.Japanese => case Language.Japanese =>
japanese japanese
case Language.JpnVert =>
japanese
case Language.Hebrew => case Language.Hebrew =>
hebrew hebrew
case Language.Lithuanian => case Language.Lithuanian =>

View File

@ -123,6 +123,11 @@ object Language {
val iso3 = "jpn" val iso3 = "jpn"
} }
/*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/
case object JpnVert extends Language {
val iso2 = "ja_vert"
val iso3 = "jpn_vert"
}
case object Hebrew extends Language { case object Hebrew extends Language {
val iso2 = "he" val iso2 = "he"
val iso3 = "heb" val iso3 = "heb"
@ -172,6 +177,7 @@ object Language {
Romanian, Romanian,
Latvian, Latvian,
Japanese, Japanese,
JpnVert,
Hebrew, Hebrew,
Lithuanian, Lithuanian,
Polish, Polish,

View File

@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta {
case Language.Czech => "simple" case Language.Czech => "simple"
case Language.Latvian => "simple" case Language.Latvian => "simple"
case Language.Japanese => "simple" case Language.Japanese => "simple"
case Language.JpnVert => "simple"
case Language.Hebrew => "simple" case Language.Hebrew => "simple"
case Language.Lithuanian => "simple" case Language.Lithuanian => "simple"
case Language.Polish => "simple" case Language.Polish => "simple"

View File

@ -593,13 +593,32 @@ Docpell Update Check
# To convert image files to PDF files, tesseract is used. This # To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go. # also extracts the text in one go.
tesseract = { tesseract = {
# Custom Language Mappings Below
# Japanese Vertical Mapping
arg-mappings = {
"tesseract_lang" = {
value = "{{lang}}"
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
command = { command = {
program = "tesseract" program = "tesseract"
# Default arguments for all processing go below.
args = [ args = [
"{{infile}}", "{{infile}}",
"out", "out",
"-l", "{{tesseract_lang}}",
"{{lang}}",
"pdf", "pdf",
"txt" "txt"
] ]
@ -648,11 +667,32 @@ Docpell Update Check
# (where ocr is not necessary). In this case, the pdf will be # (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A. # converted to PDF/A.
ocrmypdf = { ocrmypdf = {
# Custom argument mappings for this program.
arg-mappings = {
"ocr_lang" = {
value = "{{lang}}"
# Custom Language Mappings Below
# Japanese Vertical Mapping
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
enabled = true enabled = true
command = { command = {
program = "ocrmypdf" program = "ocrmypdf"
# Default arguments for all processing go below.
args = [ args = [
"-l", "{{lang}}", "{{ocr_lang}}",
"--skip-text", "--skip-text",
"--deskew", "--deskew",
"-j", "1", "-j", "1",
@ -893,4 +933,4 @@ Docpell Update Check
} }
} }
} }
} }

View File

@ -6,5 +6,8 @@
"@fortawesome/fontawesome-free": "^6.0.0", "@fortawesome/fontawesome-free": "^6.0.0",
"@tailwindcss/forms": "^0.5.0", "@tailwindcss/forms": "^0.5.0",
"flag-icons": "^7.2.0" "flag-icons": "^7.2.0"
},
"dependencies": {
"tailwindcss": "^3.4.1"
} }
} }

View File

@ -30,6 +30,7 @@ type Language
| Dutch | Dutch
| Latvian | Latvian
| Japanese | Japanese
| JpnVert
| Hebrew | Hebrew
| Hungarian | Hungarian
| Lithuanian | Lithuanian
@ -90,6 +91,9 @@ fromString str =
else if str == "jpn" || str == "ja" || str == "japanese" then else if str == "jpn" || str == "ja" || str == "japanese" then
Just Japanese Just Japanese
else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then
Just JpnVert
else if str == "heb" || str == "he" || str == "hebrew" then else if str == "heb" || str == "he" || str == "hebrew" then
Just Hebrew Just Hebrew
@ -169,6 +173,9 @@ toIso3 lang =
Japanese -> Japanese ->
"jpn" "jpn"
JpnVert ->
"jpn_vert"
Hebrew -> Hebrew ->
"heb" "heb"
@ -212,6 +219,7 @@ all =
, Romanian , Romanian
, Latvian , Latvian
, Japanese , Japanese
, JpnVert
, Hebrew , Hebrew
, Hungarian , Hungarian
, Lithuanian , Lithuanian

View File

@ -65,6 +65,9 @@ gb lang =
Japanese -> Japanese ->
"Japanese" "Japanese"
JpnVert ->
"JpnVert"
Hebrew -> Hebrew ->
"Hebrew" "Hebrew"
@ -141,6 +144,9 @@ de lang =
Japanese -> Japanese ->
"Japanisch" "Japanisch"
JpnVert ->
"JpnVert"
Hebrew -> Hebrew ->
"Hebräisch" "Hebräisch"
@ -217,6 +223,9 @@ fr lang =
Japanese -> Japanese ->
"Japonnais" "Japonnais"
JpnVert ->
"JpnVert"
Hebrew -> Hebrew ->
"Hébreu" "Hébreu"