mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)
* Add Japanese Vertical Support * Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
@ -77,7 +77,7 @@ RUN \
|
|||||||
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
||||||
mv khm.traineddata /usr/share/tessdata
|
mv khm.traineddata /usr/share/tessdata
|
||||||
|
|
||||||
# Using these data files for japanese, because they work better. See #973
|
# Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445.
|
||||||
RUN \
|
RUN \
|
||||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
||||||
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
|
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
|
||||||
|
@ -125,6 +125,7 @@ object DateFind {
|
|||||||
case Language.Dutch => dmy.or(ymd).or(mdy)
|
case Language.Dutch => dmy.or(ymd).or(mdy)
|
||||||
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
||||||
case Language.Japanese => ymd
|
case Language.Japanese => ymd
|
||||||
|
case Language.JpnVert => ymd
|
||||||
case Language.Hebrew => dmy
|
case Language.Hebrew => dmy
|
||||||
case Language.Lithuanian => ymd
|
case Language.Lithuanian => ymd
|
||||||
case Language.Polish => dmy
|
case Language.Polish => dmy
|
||||||
|
@ -54,6 +54,8 @@ object MonthName {
|
|||||||
latvian
|
latvian
|
||||||
case Language.Japanese =>
|
case Language.Japanese =>
|
||||||
japanese
|
japanese
|
||||||
|
case Language.JpnVert =>
|
||||||
|
japanese
|
||||||
case Language.Hebrew =>
|
case Language.Hebrew =>
|
||||||
hebrew
|
hebrew
|
||||||
case Language.Lithuanian =>
|
case Language.Lithuanian =>
|
||||||
|
@ -123,6 +123,11 @@ object Language {
|
|||||||
val iso3 = "jpn"
|
val iso3 = "jpn"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/
|
||||||
|
case object JpnVert extends Language {
|
||||||
|
val iso2 = "ja_vert"
|
||||||
|
val iso3 = "jpn_vert"
|
||||||
|
}
|
||||||
case object Hebrew extends Language {
|
case object Hebrew extends Language {
|
||||||
val iso2 = "he"
|
val iso2 = "he"
|
||||||
val iso3 = "heb"
|
val iso3 = "heb"
|
||||||
@ -172,6 +177,7 @@ object Language {
|
|||||||
Romanian,
|
Romanian,
|
||||||
Latvian,
|
Latvian,
|
||||||
Japanese,
|
Japanese,
|
||||||
|
JpnVert,
|
||||||
Hebrew,
|
Hebrew,
|
||||||
Lithuanian,
|
Lithuanian,
|
||||||
Polish,
|
Polish,
|
||||||
|
@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta {
|
|||||||
case Language.Czech => "simple"
|
case Language.Czech => "simple"
|
||||||
case Language.Latvian => "simple"
|
case Language.Latvian => "simple"
|
||||||
case Language.Japanese => "simple"
|
case Language.Japanese => "simple"
|
||||||
|
case Language.JpnVert => "simple"
|
||||||
case Language.Hebrew => "simple"
|
case Language.Hebrew => "simple"
|
||||||
case Language.Lithuanian => "simple"
|
case Language.Lithuanian => "simple"
|
||||||
case Language.Polish => "simple"
|
case Language.Polish => "simple"
|
||||||
|
@ -593,13 +593,32 @@ Docpell Update Check
|
|||||||
# To convert image files to PDF files, tesseract is used. This
|
# To convert image files to PDF files, tesseract is used. This
|
||||||
# also extracts the text in one go.
|
# also extracts the text in one go.
|
||||||
tesseract = {
|
tesseract = {
|
||||||
|
# Custom Language Mappings Below
|
||||||
|
# Japanese Vertical Mapping
|
||||||
|
arg-mappings = {
|
||||||
|
"tesseract_lang" = {
|
||||||
|
value = "{{lang}}"
|
||||||
|
mappings = [
|
||||||
|
{
|
||||||
|
matches = "jpn_vert"
|
||||||
|
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
|
||||||
|
},
|
||||||
|
# Start Other Custom Language Mappings Here
|
||||||
|
# Default Mapping Below
|
||||||
|
{
|
||||||
|
matches = ".*"
|
||||||
|
args = [ "-l", "{{lang}}" ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
command = {
|
command = {
|
||||||
program = "tesseract"
|
program = "tesseract"
|
||||||
|
# Default arguments for all processing go below.
|
||||||
args = [
|
args = [
|
||||||
"{{infile}}",
|
"{{infile}}",
|
||||||
"out",
|
"out",
|
||||||
"-l",
|
"{{tesseract_lang}}",
|
||||||
"{{lang}}",
|
|
||||||
"pdf",
|
"pdf",
|
||||||
"txt"
|
"txt"
|
||||||
]
|
]
|
||||||
@ -648,11 +667,32 @@ Docpell Update Check
|
|||||||
# (where ocr is not necessary). In this case, the pdf will be
|
# (where ocr is not necessary). In this case, the pdf will be
|
||||||
# converted to PDF/A.
|
# converted to PDF/A.
|
||||||
ocrmypdf = {
|
ocrmypdf = {
|
||||||
|
# Custom argument mappings for this program.
|
||||||
|
arg-mappings = {
|
||||||
|
"ocr_lang" = {
|
||||||
|
value = "{{lang}}"
|
||||||
|
# Custom Language Mappings Below
|
||||||
|
# Japanese Vertical Mapping
|
||||||
|
mappings = [
|
||||||
|
{
|
||||||
|
matches = "jpn_vert"
|
||||||
|
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
|
||||||
|
},
|
||||||
|
# Start Other Custom Language Mappings Here
|
||||||
|
# Default Mapping Below
|
||||||
|
{
|
||||||
|
matches = ".*"
|
||||||
|
args = [ "-l", "{{lang}}" ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
enabled = true
|
enabled = true
|
||||||
command = {
|
command = {
|
||||||
program = "ocrmypdf"
|
program = "ocrmypdf"
|
||||||
|
# Default arguments for all processing go below.
|
||||||
args = [
|
args = [
|
||||||
"-l", "{{lang}}",
|
"{{ocr_lang}}",
|
||||||
"--skip-text",
|
"--skip-text",
|
||||||
"--deskew",
|
"--deskew",
|
||||||
"-j", "1",
|
"-j", "1",
|
||||||
@ -893,4 +933,4 @@ Docpell Update Check
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,5 +6,8 @@
|
|||||||
"@fortawesome/fontawesome-free": "^6.0.0",
|
"@fortawesome/fontawesome-free": "^6.0.0",
|
||||||
"@tailwindcss/forms": "^0.5.0",
|
"@tailwindcss/forms": "^0.5.0",
|
||||||
"flag-icons": "^7.2.0"
|
"flag-icons": "^7.2.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"tailwindcss": "^3.4.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,7 @@ type Language
|
|||||||
| Dutch
|
| Dutch
|
||||||
| Latvian
|
| Latvian
|
||||||
| Japanese
|
| Japanese
|
||||||
|
| JpnVert
|
||||||
| Hebrew
|
| Hebrew
|
||||||
| Hungarian
|
| Hungarian
|
||||||
| Lithuanian
|
| Lithuanian
|
||||||
@ -90,6 +91,9 @@ fromString str =
|
|||||||
else if str == "jpn" || str == "ja" || str == "japanese" then
|
else if str == "jpn" || str == "ja" || str == "japanese" then
|
||||||
Just Japanese
|
Just Japanese
|
||||||
|
|
||||||
|
else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then
|
||||||
|
Just JpnVert
|
||||||
|
|
||||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||||
Just Hebrew
|
Just Hebrew
|
||||||
|
|
||||||
@ -169,6 +173,9 @@ toIso3 lang =
|
|||||||
Japanese ->
|
Japanese ->
|
||||||
"jpn"
|
"jpn"
|
||||||
|
|
||||||
|
JpnVert ->
|
||||||
|
"jpn_vert"
|
||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"heb"
|
"heb"
|
||||||
|
|
||||||
@ -212,6 +219,7 @@ all =
|
|||||||
, Romanian
|
, Romanian
|
||||||
, Latvian
|
, Latvian
|
||||||
, Japanese
|
, Japanese
|
||||||
|
, JpnVert
|
||||||
, Hebrew
|
, Hebrew
|
||||||
, Hungarian
|
, Hungarian
|
||||||
, Lithuanian
|
, Lithuanian
|
||||||
|
@ -65,6 +65,9 @@ gb lang =
|
|||||||
Japanese ->
|
Japanese ->
|
||||||
"Japanese"
|
"Japanese"
|
||||||
|
|
||||||
|
JpnVert ->
|
||||||
|
"JpnVert"
|
||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebrew"
|
"Hebrew"
|
||||||
|
|
||||||
@ -141,6 +144,9 @@ de lang =
|
|||||||
Japanese ->
|
Japanese ->
|
||||||
"Japanisch"
|
"Japanisch"
|
||||||
|
|
||||||
|
JpnVert ->
|
||||||
|
"JpnVert"
|
||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebräisch"
|
"Hebräisch"
|
||||||
|
|
||||||
@ -217,6 +223,9 @@ fr lang =
|
|||||||
Japanese ->
|
Japanese ->
|
||||||
"Japonnais"
|
"Japonnais"
|
||||||
|
|
||||||
|
JpnVert ->
|
||||||
|
"JpnVert"
|
||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hébreu"
|
"Hébreu"
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user