Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)

* Add Japanese Vertical Support 
* Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
tenpai
2024-04-16 18:24:57 +00:00
committed by GitHub
parent 36c00cc9ec
commit e731d822dc
9 changed files with 75 additions and 5 deletions

View File

@ -593,13 +593,32 @@ Docpell Update Check
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = {
# Custom Language Mappings Below
# Japanese Vertical Mapping
arg-mappings = {
"tesseract_lang" = {
value = "{{lang}}"
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
command = {
program = "tesseract"
# Default arguments for all processing go below.
args = [
"{{infile}}",
"out",
"-l",
"{{lang}}",
"{{tesseract_lang}}",
"pdf",
"txt"
]
@ -648,11 +667,32 @@ Docpell Update Check
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
# Custom argument mappings for this program.
arg-mappings = {
"ocr_lang" = {
value = "{{lang}}"
# Custom Language Mappings Below
# Japanese Vertical Mapping
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
enabled = true
command = {
program = "ocrmypdf"
# Default arguments for all processing go below.
args = [
"-l", "{{lang}}",
"{{ocr_lang}}",
"--skip-text",
"--deskew",
"-j", "1",
@ -893,4 +933,4 @@ Docpell Update Check
}
}
}
}
}