mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)
* Add Japanese Vertical Support * Adds Japanese Vertical mappings to default configuration.
This commit is contained in:
@ -593,13 +593,32 @@ Docpell Update Check
|
||||
# To convert image files to PDF files, tesseract is used. This
|
||||
# also extracts the text in one go.
|
||||
tesseract = {
|
||||
# Custom Language Mappings Below
|
||||
# Japanese Vertical Mapping
|
||||
arg-mappings = {
|
||||
"tesseract_lang" = {
|
||||
value = "{{lang}}"
|
||||
mappings = [
|
||||
{
|
||||
matches = "jpn_vert"
|
||||
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
|
||||
},
|
||||
# Start Other Custom Language Mappings Here
|
||||
# Default Mapping Below
|
||||
{
|
||||
matches = ".*"
|
||||
args = [ "-l", "{{lang}}" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
command = {
|
||||
program = "tesseract"
|
||||
# Default arguments for all processing go below.
|
||||
args = [
|
||||
"{{infile}}",
|
||||
"out",
|
||||
"-l",
|
||||
"{{lang}}",
|
||||
"{{tesseract_lang}}",
|
||||
"pdf",
|
||||
"txt"
|
||||
]
|
||||
@ -648,11 +667,32 @@ Docpell Update Check
|
||||
# (where ocr is not necessary). In this case, the pdf will be
|
||||
# converted to PDF/A.
|
||||
ocrmypdf = {
|
||||
# Custom argument mappings for this program.
|
||||
arg-mappings = {
|
||||
"ocr_lang" = {
|
||||
value = "{{lang}}"
|
||||
# Custom Language Mappings Below
|
||||
# Japanese Vertical Mapping
|
||||
mappings = [
|
||||
{
|
||||
matches = "jpn_vert"
|
||||
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
|
||||
},
|
||||
# Start Other Custom Language Mappings Here
|
||||
# Default Mapping Below
|
||||
{
|
||||
matches = ".*"
|
||||
args = [ "-l", "{{lang}}" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
enabled = true
|
||||
command = {
|
||||
program = "ocrmypdf"
|
||||
# Default arguments for all processing go below.
|
||||
args = [
|
||||
"-l", "{{lang}}",
|
||||
"{{ocr_lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j", "1",
|
||||
@ -893,4 +933,4 @@ Docpell Update Check
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user