Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505)

* Add Japanese Vertical Support * Adds Japanese Vertical mappings to default configuration.
2025-10-16 21:01:51 +00:00 · 2024-04-16 18:24:57 +00:00
parent 36c00cc9ec
commit e731d822dc
9 changed files with 75 additions and 5 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -593,13 +593,32 @@ Docpell Update Check
    # To convert image files to PDF files, tesseract is used. This
    # also extracts the text in one go.
    tesseract = {
+    # Custom Language Mappings Below
+    # Japanese Vertical Mapping
+    arg-mappings = {
+      "tesseract_lang" = {
+        value = "{{lang}}"
+        mappings = [
+          {
+            matches = "jpn_vert"
+            args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
+          },
+        # Start Other Custom Language Mappings Here
+        # Default Mapping Below
+          {
+            matches = ".*"
+            args = [ "-l", "{{lang}}" ] 
+          }
+        ]
+      }
+    }
      command = {
        program = "tesseract"
+        # Default arguments for all processing go below.
        args = [
          "{{infile}}",
          "out",
-          "-l",
-          "{{lang}}",
+          "{{tesseract_lang}}",
          "pdf",
          "txt"
        ]
@@ -648,11 +667,32 @@ Docpell Update Check
    # (where ocr is not necessary). In this case, the pdf will be
    # converted to PDF/A.
    ocrmypdf = {
+    # Custom argument mappings for this program.
+    arg-mappings = {
+      "ocr_lang" = {
+        value = "{{lang}}"
+        # Custom Language Mappings Below
+        # Japanese Vertical Mapping
+        mappings = [
+          {
+            matches = "jpn_vert"
+            args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
+          },
+        # Start Other Custom Language Mappings Here
+        # Default Mapping Below
+          {
+            matches = ".*"
+            args = [ "-l", "{{lang}}" ]
+          }
+        ]
+      }
+    }
      enabled = true
      command = {
        program = "ocrmypdf"
+        # Default arguments for all processing go below.
        args = [
-          "-l", "{{lang}}",
+          "{{ocr_lang}}",
          "--skip-text",
          "--deskew",
          "-j", "1",
@@ -893,4 +933,4 @@ Docpell Update Check
      }
    }
  }
-}
+}