Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
2025-09-15 21:46:53 +00:00 · 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -339,6 +339,39 @@ docspell.joex {
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }
+
+    # The tool ocrmypdf can be used to convert pdf files to pdf files
+    # in order to add extracted text as a separate layer. This makes
+    # image-only pdfs searchable and you can select and copy/paste the
+    # text. It also converts pdfs into pdf/a type pdfs, which are best
+    # suited for archiving. So it makes sense to use this even for
+    # text-only pdfs.
+    #
+    # It is recommended to install ocrympdf, but it also is optional.
+    # If it is enabled but fails, the error is not fatal and the
+    # processing will continue using the original pdf for extracting
+    # text. You can also disable it to remove the errors from the
+    # processing logs.
+    #
+    # The `--skip-text` option is necessary to not fail on "text" pdfs
+    # (where ocr is not necessary). In this case, the pdf will be
+    # converted to PDF/A.
+    ocrmypdf = {
+      enabled = true
+      command = {
+        program = "ocrmypdf"
+        args = [
+          "-l", "{{lang}}",
+          "--skip-text",
+          "--deskew",
+          "-j", "1",
+          "{{infile}}",
+          "{{outfile}}"
+        ]
+        timeout = "5 minutes"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
  }

  # General config for processing documents