Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
2025-08-05 02:24:52 +00:00 · 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@ -8,7 +8,7 @@ import fs2._

 import docspell.common._
 import docspell.convert.ConversionResult.Handler
-import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
+import docspell.convert.extern._
 import docspell.convert.flexmark.Markdown
 import docspell.files.{ImageSize, TikaMimetype}

@ -35,7 +35,8 @@ object Conversion {
      ): F[A] =
        TikaMimetype.resolve(dataType, in).flatMap {
          case MimeType.PdfMatch(_) =>
-            handler.run(ConversionResult.successPdf(in))
+            OcrMyPdf
+              .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)

          case MimeType.HtmlMatch(mt) =>
            val cs = mt.charsetOrUtf8
--- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
@ -1,5 +1,6 @@
 package docspell.convert

+import docspell.convert.extern.OcrMyPdfConfig
 import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
 import docspell.convert.flexmark.MarkdownConfig

@ -9,5 +10,6 @@ case class ConvertConfig(
    markdown: MarkdownConfig,
    wkhtmlpdf: WkHtmlPdfConfig,
    tesseract: TesseractConfig,
-    unoconv: UnoconvConfig
+    unoconv: UnoconvConfig,
+    ocrmypdf: OcrMyPdfConfig
 )
--- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala
@ -41,7 +41,7 @@ private[extern] object ExternConv {

        in.through(createInput).flatMap { _ =>
          SystemCommand
-            .execSuccess[F](
+            .exec[F](
              sysCfg,
              blocker,
              logger,
@ -65,11 +65,20 @@ private[extern] object ExternConv {
      logger: Logger[F]
  )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
    File.existsNonEmpty[F](out).flatMap {
-      case true =>
-        if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
-        else
-          logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
+      case true if result.rc == 0 =>
+        val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
+        File.existsNonEmpty[F](outTxt).flatMap {
+          case true =>
+            successPdfTxt(
+              File.readAll(out, blocker, chunkSize),
+              File.readText(outTxt, blocker)
+            ).pure[F]
+          case false =>
            successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
+        }
+      case true if result.rc != 0 =>
+        logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
+          successPdf(File.readAll(out, blocker, chunkSize)).pure[F]

      case false =>
        ConversionResult
--- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
@ -0,0 +1,37 @@
+package docspell.convert.extern
+
+import java.nio.file.Path
+
+import cats.effect._
+import fs2.Stream
+
+import docspell.common._
+import docspell.convert.ConversionResult
+import docspell.convert.ConversionResult.Handler
+
+object OcrMyPdf {
+
+  def toPDF[F[_]: Sync: ContextShift, A](
+      cfg: OcrMyPdfConfig,
+      lang: Language,
+      chunkSize: Int,
+      blocker: Blocker,
+      logger: Logger[F]
+  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
+    if (cfg.enabled) {
+      val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
+        ExternConv.readResult[F](blocker, chunkSize, logger)
+
+      ExternConv.toPDF[F, A](
+        "ocrmypdf",
+        cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
+        cfg.workingDir,
+        false,
+        blocker,
+        logger,
+        reader
+      )(in, handler)
+    } else
+      handler(ConversionResult.unsupportedFormat(MimeType.pdf))
+
+}
--- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
@ -0,0 +1,11 @@
+package docspell.convert.extern
+
+import java.nio.file.Path
+
+import docspell.common.SystemCommand
+
+case class OcrMyPdfConfig(
+    enabled: Boolean,
+    command: SystemCommand.Config,
+    workingDir: Path
+)
--- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
 import docspell.convert.flexmark.MarkdownConfig
 import docspell.files.{ExampleFiles, TestFiles}
 import minitest.SimpleTestSuite
+import docspell.convert.extern.OcrMyPdfConfig

 object ConversionTest extends SimpleTestSuite with FileChecks {
  val blocker     = TestFiles.blocker
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
        Duration.seconds(20)
      ),
      target
+    ),
+    OcrMyPdfConfig(
+      true,
+      SystemCommand.Config(
+        "ocrmypdf",
+        Seq(
+          "-l",
+          "{{lang}}",
+          "--skip-text",
+          "--deskew",
+          "-j",
+          "1",
+          "{{infile}}",
+          "{{outfile}}"
+        ),
+        Duration.seconds(20)
+      ),
+      target
    )
  )