Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
2025-07-30 19:14:52 +00:00 · 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions
--- a/docker/joex.dockerfile
+++ b/docker/joex.dockerfile
@ -19,6 +19,17 @@ RUN apk add --no-cache openjdk11-jre \
    ttf-dejavu \
    ttf-freefont \
    ttf-liberation \
+    libxml2-dev \
+    libxslt-dev \
+    pngquant \
+    zlib-dev \
+    g++ \
+    qpdf \
+    python3-dev \
+    libffi-dev\
+    qpdf-dev \
+  && pip3 install --upgrade pip \
+  && pip3 install ocrmypdf \
  && curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
  && chmod +x /usr/local/bin/unoconv \
  && ln -s /usr/bin/python3 /usr/bin/python \
@ -27,7 +38,7 @@ RUN apk add --no-cache openjdk11-jre \
  && curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
  && unzip docspell.zip \
  && rm docspell.zip \
-  && apk del curl unzip
+  && apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev

 COPY entrypoint-joex.sh /opt/entrypoint.sh

--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@ -8,7 +8,7 @@ import fs2._

 import docspell.common._
 import docspell.convert.ConversionResult.Handler
-import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
+import docspell.convert.extern._
 import docspell.convert.flexmark.Markdown
 import docspell.files.{ImageSize, TikaMimetype}

@ -35,7 +35,8 @@ object Conversion {
      ): F[A] =
        TikaMimetype.resolve(dataType, in).flatMap {
          case MimeType.PdfMatch(_) =>
-            handler.run(ConversionResult.successPdf(in))
+            OcrMyPdf
+              .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)

          case MimeType.HtmlMatch(mt) =>
            val cs = mt.charsetOrUtf8
--- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
@ -1,5 +1,6 @@
 package docspell.convert

+import docspell.convert.extern.OcrMyPdfConfig
 import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
 import docspell.convert.flexmark.MarkdownConfig

@ -9,5 +10,6 @@ case class ConvertConfig(
    markdown: MarkdownConfig,
    wkhtmlpdf: WkHtmlPdfConfig,
    tesseract: TesseractConfig,
-    unoconv: UnoconvConfig
+    unoconv: UnoconvConfig,
+    ocrmypdf: OcrMyPdfConfig
 )
--- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala
@ -41,7 +41,7 @@ private[extern] object ExternConv {

        in.through(createInput).flatMap { _ =>
          SystemCommand
-            .execSuccess[F](
+            .exec[F](
              sysCfg,
              blocker,
              logger,
@ -65,11 +65,20 @@ private[extern] object ExternConv {
      logger: Logger[F]
  )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
    File.existsNonEmpty[F](out).flatMap {
-      case true =>
-        if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
-        else
-          logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
+      case true if result.rc == 0 =>
+        val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
+        File.existsNonEmpty[F](outTxt).flatMap {
+          case true =>
+            successPdfTxt(
+              File.readAll(out, blocker, chunkSize),
+              File.readText(outTxt, blocker)
+            ).pure[F]
+          case false =>
            successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
+        }
+      case true if result.rc != 0 =>
+        logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
+          successPdf(File.readAll(out, blocker, chunkSize)).pure[F]

      case false =>
        ConversionResult
--- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
@ -0,0 +1,37 @@
+package docspell.convert.extern
+
+import java.nio.file.Path
+
+import cats.effect._
+import fs2.Stream
+
+import docspell.common._
+import docspell.convert.ConversionResult
+import docspell.convert.ConversionResult.Handler
+
+object OcrMyPdf {
+
+  def toPDF[F[_]: Sync: ContextShift, A](
+      cfg: OcrMyPdfConfig,
+      lang: Language,
+      chunkSize: Int,
+      blocker: Blocker,
+      logger: Logger[F]
+  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
+    if (cfg.enabled) {
+      val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
+        ExternConv.readResult[F](blocker, chunkSize, logger)
+
+      ExternConv.toPDF[F, A](
+        "ocrmypdf",
+        cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
+        cfg.workingDir,
+        false,
+        blocker,
+        logger,
+        reader
+      )(in, handler)
+    } else
+      handler(ConversionResult.unsupportedFormat(MimeType.pdf))
+
+}
--- a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
@ -0,0 +1,11 @@
+package docspell.convert.extern
+
+import java.nio.file.Path
+
+import docspell.common.SystemCommand
+
+case class OcrMyPdfConfig(
+    enabled: Boolean,
+    command: SystemCommand.Config,
+    workingDir: Path
+)
--- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
 import docspell.convert.flexmark.MarkdownConfig
 import docspell.files.{ExampleFiles, TestFiles}
 import minitest.SimpleTestSuite
+import docspell.convert.extern.OcrMyPdfConfig

 object ConversionTest extends SimpleTestSuite with FileChecks {
  val blocker     = TestFiles.blocker
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
        Duration.seconds(20)
      ),
      target
+    ),
+    OcrMyPdfConfig(
+      true,
+      SystemCommand.Config(
+        "ocrmypdf",
+        Seq(
+          "-l",
+          "{{lang}}",
+          "--skip-text",
+          "--deskew",
+          "-j",
+          "1",
+          "{{infile}}",
+          "{{outfile}}"
+        ),
+        Duration.seconds(20)
+      ),
+      target
    )
  )

--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -339,6 +339,39 @@ docspell.joex {
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }
+
+    # The tool ocrmypdf can be used to convert pdf files to pdf files
+    # in order to add extracted text as a separate layer. This makes
+    # image-only pdfs searchable and you can select and copy/paste the
+    # text. It also converts pdfs into pdf/a type pdfs, which are best
+    # suited for archiving. So it makes sense to use this even for
+    # text-only pdfs.
+    #
+    # It is recommended to install ocrympdf, but it also is optional.
+    # If it is enabled but fails, the error is not fatal and the
+    # processing will continue using the original pdf for extracting
+    # text. You can also disable it to remove the errors from the
+    # processing logs.
+    #
+    # The `--skip-text` option is necessary to not fail on "text" pdfs
+    # (where ocr is not necessary). In this case, the pdf will be
+    # converted to PDF/A.
+    ocrmypdf = {
+      enabled = true
+      command = {
+        program = "ocrmypdf"
+        args = [
+          "-l", "{{lang}}",
+          "--skip-text",
+          "--deskew",
+          "-j", "1",
+          "{{infile}}",
+          "{{outfile}}"
+        ]
+        timeout = "5 minutes"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
  }

  # General config for processing documents
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -64,10 +64,6 @@ object ConvertPdf {
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
    Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
      mime.toLocal match {
-        case MimeType.PdfMatch(_) =>
-          ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
-            (ra, None: Option[RAttachmentMeta]).pure[F]
-
        case mt =>
          val data = ctx.store.bitpeace
            .get(ra.fileId.id)
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@ -85,9 +85,10 @@ object TextExtraction {
      item: ItemData
  )(ra: RAttachment): F[RAttachmentMeta] =
    for {
-      _   <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
-      dst <- Duration.stopTime[F]
-      txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
+      _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
+      dst  <- Duration.stopTime[F]
+      fids <- filesToExtract(ctx)(item, ra)
+      txt  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
      meta = item.changeMeta(
        ra.id,
        rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
@ -151,11 +152,24 @@ object TextExtraction {

  /** Returns the fileIds to extract text from. First, the source file
    * is tried. If that fails, the converted file is tried.
+    *
+    * If the source file is a PDF, then use the converted file. This
+    * may then already contain the text if ocrmypdf is enabled. If it
+    * is disabled, both files are the same.
    */
-  private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
+  private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
+      item: ItemData,
+      ra: RAttachment
+  ): F[List[Ident]] =
    item.originFile.get(ra.id) match {
-      case Some(sid) => List(sid, ra.fileId).distinct
-      case None      => List(ra.fileId)
+      case Some(sid) =>
+        ctx.store.transact(RFileMeta.findMime(sid)).map {
+          case Some(MimeType.PdfMatch(_)) =>
+            List(ra.fileId)
+          case _ =>
+            List(sid, ra.fileId).distinct
+        }
+      case None => List(ra.fileId).pure[F]
    }

  private def stripAttachmentName(ra: RAttachment): String =
--- a/modules/microsite/docs/dev/adr.md
+++ b/modules/microsite/docs/dev/adr.md
@ -23,3 +23,4 @@ Some early information about certain details can be found in a few
 - [0012 Periodic Tasks](adr/0012_periodic_tasks)
 - [0013 Archive Files](adr/0013_archive_files)
 - [0014 Full-Text Search](adr/0014_fulltext_search_engine)
+- [0015 Convert PDF files](adr/0015_convert_pdf_files)
--- a/modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
+++ b/modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
@ -0,0 +1,67 @@
+---
+layout: docs
+title: Convert PDF Files
+permalink: dev/adr/0015_convert_pdf_files
+---
+
+# {{ page.title }}
+
+## Context and Problem Statement
+
+Some PDFs contain only images (when coming from a scanner) and
+therefore one is not able to click into the pdf and select text for
+copy&paste. Also it is not searchable in a PDF viewer. These are
+really shortcomings that can be fixed, especially when there is
+already OCR build in.
+
+For images, this works already as tesseract is used to create the PDF
+files. Tesseract creates the files with an additional text layer
+containing the OCRed text.
+
+## Considered Options
+
+* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
+  OCR text layer to scanned PDF files, allowing them to be searched
+
+
+### ocrmypdf
+
+This is a very nice python tool, that uses tesseract to do OCR on each
+page and add the extracted text as a pdf text layer to the page.
+Additionally it creates PDF/A type pdfs, which are great for
+archiving. This fixes exactly the things stated above.
+
+#### Integration
+
+Docspell already has this built in for images. When converting images
+to a PDF (which is done early in processing), the process creates a
+text and a PDF file. Docspell then sets the text in this step and the
+text extraction step skips doing its work, if there is already text
+available.
+
+It would be possible to use the `--sidecar` option with ocrmypdf to
+create a text file of the extracted text with one run, too (exactly
+like it works for tesseract). But for "text" pdfs, ocrmypdf writes
+some info-message into this text file:
+
+```
+[OCR skipped on page 1][OCR skipped on page 2]
+```
+
+Docspell cannot reliably tell, wether this is extracted text or not.
+It would be reqiured to load the pdf and check its contents. This is a
+bit of bad luck, because everything would just work already. So it
+requires a (small) change in the text-extraction step. By default,
+text extraction happens on the source file. For PDFs, text extraction
+should now be run on the converted file, to avoid running OCR twice.
+
+The converted pdf file is either be a text-pdf in the first place,
+where ocrmypdf would only convert it to a PDF/A file; or it may be a
+converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
+is disabled, the converted file and the source file are the same for
+PDFs.
+
+## Decision Outcome
+
+Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
+distributed under the GPL-3 license.
--- a/modules/microsite/docs/doc/install.md
+++ b/modules/microsite/docs/doc/install.md
@ -77,6 +77,10 @@ component.
  office documents into PDF files. It uses libreoffice/openoffice.
 - [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
  PDF files.
+- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
+  used to convert PDF to PDF files. It adds an OCR layer to scanned
+  PDF files to make them searchable. It also creates PDF/A files from
+  the input pdf.

 The performance of `unoconv` can be improved by starting `unoconv -l`
 in a separate process. This runs a libreoffice/openoffice listener
@ -87,7 +91,7 @@ therefore avoids starting one each time `unoconv` is called.
 On Debian this should install all joex requirements:

 ``` bash
-sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf
+sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
 ```


--- a/modules/microsite/docs/features.md
+++ b/modules/microsite/docs/features.md
@ -13,7 +13,9 @@ permalink: features
 - OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
 - [Full-Text Search](doc/finding#full-text-search) based on [Apache
  SOLR](https://lucene.apache.org/solr)
- Conversion to PDF: all files are converted into a PDF file
+- Conversion to PDF: all files are converted into a PDF file. PDFs
+  with only images (as often returned from scanners) are converted
+  into searchable PDF/A pdfs.
 - Non-destructive: all your uploaded files are never modified and can
  always be downloaded untouched
 - Text is analysed to find and attach meta data automatically
--- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala
@ -3,8 +3,10 @@ package docspell.store.records
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
+import docspell.store.syntax.MimeTypes._

 import bitpeace.FileMeta
+import bitpeace.Mimetype
 import doobie._
 import doobie.implicits._

@ -30,4 +32,13 @@ object RFileMeta {

    selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
  }
+
+  def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
+    import bitpeace.sql._
+
+    selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
+      .query[Mimetype]
+      .option
+      .map(_.map(_.toLocal))
+  }
 }
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@ -131,6 +131,23 @@ let
        };
        working-dir = "/tmp/docspell-convert";
      };
+
+      ocrmypdf = {
+        enabled = true;
+        command = {
+          program = "${pkgs.ocrmypdf}/bin/ocrmypdf";
+          args = [
+          "-l" "{{lang}}"
+          "--skip-text"
+          "--deskew"
+          "-j" "1"
+          "{{infile}}"
+          "{{outfile}}"
+          ];
+          timeout = "5 minutes";
+        };
+        working-dir = "/tmp/docspell-convert";
+      };
    };
    files = {
      chunk-size = 524288;
@ -860,6 +877,66 @@ in {
                process.
              '';
            };
+
+            ocrmypdf = mkOption {
+              type = types.submodule({
+                options = {
+                  enabled = mkOption {
+                    type = types.bool;
+                    default = defaults.convert.ocrmypdf.enabled;
+                    description = "Whether to use ocrmypdf to convert pdf to pdf/a.";
+                  };
+                  working-dir = mkOption {
+                    type = types.str;
+                    default = defaults.convert.ocrmypdf.working-dir;
+                    description = "Directory where the conversion processes can put their temp files";
+                  };
+                  command = mkOption {
+                    type = types.submodule({
+                      options = {
+                        program = mkOption {
+                          type = types.str;
+                          default = defaults.convert.ocrmypdf.command.program;
+                          description = "The path to the executable.";
+                        };
+                        args = mkOption {
+                          type = types.listOf types.str;
+                          default = defaults.convert.ocrmypdf.command.args;
+                          description = "The arguments to the program";
+                        };
+                        timeout = mkOption {
+                          type = types.str;
+                          default = defaults.convert.ocrmypdf.command.timeout;
+                          description = "The timeout when executing the command";
+                        };
+                      };
+                    });
+                    default = defaults.convert.ocrmypdf.command;
+                    description = "The system command";
+                  };
+                };
+              });
+              default = defaults.convert.orcmypdf;
+              description = ''
+                The tool ocrmypdf can be used to convert pdf files to pdf files
+                in order to add extracted text as a separate layer. This makes
+                image-only pdfs searchable and you can select and copy/paste the
+                text. It also converts pdfs into pdf/a type pdfs, which are best
+                suited for archiving. So it makes sense to use this even for
+                text-only pdfs.
+
+                It is recommended to install ocrympdf, but it also is optional.
+                If it is enabled but fails, the error is not fatal and the
+                processing will continue using the original pdf for extracting
+                text. You can also disable it to remove the errors from the
+                processing logs.
+
+                The `--skip-text` option is necessary to not fail on "text" pdfs
+                (where ocr is not necessary). In this case, the pdf will be
+                converted to PDF/A.
+              '';
+            };
+
          };
        });
        default = defaults.convert;