Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
2025-08-05 02:24:52 +00:00 · 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -339,6 +339,39 @@ docspell.joex {
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }
+
+    # The tool ocrmypdf can be used to convert pdf files to pdf files
+    # in order to add extracted text as a separate layer. This makes
+    # image-only pdfs searchable and you can select and copy/paste the
+    # text. It also converts pdfs into pdf/a type pdfs, which are best
+    # suited for archiving. So it makes sense to use this even for
+    # text-only pdfs.
+    #
+    # It is recommended to install ocrympdf, but it also is optional.
+    # If it is enabled but fails, the error is not fatal and the
+    # processing will continue using the original pdf for extracting
+    # text. You can also disable it to remove the errors from the
+    # processing logs.
+    #
+    # The `--skip-text` option is necessary to not fail on "text" pdfs
+    # (where ocr is not necessary). In this case, the pdf will be
+    # converted to PDF/A.
+    ocrmypdf = {
+      enabled = true
+      command = {
+        program = "ocrmypdf"
+        args = [
+          "-l", "{{lang}}",
+          "--skip-text",
+          "--deskew",
+          "-j", "1",
+          "{{infile}}",
+          "{{outfile}}"
+        ]
+        timeout = "5 minutes"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
  }

  # General config for processing documents
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -64,10 +64,6 @@ object ConvertPdf {
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
    Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
      mime.toLocal match {
-        case MimeType.PdfMatch(_) =>
-          ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
-            (ra, None: Option[RAttachmentMeta]).pure[F]
-
        case mt =>
          val data = ctx.store.bitpeace
            .get(ra.fileId.id)
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@ -85,9 +85,10 @@ object TextExtraction {
      item: ItemData
  )(ra: RAttachment): F[RAttachmentMeta] =
    for {
-      _   <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
-      dst <- Duration.stopTime[F]
-      txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
+      _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
+      dst  <- Duration.stopTime[F]
+      fids <- filesToExtract(ctx)(item, ra)
+      txt  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
      meta = item.changeMeta(
        ra.id,
        rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
@ -151,11 +152,24 @@ object TextExtraction {

  /** Returns the fileIds to extract text from. First, the source file
    * is tried. If that fails, the converted file is tried.
+    *
+    * If the source file is a PDF, then use the converted file. This
+    * may then already contain the text if ocrmypdf is enabled. If it
+    * is disabled, both files are the same.
    */
-  private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
+  private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
+      item: ItemData,
+      ra: RAttachment
+  ): F[List[Ident]] =
    item.originFile.get(ra.id) match {
-      case Some(sid) => List(sid, ra.fileId).distinct
-      case None      => List(ra.fileId)
+      case Some(sid) =>
+        ctx.store.transact(RFileMeta.findMime(sid)).map {
+          case Some(MimeType.PdfMatch(_)) =>
+            List(ra.fileId)
+          case _ =>
+            List(sid, ra.fileId).distinct
+        }
+      case None => List(ra.fileId).pure[F]
    }

  private def stripAttachmentName(ra: RAttachment): String =