Allow to always use OCR extracted text

Fixes: #1628
2025-11-01 02:20:10 +00:00 · 2022-07-07 17:58:03 +02:00
parent d0d8a8fbe7
commit d413b16b03
2 changed files with 12 additions and 3 deletions
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -48,15 +48,21 @@ object PdfExtract {
    // maybe better: inspect the pdf and decide whether ocr or not
    for {
      pdfboxRes <-
-        logger.debug("Trying to strip text from pdf using pdfbox.") *>
-          PdfboxExtract.getTextAndMetaData[F](in)
+        if (stripMinLen > 0)
+          logger.debug("Trying to strip text from pdf using pdfbox.") *>
+            PdfboxExtract.getTextAndMetaData[F](in)
+        else
+          logger
+            .debug(s"Not stripping text from pdf, min-length=$stripMinLen")
+            .as(Right(Text("") -> None))
      res <- pdfboxRes.fold(
        ex =>
          logger.info(
            s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
          ) >> runOcr.map(txt => Result(txt, None)).attempt,
        pair =>
-          if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
+          if (pair._1.length >= stripMinLen && stripMinLen > 0)
+            Result(pair).pure[F].attempt
          else
            logger
              .info(
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -298,6 +298,9 @@ Docpell Update Check
    # and images. If the returned text is shorter than the value
    # below, OCR is run afterwards. Then both extracted texts are
    # compared and the longer will be used.
+    #
+    # If you set this to 0 (or a negative value), then the text parts
+    # of a PDF are ignored and OCR is always run and its result used.
    pdf {
      min-text-len = 500
    }