From d413b16b031c46616cddddd079bd814b3352f777 Mon Sep 17 00:00:00 2001 From: eikek Date: Thu, 7 Jul 2022 17:58:03 +0200 Subject: [PATCH] Allow to always use OCR extracted text Fixes: #1628 --- .../src/main/scala/docspell/extract/PdfExtract.scala | 12 +++++++++--- modules/joex/src/main/resources/reference.conf | 3 +++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index a9bf2858..400a1a4e 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -48,15 +48,21 @@ object PdfExtract { // maybe better: inspect the pdf and decide whether ocr or not for { pdfboxRes <- - logger.debug("Trying to strip text from pdf using pdfbox.") *> - PdfboxExtract.getTextAndMetaData[F](in) + if (stripMinLen > 0) + logger.debug("Trying to strip text from pdf using pdfbox.") *> + PdfboxExtract.getTextAndMetaData[F](in) + else + logger + .debug(s"Not stripping text from pdf, min-length=$stripMinLen") + .as(Right(Text("") -> None)) res <- pdfboxRes.fold( ex => logger.info( s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " ) >> runOcr.map(txt => Result(txt, None)).attempt, pair => - if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt + if (pair._1.length >= stripMinLen && stripMinLen > 0) + Result(pair).pure[F].attempt else logger .info( diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 1cda608a..3421ffef 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -298,6 +298,9 @@ Docpell Update Check # and images. If the returned text is shorter than the value # below, OCR is run afterwards. Then both extracted texts are # compared and the longer will be used. + # + # If you set this to 0 (or a negative value), then the text parts + # of a PDF are ignored and OCR is always run and its result used. pdf { min-text-len = 500 }