Merge pull request #1634 from eikek/force-ocr

Allow to always use OCR extracted text
This commit is contained in:
mergify[bot] 2022-07-07 16:12:07 +00:00 committed by GitHub
commit 275901267d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 3 deletions

View File

@ -48,15 +48,21 @@ object PdfExtract {
// maybe better: inspect the pdf and decide whether ocr or not
for {
pdfboxRes <-
if (stripMinLen > 0)
logger.debug("Trying to strip text from pdf using pdfbox.") *>
PdfboxExtract.getTextAndMetaData[F](in)
else
logger
.debug(s"Not stripping text from pdf, min-length=$stripMinLen")
.as(Right(Text("") -> None))
res <- pdfboxRes.fold(
ex =>
logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) >> runOcr.map(txt => Result(txt, None)).attempt,
pair =>
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
if (pair._1.length >= stripMinLen && stripMinLen > 0)
Result(pair).pure[F].attempt
else
logger
.info(

View File

@ -298,6 +298,9 @@ Docpell Update Check
# and images. If the returned text is shorter than the value
# below, OCR is run afterwards. Then both extracted texts are
# compared and the longer will be used.
#
# If you set this to 0 (or a negative value), then the text parts
# of a PDF are ignored and OCR is always run and its result used.
pdf {
min-text-len = 500
}