Merge pull request #1634 from eikek/force-ocr

Allow to always use OCR extracted text
This commit is contained in:
mergify[bot]
2022-07-07 16:12:07 +00:00
committed by GitHub
2 changed files with 12 additions and 3 deletions

View File

@@ -48,15 +48,21 @@ object PdfExtract {
// maybe better: inspect the pdf and decide whether ocr or not // maybe better: inspect the pdf and decide whether ocr or not
for { for {
pdfboxRes <- pdfboxRes <-
if (stripMinLen > 0)
logger.debug("Trying to strip text from pdf using pdfbox.") *> logger.debug("Trying to strip text from pdf using pdfbox.") *>
PdfboxExtract.getTextAndMetaData[F](in) PdfboxExtract.getTextAndMetaData[F](in)
else
logger
.debug(s"Not stripping text from pdf, min-length=$stripMinLen")
.as(Right(Text("") -> None))
res <- pdfboxRes.fold( res <- pdfboxRes.fold(
ex => ex =>
logger.info( logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) >> runOcr.map(txt => Result(txt, None)).attempt, ) >> runOcr.map(txt => Result(txt, None)).attempt,
pair => pair =>
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt if (pair._1.length >= stripMinLen && stripMinLen > 0)
Result(pair).pure[F].attempt
else else
logger logger
.info( .info(

View File

@@ -298,6 +298,9 @@ Docpell Update Check
# and images. If the returned text is shorter than the value # and images. If the returned text is shorter than the value
# below, OCR is run afterwards. Then both extracted texts are # below, OCR is run afterwards. Then both extracted texts are
# compared and the longer will be used. # compared and the longer will be used.
#
# If you set this to 0 (or a negative value), then the text parts
# of a PDF are ignored and OCR is always run and its result used.
pdf { pdf {
min-text-len = 500 min-text-len = 500
} }