mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 10:59:33 +00:00
Merge pull request #1634 from eikek/force-ocr
Allow to always use OCR extracted text
This commit is contained in:
commit
275901267d
@ -48,15 +48,21 @@ object PdfExtract {
|
||||
// maybe better: inspect the pdf and decide whether ocr or not
|
||||
for {
|
||||
pdfboxRes <-
|
||||
if (stripMinLen > 0)
|
||||
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
||||
PdfboxExtract.getTextAndMetaData[F](in)
|
||||
else
|
||||
logger
|
||||
.debug(s"Not stripping text from pdf, min-length=$stripMinLen")
|
||||
.as(Right(Text("") -> None))
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
||||
pair =>
|
||||
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
|
||||
if (pair._1.length >= stripMinLen && stripMinLen > 0)
|
||||
Result(pair).pure[F].attempt
|
||||
else
|
||||
logger
|
||||
.info(
|
||||
|
@ -298,6 +298,9 @@ Docpell Update Check
|
||||
# and images. If the returned text is shorter than the value
|
||||
# below, OCR is run afterwards. Then both extracted texts are
|
||||
# compared and the longer will be used.
|
||||
#
|
||||
# If you set this to 0 (or a negative value), then the text parts
|
||||
# of a PDF are ignored and OCR is always run and its result used.
|
||||
pdf {
|
||||
min-text-len = 500
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user