mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 10:59:33 +00:00
Merge pull request #1634 from eikek/force-ocr
Allow to always use OCR extracted text
This commit is contained in:
commit
275901267d
@ -48,15 +48,21 @@ object PdfExtract {
|
|||||||
// maybe better: inspect the pdf and decide whether ocr or not
|
// maybe better: inspect the pdf and decide whether ocr or not
|
||||||
for {
|
for {
|
||||||
pdfboxRes <-
|
pdfboxRes <-
|
||||||
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
if (stripMinLen > 0)
|
||||||
PdfboxExtract.getTextAndMetaData[F](in)
|
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
||||||
|
PdfboxExtract.getTextAndMetaData[F](in)
|
||||||
|
else
|
||||||
|
logger
|
||||||
|
.debug(s"Not stripping text from pdf, min-length=$stripMinLen")
|
||||||
|
.as(Right(Text("") -> None))
|
||||||
res <- pdfboxRes.fold(
|
res <- pdfboxRes.fold(
|
||||||
ex =>
|
ex =>
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||||
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
||||||
pair =>
|
pair =>
|
||||||
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
|
if (pair._1.length >= stripMinLen && stripMinLen > 0)
|
||||||
|
Result(pair).pure[F].attempt
|
||||||
else
|
else
|
||||||
logger
|
logger
|
||||||
.info(
|
.info(
|
||||||
|
@ -298,6 +298,9 @@ Docpell Update Check
|
|||||||
# and images. If the returned text is shorter than the value
|
# and images. If the returned text is shorter than the value
|
||||||
# below, OCR is run afterwards. Then both extracted texts are
|
# below, OCR is run afterwards. Then both extracted texts are
|
||||||
# compared and the longer will be used.
|
# compared and the longer will be used.
|
||||||
|
#
|
||||||
|
# If you set this to 0 (or a negative value), then the text parts
|
||||||
|
# of a PDF are ignored and OCR is always run and its result used.
|
||||||
pdf {
|
pdf {
|
||||||
min-text-len = 500
|
min-text-len = 500
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user