Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the
  extracted text as another layer into the pdf

- Although not used, the external conversion routine will now check
  for an existing text file that is named as the pdf file with extension
  `.txt`. If present it is included in the conversion result and will be
  used as the extracted text.

- text extraction for pdf files happens now on the converted file,
  because it may already contain the text from the conversion step and
  thus avoids running OCR twice.

- All errors during conversion are not fatal; processing continues
  without a converted file.
This commit is contained in:
Eike Kettner
2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions

View File

@ -339,6 +339,39 @@ docspell.joex {
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# The tool ocrmypdf can be used to convert pdf files to pdf files
# in order to add extracted text as a separate layer. This makes
# image-only pdfs searchable and you can select and copy/paste the
# text. It also converts pdfs into pdf/a type pdfs, which are best
# suited for archiving. So it makes sense to use this even for
# text-only pdfs.
#
# It is recommended to install ocrympdf, but it also is optional.
# If it is enabled but fails, the error is not fatal and the
# processing will continue using the original pdf for extracting
# text. You can also disable it to remove the errors from the
# processing logs.
#
# The `--skip-text` option is necessary to not fail on "text" pdfs
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
enabled = true
command = {
program = "ocrmypdf"
args = [
"-l", "{{lang}}",
"--skip-text",
"--deskew",
"-j", "1",
"{{infile}}",
"{{outfile}}"
]
timeout = "5 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
}
# General config for processing documents

View File

@ -64,10 +64,6 @@ object ConvertPdf {
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
mime.toLocal match {
case MimeType.PdfMatch(_) =>
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
case mt =>
val data = ctx.store.bitpeace
.get(ra.fileId.id)

View File

@ -85,9 +85,10 @@ object TextExtraction {
item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] =
for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F]
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F]
fids <- filesToExtract(ctx)(item, ra)
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta(
ra.id,
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
@ -151,11 +152,24 @@ object TextExtraction {
/** Returns the fileIds to extract text from. First, the source file
* is tried. If that fails, the converted file is tried.
*
* If the source file is a PDF, then use the converted file. This
* may then already contain the text if ocrmypdf is enabled. If it
* is disabled, both files are the same.
*/
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
item: ItemData,
ra: RAttachment
): F[List[Ident]] =
item.originFile.get(ra.id) match {
case Some(sid) => List(sid, ra.fileId).distinct
case None => List(ra.fileId)
case Some(sid) =>
ctx.store.transact(RFileMeta.findMime(sid)).map {
case Some(MimeType.PdfMatch(_)) =>
List(ra.fileId)
case _ =>
List(sid, ra.fileId).distinct
}
case None => List(ra.fileId).pure[F]
}
private def stripAttachmentName(ra: RAttachment): String =