mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 09:58:26 +00:00
Use ocrmypdf tool to create pdf/a during conversion
- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
This commit is contained in:
@ -339,6 +339,39 @@ docspell.joex {
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||
}
|
||||
|
||||
# The tool ocrmypdf can be used to convert pdf files to pdf files
|
||||
# in order to add extracted text as a separate layer. This makes
|
||||
# image-only pdfs searchable and you can select and copy/paste the
|
||||
# text. It also converts pdfs into pdf/a type pdfs, which are best
|
||||
# suited for archiving. So it makes sense to use this even for
|
||||
# text-only pdfs.
|
||||
#
|
||||
# It is recommended to install ocrympdf, but it also is optional.
|
||||
# If it is enabled but fails, the error is not fatal and the
|
||||
# processing will continue using the original pdf for extracting
|
||||
# text. You can also disable it to remove the errors from the
|
||||
# processing logs.
|
||||
#
|
||||
# The `--skip-text` option is necessary to not fail on "text" pdfs
|
||||
# (where ocr is not necessary). In this case, the pdf will be
|
||||
# converted to PDF/A.
|
||||
ocrmypdf = {
|
||||
enabled = true
|
||||
command = {
|
||||
program = "ocrmypdf"
|
||||
args = [
|
||||
"-l", "{{lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j", "1",
|
||||
"{{infile}}",
|
||||
"{{outfile}}"
|
||||
]
|
||||
timeout = "5 minutes"
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||
}
|
||||
}
|
||||
|
||||
# General config for processing documents
|
||||
|
@ -64,10 +64,6 @@ object ConvertPdf {
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
|
||||
mime.toLocal match {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
case mt =>
|
||||
val data = ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
|
@ -85,9 +85,10 @@ object TextExtraction {
|
||||
item: ItemData
|
||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
||||
for {
|
||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||
dst <- Duration.stopTime[F]
|
||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
|
||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||
dst <- Duration.stopTime[F]
|
||||
fids <- filesToExtract(ctx)(item, ra)
|
||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||
meta = item.changeMeta(
|
||||
ra.id,
|
||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
||||
@ -151,11 +152,24 @@ object TextExtraction {
|
||||
|
||||
/** Returns the fileIds to extract text from. First, the source file
|
||||
* is tried. If that fails, the converted file is tried.
|
||||
*
|
||||
* If the source file is a PDF, then use the converted file. This
|
||||
* may then already contain the text if ocrmypdf is enabled. If it
|
||||
* is disabled, both files are the same.
|
||||
*/
|
||||
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
|
||||
private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
|
||||
item: ItemData,
|
||||
ra: RAttachment
|
||||
): F[List[Ident]] =
|
||||
item.originFile.get(ra.id) match {
|
||||
case Some(sid) => List(sid, ra.fileId).distinct
|
||||
case None => List(ra.fileId)
|
||||
case Some(sid) =>
|
||||
ctx.store.transact(RFileMeta.findMime(sid)).map {
|
||||
case Some(MimeType.PdfMatch(_)) =>
|
||||
List(ra.fileId)
|
||||
case _ =>
|
||||
List(sid, ra.fileId).distinct
|
||||
}
|
||||
case None => List(ra.fileId).pure[F]
|
||||
}
|
||||
|
||||
private def stripAttachmentName(ra: RAttachment): String =
|
||||
|
Reference in New Issue
Block a user