Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the
  extracted text as another layer into the pdf

- Although not used, the external conversion routine will now check
  for an existing text file that is named as the pdf file with extension
  `.txt`. If present it is included in the conversion result and will be
  used as the extracted text.

- text extraction for pdf files happens now on the converted file,
  because it may already contain the text from the conversion step and
  thus avoids running OCR twice.

- All errors during conversion are not fatal; processing continues
  without a converted file.
This commit is contained in:
Eike Kettner 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions

View File

@ -19,6 +19,17 @@ RUN apk add --no-cache openjdk11-jre \
ttf-dejavu \
ttf-freefont \
ttf-liberation \
libxml2-dev \
libxslt-dev \
pngquant \
zlib-dev \
g++ \
qpdf \
python3-dev \
libffi-dev\
qpdf-dev \
&& pip3 install --upgrade pip \
&& pip3 install ocrmypdf \
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
&& chmod +x /usr/local/bin/unoconv \
&& ln -s /usr/bin/python3 /usr/bin/python \
@ -27,7 +38,7 @@ RUN apk add --no-cache openjdk11-jre \
&& curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
&& unzip docspell.zip \
&& rm docspell.zip \
&& apk del curl unzip
&& apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
COPY entrypoint-joex.sh /opt/entrypoint.sh

View File

@ -8,7 +8,7 @@ import fs2._
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
import docspell.convert.extern._
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
@ -35,7 +35,8 @@ object Conversion {
): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.PdfMatch(_) =>
handler.run(ConversionResult.successPdf(in))
OcrMyPdf
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8

View File

@ -1,5 +1,6 @@
package docspell.convert
import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
@ -9,5 +10,6 @@ case class ConvertConfig(
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
tesseract: TesseractConfig,
unoconv: UnoconvConfig
unoconv: UnoconvConfig,
ocrmypdf: OcrMyPdfConfig
)

View File

@ -41,7 +41,7 @@ private[extern] object ExternConv {
in.through(createInput).flatMap { _ =>
SystemCommand
.execSuccess[F](
.exec[F](
sysCfg,
blocker,
logger,
@ -65,11 +65,20 @@ private[extern] object ExternConv {
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
File.existsNonEmpty[F](out).flatMap {
case true =>
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
else
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
case true if result.rc == 0 =>
val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
File.existsNonEmpty[F](outTxt).flatMap {
case true =>
successPdfTxt(
File.readAll(out, blocker, chunkSize),
File.readText(outTxt, blocker)
).pure[F]
case false =>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
}
case true if result.rc != 0 =>
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
case false =>
ConversionResult

View File

@ -0,0 +1,37 @@
package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
object OcrMyPdf {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: OcrMyPdfConfig,
lang: Language,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
if (cfg.enabled) {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.toPDF[F, A](
"ocrmypdf",
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
cfg.workingDir,
false,
blocker,
logger,
reader
)(in, handler)
} else
handler(ConversionResult.unsupportedFormat(MimeType.pdf))
}

View File

@ -0,0 +1,11 @@
package docspell.convert.extern
import java.nio.file.Path
import docspell.common.SystemCommand
case class OcrMyPdfConfig(
enabled: Boolean,
command: SystemCommand.Config,
workingDir: Path
)

View File

@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
import docspell.convert.extern.OcrMyPdfConfig
object ConversionTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
Duration.seconds(20)
),
target
),
OcrMyPdfConfig(
true,
SystemCommand.Config(
"ocrmypdf",
Seq(
"-l",
"{{lang}}",
"--skip-text",
"--deskew",
"-j",
"1",
"{{infile}}",
"{{outfile}}"
),
Duration.seconds(20)
),
target
)
)

View File

@ -339,6 +339,39 @@ docspell.joex {
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# The tool ocrmypdf can be used to convert pdf files to pdf files
# in order to add extracted text as a separate layer. This makes
# image-only pdfs searchable and you can select and copy/paste the
# text. It also converts pdfs into pdf/a type pdfs, which are best
# suited for archiving. So it makes sense to use this even for
# text-only pdfs.
#
# It is recommended to install ocrympdf, but it also is optional.
# If it is enabled but fails, the error is not fatal and the
# processing will continue using the original pdf for extracting
# text. You can also disable it to remove the errors from the
# processing logs.
#
# The `--skip-text` option is necessary to not fail on "text" pdfs
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
enabled = true
command = {
program = "ocrmypdf"
args = [
"-l", "{{lang}}",
"--skip-text",
"--deskew",
"-j", "1",
"{{infile}}",
"{{outfile}}"
]
timeout = "5 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
}
# General config for processing documents

View File

@ -64,10 +64,6 @@ object ConvertPdf {
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
mime.toLocal match {
case MimeType.PdfMatch(_) =>
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
case mt =>
val data = ctx.store.bitpeace
.get(ra.fileId.id)

View File

@ -85,9 +85,10 @@ object TextExtraction {
item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] =
for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F]
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F]
fids <- filesToExtract(ctx)(item, ra)
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta(
ra.id,
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
@ -151,11 +152,24 @@ object TextExtraction {
/** Returns the fileIds to extract text from. First, the source file
* is tried. If that fails, the converted file is tried.
*
* If the source file is a PDF, then use the converted file. This
* may then already contain the text if ocrmypdf is enabled. If it
* is disabled, both files are the same.
*/
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
item: ItemData,
ra: RAttachment
): F[List[Ident]] =
item.originFile.get(ra.id) match {
case Some(sid) => List(sid, ra.fileId).distinct
case None => List(ra.fileId)
case Some(sid) =>
ctx.store.transact(RFileMeta.findMime(sid)).map {
case Some(MimeType.PdfMatch(_)) =>
List(ra.fileId)
case _ =>
List(sid, ra.fileId).distinct
}
case None => List(ra.fileId).pure[F]
}
private def stripAttachmentName(ra: RAttachment): String =

View File

@ -23,3 +23,4 @@ Some early information about certain details can be found in a few
- [0012 Periodic Tasks](adr/0012_periodic_tasks)
- [0013 Archive Files](adr/0013_archive_files)
- [0014 Full-Text Search](adr/0014_fulltext_search_engine)
- [0015 Convert PDF files](adr/0015_convert_pdf_files)

View File

@ -0,0 +1,67 @@
---
layout: docs
title: Convert PDF Files
permalink: dev/adr/0015_convert_pdf_files
---
# {{ page.title }}
## Context and Problem Statement
Some PDFs contain only images (when coming from a scanner) and
therefore one is not able to click into the pdf and select text for
copy&paste. Also it is not searchable in a PDF viewer. These are
really shortcomings that can be fixed, especially when there is
already OCR build in.
For images, this works already as tesseract is used to create the PDF
files. Tesseract creates the files with an additional text layer
containing the OCRed text.
## Considered Options
* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
OCR text layer to scanned PDF files, allowing them to be searched
### ocrmypdf
This is a very nice python tool, that uses tesseract to do OCR on each
page and add the extracted text as a pdf text layer to the page.
Additionally it creates PDF/A type pdfs, which are great for
archiving. This fixes exactly the things stated above.
#### Integration
Docspell already has this built in for images. When converting images
to a PDF (which is done early in processing), the process creates a
text and a PDF file. Docspell then sets the text in this step and the
text extraction step skips doing its work, if there is already text
available.
It would be possible to use the `--sidecar` option with ocrmypdf to
create a text file of the extracted text with one run, too (exactly
like it works for tesseract). But for "text" pdfs, ocrmypdf writes
some info-message into this text file:
```
[OCR skipped on page 1] [OCR skipped on page 2]
```
Docspell cannot reliably tell, wether this is extracted text or not.
It would be reqiured to load the pdf and check its contents. This is a
bit of bad luck, because everything would just work already. So it
requires a (small) change in the text-extraction step. By default,
text extraction happens on the source file. For PDFs, text extraction
should now be run on the converted file, to avoid running OCR twice.
The converted pdf file is either be a text-pdf in the first place,
where ocrmypdf would only convert it to a PDF/A file; or it may be a
converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
is disabled, the converted file and the source file are the same for
PDFs.
## Decision Outcome
Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
distributed under the GPL-3 license.

View File

@ -77,6 +77,10 @@ component.
office documents into PDF files. It uses libreoffice/openoffice.
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
PDF files.
- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
used to convert PDF to PDF files. It adds an OCR layer to scanned
PDF files to make them searchable. It also creates PDF/A files from
the input pdf.
The performance of `unoconv` can be improved by starting `unoconv -l`
in a separate process. This runs a libreoffice/openoffice listener
@ -87,7 +91,7 @@ therefore avoids starting one each time `unoconv` is called.
On Debian this should install all joex requirements:
``` bash
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
```

View File

@ -13,7 +13,9 @@ permalink: features
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
- [Full-Text Search](doc/finding#full-text-search) based on [Apache
SOLR](https://lucene.apache.org/solr)
- Conversion to PDF: all files are converted into a PDF file
- Conversion to PDF: all files are converted into a PDF file. PDFs
with only images (as often returned from scanners) are converted
into searchable PDF/A pdfs.
- Non-destructive: all your uploaded files are never modified and can
always be downloaded untouched
- Text is analysed to find and attach meta data automatically

View File

@ -3,8 +3,10 @@ package docspell.store.records
import docspell.common._
import docspell.store.impl.Implicits._
import docspell.store.impl._
import docspell.store.syntax.MimeTypes._
import bitpeace.FileMeta
import bitpeace.Mimetype
import doobie._
import doobie.implicits._
@ -30,4 +32,13 @@ object RFileMeta {
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
}
def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
import bitpeace.sql._
selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
.query[Mimetype]
.option
.map(_.map(_.toLocal))
}
}

View File

@ -131,6 +131,23 @@ let
};
working-dir = "/tmp/docspell-convert";
};
ocrmypdf = {
enabled = true;
command = {
program = "${pkgs.ocrmypdf}/bin/ocrmypdf";
args = [
"-l" "{{lang}}"
"--skip-text"
"--deskew"
"-j" "1"
"{{infile}}"
"{{outfile}}"
];
timeout = "5 minutes";
};
working-dir = "/tmp/docspell-convert";
};
};
files = {
chunk-size = 524288;
@ -860,6 +877,66 @@ in {
process.
'';
};
ocrmypdf = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.convert.ocrmypdf.enabled;
description = "Whether to use ocrmypdf to convert pdf to pdf/a.";
};
working-dir = mkOption {
type = types.str;
default = defaults.convert.ocrmypdf.working-dir;
description = "Directory where the conversion processes can put their temp files";
};
command = mkOption {
type = types.submodule({
options = {
program = mkOption {
type = types.str;
default = defaults.convert.ocrmypdf.command.program;
description = "The path to the executable.";
};
args = mkOption {
type = types.listOf types.str;
default = defaults.convert.ocrmypdf.command.args;
description = "The arguments to the program";
};
timeout = mkOption {
type = types.str;
default = defaults.convert.ocrmypdf.command.timeout;
description = "The timeout when executing the command";
};
};
});
default = defaults.convert.ocrmypdf.command;
description = "The system command";
};
};
});
default = defaults.convert.orcmypdf;
description = ''
The tool ocrmypdf can be used to convert pdf files to pdf files
in order to add extracted text as a separate layer. This makes
image-only pdfs searchable and you can select and copy/paste the
text. It also converts pdfs into pdf/a type pdfs, which are best
suited for archiving. So it makes sense to use this even for
text-only pdfs.
It is recommended to install ocrympdf, but it also is optional.
If it is enabled but fails, the error is not fatal and the
processing will continue using the original pdf for extracting
text. You can also disable it to remove the errors from the
processing logs.
The `--skip-text` option is necessary to not fail on "text" pdfs
(where ocr is not necessary). In this case, the pdf will be
converted to PDF/A.
'';
};
};
});
default = defaults.convert;