mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-27 17:35:04 +00:00
Use ocrmypdf tool to create pdf/a during conversion
- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
This commit is contained in:
parent
99210365ce
commit
3d49ceaab5
@ -19,6 +19,17 @@ RUN apk add --no-cache openjdk11-jre \
|
||||
ttf-dejavu \
|
||||
ttf-freefont \
|
||||
ttf-liberation \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
pngquant \
|
||||
zlib-dev \
|
||||
g++ \
|
||||
qpdf \
|
||||
python3-dev \
|
||||
libffi-dev\
|
||||
qpdf-dev \
|
||||
&& pip3 install --upgrade pip \
|
||||
&& pip3 install ocrmypdf \
|
||||
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
||||
&& chmod +x /usr/local/bin/unoconv \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python \
|
||||
@ -27,7 +38,7 @@ RUN apk add --no-cache openjdk11-jre \
|
||||
&& curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
|
||||
&& unzip docspell.zip \
|
||||
&& rm docspell.zip \
|
||||
&& apk del curl unzip
|
||||
&& apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
|
||||
|
||||
COPY entrypoint-joex.sh /opt/entrypoint.sh
|
||||
|
||||
|
@ -8,7 +8,7 @@ import fs2._
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
|
||||
import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.Markdown
|
||||
import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
@ -35,7 +35,8 @@ object Conversion {
|
||||
): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
OcrMyPdf
|
||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
|
||||
case MimeType.HtmlMatch(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
|
@ -1,5 +1,6 @@
|
||||
package docspell.convert
|
||||
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
|
||||
@ -9,5 +10,6 @@ case class ConvertConfig(
|
||||
markdown: MarkdownConfig,
|
||||
wkhtmlpdf: WkHtmlPdfConfig,
|
||||
tesseract: TesseractConfig,
|
||||
unoconv: UnoconvConfig
|
||||
unoconv: UnoconvConfig,
|
||||
ocrmypdf: OcrMyPdfConfig
|
||||
)
|
||||
|
@ -41,7 +41,7 @@ private[extern] object ExternConv {
|
||||
|
||||
in.through(createInput).flatMap { _ =>
|
||||
SystemCommand
|
||||
.execSuccess[F](
|
||||
.exec[F](
|
||||
sysCfg,
|
||||
blocker,
|
||||
logger,
|
||||
@ -65,11 +65,20 @@ private[extern] object ExternConv {
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||
File.existsNonEmpty[F](out).flatMap {
|
||||
case true =>
|
||||
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
else
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
case true if result.rc == 0 =>
|
||||
val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
|
||||
File.existsNonEmpty[F](outTxt).flatMap {
|
||||
case true =>
|
||||
successPdfTxt(
|
||||
File.readAll(out, blocker, chunkSize),
|
||||
File.readText(outTxt, blocker)
|
||||
).pure[F]
|
||||
case false =>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
}
|
||||
case true if result.rc != 0 =>
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
ConversionResult
|
||||
|
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object OcrMyPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: OcrMyPdfConfig,
|
||||
lang: Language,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
if (cfg.enabled) {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"ocrmypdf",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
} else
|
||||
handler(ConversionResult.unsupportedFormat(MimeType.pdf))
|
||||
|
||||
}
|
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
|
||||
case class OcrMyPdfConfig(
|
||||
enabled: Boolean,
|
||||
command: SystemCommand.Config,
|
||||
workingDir: Path
|
||||
)
|
@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
|
||||
object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
),
|
||||
OcrMyPdfConfig(
|
||||
true,
|
||||
SystemCommand.Config(
|
||||
"ocrmypdf",
|
||||
Seq(
|
||||
"-l",
|
||||
"{{lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j",
|
||||
"1",
|
||||
"{{infile}}",
|
||||
"{{outfile}}"
|
||||
),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -339,6 +339,39 @@ docspell.joex {
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||
}
|
||||
|
||||
# The tool ocrmypdf can be used to convert pdf files to pdf files
|
||||
# in order to add extracted text as a separate layer. This makes
|
||||
# image-only pdfs searchable and you can select and copy/paste the
|
||||
# text. It also converts pdfs into pdf/a type pdfs, which are best
|
||||
# suited for archiving. So it makes sense to use this even for
|
||||
# text-only pdfs.
|
||||
#
|
||||
# It is recommended to install ocrympdf, but it also is optional.
|
||||
# If it is enabled but fails, the error is not fatal and the
|
||||
# processing will continue using the original pdf for extracting
|
||||
# text. You can also disable it to remove the errors from the
|
||||
# processing logs.
|
||||
#
|
||||
# The `--skip-text` option is necessary to not fail on "text" pdfs
|
||||
# (where ocr is not necessary). In this case, the pdf will be
|
||||
# converted to PDF/A.
|
||||
ocrmypdf = {
|
||||
enabled = true
|
||||
command = {
|
||||
program = "ocrmypdf"
|
||||
args = [
|
||||
"-l", "{{lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j", "1",
|
||||
"{{infile}}",
|
||||
"{{outfile}}"
|
||||
]
|
||||
timeout = "5 minutes"
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||
}
|
||||
}
|
||||
|
||||
# General config for processing documents
|
||||
|
@ -64,10 +64,6 @@ object ConvertPdf {
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
|
||||
mime.toLocal match {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
case mt =>
|
||||
val data = ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
|
@ -85,9 +85,10 @@ object TextExtraction {
|
||||
item: ItemData
|
||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
||||
for {
|
||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||
dst <- Duration.stopTime[F]
|
||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
|
||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||
dst <- Duration.stopTime[F]
|
||||
fids <- filesToExtract(ctx)(item, ra)
|
||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||
meta = item.changeMeta(
|
||||
ra.id,
|
||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
||||
@ -151,11 +152,24 @@ object TextExtraction {
|
||||
|
||||
/** Returns the fileIds to extract text from. First, the source file
|
||||
* is tried. If that fails, the converted file is tried.
|
||||
*
|
||||
* If the source file is a PDF, then use the converted file. This
|
||||
* may then already contain the text if ocrmypdf is enabled. If it
|
||||
* is disabled, both files are the same.
|
||||
*/
|
||||
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
|
||||
private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
|
||||
item: ItemData,
|
||||
ra: RAttachment
|
||||
): F[List[Ident]] =
|
||||
item.originFile.get(ra.id) match {
|
||||
case Some(sid) => List(sid, ra.fileId).distinct
|
||||
case None => List(ra.fileId)
|
||||
case Some(sid) =>
|
||||
ctx.store.transact(RFileMeta.findMime(sid)).map {
|
||||
case Some(MimeType.PdfMatch(_)) =>
|
||||
List(ra.fileId)
|
||||
case _ =>
|
||||
List(sid, ra.fileId).distinct
|
||||
}
|
||||
case None => List(ra.fileId).pure[F]
|
||||
}
|
||||
|
||||
private def stripAttachmentName(ra: RAttachment): String =
|
||||
|
@ -23,3 +23,4 @@ Some early information about certain details can be found in a few
|
||||
- [0012 Periodic Tasks](adr/0012_periodic_tasks)
|
||||
- [0013 Archive Files](adr/0013_archive_files)
|
||||
- [0014 Full-Text Search](adr/0014_fulltext_search_engine)
|
||||
- [0015 Convert PDF files](adr/0015_convert_pdf_files)
|
||||
|
67
modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
Normal file
67
modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
Normal file
@ -0,0 +1,67 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert PDF Files
|
||||
permalink: dev/adr/0015_convert_pdf_files
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Some PDFs contain only images (when coming from a scanner) and
|
||||
therefore one is not able to click into the pdf and select text for
|
||||
copy&paste. Also it is not searchable in a PDF viewer. These are
|
||||
really shortcomings that can be fixed, especially when there is
|
||||
already OCR build in.
|
||||
|
||||
For images, this works already as tesseract is used to create the PDF
|
||||
files. Tesseract creates the files with an additional text layer
|
||||
containing the OCRed text.
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
|
||||
OCR text layer to scanned PDF files, allowing them to be searched
|
||||
|
||||
|
||||
### ocrmypdf
|
||||
|
||||
This is a very nice python tool, that uses tesseract to do OCR on each
|
||||
page and add the extracted text as a pdf text layer to the page.
|
||||
Additionally it creates PDF/A type pdfs, which are great for
|
||||
archiving. This fixes exactly the things stated above.
|
||||
|
||||
#### Integration
|
||||
|
||||
Docspell already has this built in for images. When converting images
|
||||
to a PDF (which is done early in processing), the process creates a
|
||||
text and a PDF file. Docspell then sets the text in this step and the
|
||||
text extraction step skips doing its work, if there is already text
|
||||
available.
|
||||
|
||||
It would be possible to use the `--sidecar` option with ocrmypdf to
|
||||
create a text file of the extracted text with one run, too (exactly
|
||||
like it works for tesseract). But for "text" pdfs, ocrmypdf writes
|
||||
some info-message into this text file:
|
||||
|
||||
```
|
||||
[OCR skipped on page 1][OCR skipped on page 2]
|
||||
```
|
||||
|
||||
Docspell cannot reliably tell, wether this is extracted text or not.
|
||||
It would be reqiured to load the pdf and check its contents. This is a
|
||||
bit of bad luck, because everything would just work already. So it
|
||||
requires a (small) change in the text-extraction step. By default,
|
||||
text extraction happens on the source file. For PDFs, text extraction
|
||||
should now be run on the converted file, to avoid running OCR twice.
|
||||
|
||||
The converted pdf file is either be a text-pdf in the first place,
|
||||
where ocrmypdf would only convert it to a PDF/A file; or it may be a
|
||||
converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
|
||||
is disabled, the converted file and the source file are the same for
|
||||
PDFs.
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
|
||||
distributed under the GPL-3 license.
|
@ -77,6 +77,10 @@ component.
|
||||
office documents into PDF files. It uses libreoffice/openoffice.
|
||||
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
|
||||
PDF files.
|
||||
- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
|
||||
used to convert PDF to PDF files. It adds an OCR layer to scanned
|
||||
PDF files to make them searchable. It also creates PDF/A files from
|
||||
the input pdf.
|
||||
|
||||
The performance of `unoconv` can be improved by starting `unoconv -l`
|
||||
in a separate process. This runs a libreoffice/openoffice listener
|
||||
@ -87,7 +91,7 @@ therefore avoids starting one each time `unoconv` is called.
|
||||
On Debian this should install all joex requirements:
|
||||
|
||||
``` bash
|
||||
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf
|
||||
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
|
||||
```
|
||||
|
||||
|
||||
|
@ -13,7 +13,9 @@ permalink: features
|
||||
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||
- [Full-Text Search](doc/finding#full-text-search) based on [Apache
|
||||
SOLR](https://lucene.apache.org/solr)
|
||||
- Conversion to PDF: all files are converted into a PDF file
|
||||
- Conversion to PDF: all files are converted into a PDF file. PDFs
|
||||
with only images (as often returned from scanners) are converted
|
||||
into searchable PDF/A pdfs.
|
||||
- Non-destructive: all your uploaded files are never modified and can
|
||||
always be downloaded untouched
|
||||
- Text is analysed to find and attach meta data automatically
|
||||
|
@ -3,8 +3,10 @@ package docspell.store.records
|
||||
import docspell.common._
|
||||
import docspell.store.impl.Implicits._
|
||||
import docspell.store.impl._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.FileMeta
|
||||
import bitpeace.Mimetype
|
||||
import doobie._
|
||||
import doobie.implicits._
|
||||
|
||||
@ -30,4 +32,13 @@ object RFileMeta {
|
||||
|
||||
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
|
||||
}
|
||||
|
||||
def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
|
||||
import bitpeace.sql._
|
||||
|
||||
selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
|
||||
.query[Mimetype]
|
||||
.option
|
||||
.map(_.map(_.toLocal))
|
||||
}
|
||||
}
|
||||
|
@ -131,6 +131,23 @@ let
|
||||
};
|
||||
working-dir = "/tmp/docspell-convert";
|
||||
};
|
||||
|
||||
ocrmypdf = {
|
||||
enabled = true;
|
||||
command = {
|
||||
program = "${pkgs.ocrmypdf}/bin/ocrmypdf";
|
||||
args = [
|
||||
"-l" "{{lang}}"
|
||||
"--skip-text"
|
||||
"--deskew"
|
||||
"-j" "1"
|
||||
"{{infile}}"
|
||||
"{{outfile}}"
|
||||
];
|
||||
timeout = "5 minutes";
|
||||
};
|
||||
working-dir = "/tmp/docspell-convert";
|
||||
};
|
||||
};
|
||||
files = {
|
||||
chunk-size = 524288;
|
||||
@ -860,6 +877,66 @@ in {
|
||||
process.
|
||||
'';
|
||||
};
|
||||
|
||||
ocrmypdf = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
enabled = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.convert.ocrmypdf.enabled;
|
||||
description = "Whether to use ocrmypdf to convert pdf to pdf/a.";
|
||||
};
|
||||
working-dir = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.convert.ocrmypdf.working-dir;
|
||||
description = "Directory where the conversion processes can put their temp files";
|
||||
};
|
||||
command = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
program = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.convert.ocrmypdf.command.program;
|
||||
description = "The path to the executable.";
|
||||
};
|
||||
args = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = defaults.convert.ocrmypdf.command.args;
|
||||
description = "The arguments to the program";
|
||||
};
|
||||
timeout = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.convert.ocrmypdf.command.timeout;
|
||||
description = "The timeout when executing the command";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.convert.ocrmypdf.command;
|
||||
description = "The system command";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.convert.orcmypdf;
|
||||
description = ''
|
||||
The tool ocrmypdf can be used to convert pdf files to pdf files
|
||||
in order to add extracted text as a separate layer. This makes
|
||||
image-only pdfs searchable and you can select and copy/paste the
|
||||
text. It also converts pdfs into pdf/a type pdfs, which are best
|
||||
suited for archiving. So it makes sense to use this even for
|
||||
text-only pdfs.
|
||||
|
||||
It is recommended to install ocrympdf, but it also is optional.
|
||||
If it is enabled but fails, the error is not fatal and the
|
||||
processing will continue using the original pdf for extracting
|
||||
text. You can also disable it to remove the errors from the
|
||||
processing logs.
|
||||
|
||||
The `--skip-text` option is necessary to not fail on "text" pdfs
|
||||
(where ocr is not necessary). In this case, the pdf will be
|
||||
converted to PDF/A.
|
||||
'';
|
||||
};
|
||||
|
||||
};
|
||||
});
|
||||
default = defaults.convert;
|
||||
|
Loading…
x
Reference in New Issue
Block a user