Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the
  extracted text as another layer into the pdf

- Although not used, the external conversion routine will now check
  for an existing text file that is named as the pdf file with extension
  `.txt`. If present it is included in the conversion result and will be
  used as the extracted text.

- text extraction for pdf files happens now on the converted file,
  because it may already contain the text from the conversion step and
  thus avoids running OCR twice.

- All errors during conversion are not fatal; processing continues
  without a converted file.
This commit is contained in:
Eike Kettner
2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions

View File

@ -8,7 +8,7 @@ import fs2._
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
import docspell.convert.extern._
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
@ -35,7 +35,8 @@ object Conversion {
): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.PdfMatch(_) =>
handler.run(ConversionResult.successPdf(in))
OcrMyPdf
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8

View File

@ -1,5 +1,6 @@
package docspell.convert
import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
@ -9,5 +10,6 @@ case class ConvertConfig(
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
tesseract: TesseractConfig,
unoconv: UnoconvConfig
unoconv: UnoconvConfig,
ocrmypdf: OcrMyPdfConfig
)

View File

@ -41,7 +41,7 @@ private[extern] object ExternConv {
in.through(createInput).flatMap { _ =>
SystemCommand
.execSuccess[F](
.exec[F](
sysCfg,
blocker,
logger,
@ -65,11 +65,20 @@ private[extern] object ExternConv {
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
File.existsNonEmpty[F](out).flatMap {
case true =>
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
else
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
case true if result.rc == 0 =>
val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
File.existsNonEmpty[F](outTxt).flatMap {
case true =>
successPdfTxt(
File.readAll(out, blocker, chunkSize),
File.readText(outTxt, blocker)
).pure[F]
case false =>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
}
case true if result.rc != 0 =>
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
case false =>
ConversionResult

View File

@ -0,0 +1,37 @@
package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
object OcrMyPdf {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: OcrMyPdfConfig,
lang: Language,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
if (cfg.enabled) {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.toPDF[F, A](
"ocrmypdf",
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
cfg.workingDir,
false,
blocker,
logger,
reader
)(in, handler)
} else
handler(ConversionResult.unsupportedFormat(MimeType.pdf))
}

View File

@ -0,0 +1,11 @@
package docspell.convert.extern
import java.nio.file.Path
import docspell.common.SystemCommand
case class OcrMyPdfConfig(
enabled: Boolean,
command: SystemCommand.Config,
workingDir: Path
)

View File

@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
import docspell.convert.extern.OcrMyPdfConfig
object ConversionTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
Duration.seconds(20)
),
target
),
OcrMyPdfConfig(
true,
SystemCommand.Config(
"ocrmypdf",
Seq(
"-l",
"{{lang}}",
"--skip-text",
"--deskew",
"-j",
"1",
"{{infile}}",
"{{outfile}}"
),
Duration.seconds(20)
),
target
)
)