mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Use ocrmypdf tool to create pdf/a during conversion
- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
This commit is contained in:
@ -8,7 +8,7 @@ import fs2._
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
|
||||
import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.Markdown
|
||||
import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
@ -35,7 +35,8 @@ object Conversion {
|
||||
): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
OcrMyPdf
|
||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
|
||||
case MimeType.HtmlMatch(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
|
@ -1,5 +1,6 @@
|
||||
package docspell.convert
|
||||
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
|
||||
@ -9,5 +10,6 @@ case class ConvertConfig(
|
||||
markdown: MarkdownConfig,
|
||||
wkhtmlpdf: WkHtmlPdfConfig,
|
||||
tesseract: TesseractConfig,
|
||||
unoconv: UnoconvConfig
|
||||
unoconv: UnoconvConfig,
|
||||
ocrmypdf: OcrMyPdfConfig
|
||||
)
|
||||
|
@ -41,7 +41,7 @@ private[extern] object ExternConv {
|
||||
|
||||
in.through(createInput).flatMap { _ =>
|
||||
SystemCommand
|
||||
.execSuccess[F](
|
||||
.exec[F](
|
||||
sysCfg,
|
||||
blocker,
|
||||
logger,
|
||||
@ -65,11 +65,20 @@ private[extern] object ExternConv {
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||
File.existsNonEmpty[F](out).flatMap {
|
||||
case true =>
|
||||
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
else
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
case true if result.rc == 0 =>
|
||||
val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
|
||||
File.existsNonEmpty[F](outTxt).flatMap {
|
||||
case true =>
|
||||
successPdfTxt(
|
||||
File.readAll(out, blocker, chunkSize),
|
||||
File.readText(outTxt, blocker)
|
||||
).pure[F]
|
||||
case false =>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
}
|
||||
case true if result.rc != 0 =>
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
ConversionResult
|
||||
|
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object OcrMyPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: OcrMyPdfConfig,
|
||||
lang: Language,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
if (cfg.enabled) {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"ocrmypdf",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
} else
|
||||
handler(ConversionResult.unsupportedFormat(MimeType.pdf))
|
||||
|
||||
}
|
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
|
||||
case class OcrMyPdfConfig(
|
||||
enabled: Boolean,
|
||||
command: SystemCommand.Config,
|
||||
workingDir: Path
|
||||
)
|
@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
|
||||
object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
),
|
||||
OcrMyPdfConfig(
|
||||
true,
|
||||
SystemCommand.Config(
|
||||
"ocrmypdf",
|
||||
Seq(
|
||||
"-l",
|
||||
"{{lang}}",
|
||||
"--skip-text",
|
||||
"--deskew",
|
||||
"-j",
|
||||
"1",
|
||||
"{{infile}}",
|
||||
"{{outfile}}"
|
||||
),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
)
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user