Streamline extern-conv stdin/infile

This commit is contained in:
Eike Kettner
2020-02-18 12:43:47 +01:00
parent 0dcc00836b
commit 5869e2ee6e
6 changed files with 68 additions and 93 deletions

View File

@ -18,6 +18,13 @@ object SystemCommand {
def mapArgs(f: String => String): Config = def mapArgs(f: String => String): Config =
Config(program, args.map(f), timeout) Config(program, args.map(f), timeout)
def replace(repl: Map[String, String]): Config =
mapArgs(s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v)
})
def toCmd: List[String] = def toCmd: List[String] =
program :: args.toList program :: args.toList

View File

@ -13,50 +13,28 @@ object ExternConv {
cmdCfg: SystemCommand.Config, cmdCfg: SystemCommand.Config,
wd: Path, wd: Path,
chunkSize: Int, chunkSize: Int,
useStdin: Boolean,
blocker: Blocker, blocker: Blocker,
logger: Logger[F] logger: Logger[F]
): Pipe[F, Byte, Byte] = ): Pipe[F, Byte, Byte] =
in => in =>
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
val out = dir.resolve("out.pdf") val inFile = dir.resolve("infile").toAbsolutePath.normalize
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
val sysCfg = val sysCfg =
cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) cmdCfg.replace(
Map("{{outfile}}" -> out.toString) ++
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
else Map.empty)
)
val createInput: Pipe[F, Byte, Unit] =
if (useStdin) _ => Stream.emit(())
else storeDataToFile(name, blocker, logger, inFile)
in.through(createInput).flatMap { _ =>
SystemCommand SystemCommand
.execSuccess[F](sysCfg, blocker, logger, Some(dir), in) .execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
.flatMap(result =>
logResult(name, result, logger) ++ readResult[F](
out,
result,
blocker,
chunkSize,
logger
)
)
}
def toPDFviaFile[F[_]: Sync: ContextShift](
name: String,
cmdCfg: SystemCommand.Config,
wd: Path,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
): Pipe[F, Byte, Byte] =
in =>
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
val inFile = dir.resolve("infile")
val out = dir.resolve("out.pdf")
val sysCfg =
cmdCfg.mapArgs(
_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)
.replace("{{infile}}", inFile.toAbsolutePath.normalize.toString)
)
(Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ =>
SystemCommand
.execSuccess[F](sysCfg, blocker, logger, Some(dir))
.flatMap(result => .flatMap(result =>
logResult(name, result, logger) ++ readResult[F]( logResult(name, result, logger) ++ readResult[F](
out, out,
@ -69,7 +47,7 @@ object ExternConv {
} }
} }
private def readResult[F[_]: Sync: ContextShift]( def readResult[F[_]: Sync: ContextShift](
out: Path, out: Path,
result: SystemCommand.Result, result: SystemCommand.Result,
blocker: Blocker, blocker: Blocker,
@ -91,6 +69,11 @@ object ExternConv {
) )
} }
private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
in =>
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
Stream.eval(storeFile(in, inFile, blocker))
private def logResult[F[_]: Sync]( private def logResult[F[_]: Sync](
name: String, name: String,
result: SystemCommand.Result, result: SystemCommand.Result,

View File

@ -0,0 +1,5 @@
package docspell.convert.extern
object Tesseract {
}

View File

@ -12,7 +12,7 @@ object Unoconv {
blocker: Blocker, blocker: Blocker,
logger: Logger[F], logger: Logger[F],
): Pipe[F, Byte, Byte] = ): Pipe[F, Byte, Byte] =
ExternConv.toPDFviaFile[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger)
} }

View File

@ -12,7 +12,7 @@ object WkHtmlPdf {
blocker: Blocker, blocker: Blocker,
logger: Logger[F], logger: Logger[F],
): Pipe[F, Byte, Byte] = ): Pipe[F, Byte, Byte] =
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger)
} }

View File

@ -19,12 +19,10 @@ object Ocr {
): F[Option[String]] = ): F[Option[String]] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
runGhostscript(pdf, config, wd, blocker, logger) runGhostscript(pdf, config, wd, blocker, logger)
.flatMap({ tmpImg => .flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config))
runTesseractFile(tmpImg, blocker, logger, lang, config) .fold1(_ + "\n\n\n" + _)
}) .compile
.fold1(_ + "\n\n\n" + _). .last
compile.
last
} }
/** Extract the text from the given image file /** Extract the text from the given image file
@ -47,12 +45,10 @@ object Ocr {
): F[Option[String]] = ): F[Option[String]] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
.flatMap({ tif => .flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config))
runTesseractFile(tif, blocker, logger, lang, config) .fold1(_ + "\n\n\n" + _)
}) .compile
.fold1(_ + "\n\n\n" + _). .last
compile.
last
} }
def extractImageFile[F[_]: Sync: ContextShift]( def extractImageFile[F[_]: Sync: ContextShift](
@ -80,19 +76,15 @@ object Ocr {
else cfg.ghostscript.command.args else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command val cmd = cfg.ghostscript.command
.copy(args = xargs) .copy(args = xargs)
.mapArgs( .replace(
replace(
Map( Map(
"{{infile}}" -> "-", "{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif" "{{outfile}}" -> "%d.tif"
) )
) )
)
SystemCommand SystemCommand
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
.evalMap({ _ => .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
File.listFiles(pathEndsWith(".tif"), wd)
})
.flatMap(fs => Stream.emits(fs)) .flatMap(fs => Stream.emits(fs))
} }
@ -106,19 +98,15 @@ object Ocr {
blocker: Blocker, blocker: Blocker,
logger: Logger[F] logger: Logger[F]
): Stream[F, Path] = { ): Stream[F, Path] = {
val cmd = ghostscript.mapArgs( val cmd = ghostscript.replace(
replace(
Map( Map(
"{{infile}}" -> pdf.toAbsolutePath.toString, "{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif" "{{outfile}}" -> "%d.tif"
) )
) )
)
SystemCommand SystemCommand
.execSuccess[F](cmd, blocker, logger, wd = Some(wd)) .execSuccess[F](cmd, blocker, logger, wd = Some(wd))
.evalMap({ _ => .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
File.listFiles(pathEndsWith(".tif"), wd)
})
.flatMap(fs => Stream.emits(fs)) .flatMap(fs => Stream.emits(fs))
} }
@ -136,16 +124,16 @@ object Ocr {
logger: Logger[F] logger: Logger[F]
): Stream[F, Path] = { ): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs( val cmd = unpaper.replace(
replace(
Map( Map(
"{{infile}}" -> img.toAbsolutePath.toString, "{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString "{{outfile}}" -> targetFile.toString
) )
) )
) SystemCommand
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith { .execSuccess[F](cmd, blocker, logger, wd = Some(wd))
th => .map(_ => targetFile)
.handleErrorWith { th =>
logger logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img) Stream.emit(img)
@ -165,9 +153,8 @@ object Ocr {
// tesseract cannot cope with absolute filenames // tesseract cannot cope with absolute filenames
// so use the parent as working dir // so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
val cmd = config.tesseract.command.mapArgs( val cmd = config.tesseract.command
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) .replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
)
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
} }
@ -182,17 +169,10 @@ object Ocr {
config: OcrConfig config: OcrConfig
): Stream[F, String] = { ): Stream[F, String] = {
val cmd = config.tesseract.command val cmd = config.tesseract.command
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
} }
private def replace(repl: Map[String, String]): String => String =
s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v)
}
private def fixLanguage(lang: String): String = private def fixLanguage(lang: String): String =
lang match { lang match {
case "de" => "deu" case "de" => "deu"