mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Streamline extern-conv stdin/infile
This commit is contained in:
@ -18,6 +18,13 @@ object SystemCommand {
|
|||||||
def mapArgs(f: String => String): Config =
|
def mapArgs(f: String => String): Config =
|
||||||
Config(program, args.map(f), timeout)
|
Config(program, args.map(f), timeout)
|
||||||
|
|
||||||
|
def replace(repl: Map[String, String]): Config =
|
||||||
|
mapArgs(s =>
|
||||||
|
repl.foldLeft(s) {
|
||||||
|
case (res, (k, v)) =>
|
||||||
|
res.replace(k, v)
|
||||||
|
})
|
||||||
|
|
||||||
def toCmd: List[String] =
|
def toCmd: List[String] =
|
||||||
program :: args.toList
|
program :: args.toList
|
||||||
|
|
||||||
|
@ -13,50 +13,28 @@ object ExternConv {
|
|||||||
cmdCfg: SystemCommand.Config,
|
cmdCfg: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
|
useStdin: Boolean,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Pipe[F, Byte, Byte] =
|
): Pipe[F, Byte, Byte] =
|
||||||
in =>
|
in =>
|
||||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
||||||
val out = dir.resolve("out.pdf")
|
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
||||||
|
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||||
val sysCfg =
|
val sysCfg =
|
||||||
cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString))
|
cmdCfg.replace(
|
||||||
|
Map("{{outfile}}" -> out.toString) ++
|
||||||
SystemCommand
|
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
||||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir), in)
|
else Map.empty)
|
||||||
.flatMap(result =>
|
|
||||||
logResult(name, result, logger) ++ readResult[F](
|
|
||||||
out,
|
|
||||||
result,
|
|
||||||
blocker,
|
|
||||||
chunkSize,
|
|
||||||
logger
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
def toPDFviaFile[F[_]: Sync: ContextShift](
|
|
||||||
name: String,
|
|
||||||
cmdCfg: SystemCommand.Config,
|
|
||||||
wd: Path,
|
|
||||||
chunkSize: Int,
|
|
||||||
blocker: Blocker,
|
|
||||||
logger: Logger[F]
|
|
||||||
): Pipe[F, Byte, Byte] =
|
|
||||||
in =>
|
|
||||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
|
||||||
val inFile = dir.resolve("infile")
|
|
||||||
val out = dir.resolve("out.pdf")
|
|
||||||
val sysCfg =
|
|
||||||
cmdCfg.mapArgs(
|
|
||||||
_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)
|
|
||||||
.replace("{{infile}}", inFile.toAbsolutePath.normalize.toString)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
(Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
val createInput: Pipe[F, Byte, Unit] =
|
||||||
Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ =>
|
if (useStdin) _ => Stream.emit(())
|
||||||
|
else storeDataToFile(name, blocker, logger, inFile)
|
||||||
|
|
||||||
|
in.through(createInput).flatMap { _ =>
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir))
|
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
||||||
.flatMap(result =>
|
.flatMap(result =>
|
||||||
logResult(name, result, logger) ++ readResult[F](
|
logResult(name, result, logger) ++ readResult[F](
|
||||||
out,
|
out,
|
||||||
@ -69,7 +47,7 @@ object ExternConv {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private def readResult[F[_]: Sync: ContextShift](
|
def readResult[F[_]: Sync: ContextShift](
|
||||||
out: Path,
|
out: Path,
|
||||||
result: SystemCommand.Result,
|
result: SystemCommand.Result,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
@ -91,6 +69,11 @@ object ExternConv {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
|
||||||
|
in =>
|
||||||
|
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
||||||
|
Stream.eval(storeFile(in, inFile, blocker))
|
||||||
|
|
||||||
private def logResult[F[_]: Sync](
|
private def logResult[F[_]: Sync](
|
||||||
name: String,
|
name: String,
|
||||||
result: SystemCommand.Result,
|
result: SystemCommand.Result,
|
||||||
|
5
modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
vendored
Normal file
5
modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
object Tesseract {
|
||||||
|
|
||||||
|
}
|
@ -12,7 +12,7 @@ object Unoconv {
|
|||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
): Pipe[F, Byte, Byte] =
|
): Pipe[F, Byte, Byte] =
|
||||||
ExternConv.toPDFviaFile[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger)
|
ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@ object WkHtmlPdf {
|
|||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
): Pipe[F, Byte, Byte] =
|
): Pipe[F, Byte, Byte] =
|
||||||
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger)
|
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -19,12 +19,10 @@ object Ocr {
|
|||||||
): F[Option[String]] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscript(pdf, config, wd, blocker, logger)
|
runGhostscript(pdf, config, wd, blocker, logger)
|
||||||
.flatMap({ tmpImg =>
|
.flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config))
|
||||||
runTesseractFile(tmpImg, blocker, logger, lang, config)
|
.fold1(_ + "\n\n\n" + _)
|
||||||
})
|
.compile
|
||||||
.fold1(_ + "\n\n\n" + _).
|
.last
|
||||||
compile.
|
|
||||||
last
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Extract the text from the given image file
|
/** Extract the text from the given image file
|
||||||
@ -47,12 +45,10 @@ object Ocr {
|
|||||||
): F[Option[String]] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
|
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
|
||||||
.flatMap({ tif =>
|
.flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config))
|
||||||
runTesseractFile(tif, blocker, logger, lang, config)
|
.fold1(_ + "\n\n\n" + _)
|
||||||
})
|
.compile
|
||||||
.fold1(_ + "\n\n\n" + _).
|
.last
|
||||||
compile.
|
|
||||||
last
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractImageFile[F[_]: Sync: ContextShift](
|
def extractImageFile[F[_]: Sync: ContextShift](
|
||||||
@ -68,11 +64,11 @@ object Ocr {
|
|||||||
* files are stored to a temporary location on disk and returned.
|
* files are stored to a temporary location on disk and returned.
|
||||||
*/
|
*/
|
||||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||||
pdf: Stream[F, Byte],
|
pdf: Stream[F, Byte],
|
||||||
cfg: OcrConfig,
|
cfg: OcrConfig,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val xargs =
|
val xargs =
|
||||||
if (cfg.pageRange.begin > 0)
|
if (cfg.pageRange.begin > 0)
|
||||||
@ -80,19 +76,15 @@ object Ocr {
|
|||||||
else cfg.ghostscript.command.args
|
else cfg.ghostscript.command.args
|
||||||
val cmd = cfg.ghostscript.command
|
val cmd = cfg.ghostscript.command
|
||||||
.copy(args = xargs)
|
.copy(args = xargs)
|
||||||
.mapArgs(
|
.replace(
|
||||||
replace(
|
Map(
|
||||||
Map(
|
"{{infile}}" -> "-",
|
||||||
"{{infile}}" -> "-",
|
"{{outfile}}" -> "%d.tif"
|
||||||
"{{outfile}}" -> "%d.tif"
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
|
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
|
||||||
.evalMap({ _ =>
|
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||||
File.listFiles(pathEndsWith(".tif"), wd)
|
|
||||||
})
|
|
||||||
.flatMap(fs => Stream.emits(fs))
|
.flatMap(fs => Stream.emits(fs))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,19 +98,15 @@ object Ocr {
|
|||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val cmd = ghostscript.mapArgs(
|
val cmd = ghostscript.replace(
|
||||||
replace(
|
Map(
|
||||||
Map(
|
"{{infile}}" -> pdf.toAbsolutePath.toString,
|
||||||
"{{infile}}" -> pdf.toAbsolutePath.toString,
|
"{{outfile}}" -> "%d.tif"
|
||||||
"{{outfile}}" -> "%d.tif"
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||||
.evalMap({ _ =>
|
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||||
File.listFiles(pathEndsWith(".tif"), wd)
|
|
||||||
})
|
|
||||||
.flatMap(fs => Stream.emits(fs))
|
.flatMap(fs => Stream.emits(fs))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,20 +124,20 @@ object Ocr {
|
|||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
||||||
val cmd = unpaper.mapArgs(
|
val cmd = unpaper.replace(
|
||||||
replace(
|
Map(
|
||||||
Map(
|
"{{infile}}" -> img.toAbsolutePath.toString,
|
||||||
"{{infile}}" -> img.toAbsolutePath.toString,
|
"{{outfile}}" -> targetFile.toString
|
||||||
"{{outfile}}" -> targetFile.toString
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
SystemCommand
|
||||||
th =>
|
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||||
|
.map(_ => targetFile)
|
||||||
|
.handleErrorWith { th =>
|
||||||
logger
|
logger
|
||||||
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
||||||
Stream.emit(img)
|
Stream.emit(img)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Run tesseract on the given image file and return the extracted
|
/** Run tesseract on the given image file and return the extracted
|
||||||
@ -165,9 +153,8 @@ object Ocr {
|
|||||||
// tesseract cannot cope with absolute filenames
|
// tesseract cannot cope with absolute filenames
|
||||||
// so use the parent as working dir
|
// so use the parent as working dir
|
||||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
|
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
|
||||||
val cmd = config.tesseract.command.mapArgs(
|
val cmd = config.tesseract.command
|
||||||
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
.replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
||||||
)
|
|
||||||
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
|
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,17 +169,10 @@ object Ocr {
|
|||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] = {
|
): Stream[F, String] = {
|
||||||
val cmd = config.tesseract.command
|
val cmd = config.tesseract.command
|
||||||
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
.replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
|
||||||
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
|
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def replace(repl: Map[String, String]): String => String =
|
|
||||||
s =>
|
|
||||||
repl.foldLeft(s) {
|
|
||||||
case (res, (k, v)) =>
|
|
||||||
res.replace(k, v)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def fixLanguage(lang: String): String =
|
private def fixLanguage(lang: String): String =
|
||||||
lang match {
|
lang match {
|
||||||
case "de" => "deu"
|
case "de" => "deu"
|
||||||
|
Reference in New Issue
Block a user