diff --git a/modules/common/src/main/scala/docspell/common/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala index 06876c96..075c2dc7 100644 --- a/modules/common/src/main/scala/docspell/common/SystemCommand.scala +++ b/modules/common/src/main/scala/docspell/common/SystemCommand.scala @@ -18,6 +18,13 @@ object SystemCommand { def mapArgs(f: String => String): Config = Config(program, args.map(f), timeout) + def replace(repl: Map[String, String]): Config = + mapArgs(s => + repl.foldLeft(s) { + case (res, (k, v)) => + res.replace(k, v) + }) + def toCmd: List[String] = program :: args.toList diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala index 909b5b45..ebc96be1 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -13,50 +13,28 @@ object ExternConv { cmdCfg: SystemCommand.Config, wd: Path, chunkSize: Int, + useStdin: Boolean, blocker: Blocker, logger: Logger[F] ): Pipe[F, Byte, Byte] = in => Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => - val out = dir.resolve("out.pdf") + val inFile = dir.resolve("infile").toAbsolutePath.normalize + val out = dir.resolve("out.pdf").toAbsolutePath.normalize val sysCfg = - cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) - - SystemCommand - .execSuccess[F](sysCfg, blocker, logger, Some(dir), in) - .flatMap(result => - logResult(name, result, logger) ++ readResult[F]( - out, - result, - blocker, - chunkSize, - logger - ) - ) - } - - def toPDFviaFile[F[_]: Sync: ContextShift]( - name: String, - cmdCfg: SystemCommand.Config, - wd: Path, - chunkSize: Int, - blocker: Blocker, - logger: Logger[F] - ): Pipe[F, Byte, Byte] = - in => - Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => - val inFile = dir.resolve("infile") - val out = dir.resolve("out.pdf") - val sysCfg = - cmdCfg.mapArgs( - _.replace("{{outfile}}", out.toAbsolutePath.normalize.toString) - .replace("{{infile}}", inFile.toAbsolutePath.normalize.toString) + cmdCfg.replace( + Map("{{outfile}}" -> out.toString) ++ + (if (!useStdin) Map("{{infile}}" -> inFile.toString) + else Map.empty) ) - (Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ - Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ => + val createInput: Pipe[F, Byte, Unit] = + if (useStdin) _ => Stream.emit(()) + else storeDataToFile(name, blocker, logger, inFile) + + in.through(createInput).flatMap { _ => SystemCommand - .execSuccess[F](sysCfg, blocker, logger, Some(dir)) + .execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty) .flatMap(result => logResult(name, result, logger) ++ readResult[F]( out, @@ -69,7 +47,7 @@ object ExternConv { } } - private def readResult[F[_]: Sync: ContextShift]( + def readResult[F[_]: Sync: ContextShift]( out: Path, result: SystemCommand.Result, blocker: Blocker, @@ -91,6 +69,11 @@ object ExternConv { ) } + private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] = + in => + Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ + Stream.eval(storeFile(in, inFile, blocker)) + private def logResult[F[_]: Sync]( name: String, result: SystemCommand.Result, diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala new file mode 100644 index 00000000..f7cd017d --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -0,0 +1,5 @@ +package docspell.convert.extern + +object Tesseract { + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala index ee2256d9..7ce10109 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -12,7 +12,7 @@ object Unoconv { blocker: Blocker, logger: Logger[F], ): Pipe[F, Byte, Byte] = - ExternConv.toPDFviaFile[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) + ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger) } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index d736c474..11a7ccda 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -12,7 +12,7 @@ object WkHtmlPdf { blocker: Blocker, logger: Logger[F], ): Pipe[F, Byte, Byte] = - ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) + ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger) } diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index 91ba8af2..ff30710c 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -19,12 +19,10 @@ object Ocr { ): F[Option[String]] = File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => runGhostscript(pdf, config, wd, blocker, logger) - .flatMap({ tmpImg => - runTesseractFile(tmpImg, blocker, logger, lang, config) - }) - .fold1(_ + "\n\n\n" + _). - compile. - last + .flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config)) + .fold1(_ + "\n\n\n" + _) + .compile + .last } /** Extract the text from the given image file @@ -47,12 +45,10 @@ object Ocr { ): F[Option[String]] = File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) - .flatMap({ tif => - runTesseractFile(tif, blocker, logger, lang, config) - }) - .fold1(_ + "\n\n\n" + _). - compile. - last + .flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config)) + .fold1(_ + "\n\n\n" + _) + .compile + .last } def extractImageFile[F[_]: Sync: ContextShift]( @@ -68,11 +64,11 @@ object Ocr { * files are stored to a temporary location on disk and returned. */ private[extract] def runGhostscript[F[_]: Sync: ContextShift]( - pdf: Stream[F, Byte], - cfg: OcrConfig, - wd: Path, - blocker: Blocker, - logger: Logger[F] + pdf: Stream[F, Byte], + cfg: OcrConfig, + wd: Path, + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val xargs = if (cfg.pageRange.begin > 0) @@ -80,19 +76,15 @@ object Ocr { else cfg.ghostscript.command.args val cmd = cfg.ghostscript.command .copy(args = xargs) - .mapArgs( - replace( - Map( - "{{infile}}" -> "-", - "{{outfile}}" -> "%d.tif" - ) + .replace( + Map( + "{{infile}}" -> "-", + "{{outfile}}" -> "%d.tif" ) ) SystemCommand .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) - .evalMap({ _ => - File.listFiles(pathEndsWith(".tif"), wd) - }) + .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) .flatMap(fs => Stream.emits(fs)) } @@ -106,19 +98,15 @@ object Ocr { blocker: Blocker, logger: Logger[F] ): Stream[F, Path] = { - val cmd = ghostscript.mapArgs( - replace( - Map( - "{{infile}}" -> pdf.toAbsolutePath.toString, - "{{outfile}}" -> "%d.tif" - ) + val cmd = ghostscript.replace( + Map( + "{{infile}}" -> pdf.toAbsolutePath.toString, + "{{outfile}}" -> "%d.tif" ) ) SystemCommand .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) - .evalMap({ _ => - File.listFiles(pathEndsWith(".tif"), wd) - }) + .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) .flatMap(fs => Stream.emits(fs)) } @@ -136,20 +124,20 @@ object Ocr { logger: Logger[F] ): Stream[F, Path] = { val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath - val cmd = unpaper.mapArgs( - replace( - Map( - "{{infile}}" -> img.toAbsolutePath.toString, - "{{outfile}}" -> targetFile.toString - ) + val cmd = unpaper.replace( + Map( + "{{infile}}" -> img.toAbsolutePath.toString, + "{{outfile}}" -> targetFile.toString ) ) - SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith { - th => + SystemCommand + .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) + .map(_ => targetFile) + .handleErrorWith { th => logger .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") Stream.emit(img) - } + } } /** Run tesseract on the given image file and return the extracted @@ -165,9 +153,8 @@ object Ocr { // tesseract cannot cope with absolute filenames // so use the parent as working dir runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => - val cmd = config.tesseract.command.mapArgs( - replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) - ) + val cmd = config.tesseract.command + .replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) } @@ -182,17 +169,10 @@ object Ocr { config: OcrConfig ): Stream[F, String] = { val cmd = config.tesseract.command - .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) + .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))) SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) } - private def replace(repl: Map[String, String]): String => String = - s => - repl.foldLeft(s) { - case (res, (k, v)) => - res.replace(k, v) - } - private def fixLanguage(lang: String): String = lang match { case "de" => "deu"