diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 41884949..006f9098 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -17,6 +17,9 @@ object File { def mkTempDir[F[_]: Sync](parent: Path, prefix: String): F[Path] = mkDir(parent).map(p => Files.createTempDirectory(p, prefix)) + def mkTempFile[F[_]: Sync](parent: Path, prefix: String, suffix: Option[String] = None): F[Path] = + mkDir(parent).map(p => Files.createTempFile(p, prefix, suffix.orNull)) + def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay { val count = new AtomicInteger(0) Files.walkFileTree( diff --git a/modules/common/src/main/scala/docspell/common/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala index cfa2ab33..4f93d87a 100644 --- a/modules/common/src/main/scala/docspell/common/SystemCommand.scala +++ b/modules/common/src/main/scala/docspell/common/SystemCommand.scala @@ -1,12 +1,15 @@ package docspell.common import java.io.InputStream +import java.lang.ProcessBuilder.Redirect import java.nio.file.Path import java.util.concurrent.TimeUnit + import cats.implicits._ import cats.effect.{Blocker, ContextShift, Sync} import fs2.{Stream, io, text} import org.log4s.getLogger + import scala.jdk.CollectionConverters._ import docspell.common.syntax.all._ @@ -33,7 +36,7 @@ object SystemCommand { wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty ): Stream[F, Result] = - startProcess(cmd, wd) { proc => + startProcess(cmd, wd, stdin) { proc => Stream.eval { for { _ <- writeToProcess(stdin, proc, blocker) @@ -66,15 +69,20 @@ object SystemCommand { else Stream.emit(r) } - private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path])( + private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], stdin: Stream[F, Byte])( f: Process => Stream[F, A] ): Stream[F, A] = { val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") - val proc = log *> Sync[F].delay { + val hasStdin = stdin.take(1).compile.last.map(_.isDefined) + val proc = log *> hasStdin.flatMap(flag => Sync[F].delay { val pb = new ProcessBuilder(cmd.toCmd.asJava) + .redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT) + .redirectError(Redirect.PIPE) + .redirectOutput(Redirect.PIPE) + wd.map(_.toFile).foreach(pb.directory) pb.start() - } + }) Stream .bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala new file mode 100644 index 00000000..a45f7b22 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -0,0 +1,108 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.effect._ +import fs2.{Pipe, Stream} +import docspell.common._ + +object ExternConv { + + def toPDF[F[_]: Sync: ContextShift]( + name: String, + cmdCfg: SystemCommand.Config, + wd: Path, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + ): Pipe[F, Byte, Byte] = + in => + Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => + val out = dir.resolve("out.pdf") + val sysCfg = + cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) + + SystemCommand + .execSuccess[F](sysCfg, blocker, Some(dir), in) + .flatMap(result => + logResult(name, result, logger) ++ readResult[F]( + out, + result, + blocker, + chunkSize, + logger + ) + ) + } + + def toPDFviaFile[F[_]: Sync: ContextShift]( + name: String, + cmdCfg: SystemCommand.Config, + wd: Path, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + ): Pipe[F, Byte, Byte] = + in => + Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => + val inFile = dir.resolve("infile") + val out = dir.resolve("out.pdf") + val sysCfg = + cmdCfg.mapArgs( + _.replace("{{outfile}}", out.toAbsolutePath.normalize.toString) + .replace("{{infile}}", inFile.toAbsolutePath.normalize.toString) + ) + + (Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ + Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ => + SystemCommand + .execSuccess[F](sysCfg, blocker, Some(dir)) + .flatMap(result => + logResult(name, result, logger) ++ readResult[F]( + out, + result, + blocker, + chunkSize, + logger + ) + ) + } + } + + private def readResult[F[_]: Sync: ContextShift]( + out: Path, + result: SystemCommand.Result, + blocker: Blocker, + chunkSize: Int, + logger: Logger[F] + ): Stream[F, Byte] = + Stream.eval(File.existsNonEmpty[F](out)).flatMap { + case true => + if (result.rc == 0) File.readAll(out, blocker, chunkSize) + else + Stream + .eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists.")) + .drain ++ + File.readAll(out, blocker, chunkSize) + + case false => + Stream.raiseError[F]( + new Exception(s"Command result=${result.rc}. No output file found.") + ) + } + + private def logResult[F[_]: Sync]( + name: String, + result: SystemCommand.Result, + logger: Logger[F] + ): Stream[F, Nothing] = + Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++ + Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain + + private def storeFile[F[_]: Sync: ContextShift]( + in: Stream[F, Byte], + target: Path, + blocker: Blocker + ): F[Unit] = + in.through(fs2.io.file.writeAll(target, blocker)).compile.drain +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala new file mode 100644 index 00000000..ee2256d9 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -0,0 +1,18 @@ +package docspell.convert.extern + +import cats.effect._ +import fs2.Pipe +import docspell.common._ + +object Unoconv { + + def toPDF[F[_]: Sync: ContextShift]( + cfg: UnoconvConfig, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F], + ): Pipe[F, Byte, Byte] = + ExternConv.toPDFviaFile[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) + + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala new file mode 100644 index 00000000..da4af43c --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala @@ -0,0 +1,7 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class UnoconvConfig (cmd: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 60fa1fb9..d736c474 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -1,9 +1,7 @@ package docspell.convert.extern -import java.nio.file.Path - import cats.effect._ -import fs2.{Pipe, Stream} +import fs2.Pipe import docspell.common._ object WkHtmlPdf { @@ -12,39 +10,9 @@ object WkHtmlPdf { cfg: WkHtmlPdfConfig, chunkSize: Int, blocker: Blocker, - logger: Logger[F] + logger: Logger[F], ): Pipe[F, Byte, Byte] = - in => - Stream.resource(File.withTempDir[F](cfg.workingDir, "docspell-wkhtmltopdf")).flatMap { dir => - val out = dir.resolve("out.pdf") - val sysCfg = - cfg.cmd.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) + ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) - Stream.eval(logger.info(s"Running ${sysCfg.program}")).drain ++ - SystemCommand - .execSuccess[F](sysCfg, blocker, Some(dir), in) - .flatMap(result => readResult[F](out, result, blocker, chunkSize, logger)) - } - private def readResult[F[_]: Sync: ContextShift]( - out: Path, - result: SystemCommand.Result, - blocker: Blocker, - chunkSize: Int, - logger: Logger[F] - ): Stream[F, Byte] = - Stream.eval(File.existsNonEmpty[F](out)).flatMap { - case true => - if (result.rc == 0) File.readAll(out, blocker, chunkSize) - else - Stream - .eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists.")) - .drain ++ - File.readAll(out, blocker, chunkSize) - - case false => - Stream.raiseError( - new Exception(s"Command result=${result.rc}. No output file found. ${result.stderr}") - ) - } } diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala index 94b32811..f895e44f 100644 --- a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala +++ b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala @@ -40,8 +40,10 @@ object Markdown { map(str => toHtml(str, cfg)) private def wrapHtml(body: String, cfg: MarkdownConfig): String = { - s""" + s""" + | | + | | diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala new file mode 100644 index 00000000..7f81b694 --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -0,0 +1,80 @@ +package docspell.convert.extern + +import java.nio.file.{Files, Path, Paths} + +import fs2.Stream +import cats.effect._ +import docspell.common._ +import docspell.files.{ExampleFiles, TestFiles} +import fs2.Pipe +import minitest.SimpleTestSuite + +object ExternConvTest extends SimpleTestSuite { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val logger = Logger.log4s[IO](org.log4s.getLogger) + val target = Paths.get("target") + + + test("convert html to pdf") { + val cfg = SystemCommand.Config( + "wkhtmltopdf", + Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "wkhtmltopdf") + .use(dir => IO { + val wkCfg = WkHtmlPdfConfig(cfg, target) + val p = ExampleFiles.letter_de_html + .readURL[IO](8192, blocker) + .through(WkHtmlPdf.toPDF[IO](wkCfg, 8192, blocker, logger)) + .through(storeFile(dir.resolve("test.pdf"))) + .compile + .lastOrError + .unsafeRunSync() + + assert(Files.exists(p) && Files.size(p) > 0) + }) + .unsafeRunSync + } + } + + test("convert office to pdf") { + val cfg = SystemCommand.Config( + "unoconv", + Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "unoconv") + .use(dir => IO { + val ucCfg = UnoconvConfig(cfg, target) + val p = ExampleFiles.examples_sample_docx + .readURL[IO](8192, blocker) + .through(Unoconv.toPDF[IO](ucCfg, 8192, blocker, logger)) + .through(storeFile(dir.resolve("test.pdf"))) + .compile + .lastOrError + .unsafeRunSync() + + assert(Files.exists(p) && Files.size(p) > 0) + }) + .unsafeRunSync + } + } + + + def storeFile(file: Path): Pipe[IO, Byte, Path] = + in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes)))) + + def commandExists(cmd: String): Boolean = + Runtime.getRuntime().exec(Array("which", cmd)).waitFor() == 0 +} diff --git a/modules/files/src/test/resources/letter-de.html b/modules/files/src/test/resources/letter-de.html new file mode 100755 index 00000000..48ad7be0 --- /dev/null +++ b/modules/files/src/test/resources/letter-de.html @@ -0,0 +1,30 @@ + + + + + + + +
+            
+Max Mustermann
+Lilienweg 21
+12345 Nebendorf
+E-Mail: max.muster@gmail.com
+            
+        
+

Max Mustermann, Lilienweg 21, 12345 Nebendorf

+

EasyCare AG
Abteilung Buchhaltung
Ackerweg 12
12346 Ulmen

+

Nebendorf, 3. September 2019

+

Sehr geehrte Damen und Herren

+

hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum nächstmöglichen Termin.

+

Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbestätigung zu.

+

Vielen Dank im Vorraus!

+

Mit freundlichen Grüßen

+

Max Mustermann

+ + diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 829e36fc..23de73de 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -51,11 +51,11 @@ object ConvertPdf { .map(_.mimetype) .getOrElse(Mimetype.`application/octet-stream`) - def convertSafe[F[_]: Sync]( + def convertSafe[F[_]: Sync: ContextShift]( cfg: ConvertConfig, ctx: Context[F, ProcessItemArgs] )(ra: RAttachment, mime: Mimetype): F[RAttachment] = - Conversion.create[F](cfg).use { conv => + Conversion.create[F](cfg, ctx.blocker,ctx.logger).use { conv => ctx.logger .info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv") .map(_ => ra)