mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Extend config for external commands (#2536)
Allows to configure external commands and provide different arguments based on runtime values, like language. It extends the current config of a command to allow a `arg-mappings` section. An example for ocrmypdf: ```conf ocrmypdf = { enabled = true command = { program = "ocrmypdf" ### new arg-mappings arg-mappings = { "mylang" = { value = "{{lang}}" mappings = [ { matches = "deu" args = [ "-l", "deu", "--pdf-renderer", "sandwich" ] }, { matches = ".*" args = [ "-l", "{{lang}}" ] } ] } } #### end new arg-mappings args = [ ### will be replaced with corresponding args from "mylang" mapping "{{mylang}}", "--skip-text", "--deskew", "-j", "1", "{{infile}}", "{{outfile}}" ] timeout = "5 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" } ``` The whole section will be first processed to replace all `{{…}}` patterns with corresponding values. Then `arg-mappings` will be looked at and the first match (value == matches) in its `mappings` array is used to replace its name in the arguments to the command.
This commit is contained in:
@ -11,7 +11,8 @@ import cats.implicits._
|
||||
import fs2.io.file.{Files, Path}
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.common._
|
||||
import docspell.common.exec.ExternalCommand
|
||||
import docspell.common.exec.SysExec
|
||||
import docspell.common.util.File
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
||||
@ -21,11 +22,11 @@ private[extern] object ExternConv {
|
||||
|
||||
def toPDF[F[_]: Async: Files, A](
|
||||
name: String,
|
||||
cmdCfg: SystemCommand.Config,
|
||||
cmdCfg: ExternalCommand.WithVars,
|
||||
wd: Path,
|
||||
useStdin: Boolean,
|
||||
logger: Logger[F],
|
||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||
reader: (Path, Int) => F[ConversionResult[F]]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
Stream
|
||||
.resource(File.withTempDir[F](wd, s"docspell-$name"))
|
||||
@ -33,32 +34,21 @@ private[extern] object ExternConv {
|
||||
val inFile = dir.resolve("infile").absolute.normalize
|
||||
val out = dir.resolve("out.pdf").absolute.normalize
|
||||
val sysCfg =
|
||||
cmdCfg.replace(
|
||||
Map(
|
||||
"{{outfile}}" -> out.toString
|
||||
) ++
|
||||
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
||||
else Map.empty)
|
||||
)
|
||||
cmdCfg
|
||||
.withVar("outfile", out.toString)
|
||||
.withVarOption("infile", Option.when(!useStdin)(inFile.toString))
|
||||
.resolved
|
||||
|
||||
val createInput: Pipe[F, Byte, Unit] =
|
||||
if (useStdin) _ => Stream.emit(())
|
||||
else storeDataToFile(name, logger, inFile)
|
||||
|
||||
in.through(createInput).flatMap { _ =>
|
||||
SystemCommand
|
||||
.exec[F](
|
||||
sysCfg,
|
||||
logger,
|
||||
Some(dir),
|
||||
if (useStdin) in
|
||||
else Stream.empty
|
||||
)
|
||||
.evalMap(result =>
|
||||
logResult(name, result, logger)
|
||||
.flatMap(_ => reader(out, result))
|
||||
.flatMap(handler.run)
|
||||
)
|
||||
in.through(createInput).evalMap { _ =>
|
||||
SysExec(sysCfg, logger, Some(dir), Option.when(useStdin)(in))
|
||||
.flatMap(_.logOutputs(logger, name))
|
||||
.use { proc =>
|
||||
proc.waitFor().flatMap(rc => reader(out, rc).flatMap(handler.run))
|
||||
}
|
||||
}
|
||||
}
|
||||
.compile
|
||||
@ -74,9 +64,9 @@ private[extern] object ExternConv {
|
||||
def readResult[F[_]: Async: Files](
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||
)(out: Path, result: Int): F[ConversionResult[F]] =
|
||||
File.existsNonEmpty[F](out).flatMap {
|
||||
case true if result.rc == 0 =>
|
||||
case true if result == 0 =>
|
||||
val outTxt = out.resolveSibling(out.fileName.toString + ".txt")
|
||||
File.existsNonEmpty[F](outTxt).flatMap {
|
||||
case true =>
|
||||
@ -88,13 +78,13 @@ private[extern] object ExternConv {
|
||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||
}
|
||||
case true =>
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
logger.warn(s"Command not successful (rc=${result}), but file exists.") *>
|
||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
ConversionResult
|
||||
.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
new Exception(s"Command result=${result}. No output file found.")
|
||||
)
|
||||
.pure[F]
|
||||
}
|
||||
@ -103,25 +93,25 @@ private[extern] object ExternConv {
|
||||
outPrefix: String,
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
|
||||
)(out: Path, result: Int): F[ConversionResult[F]] = {
|
||||
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
|
||||
File.existsNonEmpty[F](outPdf).flatMap {
|
||||
case true =>
|
||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||
File.exists(outTxt).flatMap { txtExists =>
|
||||
val pdfData = File.readAll(out, chunkSize)
|
||||
if (result.rc == 0)
|
||||
if (result == 0)
|
||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F]
|
||||
else successPdf(pdfData).pure[F]
|
||||
else
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
logger.warn(s"Command not successful (rc=${result}), but file exists.") *>
|
||||
successPdf(pdfData).pure[F]
|
||||
}
|
||||
|
||||
case false =>
|
||||
ConversionResult
|
||||
.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
new Exception(s"Command result=${result}. No output file found.")
|
||||
)
|
||||
.pure[F]
|
||||
}
|
||||
@ -138,14 +128,6 @@ private[extern] object ExternConv {
|
||||
.drain ++
|
||||
Stream.eval(storeFile(in, inFile))
|
||||
|
||||
private def logResult[F[_]: Sync](
|
||||
name: String,
|
||||
result: SystemCommand.Result,
|
||||
logger: Logger[F]
|
||||
): F[Unit] =
|
||||
logger.debug(s"$name stdout: ${result.stdout}") *>
|
||||
logger.debug(s"$name stderr: ${result.stderr}")
|
||||
|
||||
private def storeFile[F[_]: Async: Files](
|
||||
in: Stream[F, Byte],
|
||||
target: Path
|
||||
|
@ -24,12 +24,14 @@ object OcrMyPdf {
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
if (cfg.enabled) {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
val cmd = cfg.command.withVars(Map("lang" -> lang.iso3))
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"ocrmypdf",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cmd,
|
||||
cfg.workingDir,
|
||||
useStdin = false,
|
||||
logger,
|
||||
|
@ -8,10 +8,10 @@ package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class OcrMyPdfConfig(
|
||||
enabled: Boolean,
|
||||
command: SystemCommand.Config,
|
||||
command: ExternalCommand,
|
||||
workingDir: Path
|
||||
)
|
||||
|
@ -24,17 +24,18 @@ object Tesseract {
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val outBase = cfg.command.args.tail.headOption.getOrElse("out")
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||
ExternConv.readResultTesseract[F](outBase, chunkSize, logger)
|
||||
|
||||
val cmd = cfg.command.withVars(Map("lang" -> lang.iso3))
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"tesseract",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cmd,
|
||||
cfg.workingDir,
|
||||
useStdin = false,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class TesseractConfig(command: SystemCommand.Config, workingDir: Path)
|
||||
case class TesseractConfig(command: ExternalCommand, workingDir: Path)
|
||||
|
@ -10,7 +10,6 @@ import cats.effect._
|
||||
import fs2.Stream
|
||||
import fs2.io.file.{Files, Path}
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.logging.Logger
|
||||
@ -22,12 +21,13 @@ object Unoconv {
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
val cmd = cfg.command.withVars(Map.empty)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"unoconv",
|
||||
cfg.command,
|
||||
cmd,
|
||||
cfg.workingDir,
|
||||
useStdin = false,
|
||||
logger,
|
||||
@ -37,5 +37,4 @@ object Unoconv {
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class UnoconvConfig(command: SystemCommand.Config, workingDir: Path)
|
||||
case class UnoconvConfig(command: ExternalCommand, workingDir: Path)
|
||||
|
@ -27,10 +27,10 @@ object Weasyprint {
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name()))
|
||||
|
||||
// html sanitize should (among other) remove links to invalid
|
||||
// protocols like cid: which is not supported by further
|
||||
@ -51,5 +51,4 @@ object Weasyprint {
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)
|
||||
case class WeasyprintConfig(command: ExternalCommand, workingDir: Path)
|
||||
|
@ -27,10 +27,10 @@ object WkHtmlPdf {
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name()))
|
||||
|
||||
// html sanitize should (among other) remove links to invalid
|
||||
// protocols like cid: which is not supported by further
|
||||
@ -58,5 +58,4 @@ object WkHtmlPdf {
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class WkHtmlPdfConfig(command: SystemCommand.Config, workingDir: Path)
|
||||
case class WkHtmlPdfConfig(command: ExternalCommand, workingDir: Path)
|
||||
|
@ -15,6 +15,7 @@ import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
import docspell.common.exec._
|
||||
import docspell.common.util.File
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.ConvertConfig.HtmlConverter
|
||||
@ -36,7 +37,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
3000 * 3000,
|
||||
MarkdownConfig("body { padding: 2em 5em; }"),
|
||||
WkHtmlPdfConfig(
|
||||
SystemCommand.Config(
|
||||
ExternalCommand(
|
||||
"wkhtmltopdf",
|
||||
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||
Duration.seconds(20)
|
||||
@ -44,7 +45,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
target
|
||||
),
|
||||
WeasyprintConfig(
|
||||
SystemCommand.Config(
|
||||
ExternalCommand(
|
||||
"weasyprint",
|
||||
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||
Duration.seconds(20)
|
||||
@ -53,7 +54,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
),
|
||||
HtmlConverter.Wkhtmltopdf,
|
||||
TesseractConfig(
|
||||
SystemCommand.Config(
|
||||
ExternalCommand(
|
||||
"tesseract",
|
||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||
Duration.seconds(20)
|
||||
@ -61,7 +62,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
target
|
||||
),
|
||||
UnoconvConfig(
|
||||
SystemCommand.Config(
|
||||
ExternalCommand(
|
||||
"unoconv",
|
||||
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||
Duration.seconds(20)
|
||||
@ -70,7 +71,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
),
|
||||
OcrMyPdfConfig(
|
||||
enabled = true,
|
||||
SystemCommand.Config(
|
||||
ExternalCommand(
|
||||
"ocrmypdf",
|
||||
Seq(
|
||||
"-l",
|
||||
|
@ -14,6 +14,7 @@ import cats.effect.unsafe.implicits.global
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common._
|
||||
import docspell.common.exec._
|
||||
import docspell.common.util.File
|
||||
import docspell.convert._
|
||||
import docspell.files.ExampleFiles
|
||||
@ -27,7 +28,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
val target = File.path(Paths.get("target"))
|
||||
|
||||
test("convert html to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
val cfg = ExternalCommand(
|
||||
"wkhtmltopdf",
|
||||
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||
Duration.seconds(20)
|
||||
@ -53,7 +54,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
}
|
||||
|
||||
test("convert office to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
val cfg = ExternalCommand(
|
||||
"unoconv",
|
||||
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||
Duration.seconds(20)
|
||||
@ -80,7 +81,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
}
|
||||
|
||||
test("convert image to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
val cfg = ExternalCommand(
|
||||
"tesseract",
|
||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||
Duration.seconds(20)
|
||||
@ -105,5 +106,4 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
)
|
||||
.unsafeRunSync()
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user