mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-12 05:59:33 +00:00
Extend config for external commands (#2536)
Allows to configure external commands and provide different arguments based on runtime values, like language. It extends the current config of a command to allow a `arg-mappings` section. An example for ocrmypdf: ```conf ocrmypdf = { enabled = true command = { program = "ocrmypdf" ### new arg-mappings arg-mappings = { "mylang" = { value = "{{lang}}" mappings = [ { matches = "deu" args = [ "-l", "deu", "--pdf-renderer", "sandwich" ] }, { matches = ".*" args = [ "-l", "{{lang}}" ] } ] } } #### end new arg-mappings args = [ ### will be replaced with corresponding args from "mylang" mapping "{{mylang}}", "--skip-text", "--deskew", "-j", "1", "{{infile}}", "{{outfile}}" ] timeout = "5 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" } ``` The whole section will be first processed to replace all `{{…}}` patterns with corresponding values. Then `arg-mappings` will be looked at and the first match (value == matches) in its `mappings` array is used to replace its name in the arguments to the command.
This commit is contained in:
parent
9c98f08520
commit
8269a73a83
@ -1,212 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright 2020 Eike K. & Contributors
|
|
||||||
*
|
|
||||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
||||||
*/
|
|
||||||
|
|
||||||
package docspell.common
|
|
||||||
|
|
||||||
import java.io.InputStream
|
|
||||||
import java.lang.ProcessBuilder.Redirect
|
|
||||||
import java.util.concurrent.TimeUnit
|
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
|
||||||
|
|
||||||
import cats.effect._
|
|
||||||
import cats.implicits._
|
|
||||||
import fs2.io.file.Path
|
|
||||||
import fs2.{Stream, io, text}
|
|
||||||
|
|
||||||
import docspell.common.{exec => newExec}
|
|
||||||
import docspell.logging.Logger
|
|
||||||
|
|
||||||
// better use `SysCmd` and `SysExec`
|
|
||||||
object SystemCommand {
|
|
||||||
|
|
||||||
final case class Config(
|
|
||||||
program: String,
|
|
||||||
args: Seq[String],
|
|
||||||
timeout: Duration,
|
|
||||||
env: Map[String, String] = Map.empty
|
|
||||||
) {
|
|
||||||
|
|
||||||
def toSysCmd = newExec
|
|
||||||
.SysCmd(program, newExec.Args(args))
|
|
||||||
.withTimeout(timeout)
|
|
||||||
.addEnv(newExec.Env(env))
|
|
||||||
|
|
||||||
def mapArgs(f: String => String): Config =
|
|
||||||
Config(program, args.map(f), timeout)
|
|
||||||
|
|
||||||
def replace(repl: Map[String, String]): Config =
|
|
||||||
mapArgs(s =>
|
|
||||||
repl.foldLeft(s) { case (res, (k, v)) =>
|
|
||||||
res.replace(k, v)
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
def withEnv(key: String, value: String): Config =
|
|
||||||
copy(env = env.updated(key, value))
|
|
||||||
|
|
||||||
def addEnv(moreEnv: Map[String, String]): Config =
|
|
||||||
copy(env = env ++ moreEnv)
|
|
||||||
|
|
||||||
def appendArgs(extraArgs: Args): Config =
|
|
||||||
copy(args = args ++ extraArgs.args)
|
|
||||||
|
|
||||||
def appendArgs(extraArgs: Seq[String]): Config =
|
|
||||||
copy(args = args ++ extraArgs)
|
|
||||||
|
|
||||||
def toCmd: List[String] =
|
|
||||||
program :: args.toList
|
|
||||||
|
|
||||||
lazy val cmdString: String =
|
|
||||||
toCmd.mkString(" ")
|
|
||||||
}
|
|
||||||
|
|
||||||
final case class Args(args: Vector[String]) extends Iterable[String] {
|
|
||||||
override def iterator = args.iterator
|
|
||||||
|
|
||||||
def prepend(a: String): Args = Args(a +: args)
|
|
||||||
|
|
||||||
def prependWhen(flag: Boolean)(a: String): Args =
|
|
||||||
prependOption(Option.when(flag)(a))
|
|
||||||
|
|
||||||
def prependOption(value: Option[String]): Args =
|
|
||||||
value.map(prepend).getOrElse(this)
|
|
||||||
|
|
||||||
def append(a: String, as: String*): Args =
|
|
||||||
Args(args ++ (a +: as.toVector))
|
|
||||||
|
|
||||||
def appendOption(value: Option[String]): Args =
|
|
||||||
value.map(append(_)).getOrElse(this)
|
|
||||||
|
|
||||||
def appendOptionVal(first: String, second: Option[String]): Args =
|
|
||||||
second.map(b => append(first, b)).getOrElse(this)
|
|
||||||
|
|
||||||
def appendWhen(flag: Boolean)(a: String, as: String*): Args =
|
|
||||||
if (flag) append(a, as: _*) else this
|
|
||||||
|
|
||||||
def appendWhenNot(flag: Boolean)(a: String, as: String*): Args =
|
|
||||||
if (!flag) append(a, as: _*) else this
|
|
||||||
|
|
||||||
def append(p: Path): Args =
|
|
||||||
append(p.toString)
|
|
||||||
|
|
||||||
def append(as: Iterable[String]): Args =
|
|
||||||
Args(args ++ as.toVector)
|
|
||||||
}
|
|
||||||
object Args {
|
|
||||||
val empty: Args = Args()
|
|
||||||
|
|
||||||
def apply(as: String*): Args =
|
|
||||||
Args(as.toVector)
|
|
||||||
}
|
|
||||||
|
|
||||||
final case class Result(rc: Int, stdout: String, stderr: String)
|
|
||||||
|
|
||||||
def exec[F[_]: Sync](
|
|
||||||
cmd: Config,
|
|
||||||
logger: Logger[F],
|
|
||||||
wd: Option[Path] = None,
|
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
|
||||||
): Stream[F, Result] =
|
|
||||||
startProcess(cmd, wd, logger, stdin) { proc =>
|
|
||||||
Stream.eval {
|
|
||||||
for {
|
|
||||||
_ <- writeToProcess(stdin, proc)
|
|
||||||
term <- Sync[F].blocking(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
|
|
||||||
_ <-
|
|
||||||
if (term)
|
|
||||||
logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
|
|
||||||
else
|
|
||||||
logger.warn(
|
|
||||||
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
|
|
||||||
)
|
|
||||||
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
|
|
||||||
out <-
|
|
||||||
if (term) inputStreamToString(proc.getInputStream)
|
|
||||||
else Sync[F].pure("")
|
|
||||||
err <-
|
|
||||||
if (term) inputStreamToString(proc.getErrorStream)
|
|
||||||
else Sync[F].pure("")
|
|
||||||
} yield Result(proc.exitValue, out, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def execSuccess[F[_]: Sync](
|
|
||||||
cmd: Config,
|
|
||||||
logger: Logger[F],
|
|
||||||
wd: Option[Path] = None,
|
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
|
||||||
): Stream[F, Result] =
|
|
||||||
exec(cmd, logger, wd, stdin).flatMap { r =>
|
|
||||||
if (r.rc != 0)
|
|
||||||
Stream.raiseError[F](
|
|
||||||
new Exception(
|
|
||||||
s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else Stream.emit(r)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def startProcess[F[_]: Sync, A](
|
|
||||||
cmd: Config,
|
|
||||||
wd: Option[Path],
|
|
||||||
logger: Logger[F],
|
|
||||||
stdin: Stream[F, Byte]
|
|
||||||
)(
|
|
||||||
f: Process => Stream[F, A]
|
|
||||||
): Stream[F, A] = {
|
|
||||||
val log = logger.debug(s"Running external command: ${cmd.cmdString}")
|
|
||||||
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
|
||||||
val proc = log *> hasStdin.flatMap(flag =>
|
|
||||||
Sync[F].blocking {
|
|
||||||
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
|
||||||
.redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT)
|
|
||||||
.redirectError(Redirect.PIPE)
|
|
||||||
.redirectOutput(Redirect.PIPE)
|
|
||||||
|
|
||||||
val pbEnv = pb.environment()
|
|
||||||
cmd.env.foreach { case (key, value) =>
|
|
||||||
pbEnv.put(key, value)
|
|
||||||
}
|
|
||||||
wd.map(_.toNioPath.toFile).foreach(pb.directory)
|
|
||||||
pb.start()
|
|
||||||
}
|
|
||||||
)
|
|
||||||
Stream
|
|
||||||
.bracket(proc)(p =>
|
|
||||||
logger.debug(s"Closing process: `${cmd.cmdString}`").map(_ => p.destroy())
|
|
||||||
)
|
|
||||||
.flatMap(f)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def inputStreamToString[F[_]: Sync](in: InputStream): F[String] =
|
|
||||||
io.readInputStream(Sync[F].pure(in), 16 * 1024, closeAfterUse = false)
|
|
||||||
.through(text.utf8.decode)
|
|
||||||
.chunks
|
|
||||||
.map(_.toVector.mkString)
|
|
||||||
.fold1(_ + _)
|
|
||||||
.compile
|
|
||||||
.last
|
|
||||||
.map(_.getOrElse(""))
|
|
||||||
|
|
||||||
private def writeToProcess[F[_]: Sync](
|
|
||||||
data: Stream[F, Byte],
|
|
||||||
proc: Process
|
|
||||||
): F[Unit] =
|
|
||||||
data
|
|
||||||
.through(io.writeOutputStream(Sync[F].blocking(proc.getOutputStream)))
|
|
||||||
.compile
|
|
||||||
.drain
|
|
||||||
|
|
||||||
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] =
|
|
||||||
Sync[F].blocking(proc.destroyForcibly()).attempt *> {
|
|
||||||
Sync[F].raiseError(
|
|
||||||
new Exception(
|
|
||||||
s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
@ -17,6 +17,9 @@ case class Env(values: Map[String, String]) {
|
|||||||
def addAll(e: Env): Env =
|
def addAll(e: Env): Env =
|
||||||
Env(values ++ e.values)
|
Env(values ++ e.values)
|
||||||
|
|
||||||
|
def modifyValue(f: String => String): Env =
|
||||||
|
Env(values.view.mapValues(f).toMap)
|
||||||
|
|
||||||
def ++(e: Env) = addAll(e)
|
def ++(e: Env) = addAll(e)
|
||||||
|
|
||||||
def foreach(f: (String, String) => Unit): Unit =
|
def foreach(f: (String, String) => Unit): Unit =
|
||||||
|
@ -0,0 +1,89 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.common.exec
|
||||||
|
|
||||||
|
import docspell.common.Duration
|
||||||
|
import docspell.common.Ident
|
||||||
|
import docspell.common.exec.Env
|
||||||
|
import docspell.common.exec.ExternalCommand.ArgMapping
|
||||||
|
import docspell.common.exec.SysCmd
|
||||||
|
|
||||||
|
final case class ExternalCommand(
|
||||||
|
program: String,
|
||||||
|
args: Seq[String],
|
||||||
|
timeout: Duration,
|
||||||
|
env: Map[String, String] = Map.empty,
|
||||||
|
argMappings: Map[Ident, ArgMapping] = Map.empty
|
||||||
|
) {
|
||||||
|
def withVars(vars: Map[String, String]): ExternalCommand.WithVars =
|
||||||
|
ExternalCommand.WithVars(this, vars)
|
||||||
|
|
||||||
|
import ExternalCommand.pattern
|
||||||
|
|
||||||
|
def resolve(vars: Map[String, String]): SysCmd = {
|
||||||
|
val replace = ExternalCommand.replaceString(vars) _
|
||||||
|
val resolvedArgMappings =
|
||||||
|
argMappings.view.mapValues(_.resolve(replace).firstMatch).toMap
|
||||||
|
val resolvedArgs = args.map(replace).flatMap { arg =>
|
||||||
|
resolvedArgMappings
|
||||||
|
.find(e => pattern(e._1.id) == arg)
|
||||||
|
.map(_._2)
|
||||||
|
.getOrElse(List(arg))
|
||||||
|
}
|
||||||
|
|
||||||
|
SysCmd(replace(program), resolvedArgs: _*)
|
||||||
|
.withTimeout(timeout)
|
||||||
|
.withEnv(_ => Env(env).modifyValue(replace))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object ExternalCommand {
|
||||||
|
private val openPattern = "{{"
|
||||||
|
private val closePattern = "}}"
|
||||||
|
|
||||||
|
private def pattern(s: String): String = s"${openPattern}${s}${closePattern}"
|
||||||
|
|
||||||
|
def apply(program: String, args: Seq[String], timeout: Duration): ExternalCommand =
|
||||||
|
ExternalCommand(program, args, timeout, Map.empty, Map.empty)
|
||||||
|
|
||||||
|
final case class ArgMapping(
|
||||||
|
value: String,
|
||||||
|
mappings: List[ArgMatch]
|
||||||
|
) {
|
||||||
|
private[exec] def resolve(replace: String => String): ArgMapping =
|
||||||
|
ArgMapping(replace(value), mappings.map(_.resolve(replace)))
|
||||||
|
|
||||||
|
def firstMatch: List[String] =
|
||||||
|
mappings.find(am => value.matches(am.matches)).map(_.args).getOrElse(Nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
final case class ArgMatch(
|
||||||
|
matches: String,
|
||||||
|
args: List[String]
|
||||||
|
) {
|
||||||
|
private[exec] def resolve(replace: String => String): ArgMatch =
|
||||||
|
ArgMatch(replace(matches), args.map(replace))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def replaceString(vars: Map[String, String])(in: String): String =
|
||||||
|
vars.foldLeft(in) { case (result, (name, value)) =>
|
||||||
|
val key = s"{{$name}}"
|
||||||
|
result.replace(key, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
final case class WithVars(cmd: ExternalCommand, vars: Map[String, String]) {
|
||||||
|
def resolved: SysCmd = cmd.resolve(vars)
|
||||||
|
def append(more: (String, String)*): WithVars =
|
||||||
|
WithVars(cmd, vars ++ more.toMap)
|
||||||
|
|
||||||
|
def withVar(key: String, value: String): WithVars =
|
||||||
|
WithVars(cmd, vars.updated(key, value))
|
||||||
|
|
||||||
|
def withVarOption(key: String, value: Option[String]): WithVars =
|
||||||
|
value.map(withVar(key, _)).getOrElse(this)
|
||||||
|
}
|
||||||
|
}
|
@ -38,6 +38,20 @@ trait SysExec[F[_]] {
|
|||||||
|
|
||||||
def waitFor(timeout: Option[Duration] = None): F[Int]
|
def waitFor(timeout: Option[Duration] = None): F[Int]
|
||||||
|
|
||||||
|
/** Uses `waitFor` and throws when return code is non-zero. Logs stderr and stdout while
|
||||||
|
* waiting.
|
||||||
|
*/
|
||||||
|
def runToSuccess(logger: Logger[F], timeout: Option[Duration] = None)(implicit
|
||||||
|
F: Async[F]
|
||||||
|
): F[Int]
|
||||||
|
|
||||||
|
/** Uses `waitFor` and throws when return code is non-zero. Logs stderr while waiting
|
||||||
|
* and collects stdout once finished successfully.
|
||||||
|
*/
|
||||||
|
def runToSuccessStdout(logger: Logger[F], timeout: Option[Duration] = None)(implicit
|
||||||
|
F: Async[F]
|
||||||
|
): F[String]
|
||||||
|
|
||||||
/** Sends a signal to the process to terminate it immediately */
|
/** Sends a signal to the process to terminate it immediately */
|
||||||
def cancel: F[Unit]
|
def cancel: F[Unit]
|
||||||
|
|
||||||
@ -75,6 +89,12 @@ object SysExec {
|
|||||||
proc <- startProcess(logger, cmd, workdir, stdin)
|
proc <- startProcess(logger, cmd, workdir, stdin)
|
||||||
fibers <- Resource.eval(Ref.of[F, List[F[Unit]]](Nil))
|
fibers <- Resource.eval(Ref.of[F, List[F[Unit]]](Nil))
|
||||||
} yield new SysExec[F] {
|
} yield new SysExec[F] {
|
||||||
|
private lazy val basicName: String =
|
||||||
|
cmd.program.lastIndexOf(java.io.File.separatorChar.toInt) match {
|
||||||
|
case n if n > 0 => cmd.program.drop(n + 1)
|
||||||
|
case _ => cmd.program.takeRight(16)
|
||||||
|
}
|
||||||
|
|
||||||
def stdout: Stream[F, Byte] =
|
def stdout: Stream[F, Byte] =
|
||||||
fs2.io.readInputStream(
|
fs2.io.readInputStream(
|
||||||
Sync[F].blocking(proc.getInputStream),
|
Sync[F].blocking(proc.getInputStream),
|
||||||
@ -107,6 +127,39 @@ object SysExec {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def runToSuccess(logger: Logger[F], timeout: Option[Duration])(implicit
|
||||||
|
F: Async[F]
|
||||||
|
): F[Int] =
|
||||||
|
logOutputs(logger, basicName).use(_.waitFor(timeout).flatMap {
|
||||||
|
case rc if rc == 0 => Sync[F].pure(0)
|
||||||
|
case rc =>
|
||||||
|
Sync[F].raiseError(
|
||||||
|
new Exception(s"Command `${cmd.program}` returned non-zero exit code ${rc}")
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
def runToSuccessStdout(logger: Logger[F], timeout: Option[Duration])(implicit
|
||||||
|
F: Async[F]
|
||||||
|
): F[String] =
|
||||||
|
F.background(
|
||||||
|
stderrLines
|
||||||
|
.through(line => Stream.eval(logger.debug(s"[$basicName (err)]: $line")))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
).use { f1 =>
|
||||||
|
waitFor(timeout)
|
||||||
|
.flatMap {
|
||||||
|
case rc if rc == 0 => stdout.through(fs2.text.utf8.decode).compile.string
|
||||||
|
case rc =>
|
||||||
|
Sync[F].raiseError[String](
|
||||||
|
new Exception(
|
||||||
|
s"Command `${cmd.program}` returned non-zero exit code ${rc}"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.flatTap(_ => f1)
|
||||||
|
}
|
||||||
|
|
||||||
def consumeOutputs(out: Pipe[F, String, Unit], err: Pipe[F, String, Unit])(implicit
|
def consumeOutputs(out: Pipe[F, String, Unit], err: Pipe[F, String, Unit])(implicit
|
||||||
F: Async[F]
|
F: Async[F]
|
||||||
): Resource[F, SysExec[F]] =
|
): Resource[F, SysExec[F]] =
|
||||||
|
@ -0,0 +1,74 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.common.exec
|
||||||
|
|
||||||
|
import docspell.common.Duration
|
||||||
|
import docspell.common.Ident
|
||||||
|
import docspell.common.exec.Args
|
||||||
|
import docspell.common.exec.Env
|
||||||
|
import docspell.common.exec.ExternalCommand._
|
||||||
|
import docspell.common.exec.SysCmd
|
||||||
|
|
||||||
|
import munit.FunSuite
|
||||||
|
|
||||||
|
class ExternalCommandTest extends FunSuite {
|
||||||
|
|
||||||
|
test("resolve") {
|
||||||
|
val cmd = ExternalCommand(
|
||||||
|
program = "tesseract",
|
||||||
|
args = "{{infile}}" :: "{{lang-spec}}" :: "out" :: "pdf" :: "txt" :: Nil,
|
||||||
|
timeout = Duration.minutes(5),
|
||||||
|
env = Map.empty,
|
||||||
|
argMappings = Map(
|
||||||
|
Ident.unsafe("lang-spec") -> ArgMapping(
|
||||||
|
value = "{{lang}}",
|
||||||
|
mappings = List(
|
||||||
|
ArgMatch(
|
||||||
|
matches = "jpn_vert",
|
||||||
|
args = List("-l", "jpn_vert", "-c", "preserve_interword_spaces=1")
|
||||||
|
),
|
||||||
|
ArgMatch(
|
||||||
|
matches = ".*",
|
||||||
|
args = List("-l", "{{lang}}")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val varsDe = Map("lang" -> "de", "encoding" -> "UTF_8", "infile" -> "text.jpg")
|
||||||
|
assertEquals(
|
||||||
|
cmd.resolve(varsDe),
|
||||||
|
SysCmd(
|
||||||
|
"tesseract",
|
||||||
|
Args.of("text.jpg", "-l", "de", "out", "pdf", "txt"),
|
||||||
|
Env.empty,
|
||||||
|
Duration.minutes(5)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val varsJpnVert = varsDe.updated("lang", "jpn_vert")
|
||||||
|
assertEquals(
|
||||||
|
cmd.resolve(varsJpnVert),
|
||||||
|
SysCmd(
|
||||||
|
"tesseract",
|
||||||
|
Args.of(
|
||||||
|
"text.jpg",
|
||||||
|
"-l",
|
||||||
|
"jpn_vert",
|
||||||
|
"-c",
|
||||||
|
"preserve_interword_spaces=1",
|
||||||
|
"out",
|
||||||
|
"pdf",
|
||||||
|
"txt"
|
||||||
|
),
|
||||||
|
Env.empty,
|
||||||
|
Duration.minutes(5)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
@ -11,7 +11,8 @@ import cats.implicits._
|
|||||||
import fs2.io.file.{Files, Path}
|
import fs2.io.file.{Files, Path}
|
||||||
import fs2.{Pipe, Stream}
|
import fs2.{Pipe, Stream}
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common.exec.ExternalCommand
|
||||||
|
import docspell.common.exec.SysExec
|
||||||
import docspell.common.util.File
|
import docspell.common.util.File
|
||||||
import docspell.convert.ConversionResult
|
import docspell.convert.ConversionResult
|
||||||
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
||||||
@ -21,11 +22,11 @@ private[extern] object ExternConv {
|
|||||||
|
|
||||||
def toPDF[F[_]: Async: Files, A](
|
def toPDF[F[_]: Async: Files, A](
|
||||||
name: String,
|
name: String,
|
||||||
cmdCfg: SystemCommand.Config,
|
cmdCfg: ExternalCommand.WithVars,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
useStdin: Boolean,
|
useStdin: Boolean,
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
reader: (Path, Int) => F[ConversionResult[F]]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||||
Stream
|
Stream
|
||||||
.resource(File.withTempDir[F](wd, s"docspell-$name"))
|
.resource(File.withTempDir[F](wd, s"docspell-$name"))
|
||||||
@ -33,32 +34,21 @@ private[extern] object ExternConv {
|
|||||||
val inFile = dir.resolve("infile").absolute.normalize
|
val inFile = dir.resolve("infile").absolute.normalize
|
||||||
val out = dir.resolve("out.pdf").absolute.normalize
|
val out = dir.resolve("out.pdf").absolute.normalize
|
||||||
val sysCfg =
|
val sysCfg =
|
||||||
cmdCfg.replace(
|
cmdCfg
|
||||||
Map(
|
.withVar("outfile", out.toString)
|
||||||
"{{outfile}}" -> out.toString
|
.withVarOption("infile", Option.when(!useStdin)(inFile.toString))
|
||||||
) ++
|
.resolved
|
||||||
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
|
||||||
else Map.empty)
|
|
||||||
)
|
|
||||||
|
|
||||||
val createInput: Pipe[F, Byte, Unit] =
|
val createInput: Pipe[F, Byte, Unit] =
|
||||||
if (useStdin) _ => Stream.emit(())
|
if (useStdin) _ => Stream.emit(())
|
||||||
else storeDataToFile(name, logger, inFile)
|
else storeDataToFile(name, logger, inFile)
|
||||||
|
|
||||||
in.through(createInput).flatMap { _ =>
|
in.through(createInput).evalMap { _ =>
|
||||||
SystemCommand
|
SysExec(sysCfg, logger, Some(dir), Option.when(useStdin)(in))
|
||||||
.exec[F](
|
.flatMap(_.logOutputs(logger, name))
|
||||||
sysCfg,
|
.use { proc =>
|
||||||
logger,
|
proc.waitFor().flatMap(rc => reader(out, rc).flatMap(handler.run))
|
||||||
Some(dir),
|
}
|
||||||
if (useStdin) in
|
|
||||||
else Stream.empty
|
|
||||||
)
|
|
||||||
.evalMap(result =>
|
|
||||||
logResult(name, result, logger)
|
|
||||||
.flatMap(_ => reader(out, result))
|
|
||||||
.flatMap(handler.run)
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.compile
|
.compile
|
||||||
@ -74,9 +64,9 @@ private[extern] object ExternConv {
|
|||||||
def readResult[F[_]: Async: Files](
|
def readResult[F[_]: Async: Files](
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
)(out: Path, result: Int): F[ConversionResult[F]] =
|
||||||
File.existsNonEmpty[F](out).flatMap {
|
File.existsNonEmpty[F](out).flatMap {
|
||||||
case true if result.rc == 0 =>
|
case true if result == 0 =>
|
||||||
val outTxt = out.resolveSibling(out.fileName.toString + ".txt")
|
val outTxt = out.resolveSibling(out.fileName.toString + ".txt")
|
||||||
File.existsNonEmpty[F](outTxt).flatMap {
|
File.existsNonEmpty[F](outTxt).flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
@ -88,13 +78,13 @@ private[extern] object ExternConv {
|
|||||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||||
}
|
}
|
||||||
case true =>
|
case true =>
|
||||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
logger.warn(s"Command not successful (rc=${result}), but file exists.") *>
|
||||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
ConversionResult
|
ConversionResult
|
||||||
.failure[F](
|
.failure[F](
|
||||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
new Exception(s"Command result=${result}. No output file found.")
|
||||||
)
|
)
|
||||||
.pure[F]
|
.pure[F]
|
||||||
}
|
}
|
||||||
@ -103,25 +93,25 @@ private[extern] object ExternConv {
|
|||||||
outPrefix: String,
|
outPrefix: String,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
|
)(out: Path, result: Int): F[ConversionResult[F]] = {
|
||||||
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
|
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
|
||||||
File.existsNonEmpty[F](outPdf).flatMap {
|
File.existsNonEmpty[F](outPdf).flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||||
File.exists(outTxt).flatMap { txtExists =>
|
File.exists(outTxt).flatMap { txtExists =>
|
||||||
val pdfData = File.readAll(out, chunkSize)
|
val pdfData = File.readAll(out, chunkSize)
|
||||||
if (result.rc == 0)
|
if (result == 0)
|
||||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F]
|
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F]
|
||||||
else successPdf(pdfData).pure[F]
|
else successPdf(pdfData).pure[F]
|
||||||
else
|
else
|
||||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
logger.warn(s"Command not successful (rc=${result}), but file exists.") *>
|
||||||
successPdf(pdfData).pure[F]
|
successPdf(pdfData).pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
ConversionResult
|
ConversionResult
|
||||||
.failure[F](
|
.failure[F](
|
||||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
new Exception(s"Command result=${result}. No output file found.")
|
||||||
)
|
)
|
||||||
.pure[F]
|
.pure[F]
|
||||||
}
|
}
|
||||||
@ -138,14 +128,6 @@ private[extern] object ExternConv {
|
|||||||
.drain ++
|
.drain ++
|
||||||
Stream.eval(storeFile(in, inFile))
|
Stream.eval(storeFile(in, inFile))
|
||||||
|
|
||||||
private def logResult[F[_]: Sync](
|
|
||||||
name: String,
|
|
||||||
result: SystemCommand.Result,
|
|
||||||
logger: Logger[F]
|
|
||||||
): F[Unit] =
|
|
||||||
logger.debug(s"$name stdout: ${result.stdout}") *>
|
|
||||||
logger.debug(s"$name stderr: ${result.stderr}")
|
|
||||||
|
|
||||||
private def storeFile[F[_]: Async: Files](
|
private def storeFile[F[_]: Async: Files](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
target: Path
|
target: Path
|
||||||
|
@ -24,12 +24,14 @@ object OcrMyPdf {
|
|||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||||
if (cfg.enabled) {
|
if (cfg.enabled) {
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](chunkSize, logger)
|
ExternConv.readResult[F](chunkSize, logger)
|
||||||
|
|
||||||
|
val cmd = cfg.command.withVars(Map("lang" -> lang.iso3))
|
||||||
|
|
||||||
ExternConv.toPDF[F, A](
|
ExternConv.toPDF[F, A](
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
cmd,
|
||||||
cfg.workingDir,
|
cfg.workingDir,
|
||||||
useStdin = false,
|
useStdin = false,
|
||||||
logger,
|
logger,
|
||||||
|
@ -8,10 +8,10 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common.SystemCommand
|
import docspell.common.exec.ExternalCommand
|
||||||
|
|
||||||
case class OcrMyPdfConfig(
|
case class OcrMyPdfConfig(
|
||||||
enabled: Boolean,
|
enabled: Boolean,
|
||||||
command: SystemCommand.Config,
|
command: ExternalCommand,
|
||||||
workingDir: Path
|
workingDir: Path
|
||||||
)
|
)
|
||||||
|
@ -24,17 +24,18 @@ object Tesseract {
|
|||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
val outBase = cfg.command.args.tail.headOption.getOrElse("out")
|
val outBase = cfg.command.args.tail.headOption.getOrElse("out")
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResultTesseract[F](outBase, chunkSize, logger)
|
ExternConv.readResultTesseract[F](outBase, chunkSize, logger)
|
||||||
|
|
||||||
|
val cmd = cfg.command.withVars(Map("lang" -> lang.iso3))
|
||||||
|
|
||||||
ExternConv.toPDF[F, A](
|
ExternConv.toPDF[F, A](
|
||||||
"tesseract",
|
"tesseract",
|
||||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
cmd,
|
||||||
cfg.workingDir,
|
cfg.workingDir,
|
||||||
useStdin = false,
|
useStdin = false,
|
||||||
logger,
|
logger,
|
||||||
reader
|
reader
|
||||||
)(in, handler)
|
)(in, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common.SystemCommand
|
import docspell.common.exec.ExternalCommand
|
||||||
|
|
||||||
case class TesseractConfig(command: SystemCommand.Config, workingDir: Path)
|
case class TesseractConfig(command: ExternalCommand, workingDir: Path)
|
||||||
|
@ -10,7 +10,6 @@ import cats.effect._
|
|||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import fs2.io.file.{Files, Path}
|
import fs2.io.file.{Files, Path}
|
||||||
|
|
||||||
import docspell.common._
|
|
||||||
import docspell.convert.ConversionResult
|
import docspell.convert.ConversionResult
|
||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.logging.Logger
|
import docspell.logging.Logger
|
||||||
@ -22,12 +21,13 @@ object Unoconv {
|
|||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](chunkSize, logger)
|
ExternConv.readResult[F](chunkSize, logger)
|
||||||
|
val cmd = cfg.command.withVars(Map.empty)
|
||||||
|
|
||||||
ExternConv.toPDF[F, A](
|
ExternConv.toPDF[F, A](
|
||||||
"unoconv",
|
"unoconv",
|
||||||
cfg.command,
|
cmd,
|
||||||
cfg.workingDir,
|
cfg.workingDir,
|
||||||
useStdin = false,
|
useStdin = false,
|
||||||
logger,
|
logger,
|
||||||
@ -37,5 +37,4 @@ object Unoconv {
|
|||||||
handler
|
handler
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common.SystemCommand
|
import docspell.common.exec.ExternalCommand
|
||||||
|
|
||||||
case class UnoconvConfig(command: SystemCommand.Config, workingDir: Path)
|
case class UnoconvConfig(command: ExternalCommand, workingDir: Path)
|
||||||
|
@ -27,10 +27,10 @@ object Weasyprint {
|
|||||||
sanitizeHtml: SanitizeHtml,
|
sanitizeHtml: SanitizeHtml,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](chunkSize, logger)
|
ExternConv.readResult[F](chunkSize, logger)
|
||||||
|
|
||||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name()))
|
||||||
|
|
||||||
// html sanitize should (among other) remove links to invalid
|
// html sanitize should (among other) remove links to invalid
|
||||||
// protocols like cid: which is not supported by further
|
// protocols like cid: which is not supported by further
|
||||||
@ -51,5 +51,4 @@ object Weasyprint {
|
|||||||
handler
|
handler
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common.SystemCommand
|
import docspell.common.exec.ExternalCommand
|
||||||
|
|
||||||
case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)
|
case class WeasyprintConfig(command: ExternalCommand, workingDir: Path)
|
||||||
|
@ -27,10 +27,10 @@ object WkHtmlPdf {
|
|||||||
sanitizeHtml: SanitizeHtml,
|
sanitizeHtml: SanitizeHtml,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, Int) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](chunkSize, logger)
|
ExternConv.readResult[F](chunkSize, logger)
|
||||||
|
|
||||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
val cmdCfg = cfg.command.withVars(Map("encoding" -> charset.name()))
|
||||||
|
|
||||||
// html sanitize should (among other) remove links to invalid
|
// html sanitize should (among other) remove links to invalid
|
||||||
// protocols like cid: which is not supported by further
|
// protocols like cid: which is not supported by further
|
||||||
@ -58,5 +58,4 @@ object WkHtmlPdf {
|
|||||||
handler
|
handler
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,6 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common.SystemCommand
|
import docspell.common.exec.ExternalCommand
|
||||||
|
|
||||||
case class WkHtmlPdfConfig(command: SystemCommand.Config, workingDir: Path)
|
case class WkHtmlPdfConfig(command: ExternalCommand, workingDir: Path)
|
||||||
|
@ -15,6 +15,7 @@ import cats.implicits._
|
|||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.common.exec._
|
||||||
import docspell.common.util.File
|
import docspell.common.util.File
|
||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.convert.ConvertConfig.HtmlConverter
|
import docspell.convert.ConvertConfig.HtmlConverter
|
||||||
@ -36,7 +37,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
3000 * 3000,
|
3000 * 3000,
|
||||||
MarkdownConfig("body { padding: 2em 5em; }"),
|
MarkdownConfig("body { padding: 2em 5em; }"),
|
||||||
WkHtmlPdfConfig(
|
WkHtmlPdfConfig(
|
||||||
SystemCommand.Config(
|
ExternalCommand(
|
||||||
"wkhtmltopdf",
|
"wkhtmltopdf",
|
||||||
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -44,7 +45,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
target
|
target
|
||||||
),
|
),
|
||||||
WeasyprintConfig(
|
WeasyprintConfig(
|
||||||
SystemCommand.Config(
|
ExternalCommand(
|
||||||
"weasyprint",
|
"weasyprint",
|
||||||
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
|
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -53,7 +54,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
),
|
),
|
||||||
HtmlConverter.Wkhtmltopdf,
|
HtmlConverter.Wkhtmltopdf,
|
||||||
TesseractConfig(
|
TesseractConfig(
|
||||||
SystemCommand.Config(
|
ExternalCommand(
|
||||||
"tesseract",
|
"tesseract",
|
||||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -61,7 +62,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
target
|
target
|
||||||
),
|
),
|
||||||
UnoconvConfig(
|
UnoconvConfig(
|
||||||
SystemCommand.Config(
|
ExternalCommand(
|
||||||
"unoconv",
|
"unoconv",
|
||||||
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -70,7 +71,7 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
),
|
),
|
||||||
OcrMyPdfConfig(
|
OcrMyPdfConfig(
|
||||||
enabled = true,
|
enabled = true,
|
||||||
SystemCommand.Config(
|
ExternalCommand(
|
||||||
"ocrmypdf",
|
"ocrmypdf",
|
||||||
Seq(
|
Seq(
|
||||||
"-l",
|
"-l",
|
||||||
|
@ -14,6 +14,7 @@ import cats.effect.unsafe.implicits.global
|
|||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.common.exec._
|
||||||
import docspell.common.util.File
|
import docspell.common.util.File
|
||||||
import docspell.convert._
|
import docspell.convert._
|
||||||
import docspell.files.ExampleFiles
|
import docspell.files.ExampleFiles
|
||||||
@ -27,7 +28,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
val target = File.path(Paths.get("target"))
|
val target = File.path(Paths.get("target"))
|
||||||
|
|
||||||
test("convert html to pdf") {
|
test("convert html to pdf") {
|
||||||
val cfg = SystemCommand.Config(
|
val cfg = ExternalCommand(
|
||||||
"wkhtmltopdf",
|
"wkhtmltopdf",
|
||||||
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -53,7 +54,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test("convert office to pdf") {
|
test("convert office to pdf") {
|
||||||
val cfg = SystemCommand.Config(
|
val cfg = ExternalCommand(
|
||||||
"unoconv",
|
"unoconv",
|
||||||
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -80,7 +81,7 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test("convert image to pdf") {
|
test("convert image to pdf") {
|
||||||
val cfg = SystemCommand.Config(
|
val cfg = ExternalCommand(
|
||||||
"tesseract",
|
"tesseract",
|
||||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
@ -105,5 +106,4 @@ class ExternConvTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
)
|
)
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,8 @@ import cats.effect._
|
|||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import fs2.io.file.{Files, Path}
|
import fs2.io.file.{Files, Path}
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common.exec.ExternalCommand
|
||||||
|
import docspell.common.exec.SysExec
|
||||||
import docspell.common.util.File
|
import docspell.common.util.File
|
||||||
import docspell.logging.Logger
|
import docspell.logging.Logger
|
||||||
|
|
||||||
@ -77,14 +78,17 @@ object Ocr {
|
|||||||
else cfg.ghostscript.command.args
|
else cfg.ghostscript.command.args
|
||||||
val cmd = cfg.ghostscript.command
|
val cmd = cfg.ghostscript.command
|
||||||
.copy(args = xargs)
|
.copy(args = xargs)
|
||||||
.replace(
|
.withVars(
|
||||||
Map(
|
Map(
|
||||||
"{{infile}}" -> "-",
|
"infile" -> "-",
|
||||||
"{{outfile}}" -> "%d.tif"
|
"outfile" -> "%d.tif"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
.resolved
|
||||||
.execSuccess(cmd, logger, wd = Some(wd), stdin = pdf)
|
|
||||||
|
Stream
|
||||||
|
.resource(SysExec(cmd, logger, Some(wd), Some(pdf)))
|
||||||
|
.evalMap(_.runToSuccess(logger))
|
||||||
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,18 +97,22 @@ object Ocr {
|
|||||||
*/
|
*/
|
||||||
private[extract] def runGhostscriptFile[F[_]: Async: Files](
|
private[extract] def runGhostscriptFile[F[_]: Async: Files](
|
||||||
pdf: Path,
|
pdf: Path,
|
||||||
ghostscript: SystemCommand.Config,
|
ghostscript: ExternalCommand,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val cmd = ghostscript.replace(
|
val cmd = ghostscript
|
||||||
|
.withVars(
|
||||||
Map(
|
Map(
|
||||||
"{{infile}}" -> pdf.absolute.toString,
|
"infile" -> pdf.absolute.toString,
|
||||||
"{{outfile}}" -> "%d.tif"
|
"outfile" -> "%d.tif"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
.resolved
|
||||||
.execSuccess[F](cmd, logger, wd = Some(wd))
|
|
||||||
|
Stream
|
||||||
|
.resource(SysExec(cmd, logger, Some(wd)))
|
||||||
|
.evalMap(_.runToSuccess(logger))
|
||||||
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,19 +124,23 @@ object Ocr {
|
|||||||
*/
|
*/
|
||||||
private[extract] def runUnpaperFile[F[_]: Async](
|
private[extract] def runUnpaperFile[F[_]: Async](
|
||||||
img: Path,
|
img: Path,
|
||||||
unpaper: SystemCommand.Config,
|
unpaper: ExternalCommand,
|
||||||
wd: Option[Path],
|
wd: Option[Path],
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val targetFile = img.resolveSibling("u-" + img.fileName.toString).absolute
|
val targetFile = img.resolveSibling("u-" + img.fileName.toString).absolute
|
||||||
val cmd = unpaper.replace(
|
val cmd = unpaper
|
||||||
|
.withVars(
|
||||||
Map(
|
Map(
|
||||||
"{{infile}}" -> img.absolute.toString,
|
"infile" -> img.absolute.toString,
|
||||||
"{{outfile}}" -> targetFile.toString
|
"outfile" -> targetFile.toString
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
.resolved
|
||||||
.execSuccess[F](cmd, logger, wd = wd)
|
|
||||||
|
Stream
|
||||||
|
.resource(SysExec(cmd, logger, wd))
|
||||||
|
.evalMap(_.runToSuccess(logger))
|
||||||
.map(_ => targetFile)
|
.map(_ => targetFile)
|
||||||
.handleErrorWith { th =>
|
.handleErrorWith { th =>
|
||||||
logger
|
logger
|
||||||
@ -150,12 +162,14 @@ object Ocr {
|
|||||||
// so use the parent as working dir
|
// so use the parent as working dir
|
||||||
runUnpaperFile(img, config.unpaper.command, img.parent, logger).flatMap { uimg =>
|
runUnpaperFile(img, config.unpaper.command, img.parent, logger).flatMap { uimg =>
|
||||||
val cmd = config.tesseract.command
|
val cmd = config.tesseract.command
|
||||||
.replace(
|
.withVars(
|
||||||
Map("{{file}}" -> uimg.fileName.toString, "{{lang}}" -> fixLanguage(lang))
|
Map("file" -> uimg.fileName.toString, "lang" -> fixLanguage(lang))
|
||||||
)
|
)
|
||||||
SystemCommand
|
.resolved
|
||||||
.execSuccess[F](cmd, logger, wd = uimg.parent)
|
|
||||||
.map(_.stdout)
|
Stream
|
||||||
|
.resource(SysExec(cmd, logger, uimg.parent))
|
||||||
|
.evalMap(_.runToSuccessStdout(logger))
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Run tesseract on the given image file and return the extracted text. */
|
/** Run tesseract on the given image file and return the extracted text. */
|
||||||
@ -166,8 +180,12 @@ object Ocr {
|
|||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] = {
|
): Stream[F, String] = {
|
||||||
val cmd = config.tesseract.command
|
val cmd = config.tesseract.command
|
||||||
.replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
|
.withVars(Map("file" -> "stdin", "lang" -> fixLanguage(lang)))
|
||||||
SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout)
|
.resolved
|
||||||
|
|
||||||
|
Stream
|
||||||
|
.resource(SysExec(cmd, logger, None, Some(img)))
|
||||||
|
.evalMap(_.runToSuccessStdout(logger))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def fixLanguage(lang: String): String =
|
private def fixLanguage(lang: String): String =
|
||||||
|
@ -6,12 +6,9 @@
|
|||||||
|
|
||||||
package docspell.extract.ocr
|
package docspell.extract.ocr
|
||||||
|
|
||||||
import java.nio.file.Paths
|
|
||||||
|
|
||||||
import fs2.io.file.Path
|
import fs2.io.file.Path
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common.exec.ExternalCommand
|
||||||
import docspell.common.util.File
|
|
||||||
|
|
||||||
case class OcrConfig(
|
case class OcrConfig(
|
||||||
maxImageSize: Int,
|
maxImageSize: Int,
|
||||||
@ -25,43 +22,10 @@ object OcrConfig {
|
|||||||
|
|
||||||
case class PageRange(begin: Int)
|
case class PageRange(begin: Int)
|
||||||
|
|
||||||
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
case class Ghostscript(command: ExternalCommand, workingDir: Path)
|
||||||
|
|
||||||
case class Tesseract(command: SystemCommand.Config)
|
case class Tesseract(command: ExternalCommand)
|
||||||
|
|
||||||
case class Unpaper(command: SystemCommand.Config)
|
case class Unpaper(command: ExternalCommand)
|
||||||
|
|
||||||
val default = OcrConfig(
|
|
||||||
maxImageSize = 3000 * 3000,
|
|
||||||
pageRange = PageRange(10),
|
|
||||||
ghostscript = Ghostscript(
|
|
||||||
SystemCommand.Config(
|
|
||||||
"gs",
|
|
||||||
Seq(
|
|
||||||
"-dNOPAUSE",
|
|
||||||
"-dBATCH",
|
|
||||||
"-dSAFER",
|
|
||||||
"-sDEVICE=tiffscaled8",
|
|
||||||
"-sOutputFile={{outfile}}",
|
|
||||||
"{{infile}}"
|
|
||||||
),
|
|
||||||
Duration.seconds(30)
|
|
||||||
),
|
|
||||||
File.path(
|
|
||||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
|
||||||
)
|
|
||||||
),
|
|
||||||
unpaper = Unpaper(
|
|
||||||
SystemCommand
|
|
||||||
.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
|
||||||
),
|
|
||||||
tesseract = Tesseract(
|
|
||||||
SystemCommand
|
|
||||||
.Config(
|
|
||||||
"tesseract",
|
|
||||||
Seq("{{file}}", "stdout", "-l", "{{lang}}"),
|
|
||||||
Duration.minutes(1)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
@ -6,9 +6,14 @@
|
|||||||
|
|
||||||
package docspell.extract.ocr
|
package docspell.extract.ocr
|
||||||
|
|
||||||
|
import java.nio.file.Paths
|
||||||
|
|
||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
import cats.effect.unsafe.implicits.global
|
import cats.effect.unsafe.implicits.global
|
||||||
|
|
||||||
|
import docspell.common.Duration
|
||||||
|
import docspell.common.exec.ExternalCommand
|
||||||
|
import docspell.common.util.File
|
||||||
import docspell.files.TestFiles
|
import docspell.files.TestFiles
|
||||||
import docspell.logging.TestLoggingConfig
|
import docspell.logging.TestLoggingConfig
|
||||||
|
|
||||||
@ -21,7 +26,7 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig {
|
|||||||
|
|
||||||
test("extract english pdf".ignore) {
|
test("extract english pdf".ignore) {
|
||||||
val text = TextExtract
|
val text = TextExtract
|
||||||
.extract[IO](letterSourceEN, logger, "eng", OcrConfig.default)
|
.extract[IO](letterSourceEN, logger, "eng", TextExtractionSuite.defaultConfig)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
@ -31,7 +36,7 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig {
|
|||||||
test("extract german pdf".ignore) {
|
test("extract german pdf".ignore) {
|
||||||
val expect = TestFiles.letterDEText
|
val expect = TestFiles.letterDEText
|
||||||
val extract = TextExtract
|
val extract = TextExtract
|
||||||
.extract[IO](letterSourceDE, logger, "deu", OcrConfig.default)
|
.extract[IO](letterSourceDE, logger, "deu", TextExtractionSuite.defaultConfig)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
@ -39,3 +44,37 @@ class TextExtractionSuite extends FunSuite with TestLoggingConfig {
|
|||||||
assertEquals(extract.value, expect)
|
assertEquals(extract.value, expect)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
object TextExtractionSuite {
|
||||||
|
val defaultConfig = OcrConfig(
|
||||||
|
maxImageSize = 3000 * 3000,
|
||||||
|
pageRange = OcrConfig.PageRange(10),
|
||||||
|
ghostscript = OcrConfig.Ghostscript(
|
||||||
|
ExternalCommand(
|
||||||
|
"gs",
|
||||||
|
Seq(
|
||||||
|
"-dNOPAUSE",
|
||||||
|
"-dBATCH",
|
||||||
|
"-dSAFER",
|
||||||
|
"-sDEVICE=tiffscaled8",
|
||||||
|
"-sOutputFile={{outfile}}",
|
||||||
|
"{{infile}}"
|
||||||
|
),
|
||||||
|
Duration.seconds(30)
|
||||||
|
),
|
||||||
|
File.path(
|
||||||
|
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||||
|
)
|
||||||
|
),
|
||||||
|
unpaper = OcrConfig.Unpaper(
|
||||||
|
ExternalCommand("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
||||||
|
),
|
||||||
|
tesseract = OcrConfig.Tesseract(
|
||||||
|
ExternalCommand(
|
||||||
|
"tesseract",
|
||||||
|
Seq("{{file}}", "stdout", "-l", "{{lang}}"),
|
||||||
|
Duration.minutes(1)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user