mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Extend config for external commands (#2536)
Allows to configure external commands and provide different arguments based on runtime values, like language. It extends the current config of a command to allow a `arg-mappings` section. An example for ocrmypdf: ```conf ocrmypdf = { enabled = true command = { program = "ocrmypdf" ### new arg-mappings arg-mappings = { "mylang" = { value = "{{lang}}" mappings = [ { matches = "deu" args = [ "-l", "deu", "--pdf-renderer", "sandwich" ] }, { matches = ".*" args = [ "-l", "{{lang}}" ] } ] } } #### end new arg-mappings args = [ ### will be replaced with corresponding args from "mylang" mapping "{{mylang}}", "--skip-text", "--deskew", "-j", "1", "{{infile}}", "{{outfile}}" ] timeout = "5 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" } ``` The whole section will be first processed to replace all `{{…}}` patterns with corresponding values. Then `arg-mappings` will be looked at and the first match (value == matches) in its `mappings` array is used to replace its name in the arguments to the command.
This commit is contained in:
@ -10,7 +10,8 @@ import cats.effect._
|
||||
import fs2.Stream
|
||||
import fs2.io.file.{Files, Path}
|
||||
|
||||
import docspell.common._
|
||||
import docspell.common.exec.ExternalCommand
|
||||
import docspell.common.exec.SysExec
|
||||
import docspell.common.util.File
|
||||
import docspell.logging.Logger
|
||||
|
||||
@ -77,14 +78,17 @@ object Ocr {
|
||||
else cfg.ghostscript.command.args
|
||||
val cmd = cfg.ghostscript.command
|
||||
.copy(args = xargs)
|
||||
.replace(
|
||||
.withVars(
|
||||
Map(
|
||||
"{{infile}}" -> "-",
|
||||
"{{outfile}}" -> "%d.tif"
|
||||
"infile" -> "-",
|
||||
"outfile" -> "%d.tif"
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess(cmd, logger, wd = Some(wd), stdin = pdf)
|
||||
.resolved
|
||||
|
||||
Stream
|
||||
.resource(SysExec(cmd, logger, Some(wd), Some(pdf)))
|
||||
.evalMap(_.runToSuccess(logger))
|
||||
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||
}
|
||||
|
||||
@ -93,18 +97,22 @@ object Ocr {
|
||||
*/
|
||||
private[extract] def runGhostscriptFile[F[_]: Async: Files](
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
ghostscript: ExternalCommand,
|
||||
wd: Path,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val cmd = ghostscript.replace(
|
||||
Map(
|
||||
"{{infile}}" -> pdf.absolute.toString,
|
||||
"{{outfile}}" -> "%d.tif"
|
||||
val cmd = ghostscript
|
||||
.withVars(
|
||||
Map(
|
||||
"infile" -> pdf.absolute.toString,
|
||||
"outfile" -> "%d.tif"
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, logger, wd = Some(wd))
|
||||
.resolved
|
||||
|
||||
Stream
|
||||
.resource(SysExec(cmd, logger, Some(wd)))
|
||||
.evalMap(_.runToSuccess(logger))
|
||||
.flatMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||
}
|
||||
|
||||
@ -116,19 +124,23 @@ object Ocr {
|
||||
*/
|
||||
private[extract] def runUnpaperFile[F[_]: Async](
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
unpaper: ExternalCommand,
|
||||
wd: Option[Path],
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val targetFile = img.resolveSibling("u-" + img.fileName.toString).absolute
|
||||
val cmd = unpaper.replace(
|
||||
Map(
|
||||
"{{infile}}" -> img.absolute.toString,
|
||||
"{{outfile}}" -> targetFile.toString
|
||||
val cmd = unpaper
|
||||
.withVars(
|
||||
Map(
|
||||
"infile" -> img.absolute.toString,
|
||||
"outfile" -> targetFile.toString
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, logger, wd = wd)
|
||||
.resolved
|
||||
|
||||
Stream
|
||||
.resource(SysExec(cmd, logger, wd))
|
||||
.evalMap(_.runToSuccess(logger))
|
||||
.map(_ => targetFile)
|
||||
.handleErrorWith { th =>
|
||||
logger
|
||||
@ -150,12 +162,14 @@ object Ocr {
|
||||
// so use the parent as working dir
|
||||
runUnpaperFile(img, config.unpaper.command, img.parent, logger).flatMap { uimg =>
|
||||
val cmd = config.tesseract.command
|
||||
.replace(
|
||||
Map("{{file}}" -> uimg.fileName.toString, "{{lang}}" -> fixLanguage(lang))
|
||||
.withVars(
|
||||
Map("file" -> uimg.fileName.toString, "lang" -> fixLanguage(lang))
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, logger, wd = uimg.parent)
|
||||
.map(_.stdout)
|
||||
.resolved
|
||||
|
||||
Stream
|
||||
.resource(SysExec(cmd, logger, uimg.parent))
|
||||
.evalMap(_.runToSuccessStdout(logger))
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted text. */
|
||||
@ -166,8 +180,12 @@ object Ocr {
|
||||
config: OcrConfig
|
||||
): Stream[F, String] = {
|
||||
val cmd = config.tesseract.command
|
||||
.replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
|
||||
SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout)
|
||||
.withVars(Map("file" -> "stdin", "lang" -> fixLanguage(lang)))
|
||||
.resolved
|
||||
|
||||
Stream
|
||||
.resource(SysExec(cmd, logger, None, Some(img)))
|
||||
.evalMap(_.runToSuccessStdout(logger))
|
||||
}
|
||||
|
||||
private def fixLanguage(lang: String): String =
|
||||
|
@ -6,12 +6,9 @@
|
||||
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common._
|
||||
import docspell.common.util.File
|
||||
import docspell.common.exec.ExternalCommand
|
||||
|
||||
case class OcrConfig(
|
||||
maxImageSize: Int,
|
||||
@ -25,43 +22,10 @@ object OcrConfig {
|
||||
|
||||
case class PageRange(begin: Int)
|
||||
|
||||
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
||||
case class Ghostscript(command: ExternalCommand, workingDir: Path)
|
||||
|
||||
case class Tesseract(command: SystemCommand.Config)
|
||||
case class Tesseract(command: ExternalCommand)
|
||||
|
||||
case class Unpaper(command: SystemCommand.Config)
|
||||
case class Unpaper(command: ExternalCommand)
|
||||
|
||||
val default = OcrConfig(
|
||||
maxImageSize = 3000 * 3000,
|
||||
pageRange = PageRange(10),
|
||||
ghostscript = Ghostscript(
|
||||
SystemCommand.Config(
|
||||
"gs",
|
||||
Seq(
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
"-dSAFER",
|
||||
"-sDEVICE=tiffscaled8",
|
||||
"-sOutputFile={{outfile}}",
|
||||
"{{infile}}"
|
||||
),
|
||||
Duration.seconds(30)
|
||||
),
|
||||
File.path(
|
||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||
)
|
||||
),
|
||||
unpaper = Unpaper(
|
||||
SystemCommand
|
||||
.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
||||
),
|
||||
tesseract = Tesseract(
|
||||
SystemCommand
|
||||
.Config(
|
||||
"tesseract",
|
||||
Seq("{{file}}", "stdout", "-l", "{{lang}}"),
|
||||
Duration.minutes(1)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
Reference in New Issue
Block a user