Move SystemCommand to common module

This commit is contained in:
Eike Kettner 2020-02-10 22:23:06 +01:00
parent ba3865ef5e
commit 3be90d64d5
4 changed files with 30 additions and 31 deletions

View File

@ -1,4 +1,4 @@
package docspell.text.ocr package docspell.common
import java.io.InputStream import java.io.InputStream
import java.nio.file.Path import java.nio.file.Path
@ -11,13 +11,24 @@ import scala.jdk.CollectionConverters._
import docspell.common.syntax.all._ import docspell.common.syntax.all._
object SystemCommand { object SystemCommand {
private[this] val logger = getLogger private[this] val logger = getLogger
final case class Config(program: String, args: Seq[String], timeout: Duration) {
def mapArgs(f: String => String): Config =
Config(program, args.map(f), timeout)
def toCmd: List[String] =
program :: args.toList
lazy val cmdString: String =
toCmd.mkString(" ")
}
final case class Result(rc: Int, stdout: String, stderr: String) final case class Result(rc: Int, stdout: String, stderr: String)
def exec[F[_]: Sync: ContextShift]( def exec[F[_]: Sync: ContextShift](
cmd: Config.Command, cmd: Config,
blocker: Blocker, blocker: Blocker,
wd: Option[Path] = None, wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty stdin: Stream[F, Byte] = Stream.empty
@ -40,7 +51,7 @@ object SystemCommand {
} }
def execSuccess[F[_]: Sync: ContextShift]( def execSuccess[F[_]: Sync: ContextShift](
cmd: Config.Command, cmd: Config,
blocker: Blocker, blocker: Blocker,
wd: Option[Path] = None, wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty stdin: Stream[F, Byte] = Stream.empty
@ -55,7 +66,7 @@ object SystemCommand {
else Stream.emit(r) else Stream.emit(r)
} }
private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])( private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path])(
f: Process => Stream[F, A] f: Process => Stream[F, A]
): Stream[F, A] = { ): Stream[F, A] = {
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
@ -93,7 +104,7 @@ object SystemCommand {
): F[Unit] = ): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] = private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] =
Sync[F].delay(proc.destroyForcibly()).attempt *> { Sync[F].delay(proc.destroyForcibly()).attempt *> {
Sync[F].raiseError( Sync[F].raiseError(
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})") new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")

View File

@ -67,11 +67,10 @@ docspell.joex {
# Configuration of text extraction # Configuration of text extraction
# #
# Extracting text currently only work for image and pdf files. It # Extracting text currently only work for image and pdf files. It
# will first runs ghostscript to create a gray image from a # will first run ghostscript to create a gray image from a pdf. Then
# pdf. Then unpaper is run to optimize the image for the upcoming # unpaper is run to optimize the image for the upcoming ocr, which
# ocr, which will be done by tesseract. All these programs must be # will be done by tesseract. All these programs must be available in
# available in your PATH or the absolute path can be specified # your PATH or the absolute path can be specified below.
# below.
extraction { extraction {
allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ] allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]

View File

@ -19,21 +19,9 @@ case class Config(
object Config { object Config {
case class PageRange(begin: Int) case class PageRange(begin: Int)
case class Command(program: String, args: Seq[String], timeout: Duration) { case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
case class Tesseract(command: SystemCommand.Config)
def mapArgs(f: String => String): Command = case class Unpaper(command: SystemCommand.Config)
Command(program, args.map(f), timeout)
def toCmd: List[String] =
program :: args.toList
lazy val cmdString: String =
toCmd.mkString(" ")
}
case class Ghostscript(command: Command, workingDir: Path)
case class Tesseract(command: Command)
case class Unpaper(command: Command)
val default = Config( val default = Config(
allowedContentTypes = Set( allowedContentTypes = Set(
@ -44,7 +32,7 @@ object Config {
), ),
pageRange = PageRange(10), pageRange = PageRange(10),
ghostscript = Ghostscript( ghostscript = Ghostscript(
Command( SystemCommand.Config(
"gs", "gs",
Seq( Seq(
"-dNOPAUSE", "-dNOPAUSE",
@ -58,9 +46,9 @@ object Config {
), ),
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
), ),
unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
tesseract = Tesseract( tesseract = Tesseract(
Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
) )
) )
} }

View File

@ -5,6 +5,7 @@ import java.nio.file.Path
import cats.effect.{Blocker, ContextShift, Sync} import cats.effect.{Blocker, ContextShift, Sync}
import fs2.Stream import fs2.Stream
import org.log4s._ import org.log4s._
import docspell.common._
object Ocr { object Ocr {
private[this] val logger = getLogger private[this] val logger = getLogger
@ -93,7 +94,7 @@ object Ocr {
*/ */
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path, pdf: Path,
ghostscript: Config.Command, ghostscript: SystemCommand.Config,
wd: Path, wd: Path,
blocker: Blocker blocker: Blocker
): Stream[F, Path] = { ): Stream[F, Path] = {
@ -121,7 +122,7 @@ object Ocr {
*/ */
private[text] def runUnpaperFile[F[_]: Sync: ContextShift]( private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path, img: Path,
unpaper: Config.Command, unpaper: SystemCommand.Config,
wd: Path, wd: Path,
blocker: Blocker blocker: Blocker
): Stream[F, Path] = { ): Stream[F, Path] = {