From 3be90d64d5fd867ef1d3292f7b0477962e3b328b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 10 Feb 2020 22:23:06 +0100 Subject: [PATCH] Move `SystemCommand` to common module --- .../docspell/common}/SystemCommand.scala | 23 +++++++++++++----- .../joex/src/main/resources/reference.conf | 9 ++++--- .../main/scala/docspell/text/ocr/Config.scala | 24 +++++-------------- .../main/scala/docspell/text/ocr/Ocr.scala | 5 ++-- 4 files changed, 30 insertions(+), 31 deletions(-) rename modules/{text/src/main/scala/docspell/text/ocr => common/src/main/scala/docspell/common}/SystemCommand.scala (87%) diff --git a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala similarity index 87% rename from modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala rename to modules/common/src/main/scala/docspell/common/SystemCommand.scala index f433c967..cfa2ab33 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala +++ b/modules/common/src/main/scala/docspell/common/SystemCommand.scala @@ -1,4 +1,4 @@ -package docspell.text.ocr +package docspell.common import java.io.InputStream import java.nio.file.Path @@ -11,13 +11,24 @@ import scala.jdk.CollectionConverters._ import docspell.common.syntax.all._ object SystemCommand { - private[this] val logger = getLogger + final case class Config(program: String, args: Seq[String], timeout: Duration) { + + def mapArgs(f: String => String): Config = + Config(program, args.map(f), timeout) + + def toCmd: List[String] = + program :: args.toList + + lazy val cmdString: String = + toCmd.mkString(" ") + } + final case class Result(rc: Int, stdout: String, stderr: String) def exec[F[_]: Sync: ContextShift]( - cmd: Config.Command, + cmd: Config, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty @@ -40,7 +51,7 @@ object SystemCommand { } def execSuccess[F[_]: Sync: ContextShift]( - cmd: Config.Command, + cmd: Config, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty @@ -55,7 +66,7 @@ object SystemCommand { else Stream.emit(r) } - private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])( + private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path])( f: Process => Stream[F, A] ): Stream[F, A] = { val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") @@ -93,7 +104,7 @@ object SystemCommand { ): F[Unit] = data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain - private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] = + private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] = Sync[F].delay(proc.destroyForcibly()).attempt *> { Sync[F].raiseError( new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})") diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 79b68912..a6a4ee60 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -67,11 +67,10 @@ docspell.joex { # Configuration of text extraction # # Extracting text currently only work for image and pdf files. It - # will first runs ghostscript to create a gray image from a - # pdf. Then unpaper is run to optimize the image for the upcoming - # ocr, which will be done by tesseract. All these programs must be - # available in your PATH or the absolute path can be specified - # below. + # will first run ghostscript to create a gray image from a pdf. Then + # unpaper is run to optimize the image for the upcoming ocr, which + # will be done by tesseract. All these programs must be available in + # your PATH or the absolute path can be specified below. extraction { allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ] diff --git a/modules/text/src/main/scala/docspell/text/ocr/Config.scala b/modules/text/src/main/scala/docspell/text/ocr/Config.scala index f2f8e5d1..5da49ed0 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Config.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/Config.scala @@ -19,21 +19,9 @@ case class Config( object Config { case class PageRange(begin: Int) - case class Command(program: String, args: Seq[String], timeout: Duration) { - - def mapArgs(f: String => String): Command = - Command(program, args.map(f), timeout) - - def toCmd: List[String] = - program :: args.toList - - lazy val cmdString: String = - toCmd.mkString(" ") - } - - case class Ghostscript(command: Command, workingDir: Path) - case class Tesseract(command: Command) - case class Unpaper(command: Command) + case class Ghostscript(command: SystemCommand.Config, workingDir: Path) + case class Tesseract(command: SystemCommand.Config) + case class Unpaper(command: SystemCommand.Config) val default = Config( allowedContentTypes = Set( @@ -44,7 +32,7 @@ object Config { ), pageRange = PageRange(10), ghostscript = Ghostscript( - Command( + SystemCommand.Config( "gs", Seq( "-dNOPAUSE", @@ -58,9 +46,9 @@ object Config { ), Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") ), - unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), + unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), tesseract = Tesseract( - Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) + SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) ) ) } diff --git a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala b/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala index 99f558d3..7cda9bdf 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala @@ -5,6 +5,7 @@ import java.nio.file.Path import cats.effect.{Blocker, ContextShift, Sync} import fs2.Stream import org.log4s._ +import docspell.common._ object Ocr { private[this] val logger = getLogger @@ -93,7 +94,7 @@ object Ocr { */ private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( pdf: Path, - ghostscript: Config.Command, + ghostscript: SystemCommand.Config, wd: Path, blocker: Blocker ): Stream[F, Path] = { @@ -121,7 +122,7 @@ object Ocr { */ private[text] def runUnpaperFile[F[_]: Sync: ContextShift]( img: Path, - unpaper: Config.Command, + unpaper: SystemCommand.Config, wd: Path, blocker: Blocker ): Stream[F, Path] = {