mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-04 22:25:58 +00:00
Move SystemCommand
to common module
This commit is contained in:
parent
ba3865ef5e
commit
3be90d64d5
@ -1,4 +1,4 @@
|
|||||||
package docspell.text.ocr
|
package docspell.common
|
||||||
|
|
||||||
import java.io.InputStream
|
import java.io.InputStream
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
@ -11,13 +11,24 @@ import scala.jdk.CollectionConverters._
|
|||||||
import docspell.common.syntax.all._
|
import docspell.common.syntax.all._
|
||||||
|
|
||||||
object SystemCommand {
|
object SystemCommand {
|
||||||
|
|
||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
|
final case class Config(program: String, args: Seq[String], timeout: Duration) {
|
||||||
|
|
||||||
|
def mapArgs(f: String => String): Config =
|
||||||
|
Config(program, args.map(f), timeout)
|
||||||
|
|
||||||
|
def toCmd: List[String] =
|
||||||
|
program :: args.toList
|
||||||
|
|
||||||
|
lazy val cmdString: String =
|
||||||
|
toCmd.mkString(" ")
|
||||||
|
}
|
||||||
|
|
||||||
final case class Result(rc: Int, stdout: String, stderr: String)
|
final case class Result(rc: Int, stdout: String, stderr: String)
|
||||||
|
|
||||||
def exec[F[_]: Sync: ContextShift](
|
def exec[F[_]: Sync: ContextShift](
|
||||||
cmd: Config.Command,
|
cmd: Config,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
wd: Option[Path] = None,
|
wd: Option[Path] = None,
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
stdin: Stream[F, Byte] = Stream.empty
|
||||||
@ -40,7 +51,7 @@ object SystemCommand {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def execSuccess[F[_]: Sync: ContextShift](
|
def execSuccess[F[_]: Sync: ContextShift](
|
||||||
cmd: Config.Command,
|
cmd: Config,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
wd: Option[Path] = None,
|
wd: Option[Path] = None,
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
stdin: Stream[F, Byte] = Stream.empty
|
||||||
@ -55,7 +66,7 @@ object SystemCommand {
|
|||||||
else Stream.emit(r)
|
else Stream.emit(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])(
|
private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path])(
|
||||||
f: Process => Stream[F, A]
|
f: Process => Stream[F, A]
|
||||||
): Stream[F, A] = {
|
): Stream[F, A] = {
|
||||||
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
|
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
|
||||||
@ -93,7 +104,7 @@ object SystemCommand {
|
|||||||
): F[Unit] =
|
): F[Unit] =
|
||||||
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
|
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
|
||||||
|
|
||||||
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] =
|
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] =
|
||||||
Sync[F].delay(proc.destroyForcibly()).attempt *> {
|
Sync[F].delay(proc.destroyForcibly()).attempt *> {
|
||||||
Sync[F].raiseError(
|
Sync[F].raiseError(
|
||||||
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")
|
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")
|
@ -67,11 +67,10 @@ docspell.joex {
|
|||||||
# Configuration of text extraction
|
# Configuration of text extraction
|
||||||
#
|
#
|
||||||
# Extracting text currently only work for image and pdf files. It
|
# Extracting text currently only work for image and pdf files. It
|
||||||
# will first runs ghostscript to create a gray image from a
|
# will first run ghostscript to create a gray image from a pdf. Then
|
||||||
# pdf. Then unpaper is run to optimize the image for the upcoming
|
# unpaper is run to optimize the image for the upcoming ocr, which
|
||||||
# ocr, which will be done by tesseract. All these programs must be
|
# will be done by tesseract. All these programs must be available in
|
||||||
# available in your PATH or the absolute path can be specified
|
# your PATH or the absolute path can be specified below.
|
||||||
# below.
|
|
||||||
extraction {
|
extraction {
|
||||||
allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
|
allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
|
||||||
|
|
||||||
|
@ -19,21 +19,9 @@ case class Config(
|
|||||||
object Config {
|
object Config {
|
||||||
case class PageRange(begin: Int)
|
case class PageRange(begin: Int)
|
||||||
|
|
||||||
case class Command(program: String, args: Seq[String], timeout: Duration) {
|
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
||||||
|
case class Tesseract(command: SystemCommand.Config)
|
||||||
def mapArgs(f: String => String): Command =
|
case class Unpaper(command: SystemCommand.Config)
|
||||||
Command(program, args.map(f), timeout)
|
|
||||||
|
|
||||||
def toCmd: List[String] =
|
|
||||||
program :: args.toList
|
|
||||||
|
|
||||||
lazy val cmdString: String =
|
|
||||||
toCmd.mkString(" ")
|
|
||||||
}
|
|
||||||
|
|
||||||
case class Ghostscript(command: Command, workingDir: Path)
|
|
||||||
case class Tesseract(command: Command)
|
|
||||||
case class Unpaper(command: Command)
|
|
||||||
|
|
||||||
val default = Config(
|
val default = Config(
|
||||||
allowedContentTypes = Set(
|
allowedContentTypes = Set(
|
||||||
@ -44,7 +32,7 @@ object Config {
|
|||||||
),
|
),
|
||||||
pageRange = PageRange(10),
|
pageRange = PageRange(10),
|
||||||
ghostscript = Ghostscript(
|
ghostscript = Ghostscript(
|
||||||
Command(
|
SystemCommand.Config(
|
||||||
"gs",
|
"gs",
|
||||||
Seq(
|
Seq(
|
||||||
"-dNOPAUSE",
|
"-dNOPAUSE",
|
||||||
@ -58,9 +46,9 @@ object Config {
|
|||||||
),
|
),
|
||||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||||
),
|
),
|
||||||
unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
|
unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
|
||||||
tesseract = Tesseract(
|
tesseract = Tesseract(
|
||||||
Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import java.nio.file.Path
|
|||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{Blocker, ContextShift, Sync}
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import org.log4s._
|
import org.log4s._
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
object Ocr {
|
object Ocr {
|
||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
@ -93,7 +94,7 @@ object Ocr {
|
|||||||
*/
|
*/
|
||||||
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||||
pdf: Path,
|
pdf: Path,
|
||||||
ghostscript: Config.Command,
|
ghostscript: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
@ -121,7 +122,7 @@ object Ocr {
|
|||||||
*/
|
*/
|
||||||
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
|
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||||
img: Path,
|
img: Path,
|
||||||
unpaper: Config.Command,
|
unpaper: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user