mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Make logger configurable in system commands
This commit is contained in:
@ -46,7 +46,7 @@ object Extraction {
|
||||
|
||||
case OcrType(_) =>
|
||||
TextExtract
|
||||
.extractOCR(data, blocker, lang.iso3, cfg.ocr)
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.attempt
|
||||
|
@ -19,7 +19,7 @@ object PdfExtract {
|
||||
): F[Either[Throwable, String]] = {
|
||||
|
||||
val runOcr =
|
||||
TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError
|
||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||
|
||||
def chooseResult(ocrStr: String, strippedStr: String) =
|
||||
if (ocrStr.length > strippedStr.length)
|
||||
|
@ -4,24 +4,23 @@ import java.nio.file.Path
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import fs2.Stream
|
||||
import org.log4s._
|
||||
import docspell.common._
|
||||
|
||||
object Ocr {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
/** Extract the text of all pages in the given pdf file.
|
||||
*/
|
||||
def extractPdf[F[_]: Sync: ContextShift](
|
||||
pdf: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscript(pdf, config, wd, blocker)
|
||||
runGhostscript(pdf, config, wd, blocker, logger)
|
||||
.flatMap({ tmpImg =>
|
||||
runTesseractFile(tmpImg, blocker, lang, config)
|
||||
runTesseractFile(tmpImg, blocker, logger, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _).
|
||||
compile.
|
||||
@ -33,21 +32,23 @@ object Ocr {
|
||||
def extractImage[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
runTesseractStdin(img, blocker, lang, config)
|
||||
runTesseractStdin(img, blocker, logger, lang, config)
|
||||
|
||||
def extractPdFFile[F[_]: Sync: ContextShift](
|
||||
pdf: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
|
||||
.flatMap({ tif =>
|
||||
runTesseractFile(tif, blocker, lang, config)
|
||||
runTesseractFile(tif, blocker, logger, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _).
|
||||
compile.
|
||||
@ -57,10 +58,11 @@ object Ocr {
|
||||
def extractImageFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
runTesseractFile(img, blocker, lang, config)
|
||||
runTesseractFile(img, blocker, logger, lang, config)
|
||||
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
@ -69,7 +71,8 @@ object Ocr {
|
||||
pdf: Stream[F, Byte],
|
||||
cfg: OcrConfig,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val xargs =
|
||||
if (cfg.pageRange.begin > 0)
|
||||
@ -86,7 +89,7 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
|
||||
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
|
||||
.evalMap({ _ =>
|
||||
File.listFiles(pathEndsWith(".tif"), wd)
|
||||
})
|
||||
@ -100,7 +103,8 @@ object Ocr {
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val cmd = ghostscript.mapArgs(
|
||||
replace(
|
||||
@ -111,7 +115,7 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, wd = Some(wd))
|
||||
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||
.evalMap({ _ =>
|
||||
File.listFiles(pathEndsWith(".tif"), wd)
|
||||
})
|
||||
@ -128,7 +132,8 @@ object Ocr {
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
||||
val cmd = unpaper.mapArgs(
|
||||
@ -139,7 +144,7 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
||||
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
||||
th =>
|
||||
logger
|
||||
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
||||
@ -153,16 +158,17 @@ object Ocr {
|
||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
// tesseract cannot cope with absolute filenames
|
||||
// so use the parent as working dir
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
|
||||
val cmd = config.tesseract.command.mapArgs(
|
||||
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
||||
)
|
||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
|
||||
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
@ -171,12 +177,13 @@ object Ocr {
|
||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] = {
|
||||
val cmd = config.tesseract.command
|
||||
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
||||
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
|
||||
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
|
||||
}
|
||||
|
||||
private def replace(repl: Map[String, String]): String => String =
|
||||
|
@ -10,14 +10,16 @@ object TextExtract {
|
||||
def extract[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
extractOCR(in, blocker, lang, config)
|
||||
extractOCR(in, blocker, logger, lang, config)
|
||||
|
||||
def extractOCR[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
@ -28,10 +30,10 @@ object TextExtract {
|
||||
raiseError(s"File `$mt` not allowed")
|
||||
|
||||
case MimeType.pdf =>
|
||||
Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate
|
||||
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
||||
|
||||
case mt if mt.primary == "image" =>
|
||||
Ocr.extractImage(in, blocker, lang, config)
|
||||
Ocr.extractImage(in, blocker, logger, lang, config)
|
||||
|
||||
case mt =>
|
||||
raiseError(s"File `$mt` not supported")
|
||||
|
@ -1,16 +1,19 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.common.Logger
|
||||
import docspell.files.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
||||
import TestFiles._
|
||||
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
|
||||
test("extract english pdf") {
|
||||
ignore()
|
||||
val text = TextExtract
|
||||
.extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default)
|
||||
.extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
@ -21,7 +24,7 @@ object TextExtractionSuite extends SimpleTestSuite {
|
||||
ignore()
|
||||
val expect = TestFiles.letterDEText
|
||||
val extract = TextExtract
|
||||
.extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default)
|
||||
.extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
|
Reference in New Issue
Block a user