mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 14:45:59 +00:00
Make logger configurable in system commands
This commit is contained in:
parent
bd605b8c94
commit
0dcc00836b
@ -8,13 +8,10 @@ import java.util.concurrent.TimeUnit
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{Blocker, ContextShift, Sync}
|
||||||
import fs2.{Stream, io, text}
|
import fs2.{Stream, io, text}
|
||||||
import org.log4s.getLogger
|
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
import docspell.common.syntax.all._
|
|
||||||
|
|
||||||
object SystemCommand {
|
object SystemCommand {
|
||||||
private[this] val logger = getLogger
|
|
||||||
|
|
||||||
final case class Config(program: String, args: Seq[String], timeout: Duration) {
|
final case class Config(program: String, args: Seq[String], timeout: Duration) {
|
||||||
|
|
||||||
@ -33,17 +30,18 @@ object SystemCommand {
|
|||||||
def exec[F[_]: Sync: ContextShift](
|
def exec[F[_]: Sync: ContextShift](
|
||||||
cmd: Config,
|
cmd: Config,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
wd: Option[Path] = None,
|
wd: Option[Path] = None,
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
stdin: Stream[F, Byte] = Stream.empty
|
||||||
): Stream[F, Result] =
|
): Stream[F, Result] =
|
||||||
startProcess(cmd, wd, stdin) { proc =>
|
startProcess(cmd, wd, logger, stdin) { proc =>
|
||||||
Stream.eval {
|
Stream.eval {
|
||||||
for {
|
for {
|
||||||
_ <- writeToProcess(stdin, proc, blocker)
|
_ <- writeToProcess(stdin, proc, blocker)
|
||||||
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
|
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
|
||||||
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
|
_ <- if (term) logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
|
||||||
else
|
else
|
||||||
logger.fwarn(
|
logger.warn(
|
||||||
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
|
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
|
||||||
)
|
)
|
||||||
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
|
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
|
||||||
@ -56,10 +54,11 @@ object SystemCommand {
|
|||||||
def execSuccess[F[_]: Sync: ContextShift](
|
def execSuccess[F[_]: Sync: ContextShift](
|
||||||
cmd: Config,
|
cmd: Config,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
wd: Option[Path] = None,
|
wd: Option[Path] = None,
|
||||||
stdin: Stream[F, Byte] = Stream.empty
|
stdin: Stream[F, Byte] = Stream.empty
|
||||||
): Stream[F, Result] =
|
): Stream[F, Result] =
|
||||||
exec(cmd, blocker, wd, stdin).flatMap { r =>
|
exec(cmd, blocker, logger, wd, stdin).flatMap { r =>
|
||||||
if (r.rc != 0)
|
if (r.rc != 0)
|
||||||
Stream.raiseError[F](
|
Stream.raiseError[F](
|
||||||
new Exception(
|
new Exception(
|
||||||
@ -69,10 +68,10 @@ object SystemCommand {
|
|||||||
else Stream.emit(r)
|
else Stream.emit(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], stdin: Stream[F, Byte])(
|
private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])(
|
||||||
f: Process => Stream[F, A]
|
f: Process => Stream[F, A]
|
||||||
): Stream[F, A] = {
|
): Stream[F, A] = {
|
||||||
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
|
val log = logger.debug(s"Running external command: ${cmd.cmdString}")
|
||||||
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
||||||
val proc = log *> hasStdin.flatMap(flag => Sync[F].delay {
|
val proc = log *> hasStdin.flatMap(flag => Sync[F].delay {
|
||||||
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
||||||
@ -85,7 +84,7 @@ object SystemCommand {
|
|||||||
})
|
})
|
||||||
Stream
|
Stream
|
||||||
.bracket(proc)(p =>
|
.bracket(proc)(p =>
|
||||||
logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
|
logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ =>
|
||||||
p.destroy()
|
p.destroy()
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -6,17 +6,17 @@ import docspell.common._
|
|||||||
|
|
||||||
trait Conversion[F[_]] {
|
trait Conversion[F[_]] {
|
||||||
|
|
||||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
|
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object Conversion {
|
object Conversion {
|
||||||
|
|
||||||
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
|
def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] =
|
||||||
Resource.pure(new Conversion[F] {
|
Resource.pure(new Conversion[F] {
|
||||||
|
|
||||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
|
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = {
|
||||||
println(cfg)
|
println(s"$cfg $blocker $logger")
|
||||||
???
|
???
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
case class ConvertConfig()
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
|
case class ConvertConfig(markdown: MarkdownConfig)
|
||||||
|
@ -23,7 +23,7 @@ object ExternConv {
|
|||||||
cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString))
|
cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString))
|
||||||
|
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](sysCfg, blocker, Some(dir), in)
|
.execSuccess[F](sysCfg, blocker, logger, Some(dir), in)
|
||||||
.flatMap(result =>
|
.flatMap(result =>
|
||||||
logResult(name, result, logger) ++ readResult[F](
|
logResult(name, result, logger) ++ readResult[F](
|
||||||
out,
|
out,
|
||||||
@ -56,7 +56,7 @@ object ExternConv {
|
|||||||
(Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
(Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
||||||
Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ =>
|
Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ =>
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](sysCfg, blocker, Some(dir))
|
.execSuccess[F](sysCfg, blocker, logger, Some(dir))
|
||||||
.flatMap(result =>
|
.flatMap(result =>
|
||||||
logResult(name, result, logger) ++ readResult[F](
|
logResult(name, result, logger) ++ readResult[F](
|
||||||
out,
|
out,
|
||||||
|
@ -46,7 +46,7 @@ object Extraction {
|
|||||||
|
|
||||||
case OcrType(_) =>
|
case OcrType(_) =>
|
||||||
TextExtract
|
TextExtract
|
||||||
.extractOCR(data, blocker, lang.iso3, cfg.ocr)
|
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.attempt
|
.attempt
|
||||||
|
@ -19,7 +19,7 @@ object PdfExtract {
|
|||||||
): F[Either[Throwable, String]] = {
|
): F[Either[Throwable, String]] = {
|
||||||
|
|
||||||
val runOcr =
|
val runOcr =
|
||||||
TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError
|
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||||
|
|
||||||
def chooseResult(ocrStr: String, strippedStr: String) =
|
def chooseResult(ocrStr: String, strippedStr: String) =
|
||||||
if (ocrStr.length > strippedStr.length)
|
if (ocrStr.length > strippedStr.length)
|
||||||
|
@ -4,24 +4,23 @@ import java.nio.file.Path
|
|||||||
|
|
||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{Blocker, ContextShift, Sync}
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import org.log4s._
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
object Ocr {
|
object Ocr {
|
||||||
private[this] val logger = getLogger
|
|
||||||
|
|
||||||
/** Extract the text of all pages in the given pdf file.
|
/** Extract the text of all pages in the given pdf file.
|
||||||
*/
|
*/
|
||||||
def extractPdf[F[_]: Sync: ContextShift](
|
def extractPdf[F[_]: Sync: ContextShift](
|
||||||
pdf: Stream[F, Byte],
|
pdf: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): F[Option[String]] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscript(pdf, config, wd, blocker)
|
runGhostscript(pdf, config, wd, blocker, logger)
|
||||||
.flatMap({ tmpImg =>
|
.flatMap({ tmpImg =>
|
||||||
runTesseractFile(tmpImg, blocker, lang, config)
|
runTesseractFile(tmpImg, blocker, logger, lang, config)
|
||||||
})
|
})
|
||||||
.fold1(_ + "\n\n\n" + _).
|
.fold1(_ + "\n\n\n" + _).
|
||||||
compile.
|
compile.
|
||||||
@ -33,21 +32,23 @@ object Ocr {
|
|||||||
def extractImage[F[_]: Sync: ContextShift](
|
def extractImage[F[_]: Sync: ContextShift](
|
||||||
img: Stream[F, Byte],
|
img: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
runTesseractStdin(img, blocker, lang, config)
|
runTesseractStdin(img, blocker, logger, lang, config)
|
||||||
|
|
||||||
def extractPdFFile[F[_]: Sync: ContextShift](
|
def extractPdFFile[F[_]: Sync: ContextShift](
|
||||||
pdf: Path,
|
pdf: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): F[Option[String]] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
|
||||||
.flatMap({ tif =>
|
.flatMap({ tif =>
|
||||||
runTesseractFile(tif, blocker, lang, config)
|
runTesseractFile(tif, blocker, logger, lang, config)
|
||||||
})
|
})
|
||||||
.fold1(_ + "\n\n\n" + _).
|
.fold1(_ + "\n\n\n" + _).
|
||||||
compile.
|
compile.
|
||||||
@ -57,10 +58,11 @@ object Ocr {
|
|||||||
def extractImageFile[F[_]: Sync: ContextShift](
|
def extractImageFile[F[_]: Sync: ContextShift](
|
||||||
img: Path,
|
img: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
runTesseractFile(img, blocker, lang, config)
|
runTesseractFile(img, blocker, logger, lang, config)
|
||||||
|
|
||||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||||
* files are stored to a temporary location on disk and returned.
|
* files are stored to a temporary location on disk and returned.
|
||||||
@ -69,7 +71,8 @@ object Ocr {
|
|||||||
pdf: Stream[F, Byte],
|
pdf: Stream[F, Byte],
|
||||||
cfg: OcrConfig,
|
cfg: OcrConfig,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val xargs =
|
val xargs =
|
||||||
if (cfg.pageRange.begin > 0)
|
if (cfg.pageRange.begin > 0)
|
||||||
@ -86,7 +89,7 @@ object Ocr {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
|
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
|
||||||
.evalMap({ _ =>
|
.evalMap({ _ =>
|
||||||
File.listFiles(pathEndsWith(".tif"), wd)
|
File.listFiles(pathEndsWith(".tif"), wd)
|
||||||
})
|
})
|
||||||
@ -100,7 +103,8 @@ object Ocr {
|
|||||||
pdf: Path,
|
pdf: Path,
|
||||||
ghostscript: SystemCommand.Config,
|
ghostscript: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val cmd = ghostscript.mapArgs(
|
val cmd = ghostscript.mapArgs(
|
||||||
replace(
|
replace(
|
||||||
@ -111,7 +115,7 @@ object Ocr {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](cmd, blocker, wd = Some(wd))
|
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||||
.evalMap({ _ =>
|
.evalMap({ _ =>
|
||||||
File.listFiles(pathEndsWith(".tif"), wd)
|
File.listFiles(pathEndsWith(".tif"), wd)
|
||||||
})
|
})
|
||||||
@ -128,7 +132,8 @@ object Ocr {
|
|||||||
img: Path,
|
img: Path,
|
||||||
unpaper: SystemCommand.Config,
|
unpaper: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
||||||
val cmd = unpaper.mapArgs(
|
val cmd = unpaper.mapArgs(
|
||||||
@ -139,7 +144,7 @@ object Ocr {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
||||||
th =>
|
th =>
|
||||||
logger
|
logger
|
||||||
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
||||||
@ -153,16 +158,17 @@ object Ocr {
|
|||||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||||
img: Path,
|
img: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
// tesseract cannot cope with absolute filenames
|
// tesseract cannot cope with absolute filenames
|
||||||
// so use the parent as working dir
|
// so use the parent as working dir
|
||||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
|
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
|
||||||
val cmd = config.tesseract.command.mapArgs(
|
val cmd = config.tesseract.command.mapArgs(
|
||||||
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
||||||
)
|
)
|
||||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
|
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Run tesseract on the given image file and return the extracted
|
/** Run tesseract on the given image file and return the extracted
|
||||||
@ -171,12 +177,13 @@ object Ocr {
|
|||||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||||
img: Stream[F, Byte],
|
img: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] = {
|
): Stream[F, String] = {
|
||||||
val cmd = config.tesseract.command
|
val cmd = config.tesseract.command
|
||||||
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
||||||
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
|
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def replace(repl: Map[String, String]): String => String =
|
private def replace(repl: Map[String, String]): String => String =
|
||||||
|
@ -10,14 +10,16 @@ object TextExtract {
|
|||||||
def extract[F[_]: Sync: ContextShift](
|
def extract[F[_]: Sync: ContextShift](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
extractOCR(in, blocker, lang, config)
|
extractOCR(in, blocker, logger, lang, config)
|
||||||
|
|
||||||
def extractOCR[F[_]: Sync: ContextShift](
|
def extractOCR[F[_]: Sync: ContextShift](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
@ -28,10 +30,10 @@ object TextExtract {
|
|||||||
raiseError(s"File `$mt` not allowed")
|
raiseError(s"File `$mt` not allowed")
|
||||||
|
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate
|
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
||||||
|
|
||||||
case mt if mt.primary == "image" =>
|
case mt if mt.primary == "image" =>
|
||||||
Ocr.extractImage(in, blocker, lang, config)
|
Ocr.extractImage(in, blocker, logger, lang, config)
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
raiseError(s"File `$mt` not supported")
|
raiseError(s"File `$mt` not supported")
|
||||||
|
@ -1,16 +1,19 @@
|
|||||||
package docspell.extract.ocr
|
package docspell.extract.ocr
|
||||||
|
|
||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
|
import docspell.common.Logger
|
||||||
import docspell.files.TestFiles
|
import docspell.files.TestFiles
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
|
|
||||||
object TextExtractionSuite extends SimpleTestSuite {
|
object TextExtractionSuite extends SimpleTestSuite {
|
||||||
import TestFiles._
|
import TestFiles._
|
||||||
|
|
||||||
|
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||||
|
|
||||||
test("extract english pdf") {
|
test("extract english pdf") {
|
||||||
ignore()
|
ignore()
|
||||||
val text = TextExtract
|
val text = TextExtract
|
||||||
.extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default)
|
.extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
@ -21,7 +24,7 @@ object TextExtractionSuite extends SimpleTestSuite {
|
|||||||
ignore()
|
ignore()
|
||||||
val expect = TestFiles.letterDEText
|
val expect = TestFiles.letterDEText
|
||||||
val extract = TextExtract
|
val extract = TextExtract
|
||||||
.extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default)
|
.extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
|
@ -48,14 +48,15 @@ object TextExtraction {
|
|||||||
ocrConfig: OcrConfig,
|
ocrConfig: OcrConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
blocker: Blocker
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
)(fileId: Ident): F[Option[String]] = {
|
)(fileId: Ident): F[Option[String]] = {
|
||||||
val data = store.bitpeace
|
val data = store.bitpeace
|
||||||
.get(fileId.id)
|
.get(fileId.id)
|
||||||
.unNoneTerminate
|
.unNoneTerminate
|
||||||
.through(store.bitpeace.fetchData2(RangeDef.all))
|
.through(store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
|
||||||
TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last
|
TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last
|
||||||
}
|
}
|
||||||
|
|
||||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||||
@ -68,7 +69,7 @@ object TextExtraction {
|
|||||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||||
|
|
||||||
case id :: rest =>
|
case id :: rest =>
|
||||||
extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id).
|
extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id).
|
||||||
recoverWith({
|
recoverWith({
|
||||||
case ex =>
|
case ex =>
|
||||||
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
||||||
|
Loading…
x
Reference in New Issue
Block a user