diff --git a/modules/common/src/main/scala/docspell/common/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala index 4f93d87a..06876c96 100644 --- a/modules/common/src/main/scala/docspell/common/SystemCommand.scala +++ b/modules/common/src/main/scala/docspell/common/SystemCommand.scala @@ -8,13 +8,10 @@ import java.util.concurrent.TimeUnit import cats.implicits._ import cats.effect.{Blocker, ContextShift, Sync} import fs2.{Stream, io, text} -import org.log4s.getLogger import scala.jdk.CollectionConverters._ -import docspell.common.syntax.all._ object SystemCommand { - private[this] val logger = getLogger final case class Config(program: String, args: Seq[String], timeout: Duration) { @@ -33,17 +30,18 @@ object SystemCommand { def exec[F[_]: Sync: ContextShift]( cmd: Config, blocker: Blocker, + logger: Logger[F], wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty ): Stream[F, Result] = - startProcess(cmd, wd, stdin) { proc => + startProcess(cmd, wd, logger, stdin) { proc => Stream.eval { for { _ <- writeToProcess(stdin, proc, blocker) term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) - _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") + _ <- if (term) logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") else - logger.fwarn( + logger.warn( s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!" ) _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) @@ -56,10 +54,11 @@ object SystemCommand { def execSuccess[F[_]: Sync: ContextShift]( cmd: Config, blocker: Blocker, + logger: Logger[F], wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty ): Stream[F, Result] = - exec(cmd, blocker, wd, stdin).flatMap { r => + exec(cmd, blocker, logger, wd, stdin).flatMap { r => if (r.rc != 0) Stream.raiseError[F]( new Exception( @@ -69,10 +68,10 @@ object SystemCommand { else Stream.emit(r) } - private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], stdin: Stream[F, Byte])( + private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])( f: Process => Stream[F, A] ): Stream[F, A] = { - val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") + val log = logger.debug(s"Running external command: ${cmd.cmdString}") val hasStdin = stdin.take(1).compile.last.map(_.isDefined) val proc = log *> hasStdin.flatMap(flag => Sync[F].delay { val pb = new ProcessBuilder(cmd.toCmd.asJava) @@ -85,7 +84,7 @@ object SystemCommand { }) Stream .bracket(proc)(p => - logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => + logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ => p.destroy() } ) diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 96a0776d..a4605280 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -6,17 +6,17 @@ import docspell.common._ trait Conversion[F[_]] { - def toPDF(inType: MimeType): Pipe[F, Byte, Byte] + def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] } object Conversion { - def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] = + def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] = Resource.pure(new Conversion[F] { - def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = { - println(cfg) + def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = { + println(s"$cfg $blocker $logger") ??? } diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index 40f2924e..ddebaa23 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -1,3 +1,5 @@ package docspell.convert -case class ConvertConfig() +import docspell.convert.flexmark.MarkdownConfig + +case class ConvertConfig(markdown: MarkdownConfig) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala index a45f7b22..909b5b45 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -23,7 +23,7 @@ object ExternConv { cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) SystemCommand - .execSuccess[F](sysCfg, blocker, Some(dir), in) + .execSuccess[F](sysCfg, blocker, logger, Some(dir), in) .flatMap(result => logResult(name, result, logger) ++ readResult[F]( out, @@ -56,7 +56,7 @@ object ExternConv { (Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ => SystemCommand - .execSuccess[F](sysCfg, blocker, Some(dir)) + .execSuccess[F](sysCfg, blocker, logger, Some(dir)) .flatMap(result => logResult(name, result, logger) ++ readResult[F]( out, diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index 8ef052c8..892ef54d 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -46,7 +46,7 @@ object Extraction { case OcrType(_) => TextExtract - .extractOCR(data, blocker, lang.iso3, cfg.ocr) + .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) .compile .lastOrError .attempt diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 20d800bb..2058e072 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -19,7 +19,7 @@ object PdfExtract { ): F[Either[Throwable, String]] = { val runOcr = - TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError + TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError def chooseResult(ocrStr: String, strippedStr: String) = if (ocrStr.length > strippedStr.length) diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index 4e97e928..91ba8af2 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -4,24 +4,23 @@ import java.nio.file.Path import cats.effect.{Blocker, ContextShift, Sync} import fs2.Stream -import org.log4s._ import docspell.common._ object Ocr { - private[this] val logger = getLogger /** Extract the text of all pages in the given pdf file. */ def extractPdf[F[_]: Sync: ContextShift]( pdf: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): F[Option[String]] = File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => - runGhostscript(pdf, config, wd, blocker) + runGhostscript(pdf, config, wd, blocker, logger) .flatMap({ tmpImg => - runTesseractFile(tmpImg, blocker, lang, config) + runTesseractFile(tmpImg, blocker, logger, lang, config) }) .fold1(_ + "\n\n\n" + _). compile. @@ -33,21 +32,23 @@ object Ocr { def extractImage[F[_]: Sync: ContextShift]( img: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = - runTesseractStdin(img, blocker, lang, config) + runTesseractStdin(img, blocker, logger, lang, config) def extractPdFFile[F[_]: Sync: ContextShift]( pdf: Path, blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): F[Option[String]] = File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => - runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) + runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) .flatMap({ tif => - runTesseractFile(tif, blocker, lang, config) + runTesseractFile(tif, blocker, logger, lang, config) }) .fold1(_ + "\n\n\n" + _). compile. @@ -57,10 +58,11 @@ object Ocr { def extractImageFile[F[_]: Sync: ContextShift]( img: Path, blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = - runTesseractFile(img, blocker, lang, config) + runTesseractFile(img, blocker, logger, lang, config) /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. @@ -69,7 +71,8 @@ object Ocr { pdf: Stream[F, Byte], cfg: OcrConfig, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val xargs = if (cfg.pageRange.begin > 0) @@ -86,7 +89,7 @@ object Ocr { ) ) SystemCommand - .execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf) + .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) .evalMap({ _ => File.listFiles(pathEndsWith(".tif"), wd) }) @@ -100,7 +103,8 @@ object Ocr { pdf: Path, ghostscript: SystemCommand.Config, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val cmd = ghostscript.mapArgs( replace( @@ -111,7 +115,7 @@ object Ocr { ) ) SystemCommand - .execSuccess[F](cmd, blocker, wd = Some(wd)) + .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) .evalMap({ _ => File.listFiles(pathEndsWith(".tif"), wd) }) @@ -128,7 +132,8 @@ object Ocr { img: Path, unpaper: SystemCommand.Config, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath val cmd = unpaper.mapArgs( @@ -139,7 +144,7 @@ object Ocr { ) ) ) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith { + SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith { th => logger .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") @@ -153,16 +158,17 @@ object Ocr { private[extract] def runTesseractFile[F[_]: Sync: ContextShift]( img: Path, blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = // tesseract cannot cope with absolute filenames // so use the parent as working dir - runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg => + runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => val cmd = config.tesseract.command.mapArgs( replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) ) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) + SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) } /** Run tesseract on the given image file and return the extracted @@ -171,12 +177,13 @@ object Ocr { private[extract] def runTesseractStdin[F[_]: Sync: ContextShift]( img: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = { val cmd = config.tesseract.command .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) - SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout) + SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) } private def replace(repl: Map[String, String]): String => String = diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index 07238482..c2fd1678 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -10,14 +10,16 @@ object TextExtract { def extract[F[_]: Sync: ContextShift]( in: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = - extractOCR(in, blocker, lang, config) + extractOCR(in, blocker, logger, lang, config) def extractOCR[F[_]: Sync: ContextShift]( in: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, config: OcrConfig ): Stream[F, String] = @@ -28,10 +30,10 @@ object TextExtract { raiseError(s"File `$mt` not allowed") case MimeType.pdf => - Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate + Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate case mt if mt.primary == "image" => - Ocr.extractImage(in, blocker, lang, config) + Ocr.extractImage(in, blocker, logger, lang, config) case mt => raiseError(s"File `$mt` not supported") diff --git a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index e63982ca..ac2998a8 100644 --- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -1,16 +1,19 @@ package docspell.extract.ocr import cats.effect.IO +import docspell.common.Logger import docspell.files.TestFiles import minitest.SimpleTestSuite object TextExtractionSuite extends SimpleTestSuite { import TestFiles._ + val logger = Logger.log4s[IO](org.log4s.getLogger) + test("extract english pdf") { ignore() val text = TextExtract - .extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default) + .extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default) .compile .lastOrError .unsafeRunSync() @@ -21,7 +24,7 @@ object TextExtractionSuite extends SimpleTestSuite { ignore() val expect = TestFiles.letterDEText val extract = TextExtract - .extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default) + .extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default) .compile .lastOrError .unsafeRunSync() diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 94b67109..6ab1c451 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -48,14 +48,15 @@ object TextExtraction { ocrConfig: OcrConfig, lang: Language, store: Store[F], - blocker: Blocker + blocker: Blocker, + logger: Logger[F] )(fileId: Ident): F[Option[String]] = { val data = store.bitpeace .get(fileId.id) .unNoneTerminate .through(store.bitpeace.fetchData2(RangeDef.all)) - TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last + TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last } private def extractTextFallback[F[_]: Sync: ContextShift]( @@ -68,7 +69,7 @@ object TextExtraction { ctx.logger.error(s"Cannot extract text").map(_ => None) case id :: rest => - extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id). + extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id). recoverWith({ case ex => ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").