mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 09:30:12 +00:00 
			
		
		
		
	Make logger configurable in system commands
This commit is contained in:
		| @@ -8,13 +8,10 @@ import java.util.concurrent.TimeUnit | ||||
| import cats.implicits._ | ||||
| import cats.effect.{Blocker, ContextShift, Sync} | ||||
| import fs2.{Stream, io, text} | ||||
| import org.log4s.getLogger | ||||
|  | ||||
| import scala.jdk.CollectionConverters._ | ||||
| import docspell.common.syntax.all._ | ||||
|  | ||||
| object SystemCommand { | ||||
|   private[this] val logger = getLogger | ||||
|  | ||||
|   final case class Config(program: String, args: Seq[String], timeout: Duration) { | ||||
|  | ||||
| @@ -33,17 +30,18 @@ object SystemCommand { | ||||
|   def exec[F[_]: Sync: ContextShift]( | ||||
|       cmd: Config, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       wd: Option[Path] = None, | ||||
|       stdin: Stream[F, Byte] = Stream.empty | ||||
|   ): Stream[F, Result] = | ||||
|     startProcess(cmd, wd, stdin) { proc => | ||||
|     startProcess(cmd, wd, logger, stdin) { proc => | ||||
|       Stream.eval { | ||||
|         for { | ||||
|           _    <- writeToProcess(stdin, proc, blocker) | ||||
|           term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) | ||||
|           _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") | ||||
|           _ <- if (term) logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") | ||||
|               else | ||||
|                 logger.fwarn( | ||||
|                 logger.warn( | ||||
|                   s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!" | ||||
|                 ) | ||||
|           _   <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) | ||||
| @@ -56,10 +54,11 @@ object SystemCommand { | ||||
|   def execSuccess[F[_]: Sync: ContextShift]( | ||||
|       cmd: Config, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       wd: Option[Path] = None, | ||||
|       stdin: Stream[F, Byte] = Stream.empty | ||||
|   ): Stream[F, Result] = | ||||
|     exec(cmd, blocker, wd, stdin).flatMap { r => | ||||
|     exec(cmd, blocker, logger, wd, stdin).flatMap { r => | ||||
|       if (r.rc != 0) | ||||
|         Stream.raiseError[F]( | ||||
|           new Exception( | ||||
| @@ -69,10 +68,10 @@ object SystemCommand { | ||||
|       else Stream.emit(r) | ||||
|     } | ||||
|  | ||||
|   private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], stdin: Stream[F, Byte])( | ||||
|   private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])( | ||||
|       f: Process => Stream[F, A] | ||||
|   ): Stream[F, A] = { | ||||
|     val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") | ||||
|     val log = logger.debug(s"Running external command: ${cmd.cmdString}") | ||||
|     val hasStdin = stdin.take(1).compile.last.map(_.isDefined) | ||||
|     val proc = log *> hasStdin.flatMap(flag => Sync[F].delay { | ||||
|       val pb = new ProcessBuilder(cmd.toCmd.asJava) | ||||
| @@ -85,7 +84,7 @@ object SystemCommand { | ||||
|     }) | ||||
|     Stream | ||||
|       .bracket(proc)(p => | ||||
|         logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => | ||||
|         logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ => | ||||
|           p.destroy() | ||||
|         } | ||||
|       ) | ||||
|   | ||||
| @@ -6,17 +6,17 @@ import docspell.common._ | ||||
|  | ||||
| trait Conversion[F[_]] { | ||||
|  | ||||
|   def toPDF(inType: MimeType): Pipe[F, Byte, Byte] | ||||
|   def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] | ||||
|  | ||||
| } | ||||
|  | ||||
| object Conversion { | ||||
|  | ||||
|   def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] = | ||||
|   def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] = | ||||
|     Resource.pure(new Conversion[F] { | ||||
|  | ||||
|       def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = { | ||||
|         println(cfg) | ||||
|       def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = { | ||||
|         println(s"$cfg $blocker $logger") | ||||
|         ??? | ||||
|       } | ||||
|  | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| package docspell.convert | ||||
|  | ||||
| case class ConvertConfig() | ||||
| import docspell.convert.flexmark.MarkdownConfig | ||||
|  | ||||
| case class ConvertConfig(markdown: MarkdownConfig) | ||||
|   | ||||
| @@ -23,7 +23,7 @@ object ExternConv { | ||||
|           cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) | ||||
|  | ||||
|         SystemCommand | ||||
|           .execSuccess[F](sysCfg, blocker, Some(dir), in) | ||||
|           .execSuccess[F](sysCfg, blocker, logger, Some(dir), in) | ||||
|           .flatMap(result => | ||||
|             logResult(name, result, logger) ++ readResult[F]( | ||||
|               out, | ||||
| @@ -56,7 +56,7 @@ object ExternConv { | ||||
|         (Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ | ||||
|           Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ => | ||||
|           SystemCommand | ||||
|             .execSuccess[F](sysCfg, blocker, Some(dir)) | ||||
|             .execSuccess[F](sysCfg, blocker, logger, Some(dir)) | ||||
|             .flatMap(result => | ||||
|               logResult(name, result, logger) ++ readResult[F]( | ||||
|                 out, | ||||
|   | ||||
| @@ -46,7 +46,7 @@ object Extraction { | ||||
|  | ||||
|           case OcrType(_) => | ||||
|             TextExtract | ||||
|               .extractOCR(data, blocker, lang.iso3, cfg.ocr) | ||||
|               .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) | ||||
|               .compile | ||||
|               .lastOrError | ||||
|               .attempt | ||||
|   | ||||
| @@ -19,7 +19,7 @@ object PdfExtract { | ||||
|   ): F[Either[Throwable, String]] = { | ||||
|  | ||||
|     val runOcr = | ||||
|       TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError | ||||
|       TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError | ||||
|  | ||||
|     def chooseResult(ocrStr: String, strippedStr: String) = | ||||
|       if (ocrStr.length > strippedStr.length) | ||||
|   | ||||
| @@ -4,24 +4,23 @@ import java.nio.file.Path | ||||
|  | ||||
| import cats.effect.{Blocker, ContextShift, Sync} | ||||
| import fs2.Stream | ||||
| import org.log4s._ | ||||
| import docspell.common._ | ||||
|  | ||||
| object Ocr { | ||||
|   private[this] val logger = getLogger | ||||
|  | ||||
|   /** Extract the text of all pages in the given pdf file. | ||||
|     */ | ||||
|   def extractPdf[F[_]: Sync: ContextShift]( | ||||
|       pdf: Stream[F, Byte], | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): F[Option[String]] = | ||||
|     File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => | ||||
|       runGhostscript(pdf, config, wd, blocker) | ||||
|       runGhostscript(pdf, config, wd, blocker, logger) | ||||
|         .flatMap({ tmpImg => | ||||
|           runTesseractFile(tmpImg, blocker, lang, config) | ||||
|           runTesseractFile(tmpImg, blocker, logger, lang, config) | ||||
|         }) | ||||
|         .fold1(_ + "\n\n\n" + _). | ||||
|         compile. | ||||
| @@ -33,21 +32,23 @@ object Ocr { | ||||
|   def extractImage[F[_]: Sync: ContextShift]( | ||||
|       img: Stream[F, Byte], | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = | ||||
|     runTesseractStdin(img, blocker, lang, config) | ||||
|     runTesseractStdin(img, blocker, logger, lang, config) | ||||
|  | ||||
|   def extractPdFFile[F[_]: Sync: ContextShift]( | ||||
|       pdf: Path, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): F[Option[String]] = | ||||
|     File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => | ||||
|       runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) | ||||
|       runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) | ||||
|         .flatMap({ tif => | ||||
|           runTesseractFile(tif, blocker, lang, config) | ||||
|           runTesseractFile(tif, blocker, logger, lang, config) | ||||
|         }) | ||||
|         .fold1(_ + "\n\n\n" + _). | ||||
|         compile. | ||||
| @@ -57,10 +58,11 @@ object Ocr { | ||||
|   def extractImageFile[F[_]: Sync: ContextShift]( | ||||
|       img: Path, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = | ||||
|     runTesseractFile(img, blocker, lang, config) | ||||
|     runTesseractFile(img, blocker, logger, lang, config) | ||||
|  | ||||
|   /** Run ghostscript to extract all pdf pages into tiff files. The | ||||
|     * files are stored to a temporary location on disk and returned. | ||||
| @@ -69,7 +71,8 @@ object Ocr { | ||||
|                                                                  pdf: Stream[F, Byte], | ||||
|                                                                  cfg: OcrConfig, | ||||
|                                                                  wd: Path, | ||||
|                                                                  blocker: Blocker | ||||
|                                                                  blocker: Blocker, | ||||
|                                                                  logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val xargs = | ||||
|       if (cfg.pageRange.begin > 0) | ||||
| @@ -86,7 +89,7 @@ object Ocr { | ||||
|         ) | ||||
|       ) | ||||
|     SystemCommand | ||||
|       .execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf) | ||||
|       .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) | ||||
|       .evalMap({ _ => | ||||
|         File.listFiles(pathEndsWith(".tif"), wd) | ||||
|       }) | ||||
| @@ -100,7 +103,8 @@ object Ocr { | ||||
|       pdf: Path, | ||||
|       ghostscript: SystemCommand.Config, | ||||
|       wd: Path, | ||||
|       blocker: Blocker | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val cmd = ghostscript.mapArgs( | ||||
|       replace( | ||||
| @@ -111,7 +115,7 @@ object Ocr { | ||||
|       ) | ||||
|     ) | ||||
|     SystemCommand | ||||
|       .execSuccess[F](cmd, blocker, wd = Some(wd)) | ||||
|       .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) | ||||
|       .evalMap({ _ => | ||||
|         File.listFiles(pathEndsWith(".tif"), wd) | ||||
|       }) | ||||
| @@ -128,7 +132,8 @@ object Ocr { | ||||
|       img: Path, | ||||
|       unpaper: SystemCommand.Config, | ||||
|       wd: Path, | ||||
|       blocker: Blocker | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath | ||||
|     val cmd = unpaper.mapArgs( | ||||
| @@ -139,7 +144,7 @@ object Ocr { | ||||
|         ) | ||||
|       ) | ||||
|     ) | ||||
|     SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith { | ||||
|     SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith { | ||||
|       th => | ||||
|         logger | ||||
|           .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") | ||||
| @@ -153,16 +158,17 @@ object Ocr { | ||||
|   private[extract] def runTesseractFile[F[_]: Sync: ContextShift]( | ||||
|       img: Path, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = | ||||
|     // tesseract cannot cope with absolute filenames | ||||
|     // so use the parent as working dir | ||||
|     runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg => | ||||
|     runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => | ||||
|       val cmd = config.tesseract.command.mapArgs( | ||||
|         replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) | ||||
|       ) | ||||
|       SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) | ||||
|       SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) | ||||
|     } | ||||
|  | ||||
|   /** Run tesseract on the given image file and return the extracted | ||||
| @@ -171,12 +177,13 @@ object Ocr { | ||||
|   private[extract] def runTesseractStdin[F[_]: Sync: ContextShift]( | ||||
|       img: Stream[F, Byte], | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = { | ||||
|     val cmd = config.tesseract.command | ||||
|       .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) | ||||
|     SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout) | ||||
|     SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) | ||||
|   } | ||||
|  | ||||
|   private def replace(repl: Map[String, String]): String => String = | ||||
|   | ||||
| @@ -10,14 +10,16 @@ object TextExtract { | ||||
|   def extract[F[_]: Sync: ContextShift]( | ||||
|       in: Stream[F, Byte], | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = | ||||
|     extractOCR(in, blocker, lang, config) | ||||
|     extractOCR(in, blocker, logger, lang, config) | ||||
|  | ||||
|   def extractOCR[F[_]: Sync: ContextShift]( | ||||
|       in: Stream[F, Byte], | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|       lang: String, | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = | ||||
| @@ -28,10 +30,10 @@ object TextExtract { | ||||
|           raiseError(s"File `$mt` not allowed") | ||||
|  | ||||
|         case MimeType.pdf => | ||||
|           Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate | ||||
|           Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate | ||||
|  | ||||
|         case mt if mt.primary == "image" => | ||||
|           Ocr.extractImage(in, blocker, lang, config) | ||||
|           Ocr.extractImage(in, blocker, logger, lang, config) | ||||
|  | ||||
|         case mt => | ||||
|           raiseError(s"File `$mt` not supported") | ||||
|   | ||||
| @@ -1,16 +1,19 @@ | ||||
| package docspell.extract.ocr | ||||
|  | ||||
| import cats.effect.IO | ||||
| import docspell.common.Logger | ||||
| import docspell.files.TestFiles | ||||
| import minitest.SimpleTestSuite | ||||
|  | ||||
| object TextExtractionSuite extends SimpleTestSuite { | ||||
|   import TestFiles._ | ||||
|  | ||||
|   val logger = Logger.log4s[IO](org.log4s.getLogger) | ||||
|  | ||||
|   test("extract english pdf") { | ||||
|     ignore() | ||||
|     val text = TextExtract | ||||
|       .extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default) | ||||
|       .extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default) | ||||
|       .compile | ||||
|       .lastOrError | ||||
|       .unsafeRunSync() | ||||
| @@ -21,7 +24,7 @@ object TextExtractionSuite extends SimpleTestSuite { | ||||
|     ignore() | ||||
|     val expect = TestFiles.letterDEText | ||||
|     val extract = TextExtract | ||||
|       .extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default) | ||||
|       .extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default) | ||||
|       .compile | ||||
|       .lastOrError | ||||
|       .unsafeRunSync() | ||||
|   | ||||
| @@ -48,14 +48,15 @@ object TextExtraction { | ||||
|       ocrConfig: OcrConfig, | ||||
|       lang: Language, | ||||
|       store: Store[F], | ||||
|       blocker: Blocker | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   )(fileId: Ident): F[Option[String]] = { | ||||
|     val data = store.bitpeace | ||||
|       .get(fileId.id) | ||||
|       .unNoneTerminate | ||||
|       .through(store.bitpeace.fetchData2(RangeDef.all)) | ||||
|  | ||||
|     TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last | ||||
|     TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last | ||||
|   } | ||||
|  | ||||
|   private def extractTextFallback[F[_]: Sync: ContextShift]( | ||||
| @@ -68,7 +69,7 @@ object TextExtraction { | ||||
|         ctx.logger.error(s"Cannot extract text").map(_ => None) | ||||
|  | ||||
|       case id :: rest => | ||||
|         extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id). | ||||
|         extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id). | ||||
|           recoverWith({ | ||||
|             case ex => | ||||
|               ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). | ||||
|   | ||||
		Reference in New Issue
	
	Block a user