mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-30 21:40:12 +00:00 
			
		
		
		
	Streamline extern-conv stdin/infile
This commit is contained in:
		| @@ -18,6 +18,13 @@ object SystemCommand { | ||||
|     def mapArgs(f: String => String): Config = | ||||
|       Config(program, args.map(f), timeout) | ||||
|  | ||||
|     def replace(repl: Map[String, String]): Config = | ||||
|       mapArgs(s => | ||||
|         repl.foldLeft(s) { | ||||
|           case (res, (k, v)) => | ||||
|             res.replace(k, v) | ||||
|         }) | ||||
|  | ||||
|     def toCmd: List[String] = | ||||
|       program :: args.toList | ||||
|  | ||||
|   | ||||
| @@ -13,50 +13,28 @@ object ExternConv { | ||||
|       cmdCfg: SystemCommand.Config, | ||||
|       wd: Path, | ||||
|       chunkSize: Int, | ||||
|       useStdin: Boolean, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Pipe[F, Byte, Byte] = | ||||
|     in => | ||||
|       Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => | ||||
|         val out = dir.resolve("out.pdf") | ||||
|         val inFile = dir.resolve("infile").toAbsolutePath.normalize | ||||
|         val out = dir.resolve("out.pdf").toAbsolutePath.normalize | ||||
|         val sysCfg = | ||||
|           cmdCfg.mapArgs(_.replace("{{outfile}}", out.toAbsolutePath.normalize.toString)) | ||||
|  | ||||
|         SystemCommand | ||||
|           .execSuccess[F](sysCfg, blocker, logger, Some(dir), in) | ||||
|           .flatMap(result => | ||||
|             logResult(name, result, logger) ++ readResult[F]( | ||||
|               out, | ||||
|               result, | ||||
|               blocker, | ||||
|               chunkSize, | ||||
|               logger | ||||
|             ) | ||||
|           ) | ||||
|       } | ||||
|  | ||||
|   def toPDFviaFile[F[_]: Sync: ContextShift]( | ||||
|       name: String, | ||||
|       cmdCfg: SystemCommand.Config, | ||||
|       wd: Path, | ||||
|       chunkSize: Int, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Pipe[F, Byte, Byte] = | ||||
|     in => | ||||
|       Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => | ||||
|         val inFile = dir.resolve("infile") | ||||
|         val out    = dir.resolve("out.pdf") | ||||
|         val sysCfg = | ||||
|           cmdCfg.mapArgs( | ||||
|             _.replace("{{outfile}}", out.toAbsolutePath.normalize.toString) | ||||
|               .replace("{{infile}}", inFile.toAbsolutePath.normalize.toString) | ||||
|           cmdCfg.replace( | ||||
|             Map("{{outfile}}" -> out.toString) ++ | ||||
|               (if (!useStdin) Map("{{infile}}" -> inFile.toString) | ||||
|               else Map.empty) | ||||
|           ) | ||||
|  | ||||
|         (Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ | ||||
|           Stream.eval(storeFile(in, inFile, blocker))).flatMap { _ => | ||||
|         val createInput: Pipe[F, Byte, Unit] = | ||||
|           if (useStdin) _ => Stream.emit(()) | ||||
|           else storeDataToFile(name, blocker, logger, inFile) | ||||
|  | ||||
|         in.through(createInput).flatMap { _ => | ||||
|           SystemCommand | ||||
|             .execSuccess[F](sysCfg, blocker, logger, Some(dir)) | ||||
|             .execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty) | ||||
|             .flatMap(result => | ||||
|               logResult(name, result, logger) ++ readResult[F]( | ||||
|                 out, | ||||
| @@ -69,7 +47,7 @@ object ExternConv { | ||||
|         } | ||||
|       } | ||||
|  | ||||
|   private def readResult[F[_]: Sync: ContextShift]( | ||||
|   def readResult[F[_]: Sync: ContextShift]( | ||||
|       out: Path, | ||||
|       result: SystemCommand.Result, | ||||
|       blocker: Blocker, | ||||
| @@ -91,6 +69,11 @@ object ExternConv { | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|   private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] = | ||||
|     in => | ||||
|       Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ | ||||
|         Stream.eval(storeFile(in, inFile, blocker)) | ||||
|  | ||||
|   private def logResult[F[_]: Sync]( | ||||
|       name: String, | ||||
|       result: SystemCommand.Result, | ||||
|   | ||||
							
								
								
									
										5
									
								
								modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| package docspell.convert.extern | ||||
|  | ||||
| object Tesseract { | ||||
|  | ||||
| } | ||||
| @@ -12,7 +12,7 @@ object Unoconv { | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|   ): Pipe[F, Byte, Byte] = | ||||
|     ExternConv.toPDFviaFile[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) | ||||
|     ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger) | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -12,7 +12,7 @@ object WkHtmlPdf { | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F], | ||||
|   ): Pipe[F, Byte, Byte] = | ||||
|     ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, blocker, logger) | ||||
|     ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger) | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -19,12 +19,10 @@ object Ocr { | ||||
|   ): F[Option[String]] = | ||||
|     File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => | ||||
|       runGhostscript(pdf, config, wd, blocker, logger) | ||||
|         .flatMap({ tmpImg => | ||||
|           runTesseractFile(tmpImg, blocker, logger, lang, config) | ||||
|         }) | ||||
|         .fold1(_ + "\n\n\n" + _). | ||||
|         compile. | ||||
|         last | ||||
|         .flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config)) | ||||
|         .fold1(_ + "\n\n\n" + _) | ||||
|         .compile | ||||
|         .last | ||||
|     } | ||||
|  | ||||
|   /** Extract the text from the given image file | ||||
| @@ -47,12 +45,10 @@ object Ocr { | ||||
|   ): F[Option[String]] = | ||||
|     File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => | ||||
|       runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) | ||||
|         .flatMap({ tif => | ||||
|           runTesseractFile(tif, blocker, logger, lang, config) | ||||
|         }) | ||||
|         .fold1(_ + "\n\n\n" + _). | ||||
|         compile. | ||||
|         last | ||||
|         .flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config)) | ||||
|         .fold1(_ + "\n\n\n" + _) | ||||
|         .compile | ||||
|         .last | ||||
|     } | ||||
|  | ||||
|   def extractImageFile[F[_]: Sync: ContextShift]( | ||||
| @@ -68,11 +64,11 @@ object Ocr { | ||||
|     * files are stored to a temporary location on disk and returned. | ||||
|     */ | ||||
|   private[extract] def runGhostscript[F[_]: Sync: ContextShift]( | ||||
|                                                                  pdf: Stream[F, Byte], | ||||
|                                                                  cfg: OcrConfig, | ||||
|                                                                  wd: Path, | ||||
|                                                                  blocker: Blocker, | ||||
|                                                                  logger: Logger[F] | ||||
|       pdf: Stream[F, Byte], | ||||
|       cfg: OcrConfig, | ||||
|       wd: Path, | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val xargs = | ||||
|       if (cfg.pageRange.begin > 0) | ||||
| @@ -80,19 +76,15 @@ object Ocr { | ||||
|       else cfg.ghostscript.command.args | ||||
|     val cmd = cfg.ghostscript.command | ||||
|       .copy(args = xargs) | ||||
|       .mapArgs( | ||||
|         replace( | ||||
|           Map( | ||||
|             "{{infile}}"  -> "-", | ||||
|             "{{outfile}}" -> "%d.tif" | ||||
|           ) | ||||
|       .replace( | ||||
|         Map( | ||||
|           "{{infile}}"  -> "-", | ||||
|           "{{outfile}}" -> "%d.tif" | ||||
|         ) | ||||
|       ) | ||||
|     SystemCommand | ||||
|       .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) | ||||
|       .evalMap({ _ => | ||||
|         File.listFiles(pathEndsWith(".tif"), wd) | ||||
|       }) | ||||
|       .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) | ||||
|       .flatMap(fs => Stream.emits(fs)) | ||||
|   } | ||||
|  | ||||
| @@ -106,19 +98,15 @@ object Ocr { | ||||
|       blocker: Blocker, | ||||
|       logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val cmd = ghostscript.mapArgs( | ||||
|       replace( | ||||
|         Map( | ||||
|           "{{infile}}"  -> pdf.toAbsolutePath.toString, | ||||
|           "{{outfile}}" -> "%d.tif" | ||||
|         ) | ||||
|     val cmd = ghostscript.replace( | ||||
|       Map( | ||||
|         "{{infile}}"  -> pdf.toAbsolutePath.toString, | ||||
|         "{{outfile}}" -> "%d.tif" | ||||
|       ) | ||||
|     ) | ||||
|     SystemCommand | ||||
|       .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) | ||||
|       .evalMap({ _ => | ||||
|         File.listFiles(pathEndsWith(".tif"), wd) | ||||
|       }) | ||||
|       .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) | ||||
|       .flatMap(fs => Stream.emits(fs)) | ||||
|   } | ||||
|  | ||||
| @@ -136,20 +124,20 @@ object Ocr { | ||||
|       logger: Logger[F] | ||||
|   ): Stream[F, Path] = { | ||||
|     val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath | ||||
|     val cmd = unpaper.mapArgs( | ||||
|       replace( | ||||
|         Map( | ||||
|           "{{infile}}"  -> img.toAbsolutePath.toString, | ||||
|           "{{outfile}}" -> targetFile.toString | ||||
|         ) | ||||
|     val cmd = unpaper.replace( | ||||
|       Map( | ||||
|         "{{infile}}"  -> img.toAbsolutePath.toString, | ||||
|         "{{outfile}}" -> targetFile.toString | ||||
|       ) | ||||
|     ) | ||||
|     SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(wd)).map(_ => targetFile).handleErrorWith { | ||||
|       th => | ||||
|     SystemCommand | ||||
|       .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) | ||||
|       .map(_ => targetFile) | ||||
|       .handleErrorWith { th => | ||||
|         logger | ||||
|           .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") | ||||
|         Stream.emit(img) | ||||
|     } | ||||
|       } | ||||
|   } | ||||
|  | ||||
|   /** Run tesseract on the given image file and return the extracted | ||||
| @@ -165,9 +153,8 @@ object Ocr { | ||||
|     // tesseract cannot cope with absolute filenames | ||||
|     // so use the parent as working dir | ||||
|     runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => | ||||
|       val cmd = config.tesseract.command.mapArgs( | ||||
|         replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) | ||||
|       ) | ||||
|       val cmd = config.tesseract.command | ||||
|         .replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) | ||||
|       SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) | ||||
|     } | ||||
|  | ||||
| @@ -182,17 +169,10 @@ object Ocr { | ||||
|       config: OcrConfig | ||||
|   ): Stream[F, String] = { | ||||
|     val cmd = config.tesseract.command | ||||
|       .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) | ||||
|       .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))) | ||||
|     SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) | ||||
|   } | ||||
|  | ||||
|   private def replace(repl: Map[String, String]): String => String = | ||||
|     s => | ||||
|       repl.foldLeft(s) { | ||||
|         case (res, (k, v)) => | ||||
|           res.replace(k, v) | ||||
|       } | ||||
|  | ||||
|   private def fixLanguage(lang: String): String = | ||||
|     lang match { | ||||
|       case "de" => "deu" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user