Reorganize processing code

Use separate modules for - text extraction - conversion to pdf - text analysis
2025-09-15 21:46:53 +00:00 · 2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions
--- a/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
@@ -0,0 +1,54 @@
+package docspell.extract.ocr
+
+import java.nio.file.{Path, Paths}
+
+import docspell.common._
+
+case class Config(
+    allowedContentTypes: Set[MimeType],
+    ghostscript: Config.Ghostscript,
+    pageRange: Config.PageRange,
+    unpaper: Config.Unpaper,
+    tesseract: Config.Tesseract
+) {
+
+  def isAllowed(mt: MimeType): Boolean =
+    allowedContentTypes contains mt
+}
+
+object Config {
+  case class PageRange(begin: Int)
+
+  case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
+  case class Tesseract(command: SystemCommand.Config)
+  case class Unpaper(command: SystemCommand.Config)
+
+  val default = Config(
+    allowedContentTypes = Set(
+      MimeType.pdf,
+      MimeType.png,
+      MimeType.jpeg,
+      MimeType.tiff
+    ),
+    pageRange = PageRange(10),
+    ghostscript = Ghostscript(
+      SystemCommand.Config(
+        "gs",
+        Seq(
+          "-dNOPAUSE",
+          "-dBATCH",
+          "-dSAFER",
+          "-sDEVICE=tiffscaled8",
+          "-sOutputFile={{outfile}}",
+          "{{infile}}"
+        ),
+        Duration.seconds(30)
+      ),
+      Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
+    ),
+    unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
+    tesseract = Tesseract(
+      SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
+    )
+  )
+}
--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@@ -0,0 +1,191 @@
+package docspell.extract.ocr
+
+import java.nio.file.Path
+
+import cats.effect.{Blocker, ContextShift, Sync}
+import fs2.Stream
+import org.log4s._
+import docspell.common._
+
+object Ocr {
+  private[this] val logger = getLogger
+
+  /** Extract the text of all pages in the given pdf file.
+    */
+  def extractPdf[F[_]: Sync: ContextShift](
+      pdf: Stream[F, Byte],
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
+      runGhostscript(pdf, config, wd, blocker)
+        .flatMap({ tmpImg =>
+          runTesseractFile(tmpImg, blocker, lang, config)
+        })
+        .fold1(_ + "\n\n\n" + _)
+    }
+
+  /** Extract the text from the given image file
+    */
+  def extractImage[F[_]: Sync: ContextShift](
+      img: Stream[F, Byte],
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    runTesseractStdin(img, blocker, lang, config)
+
+  def extractPdFFile[F[_]: Sync: ContextShift](
+      pdf: Path,
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
+      runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
+        .flatMap({ tif =>
+          runTesseractFile(tif, blocker, lang, config)
+        })
+        .fold1(_ + "\n\n\n" + _)
+    }
+
+  def extractImageFile[F[_]: Sync: ContextShift](
+      img: Path,
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    runTesseractFile(img, blocker, lang, config)
+
+  /** Run ghostscript to extract all pdf pages into tiff files. The
+    * files are stored to a temporary location on disk and returned.
+    */
+  private[extract] def runGhostscript[F[_]: Sync: ContextShift](
+      pdf: Stream[F, Byte],
+      cfg: Config,
+      wd: Path,
+      blocker: Blocker
+  ): Stream[F, Path] = {
+    val xargs =
+      if (cfg.pageRange.begin > 0)
+        s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
+      else cfg.ghostscript.command.args
+    val cmd = cfg.ghostscript.command
+      .copy(args = xargs)
+      .mapArgs(
+        replace(
+          Map(
+            "{{infile}}"  -> "-",
+            "{{outfile}}" -> "%d.tif"
+          )
+        )
+      )
+    SystemCommand
+      .execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
+      .evalMap({ _ =>
+        File.listFiles(pathEndsWith(".tif"), wd)
+      })
+      .flatMap(fs => Stream.emits(fs))
+  }
+
+  /** Run ghostscript to extract all pdf pages into tiff files. The
+    * files are stored to a temporary location on disk and returned.
+    */
+  private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
+      pdf: Path,
+      ghostscript: SystemCommand.Config,
+      wd: Path,
+      blocker: Blocker
+  ): Stream[F, Path] = {
+    val cmd = ghostscript.mapArgs(
+      replace(
+        Map(
+          "{{infile}}"  -> pdf.toAbsolutePath.toString,
+          "{{outfile}}" -> "%d.tif"
+        )
+      )
+    )
+    SystemCommand
+      .execSuccess[F](cmd, blocker, wd = Some(wd))
+      .evalMap({ _ =>
+        File.listFiles(pathEndsWith(".tif"), wd)
+      })
+      .flatMap(fs => Stream.emits(fs))
+  }
+
+  private def pathEndsWith(ext: String): Path => Boolean =
+    p => p.getFileName.toString.endsWith(ext)
+
+  /** Run unpaper to optimize the image for ocr. The
+    * files are stored to a temporary location on disk and returned.
+    */
+  private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
+      img: Path,
+      unpaper: SystemCommand.Config,
+      wd: Path,
+      blocker: Blocker
+  ): Stream[F, Path] = {
+    val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
+    val cmd = unpaper.mapArgs(
+      replace(
+        Map(
+          "{{infile}}"  -> img.toAbsolutePath.toString,
+          "{{outfile}}" -> targetFile.toString
+        )
+      )
+    )
+    SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
+      th =>
+        logger
+          .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
+        Stream.emit(img)
+    }
+  }
+
+  /** Run tesseract on the given image file and return the extracted
+    * text.
+    */
+  private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
+      img: Path,
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    // tesseract cannot cope with absolute filenames
+    // so use the parent as working dir
+    runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
+      val cmd = config.tesseract.command.mapArgs(
+        replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
+      )
+      SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
+    }
+
+  /** Run tesseract on the given image file and return the extracted
+    * text.
+    */
+  private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
+      img: Stream[F, Byte],
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] = {
+    val cmd = config.tesseract.command
+      .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
+    SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
+  }
+
+  private def replace(repl: Map[String, String]): String => String =
+    s =>
+      repl.foldLeft(s) {
+        case (res, (k, v)) =>
+          res.replace(k, v)
+      }
+
+  private def fixLanguage(lang: String): String =
+    lang match {
+      case "de" => "deu"
+      case "en" => "eng"
+      case l    => l
+    }
+}
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@@ -0,0 +1,42 @@
+package docspell.extract.ocr
+
+import cats.effect.{Blocker, ContextShift, Sync}
+import docspell.common._
+import docspell.files._
+import fs2.Stream
+
+object TextExtract {
+
+  def extract[F[_]: Sync: ContextShift](
+      in: Stream[F, Byte],
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    extractOCR(in, blocker, lang, config)
+
+  def extractOCR[F[_]: Sync: ContextShift](
+      in: Stream[F, Byte],
+      blocker: Blocker,
+      lang: String,
+      config: Config
+  ): Stream[F, String] =
+    Stream
+      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
+      .flatMap({
+        case mt if !config.isAllowed(mt) =>
+          raiseError(s"File `$mt` not allowed")
+
+        case MimeType.pdf =>
+          Ocr.extractPdf(in, blocker, lang, config)
+
+        case mt if mt.primary == "image" =>
+          Ocr.extractImage(in, blocker, lang, config)
+
+        case mt =>
+          raiseError(s"File `$mt` not supported")
+      })
+
+  private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
+    Stream.raiseError[F](new Exception(msg))
+}
--- a/modules/extract/src/test/resources/logback.xml
+++ b/modules/extract/src/test/resources/logback.xml
@@ -0,0 +1,14 @@
+<configuration>
+    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+        <withJansi>true</withJansi>
+
+        <encoder>
+            <pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
+        </encoder>
+    </appender>
+
+    <logger name="docspell" level="debug" />
+    <root level="INFO">
+        <appender-ref ref="STDOUT" />
+    </root>
+</configuration>
--- a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
+++ b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
@@ -0,0 +1,30 @@
+package docspell.extract
+
+import fs2.Stream
+import cats.effect.{Blocker, IO}
+import docspell.files._
+
+import scala.concurrent.ExecutionContext
+
+object TestFiles {
+  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
+  implicit val CS = IO.contextShift(ExecutionContext.global)
+
+  val letterSourceDE: Stream[IO, Byte] =
+    ExampleFiles.letter_de_pdf
+      .readURL[IO](16 * 1024, blocker)
+
+  val letterSourceEN: Stream[IO, Byte] =
+    ExampleFiles.letter_en_pdf
+      .readURL[IO](16 * 1024, blocker)
+
+  lazy val letterDEText =
+    ExampleFiles.letter_de_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+
+  lazy val letterENText =
+    ExampleFiles.letter_en_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+}
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@@ -0,0 +1,42 @@
+package docspell.extract.ocr
+
+import cats.effect.IO
+import docspell.common._
+import docspell.files._
+import docspell.extract.TestFiles
+import minitest.SimpleTestSuite
+
+object TextExtractionSuite extends SimpleTestSuite {
+  import TestFiles._
+
+  test("extract english pdf") {
+    ignore()
+    val text = TextExtract
+      .extract[IO](letterSourceEN, blocker, "eng", Config.default)
+      .compile
+      .lastOrError
+      .unsafeRunSync()
+    println(text)
+  }
+
+  test("extract german pdf") {
+    ignore()
+    val expect = TestFiles.letterDEText
+    val extract = TextExtract
+      .extract[IO](letterSourceDE, blocker, "deu", Config.default)
+      .compile
+      .lastOrError
+      .unsafeRunSync()
+
+    assertEquals(extract.trim, expect.trim)
+  }
+
+  test("find mimetypes") {
+    ExampleFiles.
+      all.foreach { url =>
+        TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
+          map(mt => println(url.asString + ": " + mt.asString)).
+          unsafeRunSync
+      }
+  }
+}