Configure pdf extraction; move Logger and DataType to common

2025-08-05 02:24:52 +00:00 · 2020-02-17 14:01:36 +01:00
parent 3d615181e0
commit e0682464b5
12 changed files with 64 additions and 27 deletions
--- a/modules/extract/src/main/scala/docspell/extract/DataType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/DataType.scala
@ -1,21 +0,0 @@
-package docspell.extract
-
-import docspell.common.{MimeType, MimeTypeHint}
-
-sealed trait DataType {
-
-}
-
-object DataType {
-
-  case class Exact(mime: MimeType) extends DataType
-
-  case class Hint(hint: MimeTypeHint) extends DataType
-
-
-  def apply(mt: MimeType): DataType =
-    Exact(mt)
-
-  def filename(name: String): DataType =
-    Hint(MimeTypeHint.filename(name))
-}
--- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala
@ -2,4 +2,4 @@ package docspell.extract

 import docspell.extract.ocr.OcrConfig

-case class ExtractConfig(ocr: OcrConfig)
+case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@ -29,14 +29,10 @@ object Extraction {
          dataType: DataType,
          lang: Language
      ): F[ExtractResult] = {
-        val mime = dataType match {
-          case DataType.Exact(mt)  => mt.pure[F]
-          case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
-        }
-        mime.flatMap {
+        TikaMimetype.resolve(dataType, data).flatMap {
          case MimeType.pdf =>
            PdfExtract
-              .get(data, blocker, lang, 5, cfg.ocr, logger)
+              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
              .map(ExtractResult.fromEither)

          case PoiType(mt) =>
--- a/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala
@ -0,0 +1,3 @@
+package docspell.extract
+
+case class PdfConfig (minTextLen: Int)
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@ -43,7 +43,7 @@ object PdfExtract {
          if (str.length >= stripMinLen) str.pure[F].attempt
          else
            logger
-              .info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *>
+              .info(s"Stripped text from PDF is small (${str.length}). Trying with OCR.") *>
              runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
      )
    } yield res
--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@ -17,13 +17,15 @@ object Ocr {
      blocker: Blocker,
      lang: String,
      config: OcrConfig
-  ): Stream[F, String] =
-    File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
+  ): F[Option[String]] =
+    File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
      runGhostscript(pdf, config, wd, blocker)
        .flatMap({ tmpImg =>
          runTesseractFile(tmpImg, blocker, lang, config)
        })
-        .fold1(_ + "\n\n\n" + _)
+        .fold1(_ + "\n\n\n" + _).
+        compile.
+        last
    }

  /** Extract the text from the given image file
@ -41,13 +43,15 @@ object Ocr {
      blocker: Blocker,
      lang: String,
      config: OcrConfig
-  ): Stream[F, String] =
-    File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
+  ): F[Option[String]] =
+    File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
      runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
        .flatMap({ tif =>
          runTesseractFile(tif, blocker, lang, config)
        })
-        .fold1(_ + "\n\n\n" + _)
+        .fold1(_ + "\n\n\n" + _).
+        compile.
+        last
    }

  def extractImageFile[F[_]: Sync: ContextShift](
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@ -28,7 +28,7 @@ object TextExtract {
          raiseError(s"File `$mt` not allowed")

        case MimeType.pdf =>
-          Ocr.extractPdf(in, blocker, lang, config)
+          Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate

        case mt if mt.primary == "image" =>
          Ocr.extractImage(in, blocker, lang, config)