Integrate support for more files into processing and upload

The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration.
2025-08-05 02:24:52 +00:00 · 2020-02-19 23:27:00 +01:00
parent 9b1349734e
commit 97305d27ff
21 changed files with 366 additions and 148 deletions
--- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala
@ -2,4 +2,4 @@ package docspell.extract

 import docspell.extract.ocr.OcrConfig

-case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
+case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@ -55,10 +55,10 @@ object Extraction {

            ImageSize.get(data).flatMap {
              case Some(dim) =>
-                if (dim.product > cfg.maxImageSize) {
-                  logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
+                if (dim.product > cfg.ocr.maxImageSize) {
+                  logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
                  ExtractResult.failure(new Exception(
-                    s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
+                    s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
                  ).pure[F]
                } else {
                  doExtract
@ -72,6 +72,12 @@ object Extraction {
            logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
              OdfExtract.get(data).map(ExtractResult.fromEither)

+          case mt@MimeType("text", sub) if !sub.contains("html") =>
+            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
+            data.through(fs2.text.utf8Decode).compile.last.map { txt =>
+              ExtractResult.success(txt.getOrElse("").trim)
+            }
+
          case mt =>
            ExtractResult.unsupportedFormat(mt).pure[F]

--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@ -33,12 +33,12 @@ object PdfExtract {

    //maybe better: inspect the pdf and decide whether ocr or not
    for {
-      pdfboxRes <- PdfboxExtract.get[F](in)
+      pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
      res <- pdfboxRes.fold(
        ex =>
          logger.info(
            s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
-          ) *> runOcr.attempt,
+          ) >> runOcr.attempt,
        str =>
          if (str.length >= stripMinLen) str.pure[F].attempt
          else
--- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala
@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths}
 import docspell.common._

 case class OcrConfig(
-    allowedContentTypes: Set[MimeType],
-    ghostscript: OcrConfig.Ghostscript,
+                      maxImageSize: Int,
+                      ghostscript: OcrConfig.Ghostscript,
    pageRange: OcrConfig.PageRange,
    unpaper: OcrConfig.Unpaper,
    tesseract: OcrConfig.Tesseract
 ) {
-
-  def isAllowed(mt: MimeType): Boolean =
-    allowedContentTypes contains mt
 }

 object OcrConfig {
@ -27,12 +24,7 @@ object OcrConfig {
  case class Unpaper(command: SystemCommand.Config)

  val default = OcrConfig(
-    allowedContentTypes = Set(
-      MimeType.pdf,
-      MimeType.png,
-      MimeType.jpeg,
-      MimeType.tiff
-    ),
+    maxImageSize = 3000 * 3000,
    pageRange = PageRange(10),
    ghostscript = Ghostscript(
      SystemCommand.Config(
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@ -26,9 +26,6 @@ object TextExtract {
    Stream
      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
      .flatMap({
-        case mt if !config.isAllowed(mt) =>
-          raiseError(s"File `$mt` not allowed")
-
        case MimeType.pdf =>
          Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate