Integrate support for more files into processing and upload

The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration.
2025-09-15 21:46:53 +00:00 · 2020-02-19 23:27:00 +01:00
parent 9b1349734e
commit 97305d27ff
21 changed files with 366 additions and 148 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -65,66 +65,143 @@ docspell.joex {
  }

  # Configuration of text extraction
-  #
-  # Extracting text currently only work for image and pdf files. It
-  # will first run ghostscript to create a gray image from a pdf. Then
-  # unpaper is run to optimize the image for the upcoming ocr, which
-  # will be done by tesseract. All these programs must be available in
-  # your PATH or the absolute path can be specified below.
  extraction {
-    allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
-
-    # Defines what pages to process. If a PDF with 600 pages is
-    # submitted, it is probably not necessary to scan through all of
-    # them. This would take a long time and occupy resources for no
-    # value. The first few pages should suffice. The default is first
-    # 10 pages.
-    #
-    # If you want all pages being processed, set this number to -1.
-    #
-    # Note: if you change the ghostscript command below, be aware that
-    # this setting (if not -1) will add another parameter to the
-    # beginning of the command.
-    page-range {
-      begin = 10
+    # For PDF files it is first tried to read the text parts of the
+    # PDF. But PDFs can be complex documents and they may contain text
+    # and images. If the returned text is shorter than the value
+    # below, OCR is run afterwards. Then both extracted texts are
+    # compared and the longer will be used.
+    pdf {
+      min-text-len = 10
    }

-    # The ghostscript command.
-    ghostscript {
-      command {
-        program = "gs"
-        args = [ "-dNOPAUSE"
-               , "-dBATCH"
-               , "-dSAFER"
-               , "-sDEVICE=tiffscaled8"
-               , "-sOutputFile={{outfile}}"
-               , "{{infile}}"
-               ]
-        timeout = "5 minutes"
+    # Extracting text using OCR works for image and pdf files. It will
+    # first run ghostscript to create a gray image from a pdf. Then
+    # unpaper is run to optimize the image for the upcoming ocr, which
+    # will be done by tesseract. All these programs must be available
+    # in your PATH or the absolute path can be specified below.
+    ocr {
+
+      # Images greater than this size are skipped. Note that every
+      # image is loaded completely into memory for doing OCR.
+      max-image-size = 14000000
+
+      # Defines what pages to process. If a PDF with 600 pages is
+      # submitted, it is probably not necessary to scan through all of
+      # them. This would take a long time and occupy resources for no
+      # value. The first few pages should suffice. The default is first
+      # 10 pages.
+      #
+      # If you want all pages being processed, set this number to -1.
+      #
+      # Note: if you change the ghostscript command below, be aware that
+      # this setting (if not -1) will add another parameter to the
+      # beginning of the command.
+      page-range {
+        begin = 10
      }
-      working-dir = ${java.io.tmpdir}"/docspell-extraction"
-    }

-    # The unpaper command.
-    unpaper {
-      command {
-        program = "unpaper"
-        args = [ "{{infile}}", "{{outfile}}" ]
-        timeout = "5 minutes"
+      # The ghostscript command.
+      ghostscript {
+        command {
+          program = "gs"
+          args = [ "-dNOPAUSE"
+                 , "-dBATCH"
+                 , "-dSAFER"
+                 , "-sDEVICE=tiffscaled8"
+                 , "-sOutputFile={{outfile}}"
+                 , "{{infile}}"
+                 ]
+          timeout = "5 minutes"
+        }
+        working-dir = ${java.io.tmpdir}"/docspell-extraction"
      }
-    }

-    # The tesseract command.
-    tesseract {
-      command {
-        program = "tesseract"
-        args = ["{{file}}"
-               , "stdout"
-               , "-l"
-               , "{{lang}}"
-               ]
-        timeout = "5 minutes"
+      # The unpaper command.
+      unpaper {
+        command {
+          program = "unpaper"
+          args = [ "{{infile}}", "{{outfile}}" ]
+          timeout = "5 minutes"
+        }
+      }
+
+      # The tesseract command.
+      tesseract {
+        command {
+          program = "tesseract"
+          args = ["{{file}}"
+                 , "stdout"
+                 , "-l"
+                 , "{{lang}}"
+                 ]
+          timeout = "5 minutes"
+        }
      }
    }
  }
+
+  # Configuration for converting files into PDFs.
+  #
+  # Most of it is delegated to external tools, which can be configured
+  # below. They must be in the PATH environment or specify the full
+  # path below via the `program` key.
+  convert {
+    chunk-size = 524288
+
+    max-image-size = 12000000
+
+    markdown {
+      internal-css = """
+        body { padding: 2em 5em; }
+      """
+    }
+
+    wkhtmlpdf {
+      cmd = {
+        program = "wkhtmltopdf"
+        args = [
+          "-s",
+          "A4",
+          "--encoding",
+          "UTF-8",
+          "-",
+          "{{outfile}}"
+        ]
+        timeout = "20 seconds"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
+
+    tesseract = {
+      cmd = {
+        program = "tesseract"
+        args = [
+          "{{infile}}",
+          "out",
+          "-l",
+          "{{lang}}",
+          "pdf",
+          "txt"
+        ]
+        timeout = "120 seconds"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
+
+    unoconv = {
+      cmd = {
+        program = "unoconv"
+        args = [
+          "-f",
+          "pdf",
+          "-o",
+          "{{outfile}}",
+          "{{infile}}"
+        ]
+        timeout = "20 seconds"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-convert"
+    }
+  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -3,17 +3,17 @@ package docspell.joex
 import docspell.common.{Ident, LenientUri}
 import docspell.joex.scheduler.SchedulerConfig
 import docspell.store.JdbcConfig
-import docspell.extract.ocr.{OcrConfig => OcrConfig}
 import docspell.convert.ConvertConfig
+import docspell.extract.ExtractConfig

 case class Config(
-    appId: Ident,
-    baseUrl: LenientUri,
-    bind: Config.Bind,
-    jdbc: JdbcConfig,
-    scheduler: SchedulerConfig,
-    extraction: OcrConfig,
-    convert: ConvertConfig
+                   appId: Ident,
+                   baseUrl: LenientUri,
+                   bind: Config.Bind,
+                   jdbc: JdbcConfig,
+                   scheduler: SchedulerConfig,
+                   extraction: ExtractConfig,
+                   convert: ConvertConfig
 )

 object Config {
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@@ -1,15 +1,17 @@
 package docspell.joex.process

-import bitpeace.Mimetype
+import bitpeace.{Mimetype, MimetypeHint, RangeDef}
+import cats.implicits._
 import cats.Functor
 import cats.implicits._
 import cats.effect._
-import cats.data.OptionT
-
+import cats.data.{Kleisli, OptionT}
+import fs2.Stream
 import docspell.common._
 import docspell.convert._
 import docspell.joex.scheduler._
 import docspell.store.records._
+import docspell.convert.ConversionResult.Handler

 /** Goes through all attachments and creates a PDF version of it where
  * supported.
@@ -32,32 +34,92 @@ object ConvertPdf {
      item: ItemData
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
-      // get mimetype
-      //   try to convert
-      //   save to db
-      //   update file_id of RAttachment
-
      def convert(ra: RAttachment) =
-        findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
+        findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))

      for {
        ras <- item.attachments.traverse(convert)
-      } yield item.copy(attachments = ras)
+        nra = ras.map(_._1)
+        nma = ras.flatMap(_._2)
+      } yield item.copy(attachments = nra, metas = nma)

    }

-  def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] =
+  def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
    OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
      .map(_.mimetype)
      .getOrElse(Mimetype.`application/octet-stream`)

  def convertSafe[F[_]: Sync: ContextShift](
      cfg: ConvertConfig,
-      ctx: Context[F, ProcessItemArgs]
-  )(ra: RAttachment, mime: Mimetype): F[RAttachment] =
-    Conversion.create[F](cfg, ctx.blocker,ctx.logger).use { conv =>
-      ctx.logger
-        .info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
-        .map(_ => ra)
+      ctx: Context[F, ProcessItemArgs],
+      item: ItemData
+  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
+    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
+      mime match {
+        case Mimetype.`application/pdf` =>
+          ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
+            (ra, None: Option[RAttachmentMeta]).pure[F]
+
+        case _ =>
+          val data = ctx.store.bitpeace
+            .get(ra.fileId.id)
+            .unNoneTerminate
+            .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
+          val handler = conversionHandler[F](ctx, cfg, ra, item)
+          ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
+            conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data)
+      }
    }
+
+  private def conversionHandler[F[_]: Sync](
+      ctx: Context[F, ProcessItemArgs],
+      cfg: ConvertConfig,
+      ra: RAttachment,
+      item: ItemData
+  ): Handler[F, (RAttachment, Option[RAttachmentMeta])] =
+    Kleisli({
+      case ConversionResult.SuccessPdf(pdf) =>
+        ctx.logger.info(s"Conversion to pdf successful. Saving file.") *>
+          storePDF(ctx, cfg, ra, pdf)
+            .map(r => (r, None))
+
+      case ConversionResult.SuccessPdfTxt(pdf, txt) =>
+        ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *>
+          storePDF(ctx, cfg, ra, pdf)
+            .flatMap(r =>
+              txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some))
+            )
+
+      case ConversionResult.UnsupportedFormat(mt) =>
+        ctx.logger.info(s"PDF conversion for type ${mt.asString} not supported!") *>
+          (ra, None: Option[RAttachmentMeta]).pure[F]
+
+      case ConversionResult.InputMalformed(mt, reason) =>
+        ctx.logger.info(
+          s"PDF conversion from type ${mt.asString} reported malformed input: $reason."
+        ) *>
+          (ra, None: Option[RAttachmentMeta]).pure[F]
+
+      case ConversionResult.Failure(ex) =>
+        ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
+          (ra, None: Option[RAttachmentMeta]).pure[F]
+    })
+
+  private def storePDF[F[_]: Sync](
+                                    ctx: Context[F, ProcessItemArgs],
+                                    cfg: ConvertConfig,
+                                    ra: RAttachment,
+                                    pdf: Stream[F, Byte]
+                                  ) = {
+    val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
+    val newName = ra.name.map(n => s"$n.pdf")
+    ctx.store.bitpeace
+      .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
+      .compile
+      .lastOrError
+      .map(fm => Ident.unsafe(fm.id))
+      .flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId))
+      .map(fmId => ra.copy(fileId = fmId, name = newName))
+  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -1,5 +1,6 @@
 package docspell.joex.process

+import bitpeace.FileMeta
 import cats.implicits._
 import cats.effect.Sync
 import cats.data.OptionT
@@ -22,13 +23,15 @@ object CreateItem {

  def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
-      val validFiles = ctx.args.meta.validFileTypes.map(_.asString).toSet
+      def isValidFile(fm: FileMeta) =
+        ctx.args.meta.validFileTypes.isEmpty ||
+        ctx.args.meta.validFileTypes.map(_.asString).toSet.contains(fm.mimetype.baseType)

      def fileMetas(itemId: Ident, now: Timestamp) =
        Stream
          .emits(ctx.args.files)
          .flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm)))
-          .collect({ case (f, Some(fm)) if validFiles.contains(fm.mimetype.baseType) => f })
+          .collect({ case (f, Some(fm)) if isValidFile(fm) => f })
          .zipWithIndex
          .evalMap({
            case (f, index) =>
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -9,7 +9,7 @@ case class ItemData(
    attachments: Vector[RAttachment],
    metas: Vector[RAttachmentMeta],
    dateLabels: Vector[AttachmentDates],
-    originFile: Map[Ident, Ident]
+    originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id
 ) {

  def findMeta(attachId: Ident): Option[RAttachmentMeta] =
@@ -17,6 +17,21 @@ case class ItemData(

  def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] =
    dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty)
+
+  def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = {
+    val item = changeMeta(attachId, f)
+    val next = metas.map(a => if (a.id == attachId) item else a)
+    copy(metas = next)
+  }
+
+  def changeMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): RAttachmentMeta =
+    f(findOrCreate(attachId))
+
+  def findOrCreate(attachId: Ident): RAttachmentMeta =
+    metas.find(_.id == attachId).getOrElse {
+      RAttachmentMeta.empty(attachId)
+    }
+
 }

 object ItemData {
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -1,25 +1,25 @@
 package docspell.joex.process

-import bitpeace.RangeDef
+import bitpeace.{Mimetype, RangeDef}
+import cats.data.OptionT
 import cats.implicits._
-import cats.effect.{Blocker, ContextShift, Sync}
+import cats.effect.{ContextShift, Sync}
 import docspell.common._
+import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
 import docspell.joex.scheduler.{Context, Task}
-import docspell.store.Store
-import docspell.store.records.{RAttachment, RAttachmentMeta}
-import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig}
+import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}

 object TextExtraction {

  def apply[F[_]: Sync: ContextShift](
-      cfg: OcrConfig,
-      item: ItemData
+                                       cfg: ExtractConfig,
+                                       item: ItemData
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
      for {
        _     <- ctx.logger.info("Starting text extraction")
        start <- Duration.stopTime[F]
-        txt   <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item))
+        txt   <- item.attachments.traverse(extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item))
        _     <- ctx.logger.debug("Storing extracted texts")
        _     <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
        dur   <- start
@@ -27,53 +27,84 @@ object TextExtraction {
      } yield item.copy(metas = txt)
    }

+  def extractTextIfEmpty[F[_]: Sync: ContextShift](
+                                                   ctx: Context[F, _],
+                                                   cfg: ExtractConfig,
+                                                   lang: Language,
+                                                   item: ItemData
+                                                  )(ra: RAttachment): F[RAttachmentMeta] = {
+    val rm = item.findOrCreate(ra.id)
+    rm.content match {
+      case Some(_) =>
+        ctx.logger.info("TextExtraction skipped, since text is already available.") *>
+          rm.pure[F]
+      case None =>
+        extractTextToMeta[F](ctx, cfg, lang, item)(ra)
+    }
+  }
+
  def extractTextToMeta[F[_]: Sync: ContextShift](
      ctx: Context[F, _],
-      cfg: OcrConfig,
-    lang: Language,
-    item: ItemData
+      cfg: ExtractConfig,
+      lang: Language,
+      item: ItemData
  )(ra: RAttachment): F[RAttachmentMeta] =
    for {
-      _    <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}")
+      _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
      dst  <- Duration.stopTime[F]
-      txt  <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra))
-      meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty))
+      txt  <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
+      meta = item.changeMeta(ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)))
      est  <- dst
      _ <- ctx.logger.debug(
-            s"Extracting text for attachment ${ra.name} finished in ${est.formatExact}"
+            s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
          )
    } yield meta

  def extractText[F[_]: Sync: ContextShift](
-      ocrConfig: OcrConfig,
-      lang: Language,
-      store: Store[F],
-      blocker: Blocker,
-      logger: Logger[F]
-  )(fileId: Ident): F[Option[String]] = {
-    val data = store.bitpeace
+      ctx: Context[F, _],
+      extr: Extraction[F],
+      lang: Language
+  )(fileId: Ident): F[ExtractResult] = {
+    val data = ctx.store.bitpeace
      .get(fileId.id)
      .unNoneTerminate
-      .through(store.bitpeace.fetchData2(RangeDef.all))
+      .through(ctx.store.bitpeace.fetchData2(RangeDef.all))

-    TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last
+    def findMime: F[Mimetype] =
+      OptionT(ctx.store.transact(RFileMeta.findById(fileId)))
+        .map(_.mimetype)
+        .getOrElse(Mimetype.`application/octet-stream`)
+
+    findMime
+        .flatMap(mt =>
+          extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
  }

  private def extractTextFallback[F[_]: Sync: ContextShift](
-      ctx: Context[F, _],
-      ocrConfig: OcrConfig,
-      lang: Language,
+                                                             ctx: Context[F, _],
+                                                             cfg: ExtractConfig,
+                                                             ra: RAttachment,
+                                                             lang: Language,
  )(fileIds: List[Ident]): F[Option[String]] = {
    fileIds match {
      case Nil =>
        ctx.logger.error(s"Cannot extract text").map(_ => None)

      case id :: rest =>
-        extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id).
-          recoverWith({
-            case ex =>
+        val extr = Extraction.create[F](ctx.blocker, ctx.logger, cfg)
+
+        extractText[F](ctx, extr, lang)(id)
+          .flatMap({
+            case ExtractResult.Success(txt) =>
+              txt.some.pure[F]
+
+            case ExtractResult.UnsupportedFormat(mt) =>
+              ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file.").
+                flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
+
+            case ExtractResult.Failure(ex) =>
              ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
-                flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest))
+                flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
          })
    }
  }
@@ -86,4 +117,9 @@ object TextExtraction {
      case Some(sid) => List(sid, ra.fileId).distinct
      case None => List(ra.fileId)
    }
+
+  private def stripAttachmentName(ra: RAttachment): String =
+    ra.name
+      .map(s => if (s.endsWith(".pdf") && s.count(_ == '.') > 1) s.dropRight(4) else s)
+      .getOrElse("<no-name>")
 }