Starting to support more file types

First, files are be converted to PDF for archiving. It is also easier to create a preview. This is done via the `ConvertPdf` processing task (which is not yet implemented). Text extraction then tries first with the original file. If that fails, OCR is done on the (potentially) converted pdf file. To not loose information of the original file, it is saved using the table `attachment_source`. If the original file is already a pdf, or the conversion did not succeed, the `attachment` and `attachment_source` record point to the same file.
2025-09-15 21:46:53 +00:00 · 2020-02-09 19:42:49 +01:00
parent 57ec8eec53
commit ba3865ef5e
11 changed files with 220 additions and 19 deletions
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@@ -0,0 +1,60 @@
+package docspell.joex.process
+
+import bitpeace.Mimetype
+import cats.Functor
+import cats.implicits._
+import cats.effect._
+import cats.data.OptionT
+
+import docspell.common._
+import docspell.joex.scheduler._
+import docspell.store.records._
+
+/** Goes through all attachments and creates a PDF version of it where
+  * supported.
+  *
+  * The `attachment` record is updated with the PDF version while the
+  * original file has been stored in the `attachment_source` record.
+  *
+  * If pdf conversion is not possible or if the input is already a
+  * pdf, both files are identical. That is, the `file_id`s point to
+  * the same file. Since the name of an attachment may be changed by
+  * the user, the `attachment_origin` record keeps that, too.
+  *
+  * This step assumes an existing premature item, it traverses its
+  * attachments.
+  */
+object ConvertPdf {
+
+  def apply[F[_]: Sync: ContextShift](
+      item: ItemData
+  ): Task[F, ProcessItemArgs, ItemData] =
+    Task { ctx =>
+
+      // get mimetype
+      //   try to convert
+      //   save to db
+      //   update file_id of RAttachment
+
+      def convert(ra: RAttachment) =
+        findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
+
+      for {
+        ras <- item.attachments.traverse(convert)
+      } yield item.copy(attachments = ras)
+
+    }
+
+  def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] =
+    OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
+      .map(_.mimetype)
+      .getOrElse(Mimetype.`application/octet-stream`)
+
+  def convertSafe[F[_]: Sync](
+      ctx: Context[F, ProcessItemArgs]
+  )(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
+
+    ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
+      map(_ => ra)
+  }
+}
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -2,11 +2,12 @@ package docspell.joex.process

 import cats.implicits._
 import cats.effect.Sync
+import cats.data.OptionT
 import fs2.Stream
 import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
 import docspell.store.queries.QItem
-import docspell.store.records.{RAttachment, RItem}
+import docspell.store.records.{RAttachment, RAttachmentSource, RItem}

 /**
  * Task that creates the item.
@@ -53,13 +54,21 @@ object CreateItem {
        n    <- ctx.store.transact(RItem.insert(it))
        _    <- if (n != 1) storeItemError[F](ctx) else ().pure[F]
        fm   <- fileMetas(it.id, it.created)
-        k    <- fm.traverse(a => ctx.store.transact(RAttachment.insert(a)))
+        k    <- fm.traverse(insertAttachment(ctx))
        _    <- logDifferences(ctx, fm, k.sum)
        dur  <- time
        _    <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}")
-      } yield ItemData(it, fm, Vector.empty, Vector.empty)
+      } yield ItemData(it, fm, Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap)
    }

+  def insertAttachment[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Int] = {
+    val rs = RAttachmentSource.of(ra)
+    ctx.store.transact(for {
+      n <- RAttachment.insert(ra)
+      _ <- RAttachmentSource.insert(rs)
+    } yield n)
+  }
+
  def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] =
    Task { ctx =>
      for {
@@ -69,12 +78,18 @@ object CreateItem {
        ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid))
        _ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments")
            else ().pure[F]
-        rms <- cand.headOption.traverse(ri =>
-                ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid))
-              )
-      } yield cand.headOption.map(ri =>
-        ItemData(ri, rms.getOrElse(Vector.empty), Vector.empty, Vector.empty)
-      )
+        rms <- OptionT(
+                cand.headOption.traverse(ri =>
+                  ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid))
+                )
+              ).getOrElse(Vector.empty)
+        orig <- rms.traverse(a =>
+                 ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s))
+               )
+        origMap = orig
+          .map(originFileTuple)
+          .toMap
+      } yield cand.headOption.map(ri => ItemData(ri, rms, Vector.empty, Vector.empty, origMap))
    }

  private def logDifferences[F[_]: Sync](
@@ -94,4 +109,8 @@ object CreateItem {
    val msg = "Inserting item failed. DB returned 0 update count!"
    ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg))
  }
+
+  //TODO if no source is present, it must be saved!
+  private def originFileTuple(t: (RAttachment, Option[RAttachmentSource])): (Ident, Ident) =
+    t._2.map(s => s.id -> s.fileId).getOrElse(t._1.id -> t._1.fileId)
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -8,7 +8,8 @@ case class ItemData(
    item: RItem,
    attachments: Vector[RAttachment],
    metas: Vector[RAttachmentMeta],
-    dateLabels: Vector[AttachmentDates]
+    dateLabels: Vector[AttachmentDates],
+    originFile: Map[Ident, Ident]
 ) {

  def findMeta(attachId: Ident): Option[RAttachmentMeta] =
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -10,7 +10,8 @@ object ProcessItem {
  def apply[F[_]: Sync: ContextShift](
      cfg: OcrConfig
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextExtraction(cfg, item)
+    ConvertPdf(item)
+      .flatMap(TextExtraction(cfg, _))
      .flatMap(Task.setProgress(25))
      .flatMap(TextAnalysis[F])
      .flatMap(Task.setProgress(50))
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -3,7 +3,7 @@ package docspell.joex.process
 import bitpeace.RangeDef
 import cats.implicits._
 import cats.effect.{Blocker, ContextShift, Sync}
-import docspell.common.{Duration, Language, ProcessItemArgs}
+import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
 import docspell.store.Store
 import docspell.store.records.{RAttachment, RAttachmentMeta}
@@ -19,7 +19,7 @@ object TextExtraction {
      for {
        _     <- ctx.logger.info("Starting text extraction")
        start <- Duration.stopTime[F]
-        txt   <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language))
+        txt   <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item))
        _     <- ctx.logger.debug("Storing extracted texts")
        _     <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
        dur   <- start
@@ -30,12 +30,13 @@ object TextExtraction {
  def extractTextToMeta[F[_]: Sync: ContextShift](
      ctx: Context[F, _],
      cfg: OcrConfig,
-      lang: Language
+    lang: Language,
+    item: ItemData
  )(ra: RAttachment): F[RAttachmentMeta] =
    for {
      _    <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}")
      dst  <- Duration.stopTime[F]
-      txt  <- extractText(cfg, lang, ctx.store, ctx.blocker)(ra)
+      txt  <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra))
      meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty))
      est  <- dst
      _ <- ctx.logger.debug(
@@ -48,12 +49,40 @@ object TextExtraction {
      lang: Language,
      store: Store[F],
      blocker: Blocker
-  )(ra: RAttachment): F[Option[String]] = {
+  )(fileId: Ident): F[Option[String]] = {
    val data = store.bitpeace
-      .get(ra.fileId.id)
+      .get(fileId.id)
      .unNoneTerminate
      .through(store.bitpeace.fetchData2(RangeDef.all))

    TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last
  }
+
+  private def extractTextFallback[F[_]: Sync: ContextShift](
+      ctx: Context[F, _],
+      ocrConfig: OcrConfig,
+      lang: Language,
+  )(fileIds: List[Ident]): F[Option[String]] = {
+    fileIds match {
+      case Nil =>
+        ctx.logger.error(s"Cannot extract text").map(_ => None)
+
+      case id :: rest =>
+        extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id).
+          recoverWith({
+            case ex =>
+              ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
+                flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest))
+          })
+    }
+  }
+
+  /** Returns the fileIds to extract text from. First, the source file
+    * is tried. If that fails, the converted file is tried.
+    */
+  private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
+    item.originFile.get(ra.id) match {
+      case Some(sid) => List(sid, ra.fileId).distinct
+      case None => List(ra.fileId)
+    }
 }