Skip pdf conversion if a converted file exists

For images the conversion also returns the extracted text. If this would have failed to be saved, it is extracted in the following text-extraction step.
2025-08-05 02:24:52 +00:00 · 2020-10-02 00:20:30 +02:00
parent b6f23b038a
commit d4354b8b49
4 changed files with 40 additions and 4 deletions
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -38,10 +38,20 @@ object ConvertPdf {
      item: ItemData
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
-      def convert(ra: RAttachment) =
-        findMime(ctx)(ra).flatMap(m =>
-          convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
-        )
+      def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
+        isConverted(ctx)(ra).flatMap {
+          case true =>
+            ctx.logger.info(
+              s"Conversion to pdf already done for attachment ${ra.name}."
+            ) *>
+              ctx.store
+                .transact(RAttachmentMeta.findById(ra.id))
+                .map(rmOpt => (ra, rmOpt))
+          case false =>
+            findMime(ctx)(ra).flatMap(m =>
+              convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
+            )
+        }

      for {
        ras <- item.attachments.traverse(convert)
@ -51,6 +61,11 @@ object ConvertPdf {

    }

+  def isConverted[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(
+      ra: RAttachment
+  ): F[Boolean] =
+    ctx.store.transact(RAttachmentSource.isConverted(ra.id))
+
  def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
    OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
      .map(_.mimetype)