Redo pdf conversion and text extraction on reprocess

When processing a new file conversion and text extraction is skipped if detected to be already done. This prevents running expensive tasks again after restarting/retrying. When explicitely reprocessing a file, these tasks should run again and replace the existing results.
2025-07-04 16:48:26 +00:00 · 2021-03-12 00:45:28 +01:00
parent a7ee0aa08b
commit f8bd42e5bd
2 changed files with 4 additions and 4 deletions
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -40,14 +40,14 @@ object ConvertPdf {
    Task { ctx =>
      def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
        isConverted(ctx)(ra).flatMap {
-          case true =>
+          case true if ctx.args.isNormalProcessing =>
            ctx.logger.info(
              s"Conversion to pdf already done for attachment ${ra.name}."
            ) *>
              ctx.store
                .transact(RAttachmentMeta.findById(ra.id))
                .map(rmOpt => (ra, rmOpt))
-          case false =>
+          case _ =>
            findMime(ctx)(ra).flatMap(m =>
              convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
            )
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@ -84,10 +84,10 @@ object TextExtraction {

    val rm = item.findOrCreate(ra.id, lang)
    rm.content match {
-      case Some(_) =>
+      case Some(_) if ctx.args.isNormalProcessing =>
        ctx.logger.info("TextExtraction skipped, since text is already available.") *>
          makeTextData((rm, Nil)).pure[F]
-      case None =>
+      case _ =>
        extractTextToMeta[F](ctx, cfg, lang, item)(ra)
          .map(makeTextData)
    }