From f8bd42e5bd977e0b6ebc6d3366a2525391360a11 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 12 Mar 2021 00:45:28 +0100 Subject: [PATCH] Redo pdf conversion and text extraction on reprocess When processing a new file conversion and text extraction is skipped if detected to be already done. This prevents running expensive tasks again after restarting/retrying. When explicitely reprocessing a file, these tasks should run again and replace the existing results. --- .../src/main/scala/docspell/joex/process/ConvertPdf.scala | 4 ++-- .../src/main/scala/docspell/joex/process/TextExtraction.scala | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 57292563..84828e19 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -40,14 +40,14 @@ object ConvertPdf { Task { ctx => def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] = isConverted(ctx)(ra).flatMap { - case true => + case true if ctx.args.isNormalProcessing => ctx.logger.info( s"Conversion to pdf already done for attachment ${ra.name}." ) *> ctx.store .transact(RAttachmentMeta.findById(ra.id)) .map(rmOpt => (ra, rmOpt)) - case false => + case _ => findMime(ctx)(ra).flatMap(m => convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index fcdd6f98..2dcc4d31 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -84,10 +84,10 @@ object TextExtraction { val rm = item.findOrCreate(ra.id, lang) rm.content match { - case Some(_) => + case Some(_) if ctx.args.isNormalProcessing => ctx.logger.info("TextExtraction skipped, since text is already available.") *> makeTextData((rm, Nil)).pure[F] - case None => + case _ => extractTextToMeta[F](ctx, cfg, lang, item)(ra) .map(makeTextData) }