Redo pdf conversion and text extraction on reprocess

When processing a new file conversion and text extraction is skipped
if detected to be already done. This prevents running expensive tasks
again after restarting/retrying. When explicitely reprocessing a file,
these tasks should run again and replace the existing results.
This commit is contained in:
Eike Kettner 2021-03-12 00:45:28 +01:00
parent a7ee0aa08b
commit f8bd42e5bd
2 changed files with 4 additions and 4 deletions

View File

@ -40,14 +40,14 @@ object ConvertPdf {
Task { ctx =>
def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
isConverted(ctx)(ra).flatMap {
case true =>
case true if ctx.args.isNormalProcessing =>
ctx.logger.info(
s"Conversion to pdf already done for attachment ${ra.name}."
) *>
ctx.store
.transact(RAttachmentMeta.findById(ra.id))
.map(rmOpt => (ra, rmOpt))
case false =>
case _ =>
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)

View File

@ -84,10 +84,10 @@ object TextExtraction {
val rm = item.findOrCreate(ra.id, lang)
rm.content match {
case Some(_) =>
case Some(_) if ctx.args.isNormalProcessing =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
makeTextData((rm, Nil)).pure[F]
case None =>
case _ =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData)
}