mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 17:50:11 +00:00 
			
		
		
		
	Redo pdf conversion and text extraction on reprocess
When processing a new file conversion and text extraction is skipped if detected to be already done. This prevents running expensive tasks again after restarting/retrying. When explicitely reprocessing a file, these tasks should run again and replace the existing results.
This commit is contained in:
		| @@ -40,14 +40,14 @@ object ConvertPdf { | ||||
|     Task { ctx => | ||||
|       def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] = | ||||
|         isConverted(ctx)(ra).flatMap { | ||||
|           case true => | ||||
|           case true if ctx.args.isNormalProcessing => | ||||
|             ctx.logger.info( | ||||
|               s"Conversion to pdf already done for attachment ${ra.name}." | ||||
|             ) *> | ||||
|               ctx.store | ||||
|                 .transact(RAttachmentMeta.findById(ra.id)) | ||||
|                 .map(rmOpt => (ra, rmOpt)) | ||||
|           case false => | ||||
|           case _ => | ||||
|             findMime(ctx)(ra).flatMap(m => | ||||
|               convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m) | ||||
|             ) | ||||
|   | ||||
| @@ -84,10 +84,10 @@ object TextExtraction { | ||||
|  | ||||
|     val rm = item.findOrCreate(ra.id, lang) | ||||
|     rm.content match { | ||||
|       case Some(_) => | ||||
|       case Some(_) if ctx.args.isNormalProcessing => | ||||
|         ctx.logger.info("TextExtraction skipped, since text is already available.") *> | ||||
|           makeTextData((rm, Nil)).pure[F] | ||||
|       case None => | ||||
|       case _ => | ||||
|         extractTextToMeta[F](ctx, cfg, lang, item)(ra) | ||||
|           .map(makeTextData) | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user