Skip pdf conversion if a converted file exists

For images the conversion also returns the extracted text. If this
would have failed to be saved, it is extracted in the following
text-extraction step.
This commit is contained in:
Eike Kettner
2020-10-02 00:20:30 +02:00
parent b6f23b038a
commit d4354b8b49
4 changed files with 40 additions and 4 deletions

View File

@ -38,10 +38,20 @@ object ConvertPdf {
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)
def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
isConverted(ctx)(ra).flatMap {
case true =>
ctx.logger.info(
s"Conversion to pdf already done for attachment ${ra.name}."
) *>
ctx.store
.transact(RAttachmentMeta.findById(ra.id))
.map(rmOpt => (ra, rmOpt))
case false =>
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)
}
for {
ras <- item.attachments.traverse(convert)
@ -51,6 +61,11 @@ object ConvertPdf {
}
def isConverted[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(
ra: RAttachment
): F[Boolean] =
ctx.store.transact(RAttachmentSource.isConverted(ra.id))
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)