Skip pdf conversion if a converted file exists

For images the conversion also returns the extracted text. If this
would have failed to be saved, it is extracted in the following
text-extraction step.
This commit is contained in:
Eike Kettner 2020-10-02 00:20:30 +02:00
parent b6f23b038a
commit d4354b8b49
4 changed files with 40 additions and 4 deletions

View File

@ -38,10 +38,20 @@ object ConvertPdf {
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)
def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
isConverted(ctx)(ra).flatMap {
case true =>
ctx.logger.info(
s"Conversion to pdf already done for attachment ${ra.name}."
) *>
ctx.store
.transact(RAttachmentMeta.findById(ra.id))
.map(rmOpt => (ra, rmOpt))
case false =>
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)
}
for {
ras <- item.attachments.traverse(convert)
@ -51,6 +61,11 @@ object ConvertPdf {
}
def isConverted[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(
ra: RAttachment
): F[Boolean] =
ctx.store.transact(RAttachmentSource.isConverted(ra.id))
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)

View File

@ -44,6 +44,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
def isNot[A: Put](value: A): Fragment =
f ++ fr"<> $value"
def isNot(c: Column): Fragment =
f ++ fr"<>" ++ c.f
def isNull: Fragment =
f ++ fr"is null"

View File

@ -46,6 +46,9 @@ object RAttachmentMeta {
def exists(attachId: Ident): ConnectionIO[Boolean] =
selectCount(id, table, id.is(attachId)).query[Int].unique.map(_ > 0)
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentMeta]] =
selectSimple(all, table, id.is(attachId)).query[RAttachmentMeta].option
def upsert(v: RAttachmentMeta): ConnectionIO[Int] =
for {
n0 <- update(v)

View File

@ -48,6 +48,21 @@ object RAttachmentSource {
.unique
.map(_ > 0)
def isConverted(attachId: Ident): ConnectionIO[Boolean] = {
val sId = Columns.id.prefix("s")
val sFile = Columns.fileId.prefix("s")
val aId = RAttachment.Columns.id.prefix("a")
val aFile = RAttachment.Columns.fileId.prefix("a")
val from = table ++ fr"s INNER JOIN" ++
RAttachment.table ++ fr"a ON" ++ aId.is(sId)
selectCount(aId, from, and(aId.is(attachId), aFile.isNot(sFile)))
.query[Int]
.unique
.map(_ > 0)
}
def delete(attachId: Ident): ConnectionIO[Int] =
deleteFrom(table, id.is(attachId)).update.run