Fix finding attachments for retries

The attachments to process again must be searched in sources and
archives, too.
This commit is contained in:
Eike Kettner
2020-10-02 00:19:25 +02:00
parent 5e21552358
commit b6f23b038a
3 changed files with 55 additions and 10 deletions

View File

@ -1,5 +1,6 @@
package docspell.joex.process package docspell.joex.process
import cats.data.NonEmptyList
import cats.data.OptionT import cats.data.OptionT
import cats.effect.Sync import cats.effect.Sync
import cats.implicits._ import cats.implicits._
@ -125,21 +126,31 @@ object CreateItem {
for { for {
cand <- ctx.store.transact(QItem.findByFileIds(fileMetaIds.toSeq)) cand <- ctx.store.transact(QItem.findByFileIds(fileMetaIds.toSeq))
_ <- _ <-
if (cand.nonEmpty) ctx.logger.warn("Found existing item with these files.") if (cand.nonEmpty)
ctx.logger.warn(s"Found ${cand.size} existing item with these files.")
else ().pure[F] else ().pure[F]
ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid)) ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid))
_ <- _ <-
if (ht.sum > 0) if (ht.sum > 0)
ctx.logger.warn(s"Removed ${ht.sum} items with same attachments") ctx.logger.warn(s"Removed ${ht.sum} items with same attachments")
else ().pure[F] else ().pure[F]
rms <- OptionT( rms <- OptionT
.fromOption[F](NonEmptyList.fromList(fileMetaIds.toList))
.flatMap(fids =>
OptionT(
//load attachments but only those mentioned in the task's arguments //load attachments but only those mentioned in the task's arguments
cand.headOption.traverse(ri => cand.headOption.traverse(ri =>
ctx.store ctx.store
.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) .transact(RAttachment.findByItemCollectiveSource(ri.id, ri.cid, fids))
.map(_.filter(r => fileMetaIds.contains(r.fileId))) .flatTap(ats =>
ctx.logger.debug(
s"Found ${ats.size} attachments. Use only those from task args: ${fileMetaIds}"
) )
).getOrElse(Vector.empty) )
)
)
)
.getOrElse(Vector.empty)
orig <- rms.traverse(a => orig <- rms.traverse(a =>
ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s)) ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s))
) )

View File

@ -20,7 +20,9 @@ object TextExtraction {
): Task[F, ProcessItemArgs, ItemData] = ): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
for { for {
_ <- ctx.logger.info("Starting text extraction") _ <- ctx.logger.info(
s"Starting text extraction for ${item.attachments.size} files"
)
start <- Duration.stopTime[F] start <- Duration.stopTime[F]
txt <- item.attachments.traverse( txt <- item.attachments.traverse(
extractTextIfEmpty( extractTextIfEmpty(
@ -31,9 +33,10 @@ object TextExtraction {
item item
) )
) )
_ <- ctx.logger.debug("Storing extracted texts") _ <- ctx.logger.debug("Storing extracted texts")
_ <- _ <-
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am))) txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
_ <- ctx.logger.debug(s"Extracted text stored.")
idxItem = TextData.item( idxItem = TextData.item(
item.item.id, item.item.id,
ctx.args.meta.collective, ctx.args.meta.collective,

View File

@ -1,5 +1,6 @@
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
@ -158,6 +159,36 @@ object RAttachment {
q.query[RAttachment].to[Vector] q.query[RAttachment].to[Vector]
} }
def findByItemCollectiveSource(
id: Ident,
coll: Ident,
fileIds: NonEmptyList[Ident]
): ConnectionIO[Vector[RAttachment]] = {
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val aItem = Columns.itemId.prefix("a")
val aId = Columns.id.prefix("a")
val aFile = Columns.fileId.prefix("a")
val sId = RAttachmentSource.Columns.id.prefix("s")
val sFile = RAttachmentSource.Columns.fileId.prefix("s")
val rId = RAttachmentArchive.Columns.id.prefix("r")
val rFile = RAttachmentArchive.Columns.fileId.prefix("r")
val from = table ++ fr"a INNER JOIN" ++
RItem.table ++ fr"i ON" ++ iId.is(aItem) ++ fr"LEFT JOIN" ++
RAttachmentSource.table ++ fr"s ON" ++ sId.is(aId) ++ fr"LEFT JOIN" ++
RAttachmentArchive.table ++ fr"r ON" ++ rId.is(aId)
val cond = and(
iId.is(id),
iColl.is(coll),
or(aFile.isIn(fileIds), sFile.isIn(fileIds), rFile.isIn(fileIds))
)
selectSimple(all.map(_.prefix("a")), from, cond).query[RAttachment].to[Vector]
}
def findByItemAndCollectiveWithMeta( def findByItemAndCollectiveWithMeta(
id: Ident, id: Ident,
coll: Ident coll: Ident