Add a task to re-process files of an item

This commit is contained in:
Eike Kettner
2020-08-12 22:26:44 +02:00
parent 8e15478e3c
commit 07e9a9767e
13 changed files with 350 additions and 5 deletions

View File

@ -14,6 +14,7 @@ import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.hk._
import docspell.joex.notify._
import docspell.joex.process.ItemHandler
import docspell.joex.process.ReProcessItem
import docspell.joex.scanmailbox._
import docspell.joex.scheduler._
import docspell.joexapi.client.JoexClient
@ -96,6 +97,13 @@ object JoexAppImpl {
ItemHandler.onCancel[F]
)
)
.withTask(
JobTask.json(
ReProcessItemArgs.taskName,
ReProcessItem[F](cfg, fts),
ReProcessItem.onCancel[F]
)
)
.withTask(
JobTask.json(
NotifyDueItemsArgs.taskName,

View File

@ -126,11 +126,46 @@ object ConvertPdf {
.compile
.lastOrError
.map(fm => Ident.unsafe(fm.id))
.flatMap(fmId =>
ctx.store
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
.map(_ => fmId)
)
.flatMap(fmId => updateAttachment[F](ctx, ra, fmId, newName).map(_ => fmId))
.map(fmId => ra.copy(fileId = fmId, name = newName))
}
private def updateAttachment[F[_]: Sync](
ctx: Context[F, _],
ra: RAttachment,
fmId: Ident,
newName: Option[String]
): F[Unit] =
for {
oldFile <- ctx.store.transact(RAttachment.findById(ra.id))
_ <-
ctx.store
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
_ <- oldFile match {
case Some(raPrev) =>
for {
sameFile <-
ctx.store
.transact(RAttachmentSource.isSameFile(ra.id, raPrev.fileId))
_ <-
if (sameFile) ().pure[F]
else
ctx.logger.info("Deleting previous attachment file") *>
ctx.store.bitpeace
.delete(raPrev.fileId.id)
.compile
.drain
.attempt
.flatMap {
case Right(_) => ().pure[F]
case Left(ex) =>
ctx.logger
.error(ex)(s"Cannot delete previous attachment file: ${raPrev}")
}
} yield ()
case None =>
().pure[F]
}
} yield ()
}

View File

@ -27,6 +27,17 @@ object ProcessItem {
.flatMap(SetGivenData[F](itemOps))
.flatMap(Task.setProgress(99))
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(30))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(60))
.flatMap(analysisOnly[F](cfg))
.flatMap(Task.setProgress(90))
def analysisOnly[F[_]: Sync](
cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =

View File

@ -0,0 +1,131 @@
package docspell.joex.process
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.ftsclient.FtsClient
import docspell.joex.Config
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment
import docspell.store.records.RAttachmentSource
import docspell.store.records.RCollective
import docspell.store.records.RItem
object ReProcessItem {
type Args = ReProcessItemArgs
def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
): Task[F, Args, Unit] =
loadItem[F]
.flatMap(safeProcess[F](cfg, fts))
.map(_ => ())
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
logWarn("Now cancelling re-processing.")
// --- Helpers
private def contains[F[_]](ctx: Context[F, Args]): RAttachment => Boolean = {
val selection = ctx.args.attachments.toSet
if (selection.isEmpty) (_ => true)
else ra => selection.contains(ra.id)
}
def loadItem[F[_]: Sync]: Task[F, Args, ItemData] =
Task { ctx =>
(for {
item <- OptionT(ctx.store.transact(RItem.findById(ctx.args.itemId)))
attach <- OptionT.liftF(ctx.store.transact(RAttachment.findByItem(item.id)))
asrc <-
OptionT.liftF(ctx.store.transact(RAttachmentSource.findByItem(ctx.args.itemId)))
asrcMap = asrc.map(s => s.id -> s).toMap
// copy the original files over to attachments to run the default processing task
// the processing doesn't touch the original files, only RAttachments
attachSrc =
attach
.filter(contains(ctx))
.flatMap(a =>
asrcMap.get(a.id).map { src =>
a.copy(fileId = src.fileId, name = src.name)
}
)
} yield ItemData(
item,
attachSrc,
Vector.empty,
Vector.empty,
asrcMap.view.mapValues(_.fileId).toMap,
MetaProposalList.empty,
Nil
)).getOrElseF(
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
)
}
def processFiles[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F],
data: ItemData
): Task[F, Args, ItemData] = {
val convertArgs: Language => Args => F[ProcessItemArgs] =
lang =>
args =>
ProcessItemArgs(
ProcessItemArgs.ProcessMeta(
data.item.cid,
args.itemId.some,
lang,
None, //direction
"", //source-id
None, //folder
Seq.empty
),
Nil
).pure[F]
getLanguage[F].flatMap { lang =>
ProcessItem
.processAttachments[F](cfg, fts)(data)
.contramap[Args](convertArgs(lang))
}
}
def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
Task { ctx =>
(for {
coll <- OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
lang = coll.language
} yield lang).getOrElse(Language.German)
}
def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
Task(_.isLastRetry)
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
fts: FtsClient[F]
)(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap {
case true =>
processFiles[F](cfg, fts, data).attempt
.flatMap({
case Right(d) =>
Task.pure(d)
case Left(ex) =>
logWarn[F](
"Processing failed on last retry."
).andThen(_ => Sync[F].raiseError(ex))
})
case false =>
processFiles[F](cfg, fts, data)
}
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
Task(_.logger.warn(msg))
}