Merge pull request #228 from eikek/reprocess-item-files

Reprocess item files
2025-08-05 02:24:52 +00:00 · 2020-08-13 20:25:22 +00:00
parent ed0df2b1ec f1288d384e
commit d6c370c3b1
26 changed files with 882 additions and 33 deletions
--- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
+++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
@ -61,7 +61,7 @@ object BackendApp {
      uploadImpl     <- OUpload(store, queue, cfg.files, joexImpl)
      nodeImpl       <- ONode(store)
      jobImpl        <- OJob(store, joexImpl)
-      itemImpl       <- OItem(store, ftsClient)
+      itemImpl       <- OItem(store, ftsClient, queue, joexImpl)
      itemSearchImpl <- OItemSearch(store)
      fulltextImpl   <- OFulltext(itemSearchImpl, ftsClient, store, queue, joexImpl)
      javaEmil =
--- a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala
+++ b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala
@ -8,6 +8,50 @@ import docspell.store.records.RJob

 object JobFactory {

+  def convertAllPdfs[F[_]: Sync](
+      collective: Option[Ident],
+      account: AccountId,
+      prio: Priority
+  ): F[RJob] =
+    for {
+      id  <- Ident.randomId[F]
+      now <- Timestamp.current[F]
+      job = RJob.newJob(
+        id,
+        ConvertAllPdfArgs.taskName,
+        account.collective,
+        ConvertAllPdfArgs(collective),
+        s"Convert all pdfs not yet converted",
+        now,
+        account.user,
+        prio,
+        collective
+          .map(c => c / ConvertAllPdfArgs.taskName)
+          .orElse(ConvertAllPdfArgs.taskName.some)
+      )
+    } yield job
+
+  def reprocessItem[F[_]: Sync](
+      args: ReProcessItemArgs,
+      account: AccountId,
+      prio: Priority
+  ): F[RJob] =
+    for {
+      id  <- Ident.randomId[F]
+      now <- Timestamp.current[F]
+      job = RJob.newJob(
+        id,
+        ReProcessItemArgs.taskName,
+        account.collective,
+        args,
+        s"Re-process files of item ${args.itemId.id}",
+        now,
+        account.user,
+        prio,
+        Some(ReProcessItemArgs.taskName / args.itemId)
+      )
+    } yield job
+
  def processItem[F[_]: Sync](
      args: ProcessItemArgs,
      account: AccountId,
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
@ -4,10 +4,12 @@ import cats.data.OptionT
 import cats.effect.{Effect, Resource}
 import cats.implicits._

+import docspell.backend.JobFactory
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.store.UpdateResult
 import docspell.store.queries.{QAttachment, QItem}
+import docspell.store.queue.JobQueue
 import docspell.store.records._
 import docspell.store.{AddResult, Store}

@ -76,11 +78,38 @@ trait OItem[F[_]] {
      name: Option[String],
      collective: Ident
  ): F[AddResult]
+
+  /** Submits the item for re-processing. The list of attachment ids can
+    * be used to only re-process a subset of the item's attachments.
+    * If this list is empty, all attachments are reprocessed. This
+    * call only submits the job into the queue.
+    */
+  def reprocess(
+      item: Ident,
+      attachments: List[Ident],
+      account: AccountId,
+      notifyJoex: Boolean
+  ): F[UpdateResult]
+
+  /** Submits a task that finds all non-converted pdfs and triggers
+    * converting them using ocrmypdf. Each file is converted by a
+    * separate task.
+    */
+  def convertAllPdf(
+      collective: Option[Ident],
+      account: AccountId,
+      notifyJoex: Boolean
+  ): F[UpdateResult]
 }

 object OItem {

-  def apply[F[_]: Effect](store: Store[F], fts: FtsClient[F]): Resource[F, OItem[F]] =
+  def apply[F[_]: Effect](
+      store: Store[F],
+      fts: FtsClient[F],
+      queue: JobQueue[F],
+      joex: OJoex[F]
+  ): Resource[F, OItem[F]] =
    for {
      otag   <- OTag(store)
      oorg   <- OOrganization(store)
@ -400,6 +429,35 @@ object OItem {
              )
            )

+        def reprocess(
+            item: Ident,
+            attachments: List[Ident],
+            account: AccountId,
+            notifyJoex: Boolean
+        ): F[UpdateResult] =
+          (for {
+            _ <- OptionT(
+              store.transact(RItem.findByIdAndCollective(item, account.collective))
+            )
+            args = ReProcessItemArgs(item, attachments)
+            job <- OptionT.liftF(
+              JobFactory.reprocessItem[F](args, account, Priority.Low)
+            )
+            _ <- OptionT.liftF(queue.insertIfNew(job))
+            _ <- OptionT.liftF(if (notifyJoex) joex.notifyAllNodes else ().pure[F])
+          } yield UpdateResult.success).getOrElse(UpdateResult.notFound)
+
+        def convertAllPdf(
+            collective: Option[Ident],
+            account: AccountId,
+            notifyJoex: Boolean
+        ): F[UpdateResult] =
+          for {
+            job <- JobFactory.convertAllPdfs[F](collective, account, Priority.Low)
+            _   <- queue.insertIfNew(job)
+            _   <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
+          } yield UpdateResult.success
+
        private def onSuccessIgnoreError(update: F[Unit])(ar: AddResult): F[Unit] =
          ar match {
            case AddResult.Success =>
--- a/modules/backend/src/main/scala/docspell/backend/ops/OJob.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OJob.scala
@ -48,7 +48,9 @@ object OJob {

      def queueState(collective: Ident, maxResults: Int): F[CollectiveQueueState] =
        store
-          .transact(QJob.queueStateSnapshot(collective).take(maxResults.toLong))
+          .transact(
+            QJob.queueStateSnapshot(collective, maxResults.toLong)
+          )
          .map(t => JobDetail(t._1, t._2))
          .compile
          .toVector
--- a/modules/common/src/main/scala/docspell/common/ConvertAllPdfArgs.scala
+++ b/modules/common/src/main/scala/docspell/common/ConvertAllPdfArgs.scala
@ -0,0 +1,26 @@
+package docspell.common
+
+import io.circe._
+import io.circe.generic.semiauto._
+
+/** Arguments for the task that finds all pdf files that have not been
+  * converted and submits for each a job that will convert the file
+  * using ocrmypdf.
+  *
+  * If the `collective` argument is present, then this task and the
+  * ones that are submitted by this task run in the realm of the
+  * collective (and only their files are considered). If it is empty,
+  * it is a system task and all files are considered.
+  */
+case class ConvertAllPdfArgs(collective: Option[Ident])
+
+object ConvertAllPdfArgs {
+
+  val taskName = Ident.unsafe("submit-pdf-migration-tasks")
+
+  implicit val jsonDecoder: Decoder[ConvertAllPdfArgs] =
+    deriveDecoder[ConvertAllPdfArgs]
+
+  implicit val jsonEncoder: Encoder[ConvertAllPdfArgs] =
+    deriveEncoder[ConvertAllPdfArgs]
+}
--- a/modules/common/src/main/scala/docspell/common/ReProcessItemArgs.scala
+++ b/modules/common/src/main/scala/docspell/common/ReProcessItemArgs.scala
@ -0,0 +1,24 @@
+package docspell.common
+
+import io.circe.generic.semiauto._
+import io.circe.{Decoder, Encoder}
+
+/** Arguments when re-processing an item.
+  *
+  * The `itemId` must exist and point to some item. If the attachment
+  * list is non-empty, only those attachments are re-processed. They
+  * must belong to the given item. If the list is empty, then all
+  * attachments are re-processed.
+  */
+case class ReProcessItemArgs(itemId: Ident, attachments: List[Ident])
+
+object ReProcessItemArgs {
+
+  val taskName: Ident = Ident.unsafe("re-process-item")
+
+  implicit val jsonEncoder: Encoder[ReProcessItemArgs] =
+    deriveEncoder[ReProcessItemArgs]
+
+  implicit val jsonDecoder: Decoder[ReProcessItemArgs] =
+    deriveDecoder[ReProcessItemArgs]
+}
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@ -13,7 +13,10 @@ import docspell.ftssolr.SolrFtsClient
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.notify._
+import docspell.joex.pdfconv.ConvertAllPdfTask
+import docspell.joex.pdfconv.PdfConvTask
 import docspell.joex.process.ItemHandler
+import docspell.joex.process.ReProcessItem
 import docspell.joex.scanmailbox._
 import docspell.joex.scheduler._
 import docspell.joexapi.client.JoexClient
@ -84,7 +87,7 @@ object JoexAppImpl {
      joex    <- OJoex(client, store)
      upload  <- OUpload(store, queue, cfg.files, joex)
      fts     <- createFtsClient(cfg)(httpClient)
-      itemOps <- OItem(store, fts)
+      itemOps <- OItem(store, fts, queue, joex)
      javaEmil =
        JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
      sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -96,6 +99,13 @@ object JoexAppImpl {
            ItemHandler.onCancel[F]
          )
        )
+        .withTask(
+          JobTask.json(
+            ReProcessItemArgs.taskName,
+            ReProcessItem[F](cfg, fts),
+            ReProcessItem.onCancel[F]
+          )
+        )
        .withTask(
          JobTask.json(
            NotifyDueItemsArgs.taskName,
@ -131,6 +141,20 @@ object JoexAppImpl {
            HouseKeepingTask.onCancel[F]
          )
        )
+        .withTask(
+          JobTask.json(
+            PdfConvTask.taskName,
+            PdfConvTask[F](cfg),
+            PdfConvTask.onCancel[F]
+          )
+        )
+        .withTask(
+          JobTask.json(
+            ConvertAllPdfArgs.taskName,
+            ConvertAllPdfTask[F](queue, joex),
+            ConvertAllPdfTask.onCancel[F]
+          )
+        )
        .resource
      psch <- PeriodicScheduler.create(
        cfg.periodicScheduler,
--- a/modules/joex/src/main/scala/docspell/joex/pdfconv/ConvertAllPdfTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/pdfconv/ConvertAllPdfTask.scala
@ -0,0 +1,72 @@
+package docspell.joex.pdfconv
+
+import cats.effect._
+import cats.implicits._
+import fs2.{Chunk, Stream}
+
+import docspell.backend.ops.OJoex
+import docspell.common._
+import docspell.joex.scheduler.{Context, Task}
+import docspell.store.queue.JobQueue
+import docspell.store.records.RAttachment
+import docspell.store.records._
+
+/* A task to find all non-converted pdf files (of a collective, or
+ * all) and converting them using ocrmypdf by submitting a job for
+ * each found file.
+ */
+object ConvertAllPdfTask {
+  type Args = ConvertAllPdfArgs
+
+  def apply[F[_]: Sync](queue: JobQueue[F], joex: OJoex[F]): Task[F, Args, Unit] =
+    Task { ctx =>
+      for {
+        _ <- ctx.logger.info("Converting pdfs using ocrmypdf")
+        n <- submitConversionJobs(ctx, queue)
+        _ <- ctx.logger.info(s"Submitted $n file conversion jobs")
+        _ <- joex.notifyAllNodes
+      } yield ()
+    }
+
+  def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
+    Task.log(_.warn("Cancelling convert-old-pdf task"))
+
+  def submitConversionJobs[F[_]: Sync](
+      ctx: Context[F, Args],
+      queue: JobQueue[F]
+  ): F[Int] =
+    ctx.store
+      .transact(RAttachment.findNonConvertedPdf(ctx.args.collective, 50))
+      .chunks
+      .flatMap(createJobs[F](ctx))
+      .chunks
+      .evalMap(jobs => queue.insertAllIfNew(jobs.toVector).map(_ => jobs.size))
+      .evalTap(n => ctx.logger.debug(s"Submitted $n jobs …"))
+      .compile
+      .foldMonoid
+
+  private def createJobs[F[_]: Sync](
+      ctx: Context[F, Args]
+  )(ras: Chunk[RAttachment]): Stream[F, RJob] = {
+    val collectiveOrSystem = ctx.args.collective.getOrElse(DocspellSystem.taskGroup)
+
+    def mkJob(ra: RAttachment): F[RJob] =
+      for {
+        id  <- Ident.randomId[F]
+        now <- Timestamp.current[F]
+      } yield RJob.newJob(
+        id,
+        PdfConvTask.taskName,
+        collectiveOrSystem,
+        PdfConvTask.Args(ra.id),
+        s"Convert pdf ${ra.id.id}/${ra.name.getOrElse("-")}",
+        now,
+        collectiveOrSystem,
+        Priority.Low,
+        Some(PdfConvTask.taskName / ra.id)
+      )
+
+    val jobs = ras.traverse(mkJob)
+    Stream.evalUnChunk(jobs)
+  }
+}
--- a/modules/joex/src/main/scala/docspell/joex/pdfconv/PdfConvTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/pdfconv/PdfConvTask.scala
@ -0,0 +1,155 @@
+package docspell.joex.pdfconv
+
+import cats.data.Kleisli
+import cats.data.OptionT
+import cats.effect._
+import cats.implicits._
+import fs2.Stream
+
+import docspell.common._
+import docspell.convert.ConversionResult
+import docspell.convert.extern.OcrMyPdf
+import docspell.joex.Config
+import docspell.joex.scheduler.{Context, Task}
+import docspell.store.records._
+
+import bitpeace.FileMeta
+import bitpeace.Mimetype
+import bitpeace.MimetypeHint
+import bitpeace.RangeDef
+import io.circe.generic.semiauto._
+import io.circe.{Decoder, Encoder}
+
+/** Converts the given attachment file using ocrmypdf if it is a pdf
+  * and has not already been converted (the source file is the same as
+  * in the attachment).
+  */
+object PdfConvTask {
+  case class Args(attachId: Ident)
+  object Args {
+    implicit val jsonDecoder: Decoder[Args] =
+      deriveDecoder[Args]
+    implicit val jsonEncoder: Encoder[Args] =
+      deriveEncoder[Args]
+  }
+
+  val taskName = Ident.unsafe("pdf-files-migration")
+
+  def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, Args, Unit] =
+    Task { ctx =>
+      for {
+        _    <- ctx.logger.info(s"Converting pdf file ${ctx.args} using ocrmypdf")
+        meta <- checkInputs(cfg, ctx)
+        _    <- meta.traverse(fm => convert(cfg, ctx, fm))
+      } yield ()
+    }
+
+  def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
+    Task.log(_.warn("Cancelling pdfconv task"))
+
+  // --- Helper
+
+  // check if file exists and if it is pdf and if source id is the same and if ocrmypdf is enabled
+  def checkInputs[F[_]: Sync](cfg: Config, ctx: Context[F, Args]): F[Option[FileMeta]] = {
+    val none: Option[FileMeta] = None
+    val checkSameFiles =
+      (for {
+        ra <- OptionT(ctx.store.transact(RAttachment.findById(ctx.args.attachId)))
+        isSame <- OptionT.liftF(
+          ctx.store.transact(RAttachmentSource.isSameFile(ra.id, ra.fileId))
+        )
+      } yield isSame).getOrElse(false)
+    val existsPdf =
+      for {
+        meta <- ctx.store.transact(RAttachment.findMeta(ctx.args.attachId))
+        res = meta.filter(_.mimetype.matches(Mimetype.`application/pdf`))
+        _ <-
+          if (res.isEmpty)
+            ctx.logger.info(
+              s"The attachment ${ctx.args.attachId} doesn't exist or is no pdf: $meta"
+            )
+          else ().pure[F]
+      } yield res
+
+    if (cfg.convert.ocrmypdf.enabled)
+      checkSameFiles.flatMap {
+        case true => existsPdf
+        case false =>
+          ctx.logger.info(
+            s"The attachment ${ctx.args.attachId} already has been converted. Skipping."
+          ) *>
+            none.pure[F]
+      }
+    else none.pure[F]
+  }
+
+  def convert[F[_]: Sync: ContextShift](
+      cfg: Config,
+      ctx: Context[F, Args],
+      in: FileMeta
+  ): F[Unit] = {
+    val bp = ctx.store.bitpeace
+    val data = Stream
+      .emit(in)
+      .through(bp.fetchData2(RangeDef.all))
+
+    val storeResult: ConversionResult.Handler[F, Unit] =
+      Kleisli({
+        case ConversionResult.SuccessPdf(file) =>
+          storeToAttachment(ctx, in, file)
+
+        case ConversionResult.SuccessPdfTxt(file, _) =>
+          storeToAttachment(ctx, in, file)
+
+        case ConversionResult.UnsupportedFormat(mime) =>
+          ctx.logger.warn(
+            s"Unable to convert '${mime}' file ${ctx.args}: unsupported format."
+          )
+
+        case ConversionResult.InputMalformed(mime, reason) =>
+          ctx.logger.warn(s"Unable to convert '${mime}' file ${ctx.args}: $reason")
+
+        case ConversionResult.Failure(ex) =>
+          ctx.logger.error(ex)(s"Failure converting file ${ctx.args}: ${ex.getMessage}")
+      })
+
+    def ocrMyPdf(lang: Language): F[Unit] =
+      OcrMyPdf.toPDF[F, Unit](
+        cfg.convert.ocrmypdf,
+        lang,
+        in.chunksize,
+        ctx.blocker,
+        ctx.logger
+      )(data, storeResult)
+
+    for {
+      lang <- getLanguage(ctx)
+      _    <- ocrMyPdf(lang)
+    } yield ()
+  }
+
+  def getLanguage[F[_]: Sync](ctx: Context[F, Args]): F[Language] =
+    (for {
+      coll <- OptionT(ctx.store.transact(RCollective.findByAttachment(ctx.args.attachId)))
+      lang = coll.language
+    } yield lang).getOrElse(Language.German)
+
+  def storeToAttachment[F[_]: Sync](
+      ctx: Context[F, Args],
+      meta: FileMeta,
+      newFile: Stream[F, Byte]
+  ): F[Unit] = {
+    val mimeHint = MimetypeHint.advertised(meta.mimetype.asString)
+    for {
+      time <- Timestamp.current[F]
+      fid  <- Ident.randomId[F]
+      _ <-
+        ctx.store.bitpeace
+          .saveNew(newFile, meta.chunksize, mimeHint, Some(fid.id), time.value)
+          .compile
+          .lastOrError
+      _ <- ctx.store.transact(RAttachment.updateFileId(ctx.args.attachId, fid))
+    } yield ()
+  }
+
+}
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -126,11 +126,46 @@ object ConvertPdf {
      .compile
      .lastOrError
      .map(fm => Ident.unsafe(fm.id))
-      .flatMap(fmId =>
-        ctx.store
-          .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
-          .map(_ => fmId)
-      )
+      .flatMap(fmId => updateAttachment[F](ctx, ra, fmId, newName).map(_ => fmId))
      .map(fmId => ra.copy(fileId = fmId, name = newName))
  }
+
+  private def updateAttachment[F[_]: Sync](
+      ctx: Context[F, _],
+      ra: RAttachment,
+      fmId: Ident,
+      newName: Option[String]
+  ): F[Unit] =
+    for {
+      oldFile <- ctx.store.transact(RAttachment.findById(ra.id))
+      _ <-
+        ctx.store
+          .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
+      _ <- oldFile match {
+        case Some(raPrev) =>
+          for {
+            sameFile <-
+              ctx.store
+                .transact(RAttachmentSource.isSameFile(ra.id, raPrev.fileId))
+            _ <-
+              if (sameFile) ().pure[F]
+              else
+                ctx.logger.info("Deleting previous attachment file") *>
+                  ctx.store.bitpeace
+                    .delete(raPrev.fileId.id)
+                    .compile
+                    .drain
+                    .attempt
+                    .flatMap {
+                      case Right(_) => ().pure[F]
+                      case Left(ex) =>
+                        ctx.logger
+                          .error(ex)(s"Cannot delete previous attachment file: ${raPrev}")
+
+                    }
+          } yield ()
+        case None =>
+          ().pure[F]
+      }
+    } yield ()
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@ -17,16 +17,17 @@ object ProcessItem {
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    ExtractArchive(item)
      .flatMap(Task.setProgress(20))
-      .flatMap(ConvertPdf(cfg.convert, _))
-      .flatMap(Task.setProgress(40))
-      .flatMap(TextExtraction(cfg.extraction, fts))
-      .flatMap(Task.setProgress(60))
-      .flatMap(analysisOnly[F](cfg))
-      .flatMap(Task.setProgress(80))
+      .flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
      .flatMap(LinkProposal[F])
      .flatMap(SetGivenData[F](itemOps))
      .flatMap(Task.setProgress(99))

+  def processAttachments[F[_]: ConcurrentEffect: ContextShift](
+      cfg: Config,
+      fts: FtsClient[F]
+  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+    processAttachments0[F](cfg, fts, (30, 60, 90))(item)
+
  def analysisOnly[F[_]: Sync](
      cfg: Config
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
@ -34,4 +35,16 @@ object ProcessItem {
      .flatMap(FindProposal[F](cfg.processing))
      .flatMap(EvalProposals[F])
      .flatMap(SaveProposals[F])
+
+  private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
+      cfg: Config,
+      fts: FtsClient[F],
+      progress: (Int, Int, Int)
+  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+    ConvertPdf(cfg.convert, item)
+      .flatMap(Task.setProgress(progress._1))
+      .flatMap(TextExtraction(cfg.extraction, fts))
+      .flatMap(Task.setProgress(progress._2))
+      .flatMap(analysisOnly[F](cfg))
+      .flatMap(Task.setProgress(progress._3))
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@ -0,0 +1,131 @@
+package docspell.joex.process
+
+import cats.data.OptionT
+import cats.effect._
+import cats.implicits._
+
+import docspell.common._
+import docspell.ftsclient.FtsClient
+import docspell.joex.Config
+import docspell.joex.scheduler.Context
+import docspell.joex.scheduler.Task
+import docspell.store.records.RAttachment
+import docspell.store.records.RAttachmentSource
+import docspell.store.records.RCollective
+import docspell.store.records.RItem
+
+object ReProcessItem {
+  type Args = ReProcessItemArgs
+
+  def apply[F[_]: ConcurrentEffect: ContextShift](
+      cfg: Config,
+      fts: FtsClient[F]
+  ): Task[F, Args, Unit] =
+    loadItem[F]
+      .flatMap(safeProcess[F](cfg, fts))
+      .map(_ => ())
+
+  def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
+    logWarn("Now cancelling re-processing.")
+
+  // --- Helpers
+
+  private def contains[F[_]](ctx: Context[F, Args]): RAttachment => Boolean = {
+    val selection = ctx.args.attachments.toSet
+    if (selection.isEmpty) (_ => true)
+    else ra => selection.contains(ra.id)
+  }
+
+  def loadItem[F[_]: Sync]: Task[F, Args, ItemData] =
+    Task { ctx =>
+      (for {
+        item   <- OptionT(ctx.store.transact(RItem.findById(ctx.args.itemId)))
+        attach <- OptionT.liftF(ctx.store.transact(RAttachment.findByItem(item.id)))
+        asrc <-
+          OptionT.liftF(ctx.store.transact(RAttachmentSource.findByItem(ctx.args.itemId)))
+        asrcMap = asrc.map(s => s.id -> s).toMap
+        // copy the original files over to attachments to run the default processing task
+        // the processing doesn't touch the original files, only RAttachments
+        attachSrc =
+          attach
+            .filter(contains(ctx))
+            .flatMap(a =>
+              asrcMap.get(a.id).map { src =>
+                a.copy(fileId = src.fileId, name = src.name)
+              }
+            )
+      } yield ItemData(
+        item,
+        attachSrc,
+        Vector.empty,
+        Vector.empty,
+        asrcMap.view.mapValues(_.fileId).toMap,
+        MetaProposalList.empty,
+        Nil
+      )).getOrElseF(
+        Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
+      )
+    }
+
+  def processFiles[F[_]: ConcurrentEffect: ContextShift](
+      cfg: Config,
+      fts: FtsClient[F],
+      data: ItemData
+  ): Task[F, Args, ItemData] = {
+
+    val convertArgs: Language => Args => F[ProcessItemArgs] =
+      lang =>
+        args =>
+          ProcessItemArgs(
+            ProcessItemArgs.ProcessMeta(
+              data.item.cid,
+              args.itemId.some,
+              lang,
+              None, //direction
+              "",   //source-id
+              None, //folder
+              Seq.empty
+            ),
+            Nil
+          ).pure[F]
+
+    getLanguage[F].flatMap { lang =>
+      ProcessItem
+        .processAttachments[F](cfg, fts)(data)
+        .contramap[Args](convertArgs(lang))
+    }
+  }
+
+  def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
+    Task { ctx =>
+      (for {
+        coll <- OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
+        lang = coll.language
+      } yield lang).getOrElse(Language.German)
+    }
+
+  def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
+    Task(_.isLastRetry)
+
+  def safeProcess[F[_]: ConcurrentEffect: ContextShift](
+      cfg: Config,
+      fts: FtsClient[F]
+  )(data: ItemData): Task[F, Args, ItemData] =
+    isLastRetry[F].flatMap {
+      case true =>
+        processFiles[F](cfg, fts, data).attempt
+          .flatMap({
+            case Right(d) =>
+              Task.pure(d)
+            case Left(ex) =>
+              logWarn[F](
+                "Processing failed on last retry."
+              ).andThen(_ => Sync[F].raiseError(ex))
+          })
+      case false =>
+        processFiles[F](cfg, fts, data)
+    }
+
+  private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
+    Task(_.logger.warn(msg))
+}
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@ -1213,6 +1213,33 @@ paths:
              schema:
                $ref: "#/components/schemas/BasicResult"

+  /sec/item/convertallpdfs:
+    post:
+      tags: [ Item ]
+      summary: Convert all non-converted pdfs.
+      description: |
+        Submits a job that will find all pdf files that have not been
+        converted and converts them using the ocrmypdf tool (if
+        enabled). This tool has been added in version 0.9.0 and so
+        older files can be "migrated" this way, or maybe after
+        enabling the tool.
+
+        The task finds all files of the current collective and submits
+        task for each file to convert. These tasks are submitted with
+        a low priority so that normal processing can still proceed.
+
+        The body of the request should be empty.
+      security:
+        - authTokenHeader: []
+      responses:
+        200:
+          description: Ok
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BasicResult"
+
+
  /sec/item/search:
    post:
      tags: [ Item ]
@ -1796,6 +1823,31 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ItemProposals"
+  /sec/item/{itemId}/reprocess:
+    post:
+      tags: [ Item ]
+      summary: Start reprocessing the files of the item.
+      description: |
+        This submits a job that will re-process the files (either all
+        or the ones specified) of the item and replace the metadata.
+      security:
+        - authTokenHeader: []
+      parameters:
+        - $ref: "#/components/parameters/id"
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/IdList"
+      responses:
+        200:
+          description: Ok
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BasicResult"
+
+
  /sec/item/{itemId}/attachment/movebefore:
    post:
      tags: [ Item ]
@ -2604,6 +2656,17 @@ paths:

 components:
  schemas:
+    IdList:
+      description:
+        A list of identifiers.
+      required:
+        - ids
+      properties:
+        ids:
+          type: array
+          items:
+            type: string
+            format: ident
    StringList:
      description: |
        A simple list of strings.
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala
@ -31,6 +31,13 @@ object ItemRoutes {
    import dsl._

    HttpRoutes.of {
+      case POST -> Root / "convertallpdfs" =>
+        for {
+          res <-
+            backend.item.convertAllPdf(user.account.collective.some, user.account, true)
+          resp <- Ok(Conversions.basicResult(res, "Task submitted"))
+        } yield resp
+
      case req @ POST -> Root / "search" =>
        for {
          mask <- req.as[ItemSearch]
@ -279,6 +286,15 @@ object ItemRoutes {
          resp <- Ok(Conversions.basicResult(res, "Attachment moved."))
        } yield resp

+      case req @ POST -> Root / Ident(id) / "reprocess" =>
+        for {
+          data <- req.as[IdList]
+          ids = data.ids.flatMap(s => Ident.fromString(s).toOption)
+          _    <- logger.fdebug(s"Re-process item ${id.id}")
+          res  <- backend.item.reprocess(id, ids, user.account, true)
+          resp <- Ok(Conversions.basicResult(res, "Re-process task submitted."))
+        } yield resp
+
      case DELETE -> Root / Ident(id) =>
        for {
          n <- backend.item.deleteItem(id, user.account.collective)
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/JobQueueRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/JobQueueRoutes.scala
@ -21,7 +21,7 @@ object JobQueueRoutes {
    HttpRoutes.of {
      case GET -> Root / "state" =>
        for {
-          js <- backend.job.queueState(user.account.collective, 200)
+          js <- backend.job.queueState(user.account.collective, 40)
          res = Conversions.mkJobQueueState(js)
          resp <- Ok(res)
        } yield resp
--- a/modules/store/src/main/scala/docspell/store/queries/QJob.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QJob.scala
@ -209,7 +209,8 @@ object QJob {
    store.transact(RJob.findFromIds(ids))

  def queueStateSnapshot(
-      collective: Ident
+      collective: Ident,
+      max: Long
  ): Stream[ConnectionIO, (RJob, Vector[RJobLog])] = {
    val JC                     = RJob.Columns
    val waiting: Set[JobState] = Set(JobState.Waiting, JobState.Stuck, JobState.Scheduled)
@ -218,18 +219,34 @@ object QJob {

    def selectJobs(now: Timestamp): Stream[ConnectionIO, RJob] = {
      val refDate = now.minusHours(24)
-      val sql = selectSimple(
+
+      val runningJobs = (selectSimple(
+        JC.all,
+        RJob.table,
+        and(JC.group.is(collective), JC.state.isOneOf(running.toSeq))
+      ) ++ orderBy(JC.submitted.desc)).query[RJob].stream
+
+      val waitingJobs = (selectSimple(
        JC.all,
        RJob.table,
        and(
          JC.group.is(collective),
-          or(
-            and(JC.state.isOneOf(done.toSeq), JC.submitted.isGt(refDate)),
-            JC.state.isOneOf((running ++ waiting).toSeq)
-          )
+          JC.state.isOneOf(waiting.toSeq),
+          JC.submitted.isGt(refDate)
        )
-      )
-      (sql ++ orderBy(JC.submitted.desc)).query[RJob].stream
+      ) ++ orderBy(JC.submitted.desc)).query[RJob].stream.take(max)
+
+      val doneJobs = (selectSimple(
+        JC.all,
+        RJob.table,
+        and(
+          JC.group.is(collective),
+          JC.state.isOneOf(done.toSeq),
+          JC.submitted.isGt(refDate)
+        )
+      ) ++ orderBy(JC.submitted.desc)).query[RJob].stream.take(max)
+
+      runningJobs ++ waitingJobs ++ doneJobs
    }

    def selectLogs(job: RJob): ConnectionIO[Vector[RJobLog]] =
--- a/modules/store/src/main/scala/docspell/store/queue/JobQueue.scala
+++ b/modules/store/src/main/scala/docspell/store/queue/JobQueue.scala
@ -28,6 +28,8 @@ trait JobQueue[F[_]] {

  def insertAll(jobs: Seq[RJob]): F[Unit]

+  def insertAllIfNew(jobs: Seq[RJob]): F[Unit]
+
  def nextJob(
      prio: Ident => F[Priority],
      worker: Ident,
@ -81,5 +83,13 @@ object JobQueue {
              logger.error(ex)("Could not insert job. Skipping it.")
          })

+      def insertAllIfNew(jobs: Seq[RJob]): F[Unit] =
+        jobs.toList
+          .traverse(j => insertIfNew(j).attempt)
+          .map(_.foreach {
+            case Right(()) =>
+            case Left(ex) =>
+              logger.error(ex)("Could not insert job. Skipping it.")
+          })
    })
 }
--- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala
@ -1,6 +1,7 @@
 package docspell.store.records

 import cats.implicits._
+import fs2.Stream

 import docspell.common._
 import docspell.store.impl.Implicits._
@ -71,6 +72,16 @@ object RAttachment {
      commas(fileId.setTo(fId), name.setTo(fname))
    ).update.run

+  def updateFileId(
+      attachId: Ident,
+      fId: Ident
+  ): ConnectionIO[Int] =
+    updateRow(
+      table,
+      id.is(attachId),
+      fileId.setTo(fId)
+    ).update.run
+
  def updatePosition(attachId: Ident, pos: Int): ConnectionIO[Int] =
    updateRow(table, id.is(attachId), position.setTo(pos)).update.run

@ -187,4 +198,32 @@ object RAttachment {

  def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] =
    selectSimple(Seq(itemId), table, id.is(attachId)).query[Ident].option
+
+  def findNonConvertedPdf(
+      coll: Option[Ident],
+      chunkSize: Int
+  ): Stream[ConnectionIO, RAttachment] = {
+    val aId     = Columns.id.prefix("a")
+    val aItem   = Columns.itemId.prefix("a")
+    val aFile   = Columns.fileId.prefix("a")
+    val sId     = RAttachmentSource.Columns.id.prefix("s")
+    val sFile   = RAttachmentSource.Columns.fileId.prefix("s")
+    val iId     = RItem.Columns.id.prefix("i")
+    val iColl   = RItem.Columns.cid.prefix("i")
+    val mId     = RFileMeta.Columns.id.prefix("m")
+    val mType   = RFileMeta.Columns.mimetype.prefix("m")
+    val pdfType = "application/pdf%"
+
+    val from = table ++ fr"a INNER JOIN" ++
+      RAttachmentSource.table ++ fr"s ON" ++ sId.is(aId) ++ fr"INNER JOIN" ++
+      RItem.table ++ fr"i ON" ++ iId.is(aItem) ++ fr"INNER JOIN" ++
+      RFileMeta.table ++ fr"m ON" ++ aFile.is(mId)
+    val where = coll match {
+      case Some(cid) => and(iColl.is(cid), aFile.is(sFile), mType.lowerLike(pdfType))
+      case None      => and(aFile.is(sFile), mType.lowerLike(pdfType))
+    }
+    selectSimple(all.map(_.prefix("a")), from, where)
+      .query[RAttachment]
+      .streamWithChunkSize(chunkSize)
+  }
 }
--- a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala
@ -42,6 +42,12 @@ object RAttachmentSource {
  def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
    selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option

+  def isSameFile(attachId: Ident, file: Ident): ConnectionIO[Boolean] =
+    selectCount(id, table, and(id.is(attachId), fileId.is(file)))
+      .query[Int]
+      .unique
+      .map(_ > 0)
+
  def delete(attachId: Ident): ConnectionIO[Int] =
    deleteFrom(table, id.is(attachId)).update.run

@ -64,6 +70,17 @@ object RAttachmentSource {
    selectSimple(all.map(_.prefix("a")), from, where).query[RAttachmentSource].option
  }

+  def findByItem(itemId: Ident): ConnectionIO[Vector[RAttachmentSource]] = {
+    val sId   = Columns.id.prefix("s")
+    val aId   = RAttachment.Columns.id.prefix("a")
+    val aItem = RAttachment.Columns.itemId.prefix("a")
+
+    val from = table ++ fr"s INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ sId.is(aId)
+    selectSimple(all.map(_.prefix("s")), from, aItem.is(itemId))
+      .query[RAttachmentSource]
+      .to[Vector]
+  }
+
  def findByItemWithMeta(
      id: Ident
  ): ConnectionIO[Vector[(RAttachmentSource, FileMeta)]] = {
--- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
@ -75,6 +75,14 @@ object RCollective {
    sql.query[RCollective].option
  }

+  def findByItem(itemId: Ident): ConnectionIO[Option[RCollective]] = {
+    val iColl = RItem.Columns.cid.prefix("i")
+    val iId   = RItem.Columns.id.prefix("i")
+    val cId   = id.prefix("c")
+    val from  = RItem.table ++ fr"i INNER JOIN" ++ table ++ fr"c ON" ++ iColl.is(cId)
+    selectSimple(all.map(_.prefix("c")), from, iId.is(itemId)).query[RCollective].option
+  }
+
  def existsById(cid: Ident): ConnectionIO[Boolean] = {
    val sql = selectCount(id, table, id.is(cid))
    sql.query[Int].unique.map(_ > 0)
@ -90,5 +98,19 @@ object RCollective {
    sql.query[RCollective].stream
  }

+  def findByAttachment(attachId: Ident): ConnectionIO[Option[RCollective]] = {
+    val iColl = RItem.Columns.cid.prefix("i")
+    val iId   = RItem.Columns.id.prefix("i")
+    val aItem = RAttachment.Columns.itemId.prefix("a")
+    val aId   = RAttachment.Columns.id.prefix("a")
+    val cId   = Columns.id.prefix("c")
+
+    val from = table ++ fr"c INNER JOIN" ++
+      RItem.table ++ fr"i ON" ++ cId.is(iColl) ++ fr"INNER JOIN" ++
+      RAttachment.table ++ fr"a ON" ++ aItem.is(iId)
+
+    selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
+  }
+
  case class Settings(language: Language, integrationEnabled: Boolean)
 }
--- a/modules/store/src/main/scala/docspell/store/records/RItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala
@ -314,6 +314,9 @@ object RItem {
  def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
    selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option

+  def findById(itemId: Ident): ConnectionIO[Option[RItem]] =
+    selectSimple(all, table, id.is(itemId)).query[RItem].option
+
  def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
    selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option

--- a/tools/convert-all-pdfs.sh
+++ b/tools/convert-all-pdfs.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+#
+# Simple script to authenticate with docspell and trigger the "convert
+# all pdf" route that submits a task to convert all pdf files using
+# ocrmypdf.
+
+set -e
+
+BASE_URL="${1:-http://localhost:7880}"
+LOGIN_URL="$BASE_URL/api/v1/open/auth/login"
+TRIGGER_URL="$BASE_URL/api/v1/sec/item/convertallpdfs"
+
+echo "Login to trigger converting all pdfs."
+echo "Using url: $BASE_URL"
+echo -n "Account: "
+read USER
+echo -n "Password: "
+read -s PASS
+echo
+
+auth=$(curl --fail -XPOST --silent --data-binary "{\"account\":\"$USER\", \"password\":\"$PASS\"}" "$LOGIN_URL")
+
+if [ "$(echo $auth | jq .success)" == "true" ]; then
+    echo "Login successful"
+    auth_token=$(echo $auth | jq -r .token)
+    curl --fail -XPOST -H "X-Docspell-Auth: $auth_token" "$TRIGGER_URL"
+else
+    echo "Login failed."
+fi
--- a/website/site/content/docs/joex/_index.md
+++ b/website/site/content/docs/joex/_index.md
@ -67,7 +67,7 @@ logged in.
 The relevant part of the config file regarding the scheduler is shown
 below with some explanations.

-```
+``` conf
 docspell.joex {
  # other settings left out for brevity

--- a/website/site/content/docs/tools/convert-all-pdf.md
+++ b/website/site/content/docs/tools/convert-all-pdf.md
@ -0,0 +1,46 @@
+++
+title = "Convert All PDFs"
+description = "Convert all PDF files using OcrMyPdf."
+weight = 60
+++
+
+# convert-all-pdf.sh
+
+With version 0.9.0 there was support added for another external tool,
+[OCRMyPdf](https://github.com/jbarlow83/OCRmyPDF), that can convert
+PDF files such that they contain the OCR-ed text layer. This tool is
+optional and can be disabled.
+
+In order to convert all previously processed files with this tool,
+there is an
+[endpoint](/openapi/docspell-openapi.html#api-Item-secItemConvertallpdfsPost)
+that submits a task to convert all PDF files not already converted for
+your collective.
+
+There is no UI part to trigger this route, so you need to use curl or
+the script `convert-all-pdfs.sh` in the `tools/` directory.
+
+
+# Usage
+
+```
+./convert-all-pdfs.sh [docspell-base-url]
+```
+
+For example, if docspell is at `http://localhost:7880`:
+
+```
+./convert-all-pdfs.sh http://localhost:7880
+```
+
+The script asks for your account name and password. It then logs in
+and triggers the said endpoint. After this you should see a few tasks
+running.
+
+There will be one task per file to convert. All these tasks are
+submitted with a low priority. So files uploaded through the webapp or
+a [source](@/docs/webapp/uploading.md#anonymous-upload) with a high
+priority, will be preferred as [configured in the job
+executor](@/docs/joex/_index.md#scheduler-config). This is to not
+disturb normal processing when many conversion tasks are being
+executed.
--- a/website/site/content/docs/webapp/sources-edit.png
+++ b/website/site/content/docs/webapp/sources-edit.png
--- a/website/site/content/docs/webapp/uploading.md
+++ b/website/site/content/docs/webapp/uploading.md
@ -29,6 +29,8 @@ scripts. For this the next variant exists.

 It is also possible to upload files without authentication. This
 should make tools that interact with docspell much easier to write.
+The [Android Client App](@/docs/tools/android.md) uses these urls to
+upload files.

 Go to "Collective Settings" and then to the "Source" tab. A *Source*
 identifies an endpoint where files can be uploaded anonymously.
@ -41,7 +43,7 @@ username is not visible.

 Example screenshot:

-{{ figure(file="sources-form.png") }}
+{{ figure(file="sources-edit.png") }}

 This example shows a source with name "test". Besides a description
 and a name that is only used for displaying purposes, a priority and a
@ -58,25 +60,26 @@ The source endpoint defines two urls:
 - `/app/upload/<id>`
 - `/api/v1/open/upload/item/<id>`

+{{ figure(file="sources-form.png") }}
+
 The first points to a web page where everyone could upload files into
 your account. You could give this url to people for sending files
 directly into your docspell.

 The second url is the API url, which accepts the requests to upload
-files (it is used by the upload page, the first url).
+files. This second url can be used with the [Android Client
+App](@/docs/tools/android.md) to upload files.

-For example, the api url can be used to upload files with curl:
+Another example is to use curl for uploading files from the command
+line::

 ``` bash
 $ curl -XPOST -F file=@test.pdf http://192.168.1.95:7880/api/v1/open/upload/item/3H7hvJcDJuk-NrAW4zxsdfj-K6TMPyb6BGP-xKptVxUdqWa
 {"success":true,"message":"Files submitted."}
 ```

-You could add more `-F file=@/path/to/your/file.pdf` to upload
-multiple files (note, the `@` is required by curl, so it knows that
-the following is a file). There is a [script
-provided](@/docs/tools/ds.md) that uses this to upload files from the
-command line.
+There is a [script provided](@/docs/tools/ds.md) that uses curl to
+upload files from the command line more conveniently.

 When files are uploaded to an source endpoint, the items resulting
 from this uploads are marked with the name of the source. So you know