diff --git a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala index bc05a188..fdb0d860 100644 --- a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala +++ b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala @@ -8,6 +8,45 @@ import docspell.store.records.RJob object JobFactory { + def makePreview[F[_]: Sync]( + args: MakePreviewArgs, + account: Option[AccountId] + ): F[RJob] = + for { + id <- Ident.randomId[F] + now <- Timestamp.current[F] + job = RJob.newJob( + id, + MakePreviewArgs.taskName, + account.map(_.collective).getOrElse(DocspellSystem.taskGroup), + args, + s"Generate preview image", + now, + account.map(_.user).getOrElse(DocspellSystem.user), + Priority.Low, + Some(MakePreviewArgs.taskName / args.attachment) + ) + } yield job + + def allPreviews[F[_]: Sync]( + args: AllPreviewsArgs, + submitter: Option[Ident] + ): F[RJob] = + for { + id <- Ident.randomId[F] + now <- Timestamp.current[F] + } yield RJob.newJob( + id, + AllPreviewsArgs.taskName, + args.collective.getOrElse(DocspellSystem.taskGroup), + args, + "Create preview images", + now, + submitter.getOrElse(DocspellSystem.taskGroup), + Priority.Low, + Some(DocspellSystem.allPreviewTaskTracker) + ) + def convertAllPdfs[F[_]: Sync]( collective: Option[Ident], account: AccountId, diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala index 5e9b5aaf..a4f06986 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala @@ -4,9 +4,11 @@ import cats.effect.{Effect, Resource} import cats.implicits._ import fs2.Stream +import docspell.backend.JobFactory import docspell.backend.PasswordCrypt import docspell.backend.ops.OCollective._ import docspell.common._ +import docspell.store.UpdateResult import docspell.store.queries.QCollective import docspell.store.queue.JobQueue import docspell.store.records._ @@ -51,6 +53,15 @@ trait OCollective[F[_]] { def findEnabledSource(sourceId: Ident): F[Option[RSource]] def startLearnClassifier(collective: Ident): F[Unit] + + /** Submits a task that (re)generates the preview images for all + * attachments of the given collective. + */ + def generatePreviews( + storeMode: MakePreviewArgs.StoreMode, + account: AccountId, + notifyJoex: Boolean + ): F[UpdateResult] } object OCollective { @@ -210,5 +221,20 @@ object OCollective { def findEnabledSource(sourceId: Ident): F[Option[RSource]] = store.transact(RSource.findEnabled(sourceId)) + + def generatePreviews( + storeMode: MakePreviewArgs.StoreMode, + account: AccountId, + notifyJoex: Boolean + ): F[UpdateResult] = + for { + job <- JobFactory.allPreviews[F]( + AllPreviewsArgs(Some(account.collective), storeMode), + Some(account.user) + ) + _ <- queue.insertIfNew(job) + _ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F] + } yield UpdateResult.success + }) } diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index 492d613a..13ee91c7 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -175,6 +175,15 @@ trait OItem[F[_]] { account: AccountId, notifyJoex: Boolean ): F[UpdateResult] + + /** Submits a task that (re)generates the preview image for an + * attachment. + */ + def generatePreview( + args: MakePreviewArgs, + account: AccountId, + notifyJoex: Boolean + ): F[UpdateResult] } object OItem { @@ -656,6 +665,17 @@ object OItem { _ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F] } yield UpdateResult.success + def generatePreview( + args: MakePreviewArgs, + account: AccountId, + notifyJoex: Boolean + ): F[UpdateResult] = + for { + job <- JobFactory.makePreview[F](args, account.some) + _ <- queue.insertIfNew(job) + _ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F] + } yield UpdateResult.success + private def onSuccessIgnoreError(update: F[Unit])(ar: UpdateResult): F[Unit] = ar match { case UpdateResult.Success => diff --git a/modules/common/src/main/scala/docspell/common/MakePreviewArgs.scala b/modules/common/src/main/scala/docspell/common/MakePreviewArgs.scala index 711c3fea..ebe94107 100644 --- a/modules/common/src/main/scala/docspell/common/MakePreviewArgs.scala +++ b/modules/common/src/main/scala/docspell/common/MakePreviewArgs.scala @@ -18,6 +18,12 @@ object MakePreviewArgs { val taskName = Ident.unsafe("make-preview") + def replace(attach: Ident): MakePreviewArgs = + MakePreviewArgs(attach, StoreMode.Replace) + + def whenMissing(attach: Ident): MakePreviewArgs = + MakePreviewArgs(attach, StoreMode.WhenMissing) + sealed trait StoreMode extends Product { final def name: String = productPrefix.toLowerCase() diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala index b4951686..e283b720 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -1,5 +1,6 @@ package docspell.extract import docspell.extract.ocr.OcrConfig +import docspell.extract.pdfbox.PreviewConfig -case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) +case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig, preview: PreviewConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala index 9b7225e8..226c6e82 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala @@ -21,11 +21,13 @@ trait PdfboxPreview[F[_]] { object PdfboxPreview { - def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] = + def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] = Sync[F].pure(new PdfboxPreview[F] { def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] = - PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi))) + PdfLoader.withDocumentStream(pdf)(doc => + Sync[F].delay(getPageImage(doc, 0, cfg.dpi)) + ) def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] = previewImage(pdf).map(_.map(pngStream[F])) diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PreviewConfig.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PreviewConfig.scala new file mode 100644 index 00000000..db3bc56b --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PreviewConfig.scala @@ -0,0 +1,3 @@ +package docspell.extract.pdfbox + +case class PreviewConfig(dpi: Float) diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala index 031cf3ad..c07c4c64 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala @@ -21,7 +21,7 @@ object PdfboxPreviewTest extends SimpleTestSuite { val data = file.readURL[IO](8192, blocker) val sha256out = Stream - .eval(PdfboxPreview[IO](48)) + .eval(PdfboxPreview[IO](PreviewConfig(48))) .evalMap(_.previewPNG(data)) .flatMap(_.get) .through(fs2.hash.sha256) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 51ad7d04..f8deb8e7 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -172,6 +172,18 @@ docspell.joex { min-text-len = 500 } + preview { + # When rendering a pdf page, use this dpi. This results in + # scaling the image. A standard A4 page rendered at 96dpi + # results in roughly 790x1100px image. Using 32 results in + # roughly 200x300px image. + # + # Note, when this is changed, you might want to re-generate + # preview images. Check the api for this, there is an endpoint + # to regenerate all for a collective. + dpi = 32 + } + # Extracting text using OCR works for image and pdf files. It will # first run ghostscript to create a gray image from a pdf. Then # unpaper is run to optimize the image for the upcoming ocr, which diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 4362d93a..2b9b96c5 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -174,7 +174,7 @@ object JoexAppImpl { .withTask( JobTask.json( MakePreviewArgs.taskName, - MakePreviewTask[F](cfg.convert), + MakePreviewTask[F](cfg.convert, cfg.extraction.preview), MakePreviewTask.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/preview/AllPreviewsTask.scala b/modules/joex/src/main/scala/docspell/joex/preview/AllPreviewsTask.scala index 31e6d636..70d87fdb 100644 --- a/modules/joex/src/main/scala/docspell/joex/preview/AllPreviewsTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/preview/AllPreviewsTask.scala @@ -1,13 +1,16 @@ package docspell.joex.preview -import fs2.{Chunk, Stream} -import docspell.common._ import cats.effect._ import cats.implicits._ -import docspell.store.queue.JobQueue +import fs2.{Chunk, Stream} + +import docspell.backend.JobFactory import docspell.backend.ops.OJoex -import docspell.joex.scheduler.Task +import docspell.common.MakePreviewArgs.StoreMode +import docspell.common._ import docspell.joex.scheduler.Context +import docspell.joex.scheduler.Task +import docspell.store.queue.JobQueue import docspell.store.records.RAttachment import docspell.store.records.RJob @@ -33,7 +36,7 @@ object AllPreviewsTask { queue: JobQueue[F] ): F[Int] = ctx.store - .transact(RAttachment.findWithoutPreview(ctx.args.collective, 50)) + .transact(findAttachments(ctx)) .chunks .flatMap(createJobs[F](ctx)) .chunks @@ -42,6 +45,14 @@ object AllPreviewsTask { .compile .foldMonoid + private def findAttachments[F[_]](ctx: Context[F, Args]) = + ctx.args.storeMode match { + case StoreMode.Replace => + RAttachment.findAll(ctx.args.collective, 50) + case StoreMode.WhenMissing => + RAttachment.findWithoutPreview(ctx.args.collective, 50) + } + private def createJobs[F[_]: Sync]( ctx: Context[F, Args] )(ras: Chunk[RAttachment]): Stream[F, RJob] = { @@ -68,19 +79,6 @@ object AllPreviewsTask { } def job[F[_]: Sync](storeMode: MakePreviewArgs.StoreMode, cid: Option[Ident]): F[RJob] = - for { - id <- Ident.randomId[F] - now <- Timestamp.current[F] - } yield RJob.newJob( - id, - AllPreviewsArgs.taskName, - cid.getOrElse(DocspellSystem.taskGroup), - AllPreviewsArgs(cid, storeMode), - "Create preview images", - now, - DocspellSystem.taskGroup, - Priority.Low, - Some(DocspellSystem.allPreviewTaskTracker) - ) + JobFactory.allPreviews(AllPreviewsArgs(cid, storeMode), None) } diff --git a/modules/joex/src/main/scala/docspell/joex/preview/MakePreviewTask.scala b/modules/joex/src/main/scala/docspell/joex/preview/MakePreviewTask.scala index 9da04e33..ba9671f5 100644 --- a/modules/joex/src/main/scala/docspell/joex/preview/MakePreviewTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/preview/MakePreviewTask.scala @@ -1,25 +1,27 @@ package docspell.joex.preview -import cats.implicits._ import cats.effect._ +import cats.implicits._ + import docspell.common._ -import docspell.joex.scheduler.Task -import docspell.store.records.RAttachmentPreview -import docspell.joex.scheduler.Context -import docspell.joex.process.AttachmentPreview import docspell.convert.ConvertConfig import docspell.extract.pdfbox.PdfboxPreview +import docspell.extract.pdfbox.PreviewConfig +import docspell.joex.process.AttachmentPreview +import docspell.joex.scheduler.Context +import docspell.joex.scheduler.Task import docspell.store.records.RAttachment +import docspell.store.records.RAttachmentPreview object MakePreviewTask { type Args = MakePreviewArgs - def apply[F[_]: Sync](cfg: ConvertConfig): Task[F, Args, Unit] = + def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] = Task { ctx => for { exists <- previewExists(ctx) - preview <- PdfboxPreview(30) + preview <- PdfboxPreview(pcfg) _ <- if (exists) ctx.logger.info( @@ -44,7 +46,9 @@ object MakePreviewTask { ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment)) _ <- ra .map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize)) - .getOrElse(().pure[F]) + .getOrElse( + ctx.logger.warn(s"No attachment found with id: ${ctx.args.attachment}") + ) } yield () private def previewExists[F[_]: Sync](ctx: Context[F, Args]): F[Boolean] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala index cbdf5de5..e42e67ab 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala @@ -9,13 +9,14 @@ import fs2.Stream import docspell.common._ import docspell.convert._ import docspell.extract.pdfbox.PdfboxPreview +import docspell.extract.pdfbox.PreviewConfig import docspell.joex.scheduler._ +import docspell.store.queries.QAttachment import docspell.store.records.RAttachment import docspell.store.records._ import docspell.store.syntax.MimeTypes._ import bitpeace.{Mimetype, MimetypeHint, RangeDef} -import docspell.store.queries.QAttachment /** Goes through all attachments that must be already converted into a * pdf. If it is a pdf, the first page is converted into a small @@ -23,7 +24,7 @@ import docspell.store.queries.QAttachment */ object AttachmentPreview { - def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)( + def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig, pcfg: PreviewConfig)( item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => @@ -31,7 +32,7 @@ object AttachmentPreview { _ <- ctx.logger.info( s"Creating preview images for ${item.attachments.size} files…" ) - preview <- PdfboxPreview(24) + preview <- PdfboxPreview(pcfg) _ <- item.attachments .traverse(createPreview(ctx, preview, cfg.chunkSize)) .attempt diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index d3e7522b..8caf25fb 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -54,7 +54,7 @@ object ProcessItem { ConvertPdf(cfg.convert, item) .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) - .flatMap(AttachmentPreview(cfg.convert)) + .flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview)) .flatMap(Task.setProgress(progress._2)) .flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(Task.setProgress(progress._3)) diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 4b8c51b7..c9c1a6ed 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -2526,6 +2526,24 @@ paths: schema: type: string format: binary + post: + tags: [ Attachment ] + summary: (Re)generate a preview image. + description: | + Submits a task that generates a preview image for this + attachment. The existing preview will be replaced. + security: + - authTokenHeader: [] + parameters: + - $ref: "#/components/parameters/id" + responses: + 200: + description: Ok + content: + application/json: + schema: + $ref: "#/components/schemas/BasicResult" + /sec/attachment/{id}/meta: get: tags: [ Attachment ] diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala index 1d3ee301..f168c400 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala @@ -7,6 +7,7 @@ import docspell.backend.BackendApp import docspell.backend.auth.AuthToken import docspell.backend.ops._ import docspell.common.Ident +import docspell.common.MakePreviewArgs import docspell.restapi.model._ import docspell.restserver.conv.Conversions import docspell.restserver.http4s.BinaryUtil @@ -129,6 +130,18 @@ object AttachmentRoutes { .getOrElse(NotFound(BasicResult(false, "Not found"))) } yield resp + case POST -> Root / Ident(id) / "preview" => + for { + res <- backend.item.generatePreview( + MakePreviewArgs.replace(id), + user.account, + true + ) + resp <- Ok( + Conversions.basicResult(res, "Generating preview image task submitted.") + ) + } yield resp + case GET -> Root / Ident(id) / "view" => // this route exists to provide a stable url // it redirects currently to viewerjs diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index bf7eaddd..7ecd1e90 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -6,6 +6,7 @@ import cats.implicits._ import docspell.backend.BackendApp import docspell.backend.auth.AuthToken import docspell.backend.ops.OCollective +import docspell.common.MakePreviewArgs import docspell.restapi.model._ import docspell.restserver.conv.Conversions import docspell.restserver.http4s._ @@ -94,6 +95,18 @@ object CollectiveRoutes { resp <- Ok(BasicResult(true, "Task submitted")) } yield resp + case POST -> Root / "previews" => + for { + res <- backend.collective.generatePreviews( + MakePreviewArgs.StoreMode.Replace, + user.account, + true + ) + resp <- Ok( + Conversions.basicResult(res, "Generate all previews task submitted.") + ) + } yield resp + case GET -> Root => for { collDb <- backend.collective.find(user.account.collective) diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index 9fbe7401..86ae26f4 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -26,9 +26,9 @@ object QAttachment { Stream .evalSeq(store.transact(findPreview)) .map(_.fileId.id) + .evalTap(_ => store.transact(RAttachmentPreview.delete(attachId))) .flatMap(store.bitpeace.delete) .map(flag => if (flag) 1 else 0) - .evalMap(_ => store.transact(RAttachmentPreview.delete(attachId))) .compile .foldMonoid } diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index 8be0fdb6..fa1453b6 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -231,6 +231,30 @@ object RAttachment { def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] = selectSimple(Seq(itemId), table, id.is(attachId)).query[Ident].option + def findAll( + coll: Option[Ident], + chunkSize: Int + ): Stream[ConnectionIO, RAttachment] = { + val aItem = Columns.itemId.prefix("a") + val iId = RItem.Columns.id.prefix("i") + val iColl = RItem.Columns.cid.prefix("i") + + val cols = all.map(_.prefix("a")) + + coll match { + case Some(cid) => + val join = table ++ fr"a INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) + val cond = iColl.is(cid) + selectSimple(cols, join, cond) + .query[RAttachment] + .streamWithChunkSize(chunkSize) + case None => + selectSimple(cols, table, Fragment.empty) + .query[RAttachment] + .streamWithChunkSize(chunkSize) + } + } + def findWithoutPreview( coll: Option[Ident], chunkSize: Int diff --git a/modules/webapp/src/main/webjar/docspell.css b/modules/webapp/src/main/webjar/docspell.css index a6b5c5e6..8dad246b 100644 --- a/modules/webapp/src/main/webjar/docspell.css +++ b/modules/webapp/src/main/webjar/docspell.css @@ -97,7 +97,7 @@ background: #fff; } .default-layout img.preview-image { - max-width: 200px; + max-width: 160px; margin-left: auto; margin-right: auto; }