Provide endpoints to submit tasks to re-generate previews

The scaling factor can be given in the config file. When this changes,
images can be regenerated via POSTing to certain endpoints. It is
possible to regenerate just one attachment preview or all within a
collective.
This commit is contained in:
Eike Kettner 2020-11-09 01:18:48 +01:00
parent 6037b54959
commit f4e50c5229
20 changed files with 218 additions and 38 deletions

View File

@ -8,6 +8,45 @@ import docspell.store.records.RJob
object JobFactory { object JobFactory {
def makePreview[F[_]: Sync](
args: MakePreviewArgs,
account: Option[AccountId]
): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
job = RJob.newJob(
id,
MakePreviewArgs.taskName,
account.map(_.collective).getOrElse(DocspellSystem.taskGroup),
args,
s"Generate preview image",
now,
account.map(_.user).getOrElse(DocspellSystem.user),
Priority.Low,
Some(MakePreviewArgs.taskName / args.attachment)
)
} yield job
def allPreviews[F[_]: Sync](
args: AllPreviewsArgs,
submitter: Option[Ident]
): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
AllPreviewsArgs.taskName,
args.collective.getOrElse(DocspellSystem.taskGroup),
args,
"Create preview images",
now,
submitter.getOrElse(DocspellSystem.taskGroup),
Priority.Low,
Some(DocspellSystem.allPreviewTaskTracker)
)
def convertAllPdfs[F[_]: Sync]( def convertAllPdfs[F[_]: Sync](
collective: Option[Ident], collective: Option[Ident],
account: AccountId, account: AccountId,

View File

@ -4,9 +4,11 @@ import cats.effect.{Effect, Resource}
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.backend.JobFactory
import docspell.backend.PasswordCrypt import docspell.backend.PasswordCrypt
import docspell.backend.ops.OCollective._ import docspell.backend.ops.OCollective._
import docspell.common._ import docspell.common._
import docspell.store.UpdateResult
import docspell.store.queries.QCollective import docspell.store.queries.QCollective
import docspell.store.queue.JobQueue import docspell.store.queue.JobQueue
import docspell.store.records._ import docspell.store.records._
@ -51,6 +53,15 @@ trait OCollective[F[_]] {
def findEnabledSource(sourceId: Ident): F[Option[RSource]] def findEnabledSource(sourceId: Ident): F[Option[RSource]]
def startLearnClassifier(collective: Ident): F[Unit] def startLearnClassifier(collective: Ident): F[Unit]
/** Submits a task that (re)generates the preview images for all
* attachments of the given collective.
*/
def generatePreviews(
storeMode: MakePreviewArgs.StoreMode,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult]
} }
object OCollective { object OCollective {
@ -210,5 +221,20 @@ object OCollective {
def findEnabledSource(sourceId: Ident): F[Option[RSource]] = def findEnabledSource(sourceId: Ident): F[Option[RSource]] =
store.transact(RSource.findEnabled(sourceId)) store.transact(RSource.findEnabled(sourceId))
def generatePreviews(
storeMode: MakePreviewArgs.StoreMode,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult] =
for {
job <- JobFactory.allPreviews[F](
AllPreviewsArgs(Some(account.collective), storeMode),
Some(account.user)
)
_ <- queue.insertIfNew(job)
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success
}) })
} }

View File

@ -175,6 +175,15 @@ trait OItem[F[_]] {
account: AccountId, account: AccountId,
notifyJoex: Boolean notifyJoex: Boolean
): F[UpdateResult] ): F[UpdateResult]
/** Submits a task that (re)generates the preview image for an
* attachment.
*/
def generatePreview(
args: MakePreviewArgs,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult]
} }
object OItem { object OItem {
@ -656,6 +665,17 @@ object OItem {
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F] _ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success } yield UpdateResult.success
def generatePreview(
args: MakePreviewArgs,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult] =
for {
job <- JobFactory.makePreview[F](args, account.some)
_ <- queue.insertIfNew(job)
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success
private def onSuccessIgnoreError(update: F[Unit])(ar: UpdateResult): F[Unit] = private def onSuccessIgnoreError(update: F[Unit])(ar: UpdateResult): F[Unit] =
ar match { ar match {
case UpdateResult.Success => case UpdateResult.Success =>

View File

@ -18,6 +18,12 @@ object MakePreviewArgs {
val taskName = Ident.unsafe("make-preview") val taskName = Ident.unsafe("make-preview")
def replace(attach: Ident): MakePreviewArgs =
MakePreviewArgs(attach, StoreMode.Replace)
def whenMissing(attach: Ident): MakePreviewArgs =
MakePreviewArgs(attach, StoreMode.WhenMissing)
sealed trait StoreMode extends Product { sealed trait StoreMode extends Product {
final def name: String = final def name: String =
productPrefix.toLowerCase() productPrefix.toLowerCase()

View File

@ -1,5 +1,6 @@
package docspell.extract package docspell.extract
import docspell.extract.ocr.OcrConfig import docspell.extract.ocr.OcrConfig
import docspell.extract.pdfbox.PreviewConfig
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig, preview: PreviewConfig)

View File

@ -21,11 +21,13 @@ trait PdfboxPreview[F[_]] {
object PdfboxPreview { object PdfboxPreview {
def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] = def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] =
Sync[F].pure(new PdfboxPreview[F] { Sync[F].pure(new PdfboxPreview[F] {
def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] = def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] =
PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi))) PdfLoader.withDocumentStream(pdf)(doc =>
Sync[F].delay(getPageImage(doc, 0, cfg.dpi))
)
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] = def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] =
previewImage(pdf).map(_.map(pngStream[F])) previewImage(pdf).map(_.map(pngStream[F]))

View File

@ -0,0 +1,3 @@
package docspell.extract.pdfbox
case class PreviewConfig(dpi: Float)

View File

@ -21,7 +21,7 @@ object PdfboxPreviewTest extends SimpleTestSuite {
val data = file.readURL[IO](8192, blocker) val data = file.readURL[IO](8192, blocker)
val sha256out = val sha256out =
Stream Stream
.eval(PdfboxPreview[IO](48)) .eval(PdfboxPreview[IO](PreviewConfig(48)))
.evalMap(_.previewPNG(data)) .evalMap(_.previewPNG(data))
.flatMap(_.get) .flatMap(_.get)
.through(fs2.hash.sha256) .through(fs2.hash.sha256)

View File

@ -172,6 +172,18 @@ docspell.joex {
min-text-len = 500 min-text-len = 500
} }
preview {
# When rendering a pdf page, use this dpi. This results in
# scaling the image. A standard A4 page rendered at 96dpi
# results in roughly 790x1100px image. Using 32 results in
# roughly 200x300px image.
#
# Note, when this is changed, you might want to re-generate
# preview images. Check the api for this, there is an endpoint
# to regenerate all for a collective.
dpi = 32
}
# Extracting text using OCR works for image and pdf files. It will # Extracting text using OCR works for image and pdf files. It will
# first run ghostscript to create a gray image from a pdf. Then # first run ghostscript to create a gray image from a pdf. Then
# unpaper is run to optimize the image for the upcoming ocr, which # unpaper is run to optimize the image for the upcoming ocr, which

View File

@ -174,7 +174,7 @@ object JoexAppImpl {
.withTask( .withTask(
JobTask.json( JobTask.json(
MakePreviewArgs.taskName, MakePreviewArgs.taskName,
MakePreviewTask[F](cfg.convert), MakePreviewTask[F](cfg.convert, cfg.extraction.preview),
MakePreviewTask.onCancel[F] MakePreviewTask.onCancel[F]
) )
) )

View File

@ -1,13 +1,16 @@
package docspell.joex.preview package docspell.joex.preview
import fs2.{Chunk, Stream}
import docspell.common._
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.store.queue.JobQueue import fs2.{Chunk, Stream}
import docspell.backend.JobFactory
import docspell.backend.ops.OJoex import docspell.backend.ops.OJoex
import docspell.joex.scheduler.Task import docspell.common.MakePreviewArgs.StoreMode
import docspell.common._
import docspell.joex.scheduler.Context import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queue.JobQueue
import docspell.store.records.RAttachment import docspell.store.records.RAttachment
import docspell.store.records.RJob import docspell.store.records.RJob
@ -33,7 +36,7 @@ object AllPreviewsTask {
queue: JobQueue[F] queue: JobQueue[F]
): F[Int] = ): F[Int] =
ctx.store ctx.store
.transact(RAttachment.findWithoutPreview(ctx.args.collective, 50)) .transact(findAttachments(ctx))
.chunks .chunks
.flatMap(createJobs[F](ctx)) .flatMap(createJobs[F](ctx))
.chunks .chunks
@ -42,6 +45,14 @@ object AllPreviewsTask {
.compile .compile
.foldMonoid .foldMonoid
private def findAttachments[F[_]](ctx: Context[F, Args]) =
ctx.args.storeMode match {
case StoreMode.Replace =>
RAttachment.findAll(ctx.args.collective, 50)
case StoreMode.WhenMissing =>
RAttachment.findWithoutPreview(ctx.args.collective, 50)
}
private def createJobs[F[_]: Sync]( private def createJobs[F[_]: Sync](
ctx: Context[F, Args] ctx: Context[F, Args]
)(ras: Chunk[RAttachment]): Stream[F, RJob] = { )(ras: Chunk[RAttachment]): Stream[F, RJob] = {
@ -68,19 +79,6 @@ object AllPreviewsTask {
} }
def job[F[_]: Sync](storeMode: MakePreviewArgs.StoreMode, cid: Option[Ident]): F[RJob] = def job[F[_]: Sync](storeMode: MakePreviewArgs.StoreMode, cid: Option[Ident]): F[RJob] =
for { JobFactory.allPreviews(AllPreviewsArgs(cid, storeMode), None)
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
AllPreviewsArgs.taskName,
cid.getOrElse(DocspellSystem.taskGroup),
AllPreviewsArgs(cid, storeMode),
"Create preview images",
now,
DocspellSystem.taskGroup,
Priority.Low,
Some(DocspellSystem.allPreviewTaskTracker)
)
} }

View File

@ -1,25 +1,27 @@
package docspell.joex.preview package docspell.joex.preview
import cats.implicits._
import cats.effect._ import cats.effect._
import cats.implicits._
import docspell.common._ import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.process.AttachmentPreview
import docspell.convert.ConvertConfig import docspell.convert.ConvertConfig
import docspell.extract.pdfbox.PdfboxPreview import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.process.AttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment import docspell.store.records.RAttachment
import docspell.store.records.RAttachmentPreview
object MakePreviewTask { object MakePreviewTask {
type Args = MakePreviewArgs type Args = MakePreviewArgs
def apply[F[_]: Sync](cfg: ConvertConfig): Task[F, Args, Unit] = def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] =
Task { ctx => Task { ctx =>
for { for {
exists <- previewExists(ctx) exists <- previewExists(ctx)
preview <- PdfboxPreview(30) preview <- PdfboxPreview(pcfg)
_ <- _ <-
if (exists) if (exists)
ctx.logger.info( ctx.logger.info(
@ -44,7 +46,9 @@ object MakePreviewTask {
ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment)) ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment))
_ <- ra _ <- ra
.map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize)) .map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize))
.getOrElse(().pure[F]) .getOrElse(
ctx.logger.warn(s"No attachment found with id: ${ctx.args.attachment}")
)
} yield () } yield ()
private def previewExists[F[_]: Sync](ctx: Context[F, Args]): F[Boolean] = private def previewExists[F[_]: Sync](ctx: Context[F, Args]): F[Boolean] =

View File

@ -9,13 +9,14 @@ import fs2.Stream
import docspell.common._ import docspell.common._
import docspell.convert._ import docspell.convert._
import docspell.extract.pdfbox.PdfboxPreview import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.scheduler._ import docspell.joex.scheduler._
import docspell.store.queries.QAttachment
import docspell.store.records.RAttachment import docspell.store.records.RAttachment
import docspell.store.records._ import docspell.store.records._
import docspell.store.syntax.MimeTypes._ import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef} import bitpeace.{Mimetype, MimetypeHint, RangeDef}
import docspell.store.queries.QAttachment
/** Goes through all attachments that must be already converted into a /** Goes through all attachments that must be already converted into a
* pdf. If it is a pdf, the first page is converted into a small * pdf. If it is a pdf, the first page is converted into a small
@ -23,7 +24,7 @@ import docspell.store.queries.QAttachment
*/ */
object AttachmentPreview { object AttachmentPreview {
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)( def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig, pcfg: PreviewConfig)(
item: ItemData item: ItemData
): Task[F, ProcessItemArgs, ItemData] = ): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
@ -31,7 +32,7 @@ object AttachmentPreview {
_ <- ctx.logger.info( _ <- ctx.logger.info(
s"Creating preview images for ${item.attachments.size} files…" s"Creating preview images for ${item.attachments.size} files…"
) )
preview <- PdfboxPreview(24) preview <- PdfboxPreview(pcfg)
_ <- item.attachments _ <- item.attachments
.traverse(createPreview(ctx, preview, cfg.chunkSize)) .traverse(createPreview(ctx, preview, cfg.chunkSize))
.attempt .attempt

View File

@ -54,7 +54,7 @@ object ProcessItem {
ConvertPdf(cfg.convert, item) ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1)) .flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert)) .flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(Task.setProgress(progress._2)) .flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3)) .flatMap(Task.setProgress(progress._3))

View File

@ -2526,6 +2526,24 @@ paths:
schema: schema:
type: string type: string
format: binary format: binary
post:
tags: [ Attachment ]
summary: (Re)generate a preview image.
description: |
Submits a task that generates a preview image for this
attachment. The existing preview will be replaced.
security:
- authTokenHeader: []
parameters:
- $ref: "#/components/parameters/id"
responses:
200:
description: Ok
content:
application/json:
schema:
$ref: "#/components/schemas/BasicResult"
/sec/attachment/{id}/meta: /sec/attachment/{id}/meta:
get: get:
tags: [ Attachment ] tags: [ Attachment ]

View File

@ -7,6 +7,7 @@ import docspell.backend.BackendApp
import docspell.backend.auth.AuthToken import docspell.backend.auth.AuthToken
import docspell.backend.ops._ import docspell.backend.ops._
import docspell.common.Ident import docspell.common.Ident
import docspell.common.MakePreviewArgs
import docspell.restapi.model._ import docspell.restapi.model._
import docspell.restserver.conv.Conversions import docspell.restserver.conv.Conversions
import docspell.restserver.http4s.BinaryUtil import docspell.restserver.http4s.BinaryUtil
@ -129,6 +130,18 @@ object AttachmentRoutes {
.getOrElse(NotFound(BasicResult(false, "Not found"))) .getOrElse(NotFound(BasicResult(false, "Not found")))
} yield resp } yield resp
case POST -> Root / Ident(id) / "preview" =>
for {
res <- backend.item.generatePreview(
MakePreviewArgs.replace(id),
user.account,
true
)
resp <- Ok(
Conversions.basicResult(res, "Generating preview image task submitted.")
)
} yield resp
case GET -> Root / Ident(id) / "view" => case GET -> Root / Ident(id) / "view" =>
// this route exists to provide a stable url // this route exists to provide a stable url
// it redirects currently to viewerjs // it redirects currently to viewerjs

View File

@ -6,6 +6,7 @@ import cats.implicits._
import docspell.backend.BackendApp import docspell.backend.BackendApp
import docspell.backend.auth.AuthToken import docspell.backend.auth.AuthToken
import docspell.backend.ops.OCollective import docspell.backend.ops.OCollective
import docspell.common.MakePreviewArgs
import docspell.restapi.model._ import docspell.restapi.model._
import docspell.restserver.conv.Conversions import docspell.restserver.conv.Conversions
import docspell.restserver.http4s._ import docspell.restserver.http4s._
@ -94,6 +95,18 @@ object CollectiveRoutes {
resp <- Ok(BasicResult(true, "Task submitted")) resp <- Ok(BasicResult(true, "Task submitted"))
} yield resp } yield resp
case POST -> Root / "previews" =>
for {
res <- backend.collective.generatePreviews(
MakePreviewArgs.StoreMode.Replace,
user.account,
true
)
resp <- Ok(
Conversions.basicResult(res, "Generate all previews task submitted.")
)
} yield resp
case GET -> Root => case GET -> Root =>
for { for {
collDb <- backend.collective.find(user.account.collective) collDb <- backend.collective.find(user.account.collective)

View File

@ -26,9 +26,9 @@ object QAttachment {
Stream Stream
.evalSeq(store.transact(findPreview)) .evalSeq(store.transact(findPreview))
.map(_.fileId.id) .map(_.fileId.id)
.evalTap(_ => store.transact(RAttachmentPreview.delete(attachId)))
.flatMap(store.bitpeace.delete) .flatMap(store.bitpeace.delete)
.map(flag => if (flag) 1 else 0) .map(flag => if (flag) 1 else 0)
.evalMap(_ => store.transact(RAttachmentPreview.delete(attachId)))
.compile .compile
.foldMonoid .foldMonoid
} }

View File

@ -231,6 +231,30 @@ object RAttachment {
def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] = def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] =
selectSimple(Seq(itemId), table, id.is(attachId)).query[Ident].option selectSimple(Seq(itemId), table, id.is(attachId)).query[Ident].option
def findAll(
coll: Option[Ident],
chunkSize: Int
): Stream[ConnectionIO, RAttachment] = {
val aItem = Columns.itemId.prefix("a")
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val cols = all.map(_.prefix("a"))
coll match {
case Some(cid) =>
val join = table ++ fr"a INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem)
val cond = iColl.is(cid)
selectSimple(cols, join, cond)
.query[RAttachment]
.streamWithChunkSize(chunkSize)
case None =>
selectSimple(cols, table, Fragment.empty)
.query[RAttachment]
.streamWithChunkSize(chunkSize)
}
}
def findWithoutPreview( def findWithoutPreview(
coll: Option[Ident], coll: Option[Ident],
chunkSize: Int chunkSize: Int

View File

@ -97,7 +97,7 @@
background: #fff; background: #fff;
} }
.default-layout img.preview-image { .default-layout img.preview-image {
max-width: 200px; max-width: 160px;
margin-left: auto; margin-left: auto;
margin-right: auto; margin-right: auto;
} }