Provide endpoints to submit tasks to re-generate previews

The scaling factor can be given in the config file. When this changes,
images can be regenerated via POSTing to certain endpoints. It is
possible to regenerate just one attachment preview or all within a
collective.
This commit is contained in:
Eike Kettner 2020-11-09 01:18:48 +01:00
parent 6037b54959
commit f4e50c5229
20 changed files with 218 additions and 38 deletions

View File

@ -8,6 +8,45 @@ import docspell.store.records.RJob
object JobFactory {
def makePreview[F[_]: Sync](
args: MakePreviewArgs,
account: Option[AccountId]
): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
job = RJob.newJob(
id,
MakePreviewArgs.taskName,
account.map(_.collective).getOrElse(DocspellSystem.taskGroup),
args,
s"Generate preview image",
now,
account.map(_.user).getOrElse(DocspellSystem.user),
Priority.Low,
Some(MakePreviewArgs.taskName / args.attachment)
)
} yield job
def allPreviews[F[_]: Sync](
args: AllPreviewsArgs,
submitter: Option[Ident]
): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
AllPreviewsArgs.taskName,
args.collective.getOrElse(DocspellSystem.taskGroup),
args,
"Create preview images",
now,
submitter.getOrElse(DocspellSystem.taskGroup),
Priority.Low,
Some(DocspellSystem.allPreviewTaskTracker)
)
def convertAllPdfs[F[_]: Sync](
collective: Option[Ident],
account: AccountId,

View File

@ -4,9 +4,11 @@ import cats.effect.{Effect, Resource}
import cats.implicits._
import fs2.Stream
import docspell.backend.JobFactory
import docspell.backend.PasswordCrypt
import docspell.backend.ops.OCollective._
import docspell.common._
import docspell.store.UpdateResult
import docspell.store.queries.QCollective
import docspell.store.queue.JobQueue
import docspell.store.records._
@ -51,6 +53,15 @@ trait OCollective[F[_]] {
def findEnabledSource(sourceId: Ident): F[Option[RSource]]
def startLearnClassifier(collective: Ident): F[Unit]
/** Submits a task that (re)generates the preview images for all
* attachments of the given collective.
*/
def generatePreviews(
storeMode: MakePreviewArgs.StoreMode,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult]
}
object OCollective {
@ -210,5 +221,20 @@ object OCollective {
def findEnabledSource(sourceId: Ident): F[Option[RSource]] =
store.transact(RSource.findEnabled(sourceId))
def generatePreviews(
storeMode: MakePreviewArgs.StoreMode,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult] =
for {
job <- JobFactory.allPreviews[F](
AllPreviewsArgs(Some(account.collective), storeMode),
Some(account.user)
)
_ <- queue.insertIfNew(job)
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success
})
}

View File

@ -175,6 +175,15 @@ trait OItem[F[_]] {
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult]
/** Submits a task that (re)generates the preview image for an
* attachment.
*/
def generatePreview(
args: MakePreviewArgs,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult]
}
object OItem {
@ -656,6 +665,17 @@ object OItem {
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success
def generatePreview(
args: MakePreviewArgs,
account: AccountId,
notifyJoex: Boolean
): F[UpdateResult] =
for {
job <- JobFactory.makePreview[F](args, account.some)
_ <- queue.insertIfNew(job)
_ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F]
} yield UpdateResult.success
private def onSuccessIgnoreError(update: F[Unit])(ar: UpdateResult): F[Unit] =
ar match {
case UpdateResult.Success =>

View File

@ -18,6 +18,12 @@ object MakePreviewArgs {
val taskName = Ident.unsafe("make-preview")
def replace(attach: Ident): MakePreviewArgs =
MakePreviewArgs(attach, StoreMode.Replace)
def whenMissing(attach: Ident): MakePreviewArgs =
MakePreviewArgs(attach, StoreMode.WhenMissing)
sealed trait StoreMode extends Product {
final def name: String =
productPrefix.toLowerCase()

View File

@ -1,5 +1,6 @@
package docspell.extract
import docspell.extract.ocr.OcrConfig
import docspell.extract.pdfbox.PreviewConfig
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig, preview: PreviewConfig)

View File

@ -21,11 +21,13 @@ trait PdfboxPreview[F[_]] {
object PdfboxPreview {
def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] =
def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] =
Sync[F].pure(new PdfboxPreview[F] {
def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] =
PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi)))
PdfLoader.withDocumentStream(pdf)(doc =>
Sync[F].delay(getPageImage(doc, 0, cfg.dpi))
)
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] =
previewImage(pdf).map(_.map(pngStream[F]))

View File

@ -0,0 +1,3 @@
package docspell.extract.pdfbox
case class PreviewConfig(dpi: Float)

View File

@ -21,7 +21,7 @@ object PdfboxPreviewTest extends SimpleTestSuite {
val data = file.readURL[IO](8192, blocker)
val sha256out =
Stream
.eval(PdfboxPreview[IO](48))
.eval(PdfboxPreview[IO](PreviewConfig(48)))
.evalMap(_.previewPNG(data))
.flatMap(_.get)
.through(fs2.hash.sha256)

View File

@ -172,6 +172,18 @@ docspell.joex {
min-text-len = 500
}
preview {
# When rendering a pdf page, use this dpi. This results in
# scaling the image. A standard A4 page rendered at 96dpi
# results in roughly 790x1100px image. Using 32 results in
# roughly 200x300px image.
#
# Note, when this is changed, you might want to re-generate
# preview images. Check the api for this, there is an endpoint
# to regenerate all for a collective.
dpi = 32
}
# Extracting text using OCR works for image and pdf files. It will
# first run ghostscript to create a gray image from a pdf. Then
# unpaper is run to optimize the image for the upcoming ocr, which

View File

@ -174,7 +174,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
MakePreviewArgs.taskName,
MakePreviewTask[F](cfg.convert),
MakePreviewTask[F](cfg.convert, cfg.extraction.preview),
MakePreviewTask.onCancel[F]
)
)

View File

@ -1,13 +1,16 @@
package docspell.joex.preview
import fs2.{Chunk, Stream}
import docspell.common._
import cats.effect._
import cats.implicits._
import docspell.store.queue.JobQueue
import fs2.{Chunk, Stream}
import docspell.backend.JobFactory
import docspell.backend.ops.OJoex
import docspell.joex.scheduler.Task
import docspell.common.MakePreviewArgs.StoreMode
import docspell.common._
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queue.JobQueue
import docspell.store.records.RAttachment
import docspell.store.records.RJob
@ -33,7 +36,7 @@ object AllPreviewsTask {
queue: JobQueue[F]
): F[Int] =
ctx.store
.transact(RAttachment.findWithoutPreview(ctx.args.collective, 50))
.transact(findAttachments(ctx))
.chunks
.flatMap(createJobs[F](ctx))
.chunks
@ -42,6 +45,14 @@ object AllPreviewsTask {
.compile
.foldMonoid
private def findAttachments[F[_]](ctx: Context[F, Args]) =
ctx.args.storeMode match {
case StoreMode.Replace =>
RAttachment.findAll(ctx.args.collective, 50)
case StoreMode.WhenMissing =>
RAttachment.findWithoutPreview(ctx.args.collective, 50)
}
private def createJobs[F[_]: Sync](
ctx: Context[F, Args]
)(ras: Chunk[RAttachment]): Stream[F, RJob] = {
@ -68,19 +79,6 @@ object AllPreviewsTask {
}
def job[F[_]: Sync](storeMode: MakePreviewArgs.StoreMode, cid: Option[Ident]): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
AllPreviewsArgs.taskName,
cid.getOrElse(DocspellSystem.taskGroup),
AllPreviewsArgs(cid, storeMode),
"Create preview images",
now,
DocspellSystem.taskGroup,
Priority.Low,
Some(DocspellSystem.allPreviewTaskTracker)
)
JobFactory.allPreviews(AllPreviewsArgs(cid, storeMode), None)
}

View File

@ -1,25 +1,27 @@
package docspell.joex.preview
import cats.implicits._
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.process.AttachmentPreview
import docspell.convert.ConvertConfig
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.process.AttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment
import docspell.store.records.RAttachmentPreview
object MakePreviewTask {
type Args = MakePreviewArgs
def apply[F[_]: Sync](cfg: ConvertConfig): Task[F, Args, Unit] =
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] =
Task { ctx =>
for {
exists <- previewExists(ctx)
preview <- PdfboxPreview(30)
preview <- PdfboxPreview(pcfg)
_ <-
if (exists)
ctx.logger.info(
@ -44,7 +46,9 @@ object MakePreviewTask {
ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment))
_ <- ra
.map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize))
.getOrElse(().pure[F])
.getOrElse(
ctx.logger.warn(s"No attachment found with id: ${ctx.args.attachment}")
)
} yield ()
private def previewExists[F[_]: Sync](ctx: Context[F, Args]): F[Boolean] =

View File

@ -9,13 +9,14 @@ import fs2.Stream
import docspell.common._
import docspell.convert._
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.scheduler._
import docspell.store.queries.QAttachment
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
import docspell.store.queries.QAttachment
/** Goes through all attachments that must be already converted into a
* pdf. If it is a pdf, the first page is converted into a small
@ -23,7 +24,7 @@ import docspell.store.queries.QAttachment
*/
object AttachmentPreview {
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)(
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig, pcfg: PreviewConfig)(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
@ -31,7 +32,7 @@ object AttachmentPreview {
_ <- ctx.logger.info(
s"Creating preview images for ${item.attachments.size} files…"
)
preview <- PdfboxPreview(24)
preview <- PdfboxPreview(pcfg)
_ <- item.attachments
.traverse(createPreview(ctx, preview, cfg.chunkSize))
.attempt

View File

@ -54,7 +54,7 @@ object ProcessItem {
ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert))
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3))

View File

@ -2526,6 +2526,24 @@ paths:
schema:
type: string
format: binary
post:
tags: [ Attachment ]
summary: (Re)generate a preview image.
description: |
Submits a task that generates a preview image for this
attachment. The existing preview will be replaced.
security:
- authTokenHeader: []
parameters:
- $ref: "#/components/parameters/id"
responses:
200:
description: Ok
content:
application/json:
schema:
$ref: "#/components/schemas/BasicResult"
/sec/attachment/{id}/meta:
get:
tags: [ Attachment ]

View File

@ -7,6 +7,7 @@ import docspell.backend.BackendApp
import docspell.backend.auth.AuthToken
import docspell.backend.ops._
import docspell.common.Ident
import docspell.common.MakePreviewArgs
import docspell.restapi.model._
import docspell.restserver.conv.Conversions
import docspell.restserver.http4s.BinaryUtil
@ -129,6 +130,18 @@ object AttachmentRoutes {
.getOrElse(NotFound(BasicResult(false, "Not found")))
} yield resp
case POST -> Root / Ident(id) / "preview" =>
for {
res <- backend.item.generatePreview(
MakePreviewArgs.replace(id),
user.account,
true
)
resp <- Ok(
Conversions.basicResult(res, "Generating preview image task submitted.")
)
} yield resp
case GET -> Root / Ident(id) / "view" =>
// this route exists to provide a stable url
// it redirects currently to viewerjs

View File

@ -6,6 +6,7 @@ import cats.implicits._
import docspell.backend.BackendApp
import docspell.backend.auth.AuthToken
import docspell.backend.ops.OCollective
import docspell.common.MakePreviewArgs
import docspell.restapi.model._
import docspell.restserver.conv.Conversions
import docspell.restserver.http4s._
@ -94,6 +95,18 @@ object CollectiveRoutes {
resp <- Ok(BasicResult(true, "Task submitted"))
} yield resp
case POST -> Root / "previews" =>
for {
res <- backend.collective.generatePreviews(
MakePreviewArgs.StoreMode.Replace,
user.account,
true
)
resp <- Ok(
Conversions.basicResult(res, "Generate all previews task submitted.")
)
} yield resp
case GET -> Root =>
for {
collDb <- backend.collective.find(user.account.collective)

View File

@ -26,9 +26,9 @@ object QAttachment {
Stream
.evalSeq(store.transact(findPreview))
.map(_.fileId.id)
.evalTap(_ => store.transact(RAttachmentPreview.delete(attachId)))
.flatMap(store.bitpeace.delete)
.map(flag => if (flag) 1 else 0)
.evalMap(_ => store.transact(RAttachmentPreview.delete(attachId)))
.compile
.foldMonoid
}

View File

@ -231,6 +231,30 @@ object RAttachment {
def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] =
selectSimple(Seq(itemId), table, id.is(attachId)).query[Ident].option
def findAll(
coll: Option[Ident],
chunkSize: Int
): Stream[ConnectionIO, RAttachment] = {
val aItem = Columns.itemId.prefix("a")
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val cols = all.map(_.prefix("a"))
coll match {
case Some(cid) =>
val join = table ++ fr"a INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem)
val cond = iColl.is(cid)
selectSimple(cols, join, cond)
.query[RAttachment]
.streamWithChunkSize(chunkSize)
case None =>
selectSimple(cols, table, Fragment.empty)
.query[RAttachment]
.streamWithChunkSize(chunkSize)
}
}
def findWithoutPreview(
coll: Option[Ident],
chunkSize: Int

View File

@ -97,7 +97,7 @@
background: #fff;
}
.default-layout img.preview-image {
max-width: 200px;
max-width: 160px;
margin-left: auto;
margin-right: auto;
}