Provide endpoints to submit tasks to re-generate previews

The scaling factor can be given in the config file. When this changes,
images can be regenerated via POSTing to certain endpoints. It is
possible to regenerate just one attachment preview or all within a
collective.
This commit is contained in:
Eike Kettner
2020-11-09 01:18:48 +01:00
parent 6037b54959
commit f4e50c5229
20 changed files with 218 additions and 38 deletions

View File

@ -172,6 +172,18 @@ docspell.joex {
min-text-len = 500
}
preview {
# When rendering a pdf page, use this dpi. This results in
# scaling the image. A standard A4 page rendered at 96dpi
# results in roughly 790x1100px image. Using 32 results in
# roughly 200x300px image.
#
# Note, when this is changed, you might want to re-generate
# preview images. Check the api for this, there is an endpoint
# to regenerate all for a collective.
dpi = 32
}
# Extracting text using OCR works for image and pdf files. It will
# first run ghostscript to create a gray image from a pdf. Then
# unpaper is run to optimize the image for the upcoming ocr, which

View File

@ -174,7 +174,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
MakePreviewArgs.taskName,
MakePreviewTask[F](cfg.convert),
MakePreviewTask[F](cfg.convert, cfg.extraction.preview),
MakePreviewTask.onCancel[F]
)
)

View File

@ -1,13 +1,16 @@
package docspell.joex.preview
import fs2.{Chunk, Stream}
import docspell.common._
import cats.effect._
import cats.implicits._
import docspell.store.queue.JobQueue
import fs2.{Chunk, Stream}
import docspell.backend.JobFactory
import docspell.backend.ops.OJoex
import docspell.joex.scheduler.Task
import docspell.common.MakePreviewArgs.StoreMode
import docspell.common._
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queue.JobQueue
import docspell.store.records.RAttachment
import docspell.store.records.RJob
@ -33,7 +36,7 @@ object AllPreviewsTask {
queue: JobQueue[F]
): F[Int] =
ctx.store
.transact(RAttachment.findWithoutPreview(ctx.args.collective, 50))
.transact(findAttachments(ctx))
.chunks
.flatMap(createJobs[F](ctx))
.chunks
@ -42,6 +45,14 @@ object AllPreviewsTask {
.compile
.foldMonoid
private def findAttachments[F[_]](ctx: Context[F, Args]) =
ctx.args.storeMode match {
case StoreMode.Replace =>
RAttachment.findAll(ctx.args.collective, 50)
case StoreMode.WhenMissing =>
RAttachment.findWithoutPreview(ctx.args.collective, 50)
}
private def createJobs[F[_]: Sync](
ctx: Context[F, Args]
)(ras: Chunk[RAttachment]): Stream[F, RJob] = {
@ -68,19 +79,6 @@ object AllPreviewsTask {
}
def job[F[_]: Sync](storeMode: MakePreviewArgs.StoreMode, cid: Option[Ident]): F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
AllPreviewsArgs.taskName,
cid.getOrElse(DocspellSystem.taskGroup),
AllPreviewsArgs(cid, storeMode),
"Create preview images",
now,
DocspellSystem.taskGroup,
Priority.Low,
Some(DocspellSystem.allPreviewTaskTracker)
)
JobFactory.allPreviews(AllPreviewsArgs(cid, storeMode), None)
}

View File

@ -1,25 +1,27 @@
package docspell.joex.preview
import cats.implicits._
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.process.AttachmentPreview
import docspell.convert.ConvertConfig
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.process.AttachmentPreview
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment
import docspell.store.records.RAttachmentPreview
object MakePreviewTask {
type Args = MakePreviewArgs
def apply[F[_]: Sync](cfg: ConvertConfig): Task[F, Args, Unit] =
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] =
Task { ctx =>
for {
exists <- previewExists(ctx)
preview <- PdfboxPreview(30)
preview <- PdfboxPreview(pcfg)
_ <-
if (exists)
ctx.logger.info(
@ -44,7 +46,9 @@ object MakePreviewTask {
ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment))
_ <- ra
.map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize))
.getOrElse(().pure[F])
.getOrElse(
ctx.logger.warn(s"No attachment found with id: ${ctx.args.attachment}")
)
} yield ()
private def previewExists[F[_]: Sync](ctx: Context[F, Args]): F[Boolean] =

View File

@ -9,13 +9,14 @@ import fs2.Stream
import docspell.common._
import docspell.convert._
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.scheduler._
import docspell.store.queries.QAttachment
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
import docspell.store.queries.QAttachment
/** Goes through all attachments that must be already converted into a
* pdf. If it is a pdf, the first page is converted into a small
@ -23,7 +24,7 @@ import docspell.store.queries.QAttachment
*/
object AttachmentPreview {
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)(
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig, pcfg: PreviewConfig)(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
@ -31,7 +32,7 @@ object AttachmentPreview {
_ <- ctx.logger.info(
s"Creating preview images for ${item.attachments.size} files…"
)
preview <- PdfboxPreview(24)
preview <- PdfboxPreview(pcfg)
_ <- item.attachments
.traverse(createPreview(ctx, preview, cfg.chunkSize))
.attempt

View File

@ -54,7 +54,7 @@ object ProcessItem {
ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert))
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3))