Merge pull request #1282 from eikek/fix/high-dpi

Allow subsampling when generating preview images
This commit is contained in:
mergify[bot] 2022-01-12 23:22:00 +00:00 committed by GitHub
commit 147b6f9023
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,10 +6,13 @@
package docspell.extract.pdfbox package docspell.extract.pdfbox
import java.awt.RenderingHints
import java.awt.image.BufferedImage import java.awt.image.BufferedImage
import java.awt.image.RenderedImage import java.awt.image.RenderedImage
import javax.imageio.ImageIO import javax.imageio.ImageIO
import scala.jdk.CollectionConverters._
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Chunk import fs2.Chunk
@ -17,6 +20,8 @@ import fs2.Stream
import org.apache.commons.io.output.ByteArrayOutputStream import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import org.apache.pdfbox.rendering.PDFRenderer import org.apache.pdfbox.rendering.PDFRenderer
import scodec.bits.ByteVector import scodec.bits.ByteVector
@ -27,6 +32,7 @@ trait PdfboxPreview[F[_]] {
} }
object PdfboxPreview { object PdfboxPreview {
private[this] val logger = org.log4s.getLogger
def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] = def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] =
Sync[F].pure(new PdfboxPreview[F] { Sync[F].pure(new PdfboxPreview[F] {
@ -50,14 +56,52 @@ object PdfboxPreview {
if (count <= 0 || page < 0 || count <= page) None if (count <= 0 || page < 0 || count <= page) None
else { else {
val renderer = new PDFRenderer(pdoc) val renderer = new PDFRenderer(pdoc)
renderer.setImageDownscalingOptimizationThreshold(0.85f)
val hints = new RenderingHints(
RenderingHints.KEY_RENDERING,
RenderingHints.VALUE_RENDER_QUALITY
)
hints.put(
RenderingHints.KEY_COLOR_RENDERING,
RenderingHints.VALUE_COLOR_RENDER_QUALITY
)
hints.put(
RenderingHints.KEY_INTERPOLATION,
RenderingHints.VALUE_INTERPOLATION_BICUBIC
)
hints.put(
RenderingHints.KEY_TEXT_ANTIALIASING,
RenderingHints.VALUE_TEXT_ANTIALIAS_ON
)
hints.put(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON)
renderer.setRenderingHints(hints)
renderer.setSubsamplingAllowed(enableSubsampling(pdoc.getPage(page)))
logger.debug(s"Converting pdf page $page to image at dpi $dpi")
Option(renderer.renderImageWithDPI(page, dpi)) Option(renderer.renderImageWithDPI(page, dpi))
} }
} }
private def enableSubsampling(page: PDPage): Boolean = {
val res = page.getResources()
val largestImage =
res
.getXObjectNames()
.asScala
.map(name => res.getXObject(name))
.collect { case xobj: PDImageXObject => xobj }
.map { imgobj =>
val w = imgobj.getWidth()
val h = imgobj.getHeight()
logger.debug(s"Found image in pdf of size ${w}x${h} (${w * h}px)")
w * h
}
.maxOption
largestImage.exists(_ > 10 * 1024 * 1024)
}
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = { private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
val out = new ByteArrayOutputStream() val out = new ByteArrayOutputStream()
ImageIO.write(img, "PNG", out) ImageIO.write(img, "PNG", out)
Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray()))) Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
} }
} }