Allow subsampling when generating preview images

This cuts down considerably when high-dpi images are provided in pdfs.
The test file, scanned with 600dpi resulting in a 5.4M pdf file
contains a 9900x13800 image. This image is loaded into memory in order
to scale it down by PDFBox. This easily results in out of memory
errors (this image requires already ~400M). With subsampling the size
is reduced at most by a factor of 8. Still recommended to avoid large
dpi image-only scans for text based documents or increase the heap
size for joex.
This commit is contained in:
eikek 2022-01-12 22:43:08 +01:00
parent c9eabd087a
commit bc1ec90b6e

View File

@ -6,10 +6,13 @@
package docspell.extract.pdfbox
import java.awt.RenderingHints
import java.awt.image.BufferedImage
import java.awt.image.RenderedImage
import javax.imageio.ImageIO
import scala.jdk.CollectionConverters._
import cats.effect._
import cats.implicits._
import fs2.Chunk
@ -17,6 +20,8 @@ import fs2.Stream
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import org.apache.pdfbox.rendering.PDFRenderer
import scodec.bits.ByteVector
@ -27,6 +32,7 @@ trait PdfboxPreview[F[_]] {
}
object PdfboxPreview {
private[this] val logger = org.log4s.getLogger
def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] =
Sync[F].pure(new PdfboxPreview[F] {
@ -50,14 +56,52 @@ object PdfboxPreview {
if (count <= 0 || page < 0 || count <= page) None
else {
val renderer = new PDFRenderer(pdoc)
renderer.setImageDownscalingOptimizationThreshold(0.85f)
val hints = new RenderingHints(
RenderingHints.KEY_RENDERING,
RenderingHints.VALUE_RENDER_QUALITY
)
hints.put(
RenderingHints.KEY_COLOR_RENDERING,
RenderingHints.VALUE_COLOR_RENDER_QUALITY
)
hints.put(
RenderingHints.KEY_INTERPOLATION,
RenderingHints.VALUE_INTERPOLATION_BICUBIC
)
hints.put(
RenderingHints.KEY_TEXT_ANTIALIASING,
RenderingHints.VALUE_TEXT_ANTIALIAS_ON
)
hints.put(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON)
renderer.setRenderingHints(hints)
renderer.setSubsamplingAllowed(enableSubsampling(pdoc.getPage(page)))
logger.debug(s"Converting pdf page $page to image at dpi $dpi")
Option(renderer.renderImageWithDPI(page, dpi))
}
}
private def enableSubsampling(page: PDPage): Boolean = {
val res = page.getResources()
val largestImage =
res
.getXObjectNames()
.asScala
.map(name => res.getXObject(name))
.collect { case xobj: PDImageXObject => xobj }
.map { imgobj =>
val w = imgobj.getWidth()
val h = imgobj.getHeight()
logger.debug(s"Found image in pdf of size ${w}x${h} (${w * h}px)")
w * h
}
.maxOption
largestImage.exists(_ > 10 * 1024 * 1024)
}
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
val out = new ByteArrayOutputStream()
ImageIO.write(img, "PNG", out)
Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
}
}