mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-02 09:05:08 +00:00
Allow subsampling when generating preview images
This cuts down considerably when high-dpi images are provided in pdfs. The test file, scanned with 600dpi resulting in a 5.4M pdf file contains a 9900x13800 image. This image is loaded into memory in order to scale it down by PDFBox. This easily results in out of memory errors (this image requires already ~400M). With subsampling the size is reduced at most by a factor of 8. Still recommended to avoid large dpi image-only scans for text based documents or increase the heap size for joex.
This commit is contained in:
parent
c9eabd087a
commit
bc1ec90b6e
@ -6,10 +6,13 @@
|
||||
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import java.awt.RenderingHints
|
||||
import java.awt.image.BufferedImage
|
||||
import java.awt.image.RenderedImage
|
||||
import javax.imageio.ImageIO
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Chunk
|
||||
@ -17,6 +20,8 @@ import fs2.Stream
|
||||
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.pdmodel.PDPage
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
|
||||
import org.apache.pdfbox.rendering.PDFRenderer
|
||||
import scodec.bits.ByteVector
|
||||
|
||||
@ -27,6 +32,7 @@ trait PdfboxPreview[F[_]] {
|
||||
}
|
||||
|
||||
object PdfboxPreview {
|
||||
private[this] val logger = org.log4s.getLogger
|
||||
|
||||
def apply[F[_]: Sync](cfg: PreviewConfig): F[PdfboxPreview[F]] =
|
||||
Sync[F].pure(new PdfboxPreview[F] {
|
||||
@ -50,14 +56,52 @@ object PdfboxPreview {
|
||||
if (count <= 0 || page < 0 || count <= page) None
|
||||
else {
|
||||
val renderer = new PDFRenderer(pdoc)
|
||||
renderer.setImageDownscalingOptimizationThreshold(0.85f)
|
||||
val hints = new RenderingHints(
|
||||
RenderingHints.KEY_RENDERING,
|
||||
RenderingHints.VALUE_RENDER_QUALITY
|
||||
)
|
||||
hints.put(
|
||||
RenderingHints.KEY_COLOR_RENDERING,
|
||||
RenderingHints.VALUE_COLOR_RENDER_QUALITY
|
||||
)
|
||||
hints.put(
|
||||
RenderingHints.KEY_INTERPOLATION,
|
||||
RenderingHints.VALUE_INTERPOLATION_BICUBIC
|
||||
)
|
||||
hints.put(
|
||||
RenderingHints.KEY_TEXT_ANTIALIASING,
|
||||
RenderingHints.VALUE_TEXT_ANTIALIAS_ON
|
||||
)
|
||||
hints.put(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON)
|
||||
renderer.setRenderingHints(hints)
|
||||
renderer.setSubsamplingAllowed(enableSubsampling(pdoc.getPage(page)))
|
||||
logger.debug(s"Converting pdf page $page to image at dpi $dpi")
|
||||
Option(renderer.renderImageWithDPI(page, dpi))
|
||||
}
|
||||
}
|
||||
|
||||
private def enableSubsampling(page: PDPage): Boolean = {
|
||||
val res = page.getResources()
|
||||
val largestImage =
|
||||
res
|
||||
.getXObjectNames()
|
||||
.asScala
|
||||
.map(name => res.getXObject(name))
|
||||
.collect { case xobj: PDImageXObject => xobj }
|
||||
.map { imgobj =>
|
||||
val w = imgobj.getWidth()
|
||||
val h = imgobj.getHeight()
|
||||
logger.debug(s"Found image in pdf of size ${w}x${h} (${w * h}px)")
|
||||
w * h
|
||||
}
|
||||
.maxOption
|
||||
largestImage.exists(_ > 10 * 1024 * 1024)
|
||||
}
|
||||
|
||||
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
|
||||
val out = new ByteArrayOutputStream()
|
||||
ImageIO.write(img, "PNG", out)
|
||||
Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user