Add simple pdf page preview function

This commit is contained in:
Eike Kettner 2020-11-07 23:27:31 +01:00
parent 36317a3a95
commit 350a271b22
3 changed files with 124 additions and 0 deletions

View File

@ -0,0 +1,24 @@
package docspell.extract.pdfbox
import cats.effect._
import cats.implicits._
import fs2.Stream
import org.apache.pdfbox.pdmodel.PDDocument
object PdfLoader {
private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] =
Sync[F].delay(PDDocument.load(bytes))
private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] =
Sync[F].delay(pd.close())
def withDocumentBytes[F[_]: Sync, A](pdf: Array[Byte])(f: PDDocument => F[A]): F[A] =
Sync[F].bracket(readBytes1(pdf))(f)(pd => closePDDocument(pd))
def withDocumentStream[F[_]: Sync, A](pdf: Stream[F, Byte])(
f: PDDocument => F[A]
): F[A] =
pdf.compile.to(Array).flatMap(bytes => withDocumentBytes(bytes)(f))
}

View File

@ -0,0 +1,54 @@
package docspell.extract.pdfbox
import java.awt.image.BufferedImage
import java.awt.image.RenderedImage
import javax.imageio.ImageIO
import cats.effect._
import cats.implicits._
import fs2.Chunk
import fs2.Stream
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.PDFRenderer
trait PdfboxPreview[F[_]] {
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]]
}
object PdfboxPreview {
def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] =
Sync[F].pure(new PdfboxPreview[F] {
def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] =
PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi)))
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] =
previewImage(pdf).map(_.map(pngStream[F]))
})
private def getPageImage(
pdoc: PDDocument,
page: Int,
dpi: Float
): Option[BufferedImage] = {
val count = pdoc.getNumberOfPages
if (count <= 0 || page < 0 || count <= page) None
else {
val renderer = new PDFRenderer(pdoc)
Option(renderer.renderImageWithDPI(page, dpi))
}
}
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
val out = new ByteArrayOutputStream()
ImageIO.write(img, "PNG", out)
Stream.chunk(Chunk.bytes(out.toByteArray()))
}
}

View File

@ -0,0 +1,46 @@
package docspell.extract.pdfbox
import cats.effect._
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
import java.nio.file.Path
import fs2.Stream
object PdfboxPreviewTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val testPDFs = List(
ExampleFiles.letter_de_pdf -> "83bdb379fe9ce86e830adfbe11238808bed9da6e31c1b66687d70b6b59a0d815",
ExampleFiles.letter_en_pdf -> "699655a162c0c21dd9f19d8638f4e03811c6626a52bb30a1ac733d7fa5638932",
ExampleFiles.scanner_pdf13_pdf -> "a1680b80b42d8e04365ffd1e806ea2a8adb0492104cc41d8b40435b0fe4d4e65"
)
test("extract first page image from PDFs") {
testPDFs.foreach { case (file, checksum) =>
val data = file.readURL[IO](8192, blocker)
val sha256out =
Stream
.eval(PdfboxPreview[IO](48))
.evalMap(_.previewPNG(data))
.flatMap(_.get)
.through(fs2.hash.sha256)
.chunks
.map(_.toByteVector)
.fold1(_ ++ _)
.compile
.lastOrError
.map(_.toHex.toLowerCase)
assertEquals(sha256out.unsafeRunSync(), checksum)
}
}
def writeToFile(data: Stream[IO, Byte], file: Path): IO[Unit] =
data
.through(
fs2.io.file.writeAll(file, blocker)
)
.compile
.drain
}