From 350a271b22f8e38bd15f44dcbe92baf054ef2dcd Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 7 Nov 2020 23:27:31 +0100 Subject: [PATCH] Add simple pdf page preview function --- .../docspell/extract/pdfbox/PdfLoader.scala | 24 +++++++++ .../extract/pdfbox/PdfboxPreview.scala | 54 +++++++++++++++++++ .../extract/pdfbox/PdfboxPreviewTest.scala | 46 ++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala create mode 100644 modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala create mode 100644 modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala new file mode 100644 index 00000000..47e04543 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala @@ -0,0 +1,24 @@ +package docspell.extract.pdfbox + +import cats.effect._ +import cats.implicits._ +import fs2.Stream + +import org.apache.pdfbox.pdmodel.PDDocument + +object PdfLoader { + + private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] = + Sync[F].delay(PDDocument.load(bytes)) + + private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] = + Sync[F].delay(pd.close()) + + def withDocumentBytes[F[_]: Sync, A](pdf: Array[Byte])(f: PDDocument => F[A]): F[A] = + Sync[F].bracket(readBytes1(pdf))(f)(pd => closePDDocument(pd)) + + def withDocumentStream[F[_]: Sync, A](pdf: Stream[F, Byte])( + f: PDDocument => F[A] + ): F[A] = + pdf.compile.to(Array).flatMap(bytes => withDocumentBytes(bytes)(f)) +} diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala new file mode 100644 index 00000000..9b7225e8 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala @@ -0,0 +1,54 @@ +package docspell.extract.pdfbox + +import java.awt.image.BufferedImage +import java.awt.image.RenderedImage +import javax.imageio.ImageIO + +import cats.effect._ +import cats.implicits._ +import fs2.Chunk +import fs2.Stream + +import org.apache.commons.io.output.ByteArrayOutputStream +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.rendering.PDFRenderer + +trait PdfboxPreview[F[_]] { + + def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] + +} + +object PdfboxPreview { + + def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] = + Sync[F].pure(new PdfboxPreview[F] { + + def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] = + PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi))) + + def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] = + previewImage(pdf).map(_.map(pngStream[F])) + + }) + + private def getPageImage( + pdoc: PDDocument, + page: Int, + dpi: Float + ): Option[BufferedImage] = { + val count = pdoc.getNumberOfPages + if (count <= 0 || page < 0 || count <= page) None + else { + val renderer = new PDFRenderer(pdoc) + Option(renderer.renderImageWithDPI(page, dpi)) + } + } + + private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = { + val out = new ByteArrayOutputStream() + ImageIO.write(img, "PNG", out) + Stream.chunk(Chunk.bytes(out.toByteArray())) + } + +} diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala new file mode 100644 index 00000000..031cf3ad --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala @@ -0,0 +1,46 @@ +package docspell.extract.pdfbox + +import cats.effect._ +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite +import java.nio.file.Path +import fs2.Stream + +object PdfboxPreviewTest extends SimpleTestSuite { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val testPDFs = List( + ExampleFiles.letter_de_pdf -> "83bdb379fe9ce86e830adfbe11238808bed9da6e31c1b66687d70b6b59a0d815", + ExampleFiles.letter_en_pdf -> "699655a162c0c21dd9f19d8638f4e03811c6626a52bb30a1ac733d7fa5638932", + ExampleFiles.scanner_pdf13_pdf -> "a1680b80b42d8e04365ffd1e806ea2a8adb0492104cc41d8b40435b0fe4d4e65" + ) + + test("extract first page image from PDFs") { + testPDFs.foreach { case (file, checksum) => + val data = file.readURL[IO](8192, blocker) + val sha256out = + Stream + .eval(PdfboxPreview[IO](48)) + .evalMap(_.previewPNG(data)) + .flatMap(_.get) + .through(fs2.hash.sha256) + .chunks + .map(_.toByteVector) + .fold1(_ ++ _) + .compile + .lastOrError + .map(_.toHex.toLowerCase) + + assertEquals(sha256out.unsafeRunSync(), checksum) + } + } + + def writeToFile(data: Stream[IO, Byte], file: Path): IO[Unit] = + data + .through( + fs2.io.file.writeAll(file, blocker) + ) + .compile + .drain +}