mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-25 16:45:05 +00:00
Add simple pdf page preview function
This commit is contained in:
parent
36317a3a95
commit
350a271b22
@ -0,0 +1,24 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
|
||||
object PdfLoader {
|
||||
|
||||
private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] =
|
||||
Sync[F].delay(PDDocument.load(bytes))
|
||||
|
||||
private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] =
|
||||
Sync[F].delay(pd.close())
|
||||
|
||||
def withDocumentBytes[F[_]: Sync, A](pdf: Array[Byte])(f: PDDocument => F[A]): F[A] =
|
||||
Sync[F].bracket(readBytes1(pdf))(f)(pd => closePDDocument(pd))
|
||||
|
||||
def withDocumentStream[F[_]: Sync, A](pdf: Stream[F, Byte])(
|
||||
f: PDDocument => F[A]
|
||||
): F[A] =
|
||||
pdf.compile.to(Array).flatMap(bytes => withDocumentBytes(bytes)(f))
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import java.awt.image.BufferedImage
|
||||
import java.awt.image.RenderedImage
|
||||
import javax.imageio.ImageIO
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Chunk
|
||||
import fs2.Stream
|
||||
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.rendering.PDFRenderer
|
||||
|
||||
trait PdfboxPreview[F[_]] {
|
||||
|
||||
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]]
|
||||
|
||||
}
|
||||
|
||||
object PdfboxPreview {
|
||||
|
||||
def apply[F[_]: Sync](dpi: Float): F[PdfboxPreview[F]] =
|
||||
Sync[F].pure(new PdfboxPreview[F] {
|
||||
|
||||
def previewImage(pdf: Stream[F, Byte]): F[Option[BufferedImage]] =
|
||||
PdfLoader.withDocumentStream(pdf)(doc => Sync[F].delay(getPageImage(doc, 0, dpi)))
|
||||
|
||||
def previewPNG(pdf: Stream[F, Byte]): F[Option[Stream[F, Byte]]] =
|
||||
previewImage(pdf).map(_.map(pngStream[F]))
|
||||
|
||||
})
|
||||
|
||||
private def getPageImage(
|
||||
pdoc: PDDocument,
|
||||
page: Int,
|
||||
dpi: Float
|
||||
): Option[BufferedImage] = {
|
||||
val count = pdoc.getNumberOfPages
|
||||
if (count <= 0 || page < 0 || count <= page) None
|
||||
else {
|
||||
val renderer = new PDFRenderer(pdoc)
|
||||
Option(renderer.renderImageWithDPI(page, dpi))
|
||||
}
|
||||
}
|
||||
|
||||
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
|
||||
val out = new ByteArrayOutputStream()
|
||||
ImageIO.write(img, "PNG", out)
|
||||
Stream.chunk(Chunk.bytes(out.toByteArray()))
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import cats.effect._
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
import java.nio.file.Path
|
||||
import fs2.Stream
|
||||
|
||||
object PdfboxPreviewTest extends SimpleTestSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val testPDFs = List(
|
||||
ExampleFiles.letter_de_pdf -> "83bdb379fe9ce86e830adfbe11238808bed9da6e31c1b66687d70b6b59a0d815",
|
||||
ExampleFiles.letter_en_pdf -> "699655a162c0c21dd9f19d8638f4e03811c6626a52bb30a1ac733d7fa5638932",
|
||||
ExampleFiles.scanner_pdf13_pdf -> "a1680b80b42d8e04365ffd1e806ea2a8adb0492104cc41d8b40435b0fe4d4e65"
|
||||
)
|
||||
|
||||
test("extract first page image from PDFs") {
|
||||
testPDFs.foreach { case (file, checksum) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val sha256out =
|
||||
Stream
|
||||
.eval(PdfboxPreview[IO](48))
|
||||
.evalMap(_.previewPNG(data))
|
||||
.flatMap(_.get)
|
||||
.through(fs2.hash.sha256)
|
||||
.chunks
|
||||
.map(_.toByteVector)
|
||||
.fold1(_ ++ _)
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(_.toHex.toLowerCase)
|
||||
|
||||
assertEquals(sha256out.unsafeRunSync(), checksum)
|
||||
}
|
||||
}
|
||||
|
||||
def writeToFile(data: Stream[IO, Byte], file: Path): IO[Unit] =
|
||||
data
|
||||
.through(
|
||||
fs2.io.file.writeAll(file, blocker)
|
||||
)
|
||||
.compile
|
||||
.drain
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user