From fe4a300b0eb9c5e84af937c659a9a510005821e7 Mon Sep 17 00:00:00 2001 From: eikek Date: Sun, 5 Nov 2023 23:34:51 +0100 Subject: [PATCH] Update pdfbox to 3.0.0 --- .../convert/RemovePdfEncryption.scala | 3 ++- .../scala/docspell/convert/FileChecks.scala | 8 ++++---- .../docspell/extract/pdfbox/PdfLoader.scala | 3 ++- .../extract/pdfbox/PdfboxExtract.scala | 15 ++++---------- .../extract/pdfbox/PdfboxExtractTest.scala | 20 +++++-------------- project/Dependencies.scala | 2 +- 6 files changed, 18 insertions(+), 33 deletions(-) diff --git a/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala index 0ea9232b..8c36a0f1 100644 --- a/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala +++ b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala @@ -16,6 +16,7 @@ import docspell.logging.Logger import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException +import org.apache.pdfbox.{Loader => PdfboxLoader} /** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */ object RemovePdfEncryption { @@ -76,7 +77,7 @@ object RemovePdfEncryption { } private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] = - try Option(PDDocument.load(bytes, pw.pass)) + try Option(PdfboxLoader.loadPDF(bytes, pw.pass)) catch { case _: InvalidPasswordException => None diff --git a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala index ad41c01c..13229a6f 100644 --- a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala +++ b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala @@ -22,8 +22,8 @@ import docspell.common.util.File import docspell.convert.ConversionResult.Handler import docspell.files.TikaMimetype -import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException +import org.apache.pdfbox.{Loader => PdfboxLoader} trait FileChecks { @@ -42,7 +42,7 @@ trait FileChecks { isType(MimeType.text("plain")) def isUnencryptedPDF: Boolean = - Try(PDDocument.load(p.toNioPath.toFile)).map(_.close()).isSuccess + Try(PdfboxLoader.loadPDF(p.toNioPath.toFile)).map(_.close()).isSuccess } implicit class ByteStreamOps(delegate: Stream[IO, Byte]) { @@ -58,14 +58,14 @@ trait FileChecks { def isUnencryptedPDF: IO[Boolean] = delegate.compile .to(Array) - .map(PDDocument.load(_)) + .map(PdfboxLoader.loadPDF) .map(_.close()) .map(_ => true) def isEncryptedPDF: IO[Boolean] = delegate.compile .to(Array) - .map(PDDocument.load(_)) + .map(PdfboxLoader.loadPDF) .attempt .map(e => e.fold( diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala index a429ee5d..2c194ce1 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfLoader.scala @@ -11,11 +11,12 @@ import cats.implicits._ import fs2.Stream import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.{Loader => PdfboxLoader} object PdfLoader { private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] = - Sync[F].delay(PDDocument.load(bytes)) + Sync[F].delay(PdfboxLoader.loadPDF(bytes)) private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] = Sync[F].delay(pd.close()) diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index 6a5590dd..45beb9e1 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -6,8 +6,6 @@ package docspell.extract.pdfbox -import java.io.InputStream - import scala.util.{Try, Using} import cats.effect.Sync @@ -20,6 +18,7 @@ import docspell.extract.internal.Text import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.text.PDFTextStripper +import org.apache.pdfbox.{Loader => PdfboxLoader} object PdfboxExtract { @@ -44,11 +43,8 @@ object PdfboxExtract { .attempt .map(_.flatten) - def getText(is: InputStream): Either[Throwable, Text] = - Using(PDDocument.load(is))(readText).toEither.flatten - def getText(inFile: Path): Either[Throwable, Text] = - Using(PDDocument.load(inFile.toNioPath.toFile))(readText).toEither.flatten + Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readText).toEither.flatten private def readText(doc: PDDocument): Either[Throwable, Text] = Try { @@ -64,11 +60,8 @@ object PdfboxExtract { .attempt .map(_.flatten) - def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = - Using(PDDocument.load(is))(readMetaData).toEither.flatten - def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] = - Using(PDDocument.load(inFile.toNioPath.toFile))(readMetaData).toEither.flatten + Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readMetaData).toEither.flatten private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] = Try { @@ -83,7 +76,7 @@ object PdfboxExtract { mkValue(info.getKeywords), mkValue(info.getCreator), Option(info.getCreationDate).map(c => Timestamp(c.toInstant)), - doc.getNumberOfPages() + doc.getNumberOfPages ) }.toEither } diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala index db47476c..c8ea0fa6 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -21,16 +21,6 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig { ExampleFiles.letter_en_pdf -> TestFiles.letterENText ) - test("extract text from text PDFs by inputstream") { - textPDFs.foreach { case (file, txt) => - val url = file.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) - val received = removeFormatting(str.value) - val expect = removeFormatting(txt) - assertEquals(received, expect) - } - } - test("extract text from text PDFs via Stream") { textPDFs.foreach { case (file, txt) => val data = file.readURL[IO](8192) @@ -42,18 +32,18 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig { } test("extract text from image PDFs") { - val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) + val pdfData = ExampleFiles.scanner_pdf13_pdf.readURL[IO](8192) - val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) + val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity) assertEquals(str.value, "") } test("extract metadata from pdf") { - val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) + val pdfData = ExampleFiles.keywords_pdf.readURL[IO](8192) + val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity) assert(str.value.startsWith("Keywords in PDF")) - val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) + val md = PdfboxExtract.getMetaData(pdfData).unsafeRunSync().fold(throw _, identity) assertEquals(md.author, Some("E.K.")) assertEquals(md.title, Some("Keywords in PDF")) assertEquals(md.subject, Some("This is a subject")) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 1ad4ce5f..e0a5b56a 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -35,7 +35,7 @@ object Dependencies { val MariaDbVersion = "3.2.0" val MUnitVersion = "0.7.29" val MUnitCatsEffectVersion = "1.0.7" - val PdfboxVersion = "2.0.29" + val PdfboxVersion = "3.0.0" val PdfjsViewerVersion = "2.12.313" val PoiVersion = "4.1.2" val PostgresVersion = "42.6.0"