Update pdfbox to 3.0.0

This commit is contained in:
eikek 2023-11-05 23:34:51 +01:00
parent 84612bc7e7
commit fe4a300b0e
6 changed files with 18 additions and 33 deletions

View File

@ -16,6 +16,7 @@ import docspell.logging.Logger
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
import org.apache.pdfbox.{Loader => PdfboxLoader}
/** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */ /** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */
object RemovePdfEncryption { object RemovePdfEncryption {
@ -76,7 +77,7 @@ object RemovePdfEncryption {
} }
private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] = private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] =
try Option(PDDocument.load(bytes, pw.pass)) try Option(PdfboxLoader.loadPDF(bytes, pw.pass))
catch { catch {
case _: InvalidPasswordException => case _: InvalidPasswordException =>
None None

View File

@ -22,8 +22,8 @@ import docspell.common.util.File
import docspell.convert.ConversionResult.Handler import docspell.convert.ConversionResult.Handler
import docspell.files.TikaMimetype import docspell.files.TikaMimetype
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
import org.apache.pdfbox.{Loader => PdfboxLoader}
trait FileChecks { trait FileChecks {
@ -42,7 +42,7 @@ trait FileChecks {
isType(MimeType.text("plain")) isType(MimeType.text("plain"))
def isUnencryptedPDF: Boolean = def isUnencryptedPDF: Boolean =
Try(PDDocument.load(p.toNioPath.toFile)).map(_.close()).isSuccess Try(PdfboxLoader.loadPDF(p.toNioPath.toFile)).map(_.close()).isSuccess
} }
implicit class ByteStreamOps(delegate: Stream[IO, Byte]) { implicit class ByteStreamOps(delegate: Stream[IO, Byte]) {
@ -58,14 +58,14 @@ trait FileChecks {
def isUnencryptedPDF: IO[Boolean] = def isUnencryptedPDF: IO[Boolean] =
delegate.compile delegate.compile
.to(Array) .to(Array)
.map(PDDocument.load(_)) .map(PdfboxLoader.loadPDF)
.map(_.close()) .map(_.close())
.map(_ => true) .map(_ => true)
def isEncryptedPDF: IO[Boolean] = def isEncryptedPDF: IO[Boolean] =
delegate.compile delegate.compile
.to(Array) .to(Array)
.map(PDDocument.load(_)) .map(PdfboxLoader.loadPDF)
.attempt .attempt
.map(e => .map(e =>
e.fold( e.fold(

View File

@ -11,11 +11,12 @@ import cats.implicits._
import fs2.Stream import fs2.Stream
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.{Loader => PdfboxLoader}
object PdfLoader { object PdfLoader {
private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] = private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] =
Sync[F].delay(PDDocument.load(bytes)) Sync[F].delay(PdfboxLoader.loadPDF(bytes))
private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] = private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] =
Sync[F].delay(pd.close()) Sync[F].delay(pd.close())

View File

@ -6,8 +6,6 @@
package docspell.extract.pdfbox package docspell.extract.pdfbox
import java.io.InputStream
import scala.util.{Try, Using} import scala.util.{Try, Using}
import cats.effect.Sync import cats.effect.Sync
@ -20,6 +18,7 @@ import docspell.extract.internal.Text
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper import org.apache.pdfbox.text.PDFTextStripper
import org.apache.pdfbox.{Loader => PdfboxLoader}
object PdfboxExtract { object PdfboxExtract {
@ -44,11 +43,8 @@ object PdfboxExtract {
.attempt .attempt
.map(_.flatten) .map(_.flatten)
def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten
def getText(inFile: Path): Either[Throwable, Text] = def getText(inFile: Path): Either[Throwable, Text] =
Using(PDDocument.load(inFile.toNioPath.toFile))(readText).toEither.flatten Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, Text] = private def readText(doc: PDDocument): Either[Throwable, Text] =
Try { Try {
@ -64,11 +60,8 @@ object PdfboxExtract {
.attempt .attempt
.map(_.flatten) .map(_.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] = def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(inFile.toNioPath.toFile))(readMetaData).toEither.flatten Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readMetaData).toEither.flatten
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] = private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
Try { Try {
@ -83,7 +76,7 @@ object PdfboxExtract {
mkValue(info.getKeywords), mkValue(info.getKeywords),
mkValue(info.getCreator), mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)), Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
doc.getNumberOfPages() doc.getNumberOfPages
) )
}.toEither }.toEither
} }

View File

@ -21,16 +21,6 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig {
ExampleFiles.letter_en_pdf -> TestFiles.letterENText ExampleFiles.letter_en_pdf -> TestFiles.letterENText
) )
test("extract text from text PDFs by inputstream") {
textPDFs.foreach { case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str.value)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
}
test("extract text from text PDFs via Stream") { test("extract text from text PDFs via Stream") {
textPDFs.foreach { case (file, txt) => textPDFs.foreach { case (file, txt) =>
val data = file.readURL[IO](8192) val data = file.readURL[IO](8192)
@ -42,18 +32,18 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig {
} }
test("extract text from image PDFs") { test("extract text from image PDFs") {
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) val pdfData = ExampleFiles.scanner_pdf13_pdf.readURL[IO](8192)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity)
assertEquals(str.value, "") assertEquals(str.value, "")
} }
test("extract metadata from pdf") { test("extract metadata from pdf") {
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) val pdfData = ExampleFiles.keywords_pdf.readURL[IO](8192)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity)
assert(str.value.startsWith("Keywords in PDF")) assert(str.value.startsWith("Keywords in PDF"))
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) val md = PdfboxExtract.getMetaData(pdfData).unsafeRunSync().fold(throw _, identity)
assertEquals(md.author, Some("E.K.")) assertEquals(md.author, Some("E.K."))
assertEquals(md.title, Some("Keywords in PDF")) assertEquals(md.title, Some("Keywords in PDF"))
assertEquals(md.subject, Some("This is a subject")) assertEquals(md.subject, Some("This is a subject"))

View File

@ -35,7 +35,7 @@ object Dependencies {
val MariaDbVersion = "3.2.0" val MariaDbVersion = "3.2.0"
val MUnitVersion = "0.7.29" val MUnitVersion = "0.7.29"
val MUnitCatsEffectVersion = "1.0.7" val MUnitCatsEffectVersion = "1.0.7"
val PdfboxVersion = "2.0.29" val PdfboxVersion = "3.0.0"
val PdfjsViewerVersion = "2.12.313" val PdfjsViewerVersion = "2.12.313"
val PoiVersion = "4.1.2" val PoiVersion = "4.1.2"
val PostgresVersion = "42.6.0" val PostgresVersion = "42.6.0"