mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-13 01:39:33 +00:00
Update pdfbox to 3.0.0
This commit is contained in:
parent
84612bc7e7
commit
fe4a300b0e
@ -16,6 +16,7 @@ import docspell.logging.Logger
|
|||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
||||||
|
import org.apache.pdfbox.{Loader => PdfboxLoader}
|
||||||
|
|
||||||
/** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */
|
/** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */
|
||||||
object RemovePdfEncryption {
|
object RemovePdfEncryption {
|
||||||
@ -76,7 +77,7 @@ object RemovePdfEncryption {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] =
|
private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] =
|
||||||
try Option(PDDocument.load(bytes, pw.pass))
|
try Option(PdfboxLoader.loadPDF(bytes, pw.pass))
|
||||||
catch {
|
catch {
|
||||||
case _: InvalidPasswordException =>
|
case _: InvalidPasswordException =>
|
||||||
None
|
None
|
||||||
|
@ -22,8 +22,8 @@ import docspell.common.util.File
|
|||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.files.TikaMimetype
|
import docspell.files.TikaMimetype
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
|
||||||
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
||||||
|
import org.apache.pdfbox.{Loader => PdfboxLoader}
|
||||||
|
|
||||||
trait FileChecks {
|
trait FileChecks {
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ trait FileChecks {
|
|||||||
isType(MimeType.text("plain"))
|
isType(MimeType.text("plain"))
|
||||||
|
|
||||||
def isUnencryptedPDF: Boolean =
|
def isUnencryptedPDF: Boolean =
|
||||||
Try(PDDocument.load(p.toNioPath.toFile)).map(_.close()).isSuccess
|
Try(PdfboxLoader.loadPDF(p.toNioPath.toFile)).map(_.close()).isSuccess
|
||||||
}
|
}
|
||||||
|
|
||||||
implicit class ByteStreamOps(delegate: Stream[IO, Byte]) {
|
implicit class ByteStreamOps(delegate: Stream[IO, Byte]) {
|
||||||
@ -58,14 +58,14 @@ trait FileChecks {
|
|||||||
def isUnencryptedPDF: IO[Boolean] =
|
def isUnencryptedPDF: IO[Boolean] =
|
||||||
delegate.compile
|
delegate.compile
|
||||||
.to(Array)
|
.to(Array)
|
||||||
.map(PDDocument.load(_))
|
.map(PdfboxLoader.loadPDF)
|
||||||
.map(_.close())
|
.map(_.close())
|
||||||
.map(_ => true)
|
.map(_ => true)
|
||||||
|
|
||||||
def isEncryptedPDF: IO[Boolean] =
|
def isEncryptedPDF: IO[Boolean] =
|
||||||
delegate.compile
|
delegate.compile
|
||||||
.to(Array)
|
.to(Array)
|
||||||
.map(PDDocument.load(_))
|
.map(PdfboxLoader.loadPDF)
|
||||||
.attempt
|
.attempt
|
||||||
.map(e =>
|
.map(e =>
|
||||||
e.fold(
|
e.fold(
|
||||||
|
@ -11,11 +11,12 @@ import cats.implicits._
|
|||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
|
import org.apache.pdfbox.{Loader => PdfboxLoader}
|
||||||
|
|
||||||
object PdfLoader {
|
object PdfLoader {
|
||||||
|
|
||||||
private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] =
|
private def readBytes1[F[_]: Sync](bytes: Array[Byte]): F[PDDocument] =
|
||||||
Sync[F].delay(PDDocument.load(bytes))
|
Sync[F].delay(PdfboxLoader.loadPDF(bytes))
|
||||||
|
|
||||||
private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] =
|
private def closePDDocument[F[_]: Sync](pd: PDDocument): F[Unit] =
|
||||||
Sync[F].delay(pd.close())
|
Sync[F].delay(pd.close())
|
||||||
|
@ -6,8 +6,6 @@
|
|||||||
|
|
||||||
package docspell.extract.pdfbox
|
package docspell.extract.pdfbox
|
||||||
|
|
||||||
import java.io.InputStream
|
|
||||||
|
|
||||||
import scala.util.{Try, Using}
|
import scala.util.{Try, Using}
|
||||||
|
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
@ -20,6 +18,7 @@ import docspell.extract.internal.Text
|
|||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
import org.apache.pdfbox.text.PDFTextStripper
|
import org.apache.pdfbox.text.PDFTextStripper
|
||||||
|
import org.apache.pdfbox.{Loader => PdfboxLoader}
|
||||||
|
|
||||||
object PdfboxExtract {
|
object PdfboxExtract {
|
||||||
|
|
||||||
@ -44,11 +43,8 @@ object PdfboxExtract {
|
|||||||
.attempt
|
.attempt
|
||||||
.map(_.flatten)
|
.map(_.flatten)
|
||||||
|
|
||||||
def getText(is: InputStream): Either[Throwable, Text] =
|
|
||||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
|
||||||
|
|
||||||
def getText(inFile: Path): Either[Throwable, Text] =
|
def getText(inFile: Path): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(inFile.toNioPath.toFile))(readText).toEither.flatten
|
Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readText).toEither.flatten
|
||||||
|
|
||||||
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
@ -64,11 +60,8 @@ object PdfboxExtract {
|
|||||||
.attempt
|
.attempt
|
||||||
.map(_.flatten)
|
.map(_.flatten)
|
||||||
|
|
||||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
|
||||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
|
||||||
|
|
||||||
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
|
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
|
||||||
Using(PDDocument.load(inFile.toNioPath.toFile))(readMetaData).toEither.flatten
|
Using(PdfboxLoader.loadPDF(inFile.toNioPath.toFile))(readMetaData).toEither.flatten
|
||||||
|
|
||||||
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
|
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
|
||||||
Try {
|
Try {
|
||||||
@ -83,7 +76,7 @@ object PdfboxExtract {
|
|||||||
mkValue(info.getKeywords),
|
mkValue(info.getKeywords),
|
||||||
mkValue(info.getCreator),
|
mkValue(info.getCreator),
|
||||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
|
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
|
||||||
doc.getNumberOfPages()
|
doc.getNumberOfPages
|
||||||
)
|
)
|
||||||
}.toEither
|
}.toEither
|
||||||
}
|
}
|
||||||
|
@ -21,16 +21,6 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig {
|
|||||||
ExampleFiles.letter_en_pdf -> TestFiles.letterENText
|
ExampleFiles.letter_en_pdf -> TestFiles.letterENText
|
||||||
)
|
)
|
||||||
|
|
||||||
test("extract text from text PDFs by inputstream") {
|
|
||||||
textPDFs.foreach { case (file, txt) =>
|
|
||||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
|
||||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
|
||||||
val received = removeFormatting(str.value)
|
|
||||||
val expect = removeFormatting(txt)
|
|
||||||
assertEquals(received, expect)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
test("extract text from text PDFs via Stream") {
|
test("extract text from text PDFs via Stream") {
|
||||||
textPDFs.foreach { case (file, txt) =>
|
textPDFs.foreach { case (file, txt) =>
|
||||||
val data = file.readURL[IO](8192)
|
val data = file.readURL[IO](8192)
|
||||||
@ -42,18 +32,18 @@ class PdfboxExtractTest extends FunSuite with TestLoggingConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
test("extract text from image PDFs") {
|
test("extract text from image PDFs") {
|
||||||
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
|
val pdfData = ExampleFiles.scanner_pdf13_pdf.readURL[IO](8192)
|
||||||
|
|
||||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity)
|
||||||
|
|
||||||
assertEquals(str.value, "")
|
assertEquals(str.value, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
test("extract metadata from pdf") {
|
test("extract metadata from pdf") {
|
||||||
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
|
val pdfData = ExampleFiles.keywords_pdf.readURL[IO](8192)
|
||||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.getText(pdfData).unsafeRunSync().fold(throw _, identity)
|
||||||
assert(str.value.startsWith("Keywords in PDF"))
|
assert(str.value.startsWith("Keywords in PDF"))
|
||||||
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
|
val md = PdfboxExtract.getMetaData(pdfData).unsafeRunSync().fold(throw _, identity)
|
||||||
assertEquals(md.author, Some("E.K."))
|
assertEquals(md.author, Some("E.K."))
|
||||||
assertEquals(md.title, Some("Keywords in PDF"))
|
assertEquals(md.title, Some("Keywords in PDF"))
|
||||||
assertEquals(md.subject, Some("This is a subject"))
|
assertEquals(md.subject, Some("This is a subject"))
|
||||||
|
@ -35,7 +35,7 @@ object Dependencies {
|
|||||||
val MariaDbVersion = "3.2.0"
|
val MariaDbVersion = "3.2.0"
|
||||||
val MUnitVersion = "0.7.29"
|
val MUnitVersion = "0.7.29"
|
||||||
val MUnitCatsEffectVersion = "1.0.7"
|
val MUnitCatsEffectVersion = "1.0.7"
|
||||||
val PdfboxVersion = "2.0.29"
|
val PdfboxVersion = "3.0.0"
|
||||||
val PdfjsViewerVersion = "2.12.313"
|
val PdfjsViewerVersion = "2.12.313"
|
||||||
val PoiVersion = "4.1.2"
|
val PoiVersion = "4.1.2"
|
||||||
val PostgresVersion = "42.6.0"
|
val PostgresVersion = "42.6.0"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user