mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 13:32:51 +00:00
Extract meta data from pdfs using pdfbox
This commit is contained in:
parent
bd20165d1a
commit
da68405f9b
@ -37,7 +37,7 @@ object PdfExtract {
|
||||
for {
|
||||
pdfboxRes <-
|
||||
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||
.get[F](in)
|
||||
.getText[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
|
@ -0,0 +1,31 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import docspell.common.Timestamp
|
||||
|
||||
final case class PdfMetaData(
|
||||
title: Option[String],
|
||||
author: Option[String],
|
||||
subject: Option[String],
|
||||
keywords: Option[String],
|
||||
creator: Option[String],
|
||||
creationDate: Option[Timestamp]
|
||||
) {
|
||||
|
||||
def isEmpty: Boolean =
|
||||
title.isEmpty &&
|
||||
author.isEmpty &&
|
||||
subject.isEmpty &&
|
||||
keywords.isEmpty &&
|
||||
creator.isEmpty &&
|
||||
creationDate.isEmpty
|
||||
|
||||
def nonEmpty: Boolean =
|
||||
!isEmpty
|
||||
|
||||
def keywordList: List[String] =
|
||||
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
||||
}
|
||||
|
||||
object PdfMetaData {
|
||||
val empty = PdfMetaData(None, None, None, None, None, None)
|
||||
}
|
@ -13,18 +13,33 @@ import docspell.extract.internal.Text
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.text.PDFTextStripper
|
||||
import docspell.common.Timestamp
|
||||
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
def getTextAndMetaData[F[_]: Sync](
|
||||
data: Stream[F, Byte]
|
||||
): F[Either[Throwable, (Text, PdfMetaData)]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes =>
|
||||
Using(PDDocument.load(bytes)) { doc =>
|
||||
for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, md)
|
||||
}.toEither.flatten
|
||||
)
|
||||
|
||||
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, Text] =
|
||||
def getText(is: InputStream): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
||||
def get(inFile: Path): Either[Throwable, Text] =
|
||||
def getText(inFile: Path): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||
|
||||
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||
@ -34,4 +49,31 @@ object PdfboxExtract {
|
||||
stripper.setLineSeparator("\n")
|
||||
Text(Option(stripper.getText(doc)))
|
||||
}.toEither
|
||||
|
||||
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
||||
|
||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||
|
||||
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
|
||||
|
||||
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
|
||||
Try {
|
||||
def mkValue(s: String) =
|
||||
Option(s).map(_.trim).filter(_.nonEmpty)
|
||||
|
||||
val info = doc.getDocumentInformation
|
||||
PdfMetaData(
|
||||
mkValue(info.getTitle),
|
||||
mkValue(info.getAuthor),
|
||||
mkValue(info.getSubject),
|
||||
mkValue(info.getKeywords),
|
||||
mkValue(info.getCreator),
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
||||
)
|
||||
}.toEither
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object PdfMetaDataTest extends SimpleTestSuite {
|
||||
|
||||
test("split keywords on comma") {
|
||||
val md = PdfMetaData.empty.copy(keywords = Some("a,b, c"))
|
||||
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||
}
|
||||
|
||||
test("split keywords on semicolon") {
|
||||
val md = PdfMetaData.empty.copy(keywords = Some("a; b;c"))
|
||||
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||
}
|
||||
|
||||
test("split keywords on comma and semicolon") {
|
||||
val md = PdfMetaData.empty.copy(keywords = Some("a, b; c"))
|
||||
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||
}
|
||||
|
||||
}
|
@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
textPDFs.foreach {
|
||||
case (file, txt) =>
|
||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||
val received = removeFormatting(str.value)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
textPDFs.foreach {
|
||||
case (file, txt) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
||||
val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
|
||||
val received = removeFormatting(str.value)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
test("extract text from image PDFs") {
|
||||
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
|
||||
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||
|
||||
assertEquals(str.value, "")
|
||||
}
|
||||
|
||||
test("extract metadata from pdf") {
|
||||
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
|
||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||
assert(str.value.startsWith("Keywords in PDF"))
|
||||
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
|
||||
assertEquals(md.author, Some("E.K."))
|
||||
assertEquals(md.title, Some("Keywords in PDF"))
|
||||
assertEquals(md.subject, Some("This is a subject"))
|
||||
assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo"))
|
||||
assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)"))
|
||||
assert(md.creationDate.isDefined)
|
||||
}
|
||||
|
||||
private def removeFormatting(str: String): String =
|
||||
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
|
||||
}
|
||||
|
BIN
modules/files/src/test/resources/keywords.pdf
Normal file
BIN
modules/files/src/test/resources/keywords.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user