Extract meta data from pdfs using pdfbox

This commit is contained in:
Eike Kettner 2020-07-18 23:04:46 +02:00
parent bd20165d1a
commit da68405f9b
6 changed files with 115 additions and 7 deletions

View File

@ -37,7 +37,7 @@ object PdfExtract {
for {
pdfboxRes <-
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
.get[F](in)
.getText[F](in)
res <- pdfboxRes.fold(
ex =>
logger.info(

View File

@ -0,0 +1,31 @@
package docspell.extract.pdfbox
import docspell.common.Timestamp
final case class PdfMetaData(
title: Option[String],
author: Option[String],
subject: Option[String],
keywords: Option[String],
creator: Option[String],
creationDate: Option[Timestamp]
) {
def isEmpty: Boolean =
title.isEmpty &&
author.isEmpty &&
subject.isEmpty &&
keywords.isEmpty &&
creator.isEmpty &&
creationDate.isEmpty
def nonEmpty: Boolean =
!isEmpty
def keywordList: List[String] =
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
}
object PdfMetaData {
val empty = PdfMetaData(None, None, None, None, None, None)
}

View File

@ -13,18 +13,33 @@ import docspell.extract.internal.Text
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import docspell.common.Timestamp
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte]
): F[Either[Throwable, (Text, PdfMetaData)]] =
data.compile
.to(Array)
.map(bytes =>
Using(PDDocument.load(bytes)) { doc =>
for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, md)
}.toEither.flatten
)
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, Text] =
def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, Text] =
def getText(inFile: Path): Either[Throwable, Text] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, Text] =
@ -34,4 +49,31 @@ object PdfboxExtract {
stripper.setLineSeparator("\n")
Text(Option(stripper.getText(doc)))
}.toEither
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
Try {
def mkValue(s: String) =
Option(s).map(_.trim).filter(_.nonEmpty)
val info = doc.getDocumentInformation
PdfMetaData(
mkValue(info.getTitle),
mkValue(info.getAuthor),
mkValue(info.getSubject),
mkValue(info.getKeywords),
mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
)
}.toEither
}

View File

@ -0,0 +1,22 @@
package docspell.extract.pdfbox
import minitest.SimpleTestSuite
object PdfMetaDataTest extends SimpleTestSuite {
test("split keywords on comma") {
val md = PdfMetaData.empty.copy(keywords = Some("a,b, c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
test("split keywords on semicolon") {
val md = PdfMetaData.empty.copy(keywords = Some("a; b;c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
test("split keywords on comma and semicolon") {
val md = PdfMetaData.empty.copy(keywords = Some("a, b; c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
}

View File

@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
textPDFs.foreach {
case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str.value)
val expect = removeFormatting(txt)
assertEquals(received, expect)
@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
textPDFs.foreach {
case (file, txt) =>
val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str.value)
val expect = removeFormatting(txt)
assertEquals(received, expect)
@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite {
test("extract text from image PDFs") {
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
assertEquals(str.value, "")
}
test("extract metadata from pdf") {
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
assert(str.value.startsWith("Keywords in PDF"))
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
assertEquals(md.author, Some("E.K."))
assertEquals(md.title, Some("Keywords in PDF"))
assertEquals(md.subject, Some("This is a subject"))
assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo"))
assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)"))
assert(md.creationDate.isDefined)
}
private def removeFormatting(str: String): String =
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
}

Binary file not shown.