Extract meta data from pdfs using pdfbox

2025-09-28 07:38:23 +00:00 · 2020-07-18 23:04:46 +02:00
parent bd20165d1a
commit da68405f9b
6 changed files with 115 additions and 7 deletions
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -37,7 +37,7 @@ object PdfExtract {
    for {
      pdfboxRes <-
        logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
-          .get[F](in)
+          .getText[F](in)
      res <- pdfboxRes.fold(
        ex =>
          logger.info(
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala
@@ -0,0 +1,31 @@
+package docspell.extract.pdfbox
+
+import docspell.common.Timestamp
+
+final case class PdfMetaData(
+    title: Option[String],
+    author: Option[String],
+    subject: Option[String],
+    keywords: Option[String],
+    creator: Option[String],
+    creationDate: Option[Timestamp]
+) {
+
+  def isEmpty: Boolean =
+    title.isEmpty &&
+      author.isEmpty &&
+      subject.isEmpty &&
+      keywords.isEmpty &&
+      creator.isEmpty &&
+      creationDate.isEmpty
+
+  def nonEmpty: Boolean =
+    !isEmpty
+
+  def keywordList: List[String] =
+    keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
+}
+
+object PdfMetaData {
+  val empty = PdfMetaData(None, None, None, None, None, None)
+}
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -13,18 +13,33 @@ import docspell.extract.internal.Text

 import org.apache.pdfbox.pdmodel.PDDocument
 import org.apache.pdfbox.text.PDFTextStripper
+import docspell.common.Timestamp

 object PdfboxExtract {

-  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
+  def getTextAndMetaData[F[_]: Sync](
+      data: Stream[F, Byte]
+  ): F[Either[Throwable, (Text, PdfMetaData)]] =
+    data.compile
+      .to(Array)
+      .map(bytes =>
+        Using(PDDocument.load(bytes)) { doc =>
+          for {
+            txt <- readText(doc)
+            md  <- readMetaData(doc)
+          } yield (txt, md)
+        }.toEither.flatten
+      )
+
+  def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile
      .to(Array)
      .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)

-  def get(is: InputStream): Either[Throwable, Text] =
+  def getText(is: InputStream): Either[Throwable, Text] =
    Using(PDDocument.load(is))(readText).toEither.flatten

-  def get(inFile: Path): Either[Throwable, Text] =
+  def getText(inFile: Path): Either[Throwable, Text] =
    Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten

  private def readText(doc: PDDocument): Either[Throwable, Text] =
@@ -34,4 +49,31 @@ object PdfboxExtract {
      stripper.setLineSeparator("\n")
      Text(Option(stripper.getText(doc)))
    }.toEither
+
+  def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
+    data.compile
+      .to(Array)
+      .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
+
+  def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
+    Using(PDDocument.load(is))(readMetaData).toEither.flatten
+
+  def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
+    Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
+
+  private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
+    Try {
+      def mkValue(s: String) =
+        Option(s).map(_.trim).filter(_.nonEmpty)
+
+      val info = doc.getDocumentInformation
+      PdfMetaData(
+        mkValue(info.getTitle),
+        mkValue(info.getAuthor),
+        mkValue(info.getSubject),
+        mkValue(info.getKeywords),
+        mkValue(info.getCreator),
+        Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
+      )
+    }.toEither
 }