mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Extract meta data from pdfs using pdfbox
This commit is contained in:
@ -37,7 +37,7 @@ object PdfExtract {
|
||||
for {
|
||||
pdfboxRes <-
|
||||
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||
.get[F](in)
|
||||
.getText[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
|
@ -0,0 +1,31 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import docspell.common.Timestamp
|
||||
|
||||
final case class PdfMetaData(
|
||||
title: Option[String],
|
||||
author: Option[String],
|
||||
subject: Option[String],
|
||||
keywords: Option[String],
|
||||
creator: Option[String],
|
||||
creationDate: Option[Timestamp]
|
||||
) {
|
||||
|
||||
def isEmpty: Boolean =
|
||||
title.isEmpty &&
|
||||
author.isEmpty &&
|
||||
subject.isEmpty &&
|
||||
keywords.isEmpty &&
|
||||
creator.isEmpty &&
|
||||
creationDate.isEmpty
|
||||
|
||||
def nonEmpty: Boolean =
|
||||
!isEmpty
|
||||
|
||||
def keywordList: List[String] =
|
||||
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
||||
}
|
||||
|
||||
object PdfMetaData {
|
||||
val empty = PdfMetaData(None, None, None, None, None, None)
|
||||
}
|
@ -13,18 +13,33 @@ import docspell.extract.internal.Text
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.text.PDFTextStripper
|
||||
import docspell.common.Timestamp
|
||||
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
def getTextAndMetaData[F[_]: Sync](
|
||||
data: Stream[F, Byte]
|
||||
): F[Either[Throwable, (Text, PdfMetaData)]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes =>
|
||||
Using(PDDocument.load(bytes)) { doc =>
|
||||
for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, md)
|
||||
}.toEither.flatten
|
||||
)
|
||||
|
||||
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, Text] =
|
||||
def getText(is: InputStream): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
||||
def get(inFile: Path): Either[Throwable, Text] =
|
||||
def getText(inFile: Path): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||
|
||||
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||
@ -34,4 +49,31 @@ object PdfboxExtract {
|
||||
stripper.setLineSeparator("\n")
|
||||
Text(Option(stripper.getText(doc)))
|
||||
}.toEither
|
||||
|
||||
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
||||
|
||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||
|
||||
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
|
||||
|
||||
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
|
||||
Try {
|
||||
def mkValue(s: String) =
|
||||
Option(s).map(_.trim).filter(_.nonEmpty)
|
||||
|
||||
val info = doc.getDocumentInformation
|
||||
PdfMetaData(
|
||||
mkValue(info.getTitle),
|
||||
mkValue(info.getAuthor),
|
||||
mkValue(info.getSubject),
|
||||
mkValue(info.getKeywords),
|
||||
mkValue(info.getCreator),
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
||||
)
|
||||
}.toEither
|
||||
}
|
||||
|
Reference in New Issue
Block a user