Add a processing step to retrieve page counts

This commit is contained in:
Eike Kettner
2020-11-09 11:07:47 +01:00
parent 5f217e6a76
commit a77f34b7ba
8 changed files with 128 additions and 24 deletions

View File

@ -8,7 +8,8 @@ final case class PdfMetaData(
subject: Option[String],
keywords: Option[String],
creator: Option[String],
creationDate: Option[Timestamp]
creationDate: Option[Timestamp],
pageCount: Int
) {
def isEmpty: Boolean =
@ -17,7 +18,8 @@ final case class PdfMetaData(
subject.isEmpty &&
keywords.isEmpty &&
creator.isEmpty &&
creationDate.isEmpty
creationDate.isEmpty &&
pageCount <= 0
def nonEmpty: Boolean =
!isEmpty
@ -36,5 +38,5 @@ final case class PdfMetaData(
}
object PdfMetaData {
val empty = PdfMetaData(None, None, None, None, None, None)
val empty = PdfMetaData(None, None, None, None, None, None, 0)
}

View File

@ -20,21 +20,23 @@ object PdfboxExtract {
def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte]
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile
.to(Array)
.map(bytes =>
Using(PDDocument.load(bytes)) { doc =>
for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, Some(md).filter(_.nonEmpty))
}.toEither.flatten
)
PdfLoader
.withDocumentStream(data) { doc =>
(for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
}
.attempt
.map(_.flatten)
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
PdfLoader
.withDocumentStream(data) { doc =>
readText(doc).pure[F]
}
.attempt
.map(_.flatten)
def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten
@ -51,9 +53,10 @@ object PdfboxExtract {
}.toEither
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
PdfLoader
.withDocumentStream(data)(doc => readMetaData(doc).pure[F])
.attempt
.map(_.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten
@ -73,7 +76,8 @@ object PdfboxExtract {
mkValue(info.getSubject),
mkValue(info.getKeywords),
mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
doc.getNumberOfPages()
)
}.toEither
}