mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Add a processing step to retrieve page counts
This commit is contained in:
@ -8,7 +8,8 @@ final case class PdfMetaData(
|
||||
subject: Option[String],
|
||||
keywords: Option[String],
|
||||
creator: Option[String],
|
||||
creationDate: Option[Timestamp]
|
||||
creationDate: Option[Timestamp],
|
||||
pageCount: Int
|
||||
) {
|
||||
|
||||
def isEmpty: Boolean =
|
||||
@ -17,7 +18,8 @@ final case class PdfMetaData(
|
||||
subject.isEmpty &&
|
||||
keywords.isEmpty &&
|
||||
creator.isEmpty &&
|
||||
creationDate.isEmpty
|
||||
creationDate.isEmpty &&
|
||||
pageCount <= 0
|
||||
|
||||
def nonEmpty: Boolean =
|
||||
!isEmpty
|
||||
@ -36,5 +38,5 @@ final case class PdfMetaData(
|
||||
}
|
||||
|
||||
object PdfMetaData {
|
||||
val empty = PdfMetaData(None, None, None, None, None, None)
|
||||
val empty = PdfMetaData(None, None, None, None, None, None, 0)
|
||||
}
|
||||
|
@ -20,21 +20,23 @@ object PdfboxExtract {
|
||||
def getTextAndMetaData[F[_]: Sync](
|
||||
data: Stream[F, Byte]
|
||||
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes =>
|
||||
Using(PDDocument.load(bytes)) { doc =>
|
||||
for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, Some(md).filter(_.nonEmpty))
|
||||
}.toEither.flatten
|
||||
)
|
||||
PdfLoader
|
||||
.withDocumentStream(data) { doc =>
|
||||
(for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
|
||||
}
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
PdfLoader
|
||||
.withDocumentStream(data) { doc =>
|
||||
readText(doc).pure[F]
|
||||
}
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getText(is: InputStream): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
@ -51,9 +53,10 @@ object PdfboxExtract {
|
||||
}.toEither
|
||||
|
||||
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
||||
PdfLoader
|
||||
.withDocumentStream(data)(doc => readMetaData(doc).pure[F])
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||
@ -73,7 +76,8 @@ object PdfboxExtract {
|
||||
mkValue(info.getSubject),
|
||||
mkValue(info.getKeywords),
|
||||
mkValue(info.getCreator),
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
|
||||
doc.getNumberOfPages()
|
||||
)
|
||||
}.toEither
|
||||
}
|
||||
|
Reference in New Issue
Block a user