Add a processing step to retrieve page counts

This commit is contained in:
Eike Kettner 2020-11-09 11:07:47 +01:00
parent 5f217e6a76
commit a77f34b7ba
8 changed files with 128 additions and 24 deletions

View File

@ -8,7 +8,8 @@ final case class PdfMetaData(
subject: Option[String],
keywords: Option[String],
creator: Option[String],
creationDate: Option[Timestamp]
creationDate: Option[Timestamp],
pageCount: Int
) {
def isEmpty: Boolean =
@ -17,7 +18,8 @@ final case class PdfMetaData(
subject.isEmpty &&
keywords.isEmpty &&
creator.isEmpty &&
creationDate.isEmpty
creationDate.isEmpty &&
pageCount <= 0
def nonEmpty: Boolean =
!isEmpty
@ -36,5 +38,5 @@ final case class PdfMetaData(
}
object PdfMetaData {
val empty = PdfMetaData(None, None, None, None, None, None)
val empty = PdfMetaData(None, None, None, None, None, None, 0)
}

View File

@ -20,21 +20,23 @@ object PdfboxExtract {
def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte]
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile
.to(Array)
.map(bytes =>
Using(PDDocument.load(bytes)) { doc =>
for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, Some(md).filter(_.nonEmpty))
}.toEither.flatten
)
PdfLoader
.withDocumentStream(data) { doc =>
(for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
}
.attempt
.map(_.flatten)
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
PdfLoader
.withDocumentStream(data) { doc =>
readText(doc).pure[F]
}
.attempt
.map(_.flatten)
def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten
@ -51,9 +53,10 @@ object PdfboxExtract {
}.toEither
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
PdfLoader
.withDocumentStream(data)(doc => readMetaData(doc).pure[F])
.attempt
.map(_.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten
@ -73,7 +76,8 @@ object PdfboxExtract {
mkValue(info.getSubject),
mkValue(info.getKeywords),
mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
doc.getNumberOfPages()
)
}.toEither
}

View File

@ -0,0 +1,83 @@
package docspell.joex.process
import cats.Functor
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import fs2.Stream
import docspell.common._
import docspell.extract.pdfbox.PdfMetaData
import docspell.extract.pdfbox.PdfboxExtract
import docspell.joex.scheduler._
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, RangeDef}
/** Goes through all attachments that must be already converted into a
* pdf. If it is a pdf, the number of pages are retrieved and stored
* in the attachment metadata.
*/
object AttachmentPageCount {
def apply[F[_]: Sync: ContextShift]()(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
for {
_ <- ctx.logger.info(
s"Retrieving page count for ${item.attachments.size} files…"
)
_ <- item.attachments
.traverse(createPageCount(ctx))
.attempt
.flatMap {
case Right(_) => ().pure[F]
case Left(ex) =>
ctx.logger.error(ex)(
s"Retrieving page counts failed, continuing without it."
)
}
} yield item
}
def createPageCount[F[_]: Sync](
ctx: Context[F, _]
)(ra: RAttachment): F[Option[PdfMetaData]] =
findMime[F](ctx)(ra).flatMap {
case MimeType.PdfMatch(_) =>
PdfboxExtract.getMetaData(loadFile(ctx)(ra)).flatMap {
case Right(md) =>
updatePageCount(ctx, md, ra).map(_.some)
case Left(ex) =>
ctx.logger.warn(s"Error obtaining pages count: ${ex.getMessage}") *>
(None: Option[PdfMetaData]).pure[F]
}
case _ =>
(None: Option[PdfMetaData]).pure[F]
}
private def updatePageCount[F[_]: Sync](
ctx: Context[F, _],
md: PdfMetaData,
ra: RAttachment
): F[PdfMetaData] =
ctx.store.transact(RAttachmentMeta.updatePageCount(ra.id, md.pageCount.some)) *> md
.pure[F]
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.`application/octet-stream`)
.map(_.toLocal)
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
}

View File

@ -55,6 +55,7 @@ object ProcessItem {
.flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(AttachmentPageCount())
.flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3))

View File

@ -0,0 +1,2 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "page_count" smallint;

View File

@ -0,0 +1,2 @@
ALTER TABLE `attachmentmeta`
ADD COLUMN (`page_count` SMALLINT);

View File

@ -0,0 +1,2 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "page_count" smallint;

View File

@ -13,17 +13,21 @@ case class RAttachmentMeta(
id: Ident, //same as RAttachment.id
content: Option[String],
nerlabels: List[NerLabel],
proposals: MetaProposalList
proposals: MetaProposalList,
pages: Option[Int]
) {
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
if (content.forall(_.trim.isEmpty)) copy(content = txt)
else this
def withPageCount(count: Option[Int]): RAttachmentMeta =
copy(pages = count)
}
object RAttachmentMeta {
def empty(attachId: Ident) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty)
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None)
val table = fr"attachmentmeta"
@ -32,7 +36,8 @@ object RAttachmentMeta {
val content = Column("content")
val nerlabels = Column("nerlabels")
val proposals = Column("itemproposals")
val all = List(id, content, nerlabels, proposals)
val pages = Column("page_count")
val all = List(id, content, nerlabels, proposals, pages)
}
import Columns._
@ -40,7 +45,7 @@ object RAttachmentMeta {
insertRow(
table,
all,
fr"${v.id},${v.content},${v.nerlabels},${v.proposals}"
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}"
).update.run
def exists(attachId: Ident): ConnectionIO[Boolean] =
@ -84,6 +89,9 @@ object RAttachmentMeta {
)
).update.run
def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] =
updateRow(table, id.is(mid), pages.setTo(pageCount)).update.run
def delete(attachId: Ident): ConnectionIO[Int] =
deleteFrom(table, id.is(attachId)).update.run
}