Add a processing step to retrieve page counts

This commit is contained in:
Eike Kettner 2020-11-09 11:07:47 +01:00
parent 5f217e6a76
commit a77f34b7ba
8 changed files with 128 additions and 24 deletions

View File

@ -8,7 +8,8 @@ final case class PdfMetaData(
subject: Option[String], subject: Option[String],
keywords: Option[String], keywords: Option[String],
creator: Option[String], creator: Option[String],
creationDate: Option[Timestamp] creationDate: Option[Timestamp],
pageCount: Int
) { ) {
def isEmpty: Boolean = def isEmpty: Boolean =
@ -17,7 +18,8 @@ final case class PdfMetaData(
subject.isEmpty && subject.isEmpty &&
keywords.isEmpty && keywords.isEmpty &&
creator.isEmpty && creator.isEmpty &&
creationDate.isEmpty creationDate.isEmpty &&
pageCount <= 0
def nonEmpty: Boolean = def nonEmpty: Boolean =
!isEmpty !isEmpty
@ -36,5 +38,5 @@ final case class PdfMetaData(
} }
object PdfMetaData { object PdfMetaData {
val empty = PdfMetaData(None, None, None, None, None, None) val empty = PdfMetaData(None, None, None, None, None, None, 0)
} }

View File

@ -20,21 +20,23 @@ object PdfboxExtract {
def getTextAndMetaData[F[_]: Sync]( def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte] data: Stream[F, Byte]
): F[Either[Throwable, (Text, Option[PdfMetaData])]] = ): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile PdfLoader
.to(Array) .withDocumentStream(data) { doc =>
.map(bytes => (for {
Using(PDDocument.load(bytes)) { doc => txt <- readText(doc)
for { md <- readMetaData(doc)
txt <- readText(doc) } yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
md <- readMetaData(doc) }
} yield (txt, Some(md).filter(_.nonEmpty)) .attempt
}.toEither.flatten .map(_.flatten)
)
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile PdfLoader
.to(Array) .withDocumentStream(data) { doc =>
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) readText(doc).pure[F]
}
.attempt
.map(_.flatten)
def getText(is: InputStream): Either[Throwable, Text] = def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten Using(PDDocument.load(is))(readText).toEither.flatten
@ -51,9 +53,10 @@ object PdfboxExtract {
}.toEither }.toEither
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] = def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
data.compile PdfLoader
.to(Array) .withDocumentStream(data)(doc => readMetaData(doc).pure[F])
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten) .attempt
.map(_.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten Using(PDDocument.load(is))(readMetaData).toEither.flatten
@ -73,7 +76,8 @@ object PdfboxExtract {
mkValue(info.getSubject), mkValue(info.getSubject),
mkValue(info.getKeywords), mkValue(info.getKeywords),
mkValue(info.getCreator), mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)) Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
doc.getNumberOfPages()
) )
}.toEither }.toEither
} }

View File

@ -0,0 +1,83 @@
package docspell.joex.process
import cats.Functor
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import fs2.Stream
import docspell.common._
import docspell.extract.pdfbox.PdfMetaData
import docspell.extract.pdfbox.PdfboxExtract
import docspell.joex.scheduler._
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, RangeDef}
/** Goes through all attachments that must be already converted into a
* pdf. If it is a pdf, the number of pages are retrieved and stored
* in the attachment metadata.
*/
object AttachmentPageCount {
def apply[F[_]: Sync: ContextShift]()(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
for {
_ <- ctx.logger.info(
s"Retrieving page count for ${item.attachments.size} files…"
)
_ <- item.attachments
.traverse(createPageCount(ctx))
.attempt
.flatMap {
case Right(_) => ().pure[F]
case Left(ex) =>
ctx.logger.error(ex)(
s"Retrieving page counts failed, continuing without it."
)
}
} yield item
}
def createPageCount[F[_]: Sync](
ctx: Context[F, _]
)(ra: RAttachment): F[Option[PdfMetaData]] =
findMime[F](ctx)(ra).flatMap {
case MimeType.PdfMatch(_) =>
PdfboxExtract.getMetaData(loadFile(ctx)(ra)).flatMap {
case Right(md) =>
updatePageCount(ctx, md, ra).map(_.some)
case Left(ex) =>
ctx.logger.warn(s"Error obtaining pages count: ${ex.getMessage}") *>
(None: Option[PdfMetaData]).pure[F]
}
case _ =>
(None: Option[PdfMetaData]).pure[F]
}
private def updatePageCount[F[_]: Sync](
ctx: Context[F, _],
md: PdfMetaData,
ra: RAttachment
): F[PdfMetaData] =
ctx.store.transact(RAttachmentMeta.updatePageCount(ra.id, md.pageCount.some)) *> md
.pure[F]
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.`application/octet-stream`)
.map(_.toLocal)
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
}

View File

@ -55,6 +55,7 @@ object ProcessItem {
.flatMap(Task.setProgress(progress._1)) .flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview)) .flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(AttachmentPageCount())
.flatMap(Task.setProgress(progress._2)) .flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3)) .flatMap(Task.setProgress(progress._3))

View File

@ -0,0 +1,2 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "page_count" smallint;

View File

@ -0,0 +1,2 @@
ALTER TABLE `attachmentmeta`
ADD COLUMN (`page_count` SMALLINT);

View File

@ -0,0 +1,2 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "page_count" smallint;

View File

@ -13,17 +13,21 @@ case class RAttachmentMeta(
id: Ident, //same as RAttachment.id id: Ident, //same as RAttachment.id
content: Option[String], content: Option[String],
nerlabels: List[NerLabel], nerlabels: List[NerLabel],
proposals: MetaProposalList proposals: MetaProposalList,
pages: Option[Int]
) { ) {
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
if (content.forall(_.trim.isEmpty)) copy(content = txt) if (content.forall(_.trim.isEmpty)) copy(content = txt)
else this else this
def withPageCount(count: Option[Int]): RAttachmentMeta =
copy(pages = count)
} }
object RAttachmentMeta { object RAttachmentMeta {
def empty(attachId: Ident) = def empty(attachId: Ident) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty) RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None)
val table = fr"attachmentmeta" val table = fr"attachmentmeta"
@ -32,7 +36,8 @@ object RAttachmentMeta {
val content = Column("content") val content = Column("content")
val nerlabels = Column("nerlabels") val nerlabels = Column("nerlabels")
val proposals = Column("itemproposals") val proposals = Column("itemproposals")
val all = List(id, content, nerlabels, proposals) val pages = Column("page_count")
val all = List(id, content, nerlabels, proposals, pages)
} }
import Columns._ import Columns._
@ -40,7 +45,7 @@ object RAttachmentMeta {
insertRow( insertRow(
table, table,
all, all,
fr"${v.id},${v.content},${v.nerlabels},${v.proposals}" fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}"
).update.run ).update.run
def exists(attachId: Ident): ConnectionIO[Boolean] = def exists(attachId: Ident): ConnectionIO[Boolean] =
@ -84,6 +89,9 @@ object RAttachmentMeta {
) )
).update.run ).update.run
def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] =
updateRow(table, id.is(mid), pages.setTo(pageCount)).update.run
def delete(attachId: Ident): ConnectionIO[Int] = def delete(attachId: Ident): ConnectionIO[Int] =
deleteFrom(table, id.is(attachId)).update.run deleteFrom(table, id.is(attachId)).update.run
} }