diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala index 4663d1c8..eb450ae9 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala @@ -8,7 +8,8 @@ final case class PdfMetaData( subject: Option[String], keywords: Option[String], creator: Option[String], - creationDate: Option[Timestamp] + creationDate: Option[Timestamp], + pageCount: Int ) { def isEmpty: Boolean = @@ -17,7 +18,8 @@ final case class PdfMetaData( subject.isEmpty && keywords.isEmpty && creator.isEmpty && - creationDate.isEmpty + creationDate.isEmpty && + pageCount <= 0 def nonEmpty: Boolean = !isEmpty @@ -36,5 +38,5 @@ final case class PdfMetaData( } object PdfMetaData { - val empty = PdfMetaData(None, None, None, None, None, None) + val empty = PdfMetaData(None, None, None, None, None, None, 0) } diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index def9c8ee..d3267503 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -20,21 +20,23 @@ object PdfboxExtract { def getTextAndMetaData[F[_]: Sync]( data: Stream[F, Byte] ): F[Either[Throwable, (Text, Option[PdfMetaData])]] = - data.compile - .to(Array) - .map(bytes => - Using(PDDocument.load(bytes)) { doc => - for { - txt <- readText(doc) - md <- readMetaData(doc) - } yield (txt, Some(md).filter(_.nonEmpty)) - }.toEither.flatten - ) + PdfLoader + .withDocumentStream(data) { doc => + (for { + txt <- readText(doc) + md <- readMetaData(doc) + } yield (txt, Some(md).filter(_.nonEmpty))).pure[F] + } + .attempt + .map(_.flatten) def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = - data.compile - .to(Array) - .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) + PdfLoader + .withDocumentStream(data) { doc => + readText(doc).pure[F] + } + .attempt + .map(_.flatten) def getText(is: InputStream): Either[Throwable, Text] = Using(PDDocument.load(is))(readText).toEither.flatten @@ -51,9 +53,10 @@ object PdfboxExtract { }.toEither def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] = - data.compile - .to(Array) - .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten) + PdfLoader + .withDocumentStream(data)(doc => readMetaData(doc).pure[F]) + .attempt + .map(_.flatten) def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = Using(PDDocument.load(is))(readMetaData).toEither.flatten @@ -73,7 +76,8 @@ object PdfboxExtract { mkValue(info.getSubject), mkValue(info.getKeywords), mkValue(info.getCreator), - Option(info.getCreationDate).map(c => Timestamp(c.toInstant)) + Option(info.getCreationDate).map(c => Timestamp(c.toInstant)), + doc.getNumberOfPages() ) }.toEither } diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala new file mode 100644 index 00000000..c1dbe7e4 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -0,0 +1,83 @@ +package docspell.joex.process + +import cats.Functor +import cats.data.OptionT +import cats.effect._ +import cats.implicits._ +import fs2.Stream + +import docspell.common._ +import docspell.extract.pdfbox.PdfMetaData +import docspell.extract.pdfbox.PdfboxExtract +import docspell.joex.scheduler._ +import docspell.store.records.RAttachment +import docspell.store.records._ +import docspell.store.syntax.MimeTypes._ + +import bitpeace.{Mimetype, RangeDef} + +/** Goes through all attachments that must be already converted into a + * pdf. If it is a pdf, the number of pages are retrieved and stored + * in the attachment metadata. + */ +object AttachmentPageCount { + + def apply[F[_]: Sync: ContextShift]()( + item: ItemData + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + for { + _ <- ctx.logger.info( + s"Retrieving page count for ${item.attachments.size} files…" + ) + _ <- item.attachments + .traverse(createPageCount(ctx)) + .attempt + .flatMap { + case Right(_) => ().pure[F] + case Left(ex) => + ctx.logger.error(ex)( + s"Retrieving page counts failed, continuing without it." + ) + } + } yield item + } + + def createPageCount[F[_]: Sync]( + ctx: Context[F, _] + )(ra: RAttachment): F[Option[PdfMetaData]] = + findMime[F](ctx)(ra).flatMap { + case MimeType.PdfMatch(_) => + PdfboxExtract.getMetaData(loadFile(ctx)(ra)).flatMap { + case Right(md) => + updatePageCount(ctx, md, ra).map(_.some) + case Left(ex) => + ctx.logger.warn(s"Error obtaining pages count: ${ex.getMessage}") *> + (None: Option[PdfMetaData]).pure[F] + } + + case _ => + (None: Option[PdfMetaData]).pure[F] + } + + private def updatePageCount[F[_]: Sync]( + ctx: Context[F, _], + md: PdfMetaData, + ra: RAttachment + ): F[PdfMetaData] = + ctx.store.transact(RAttachmentMeta.updatePageCount(ra.id, md.pageCount.some)) *> md + .pure[F] + + def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] = + OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + .map(_.toLocal) + + def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] = + ctx.store.bitpeace + .get(ra.fileId.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 8caf25fb..56f3cd33 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -55,6 +55,7 @@ object ProcessItem { .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview)) + .flatMap(AttachmentPageCount()) .flatMap(Task.setProgress(progress._2)) .flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(Task.setProgress(progress._3)) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.11.0__pdf_pages.sql b/modules/store/src/main/resources/db/migration/h2/V1.11.0__pdf_pages.sql new file mode 100644 index 00000000..ca347ea6 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.11.0__pdf_pages.sql @@ -0,0 +1,2 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "page_count" smallint; diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.11.0__pdf_pages.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.11.0__pdf_pages.sql new file mode 100644 index 00000000..fd580127 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.11.0__pdf_pages.sql @@ -0,0 +1,2 @@ +ALTER TABLE `attachmentmeta` +ADD COLUMN (`page_count` SMALLINT); diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.11.0__pdf_pages.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.11.0__pdf_pages.sql new file mode 100644 index 00000000..ca347ea6 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.11.0__pdf_pages.sql @@ -0,0 +1,2 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "page_count" smallint; diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index d1cb79ea..833bfeca 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -13,17 +13,21 @@ case class RAttachmentMeta( id: Ident, //same as RAttachment.id content: Option[String], nerlabels: List[NerLabel], - proposals: MetaProposalList + proposals: MetaProposalList, + pages: Option[Int] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = if (content.forall(_.trim.isEmpty)) copy(content = txt) else this + + def withPageCount(count: Option[Int]): RAttachmentMeta = + copy(pages = count) } object RAttachmentMeta { def empty(attachId: Ident) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty) + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None) val table = fr"attachmentmeta" @@ -32,7 +36,8 @@ object RAttachmentMeta { val content = Column("content") val nerlabels = Column("nerlabels") val proposals = Column("itemproposals") - val all = List(id, content, nerlabels, proposals) + val pages = Column("page_count") + val all = List(id, content, nerlabels, proposals, pages) } import Columns._ @@ -40,7 +45,7 @@ object RAttachmentMeta { insertRow( table, all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}" ).update.run def exists(attachId: Ident): ConnectionIO[Boolean] = @@ -84,6 +89,9 @@ object RAttachmentMeta { ) ).update.run + def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] = + updateRow(table, id.is(mid), pages.setTo(pageCount)).update.run + def delete(attachId: Ident): ConnectionIO[Int] = deleteFrom(table, id.is(attachId)).update.run }