mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-25 16:45:05 +00:00
Add a processing step to retrieve page counts
This commit is contained in:
parent
5f217e6a76
commit
a77f34b7ba
@ -8,7 +8,8 @@ final case class PdfMetaData(
|
||||
subject: Option[String],
|
||||
keywords: Option[String],
|
||||
creator: Option[String],
|
||||
creationDate: Option[Timestamp]
|
||||
creationDate: Option[Timestamp],
|
||||
pageCount: Int
|
||||
) {
|
||||
|
||||
def isEmpty: Boolean =
|
||||
@ -17,7 +18,8 @@ final case class PdfMetaData(
|
||||
subject.isEmpty &&
|
||||
keywords.isEmpty &&
|
||||
creator.isEmpty &&
|
||||
creationDate.isEmpty
|
||||
creationDate.isEmpty &&
|
||||
pageCount <= 0
|
||||
|
||||
def nonEmpty: Boolean =
|
||||
!isEmpty
|
||||
@ -36,5 +38,5 @@ final case class PdfMetaData(
|
||||
}
|
||||
|
||||
object PdfMetaData {
|
||||
val empty = PdfMetaData(None, None, None, None, None, None)
|
||||
val empty = PdfMetaData(None, None, None, None, None, None, 0)
|
||||
}
|
||||
|
@ -20,21 +20,23 @@ object PdfboxExtract {
|
||||
def getTextAndMetaData[F[_]: Sync](
|
||||
data: Stream[F, Byte]
|
||||
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes =>
|
||||
Using(PDDocument.load(bytes)) { doc =>
|
||||
for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, Some(md).filter(_.nonEmpty))
|
||||
}.toEither.flatten
|
||||
)
|
||||
PdfLoader
|
||||
.withDocumentStream(data) { doc =>
|
||||
(for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
|
||||
}
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
PdfLoader
|
||||
.withDocumentStream(data) { doc =>
|
||||
readText(doc).pure[F]
|
||||
}
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getText(is: InputStream): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
@ -51,9 +53,10 @@ object PdfboxExtract {
|
||||
}.toEither
|
||||
|
||||
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
||||
PdfLoader
|
||||
.withDocumentStream(data)(doc => readMetaData(doc).pure[F])
|
||||
.attempt
|
||||
.map(_.flatten)
|
||||
|
||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||
@ -73,7 +76,8 @@ object PdfboxExtract {
|
||||
mkValue(info.getSubject),
|
||||
mkValue(info.getKeywords),
|
||||
mkValue(info.getCreator),
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
|
||||
doc.getNumberOfPages()
|
||||
)
|
||||
}.toEither
|
||||
}
|
||||
|
@ -0,0 +1,83 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.Functor
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
import docspell.extract.pdfbox.PdfMetaData
|
||||
import docspell.extract.pdfbox.PdfboxExtract
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records.RAttachment
|
||||
import docspell.store.records._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, RangeDef}
|
||||
|
||||
/** Goes through all attachments that must be already converted into a
|
||||
* pdf. If it is a pdf, the number of pages are retrieved and stored
|
||||
* in the attachment metadata.
|
||||
*/
|
||||
object AttachmentPageCount {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift]()(
|
||||
item: ItemData
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
for {
|
||||
_ <- ctx.logger.info(
|
||||
s"Retrieving page count for ${item.attachments.size} files…"
|
||||
)
|
||||
_ <- item.attachments
|
||||
.traverse(createPageCount(ctx))
|
||||
.attempt
|
||||
.flatMap {
|
||||
case Right(_) => ().pure[F]
|
||||
case Left(ex) =>
|
||||
ctx.logger.error(ex)(
|
||||
s"Retrieving page counts failed, continuing without it."
|
||||
)
|
||||
}
|
||||
} yield item
|
||||
}
|
||||
|
||||
def createPageCount[F[_]: Sync](
|
||||
ctx: Context[F, _]
|
||||
)(ra: RAttachment): F[Option[PdfMetaData]] =
|
||||
findMime[F](ctx)(ra).flatMap {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfboxExtract.getMetaData(loadFile(ctx)(ra)).flatMap {
|
||||
case Right(md) =>
|
||||
updatePageCount(ctx, md, ra).map(_.some)
|
||||
case Left(ex) =>
|
||||
ctx.logger.warn(s"Error obtaining pages count: ${ex.getMessage}") *>
|
||||
(None: Option[PdfMetaData]).pure[F]
|
||||
}
|
||||
|
||||
case _ =>
|
||||
(None: Option[PdfMetaData]).pure[F]
|
||||
}
|
||||
|
||||
private def updatePageCount[F[_]: Sync](
|
||||
ctx: Context[F, _],
|
||||
md: PdfMetaData,
|
||||
ra: RAttachment
|
||||
): F[PdfMetaData] =
|
||||
ctx.store.transact(RAttachmentMeta.updatePageCount(ra.id, md.pageCount.some)) *> md
|
||||
.pure[F]
|
||||
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
.map(_.toLocal)
|
||||
|
||||
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
|
||||
ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
|
||||
}
|
@ -55,6 +55,7 @@ object ProcessItem {
|
||||
.flatMap(Task.setProgress(progress._1))
|
||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
|
||||
.flatMap(AttachmentPageCount())
|
||||
.flatMap(Task.setProgress(progress._2))
|
||||
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||
.flatMap(Task.setProgress(progress._3))
|
||||
|
@ -0,0 +1,2 @@
|
||||
ALTER TABLE "attachmentmeta"
|
||||
ADD COLUMN "page_count" smallint;
|
@ -0,0 +1,2 @@
|
||||
ALTER TABLE `attachmentmeta`
|
||||
ADD COLUMN (`page_count` SMALLINT);
|
@ -0,0 +1,2 @@
|
||||
ALTER TABLE "attachmentmeta"
|
||||
ADD COLUMN "page_count" smallint;
|
@ -13,17 +13,21 @@ case class RAttachmentMeta(
|
||||
id: Ident, //same as RAttachment.id
|
||||
content: Option[String],
|
||||
nerlabels: List[NerLabel],
|
||||
proposals: MetaProposalList
|
||||
proposals: MetaProposalList,
|
||||
pages: Option[Int]
|
||||
) {
|
||||
|
||||
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
||||
if (content.forall(_.trim.isEmpty)) copy(content = txt)
|
||||
else this
|
||||
|
||||
def withPageCount(count: Option[Int]): RAttachmentMeta =
|
||||
copy(pages = count)
|
||||
}
|
||||
|
||||
object RAttachmentMeta {
|
||||
def empty(attachId: Ident) =
|
||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty)
|
||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None)
|
||||
|
||||
val table = fr"attachmentmeta"
|
||||
|
||||
@ -32,7 +36,8 @@ object RAttachmentMeta {
|
||||
val content = Column("content")
|
||||
val nerlabels = Column("nerlabels")
|
||||
val proposals = Column("itemproposals")
|
||||
val all = List(id, content, nerlabels, proposals)
|
||||
val pages = Column("page_count")
|
||||
val all = List(id, content, nerlabels, proposals, pages)
|
||||
}
|
||||
import Columns._
|
||||
|
||||
@ -40,7 +45,7 @@ object RAttachmentMeta {
|
||||
insertRow(
|
||||
table,
|
||||
all,
|
||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals}"
|
||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}"
|
||||
).update.run
|
||||
|
||||
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
||||
@ -84,6 +89,9 @@ object RAttachmentMeta {
|
||||
)
|
||||
).update.run
|
||||
|
||||
def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] =
|
||||
updateRow(table, id.is(mid), pages.setTo(pageCount)).update.run
|
||||
|
||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||
deleteFrom(table, id.is(attachId)).update.run
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user