mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Add a processing step to retrieve page counts
This commit is contained in:
parent
5f217e6a76
commit
a77f34b7ba
@ -8,7 +8,8 @@ final case class PdfMetaData(
|
|||||||
subject: Option[String],
|
subject: Option[String],
|
||||||
keywords: Option[String],
|
keywords: Option[String],
|
||||||
creator: Option[String],
|
creator: Option[String],
|
||||||
creationDate: Option[Timestamp]
|
creationDate: Option[Timestamp],
|
||||||
|
pageCount: Int
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def isEmpty: Boolean =
|
def isEmpty: Boolean =
|
||||||
@ -17,7 +18,8 @@ final case class PdfMetaData(
|
|||||||
subject.isEmpty &&
|
subject.isEmpty &&
|
||||||
keywords.isEmpty &&
|
keywords.isEmpty &&
|
||||||
creator.isEmpty &&
|
creator.isEmpty &&
|
||||||
creationDate.isEmpty
|
creationDate.isEmpty &&
|
||||||
|
pageCount <= 0
|
||||||
|
|
||||||
def nonEmpty: Boolean =
|
def nonEmpty: Boolean =
|
||||||
!isEmpty
|
!isEmpty
|
||||||
@ -36,5 +38,5 @@ final case class PdfMetaData(
|
|||||||
}
|
}
|
||||||
|
|
||||||
object PdfMetaData {
|
object PdfMetaData {
|
||||||
val empty = PdfMetaData(None, None, None, None, None, None)
|
val empty = PdfMetaData(None, None, None, None, None, None, 0)
|
||||||
}
|
}
|
||||||
|
@ -20,21 +20,23 @@ object PdfboxExtract {
|
|||||||
def getTextAndMetaData[F[_]: Sync](
|
def getTextAndMetaData[F[_]: Sync](
|
||||||
data: Stream[F, Byte]
|
data: Stream[F, Byte]
|
||||||
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||||
data.compile
|
PdfLoader
|
||||||
.to(Array)
|
.withDocumentStream(data) { doc =>
|
||||||
.map(bytes =>
|
(for {
|
||||||
Using(PDDocument.load(bytes)) { doc =>
|
txt <- readText(doc)
|
||||||
for {
|
md <- readMetaData(doc)
|
||||||
txt <- readText(doc)
|
} yield (txt, Some(md).filter(_.nonEmpty))).pure[F]
|
||||||
md <- readMetaData(doc)
|
}
|
||||||
} yield (txt, Some(md).filter(_.nonEmpty))
|
.attempt
|
||||||
}.toEither.flatten
|
.map(_.flatten)
|
||||||
)
|
|
||||||
|
|
||||||
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile
|
PdfLoader
|
||||||
.to(Array)
|
.withDocumentStream(data) { doc =>
|
||||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
readText(doc).pure[F]
|
||||||
|
}
|
||||||
|
.attempt
|
||||||
|
.map(_.flatten)
|
||||||
|
|
||||||
def getText(is: InputStream): Either[Throwable, Text] =
|
def getText(is: InputStream): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||||
@ -51,9 +53,10 @@ object PdfboxExtract {
|
|||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||||
data.compile
|
PdfLoader
|
||||||
.to(Array)
|
.withDocumentStream(data)(doc => readMetaData(doc).pure[F])
|
||||||
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
.attempt
|
||||||
|
.map(_.flatten)
|
||||||
|
|
||||||
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||||
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||||
@ -73,7 +76,8 @@ object PdfboxExtract {
|
|||||||
mkValue(info.getSubject),
|
mkValue(info.getSubject),
|
||||||
mkValue(info.getKeywords),
|
mkValue(info.getKeywords),
|
||||||
mkValue(info.getCreator),
|
mkValue(info.getCreator),
|
||||||
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
Option(info.getCreationDate).map(c => Timestamp(c.toInstant)),
|
||||||
|
doc.getNumberOfPages()
|
||||||
)
|
)
|
||||||
}.toEither
|
}.toEither
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,83 @@
|
|||||||
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.Functor
|
||||||
|
import cats.data.OptionT
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.extract.pdfbox.PdfMetaData
|
||||||
|
import docspell.extract.pdfbox.PdfboxExtract
|
||||||
|
import docspell.joex.scheduler._
|
||||||
|
import docspell.store.records.RAttachment
|
||||||
|
import docspell.store.records._
|
||||||
|
import docspell.store.syntax.MimeTypes._
|
||||||
|
|
||||||
|
import bitpeace.{Mimetype, RangeDef}
|
||||||
|
|
||||||
|
/** Goes through all attachments that must be already converted into a
|
||||||
|
* pdf. If it is a pdf, the number of pages are retrieved and stored
|
||||||
|
* in the attachment metadata.
|
||||||
|
*/
|
||||||
|
object AttachmentPageCount {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync: ContextShift]()(
|
||||||
|
item: ItemData
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.info(
|
||||||
|
s"Retrieving page count for ${item.attachments.size} files…"
|
||||||
|
)
|
||||||
|
_ <- item.attachments
|
||||||
|
.traverse(createPageCount(ctx))
|
||||||
|
.attempt
|
||||||
|
.flatMap {
|
||||||
|
case Right(_) => ().pure[F]
|
||||||
|
case Left(ex) =>
|
||||||
|
ctx.logger.error(ex)(
|
||||||
|
s"Retrieving page counts failed, continuing without it."
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} yield item
|
||||||
|
}
|
||||||
|
|
||||||
|
def createPageCount[F[_]: Sync](
|
||||||
|
ctx: Context[F, _]
|
||||||
|
)(ra: RAttachment): F[Option[PdfMetaData]] =
|
||||||
|
findMime[F](ctx)(ra).flatMap {
|
||||||
|
case MimeType.PdfMatch(_) =>
|
||||||
|
PdfboxExtract.getMetaData(loadFile(ctx)(ra)).flatMap {
|
||||||
|
case Right(md) =>
|
||||||
|
updatePageCount(ctx, md, ra).map(_.some)
|
||||||
|
case Left(ex) =>
|
||||||
|
ctx.logger.warn(s"Error obtaining pages count: ${ex.getMessage}") *>
|
||||||
|
(None: Option[PdfMetaData]).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
(None: Option[PdfMetaData]).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
private def updatePageCount[F[_]: Sync](
|
||||||
|
ctx: Context[F, _],
|
||||||
|
md: PdfMetaData,
|
||||||
|
ra: RAttachment
|
||||||
|
): F[PdfMetaData] =
|
||||||
|
ctx.store.transact(RAttachmentMeta.updatePageCount(ra.id, md.pageCount.some)) *> md
|
||||||
|
.pure[F]
|
||||||
|
|
||||||
|
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||||
|
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||||
|
.map(_.mimetype)
|
||||||
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
.map(_.toLocal)
|
||||||
|
|
||||||
|
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
|
||||||
|
ctx.store.bitpeace
|
||||||
|
.get(ra.fileId.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
|
||||||
|
}
|
@ -55,6 +55,7 @@ object ProcessItem {
|
|||||||
.flatMap(Task.setProgress(progress._1))
|
.flatMap(Task.setProgress(progress._1))
|
||||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||||
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
|
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
|
||||||
|
.flatMap(AttachmentPageCount())
|
||||||
.flatMap(Task.setProgress(progress._2))
|
.flatMap(Task.setProgress(progress._2))
|
||||||
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||||
.flatMap(Task.setProgress(progress._3))
|
.flatMap(Task.setProgress(progress._3))
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE "attachmentmeta"
|
||||||
|
ADD COLUMN "page_count" smallint;
|
@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE `attachmentmeta`
|
||||||
|
ADD COLUMN (`page_count` SMALLINT);
|
@ -0,0 +1,2 @@
|
|||||||
|
ALTER TABLE "attachmentmeta"
|
||||||
|
ADD COLUMN "page_count" smallint;
|
@ -13,17 +13,21 @@ case class RAttachmentMeta(
|
|||||||
id: Ident, //same as RAttachment.id
|
id: Ident, //same as RAttachment.id
|
||||||
content: Option[String],
|
content: Option[String],
|
||||||
nerlabels: List[NerLabel],
|
nerlabels: List[NerLabel],
|
||||||
proposals: MetaProposalList
|
proposals: MetaProposalList,
|
||||||
|
pages: Option[Int]
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
||||||
if (content.forall(_.trim.isEmpty)) copy(content = txt)
|
if (content.forall(_.trim.isEmpty)) copy(content = txt)
|
||||||
else this
|
else this
|
||||||
|
|
||||||
|
def withPageCount(count: Option[Int]): RAttachmentMeta =
|
||||||
|
copy(pages = count)
|
||||||
}
|
}
|
||||||
|
|
||||||
object RAttachmentMeta {
|
object RAttachmentMeta {
|
||||||
def empty(attachId: Ident) =
|
def empty(attachId: Ident) =
|
||||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty)
|
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None)
|
||||||
|
|
||||||
val table = fr"attachmentmeta"
|
val table = fr"attachmentmeta"
|
||||||
|
|
||||||
@ -32,7 +36,8 @@ object RAttachmentMeta {
|
|||||||
val content = Column("content")
|
val content = Column("content")
|
||||||
val nerlabels = Column("nerlabels")
|
val nerlabels = Column("nerlabels")
|
||||||
val proposals = Column("itemproposals")
|
val proposals = Column("itemproposals")
|
||||||
val all = List(id, content, nerlabels, proposals)
|
val pages = Column("page_count")
|
||||||
|
val all = List(id, content, nerlabels, proposals, pages)
|
||||||
}
|
}
|
||||||
import Columns._
|
import Columns._
|
||||||
|
|
||||||
@ -40,7 +45,7 @@ object RAttachmentMeta {
|
|||||||
insertRow(
|
insertRow(
|
||||||
table,
|
table,
|
||||||
all,
|
all,
|
||||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals}"
|
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}"
|
||||||
).update.run
|
).update.run
|
||||||
|
|
||||||
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
||||||
@ -84,6 +89,9 @@ object RAttachmentMeta {
|
|||||||
)
|
)
|
||||||
).update.run
|
).update.run
|
||||||
|
|
||||||
|
def updatePageCount(mid: Ident, pageCount: Option[Int]): ConnectionIO[Int] =
|
||||||
|
updateRow(table, id.is(mid), pages.setTo(pageCount)).update.run
|
||||||
|
|
||||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||||
deleteFrom(table, id.is(attachId)).update.run
|
deleteFrom(table, id.is(attachId)).update.run
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user