From ef7cb4e779f506fae41711301fb31984bc5ce219 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 8 Nov 2020 01:14:21 +0100 Subject: [PATCH] Create a preview image of all files during processing --- .../main/scala/docspell/common/FileName.scala | 48 ++++++++++ .../scala/docspell/common/FileNameTest.scala | 58 ++++++++++++ .../joex/process/AttachmentPreview.scala | 89 +++++++++++++++++++ .../docspell/joex/process/ProcessItem.scala | 1 + 4 files changed, 196 insertions(+) create mode 100644 modules/common/src/main/scala/docspell/common/FileName.scala create mode 100644 modules/common/src/test/scala/docspell/common/FileNameTest.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala diff --git a/modules/common/src/main/scala/docspell/common/FileName.scala b/modules/common/src/main/scala/docspell/common/FileName.scala new file mode 100644 index 00000000..1bc9184c --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/FileName.scala @@ -0,0 +1,48 @@ +package docspell.common + +case class FileName private (name: String) { + + private[this] val (base, ext) = + name.lastIndexOf('.') match { + case -1 => (name, None) + case n => (name.take(n), Some(name.drop(n + 1))) + } + + /** Returns the name part without the extension. If there is no + * extension, it is the same as fullname. + */ + def baseName: String = + base + + /** Returns the extension part if available without the dot. */ + def extension: Option[String] = + ext + + def fullName: String = + name + + /** Creates a new name where part is spliced into the name before the + * extension, separated by separator. + */ + def withPart(part: String, sep: Char): FileName = + if (part.isEmpty()) this + else + ext + .map(e => new FileName(s"${base}${sep}${part}.${e}")) + .getOrElse(new FileName(s"${base}${sep}${part}")) + + /** Create a new name using the given extension. */ + def withExtension(newExt: String): FileName = + if (newExt.isEmpty()) new FileName(base) + else new FileName(s"${base}.${newExt}") + +} +object FileName { + + def apply(name: String): FileName = + Option(name) + .map(_.trim) + .filter(_.nonEmpty) + .map(n => new FileName(n)) + .getOrElse(new FileName("unknown-file")) +} diff --git a/modules/common/src/test/scala/docspell/common/FileNameTest.scala b/modules/common/src/test/scala/docspell/common/FileNameTest.scala new file mode 100644 index 00000000..8b2778d7 --- /dev/null +++ b/modules/common/src/test/scala/docspell/common/FileNameTest.scala @@ -0,0 +1,58 @@ +package docspell.common + +import minitest._ + +object FileNameTest extends SimpleTestSuite { + + test("make filename") { + val data = List( + (FileName("test"), "test", None), + (FileName("test.pdf"), "test", Some("pdf")), + (FileName("bla.xml.gz"), "bla.xml", Some("gz")), + (FileName(""), "unknown-file", None) + ) + + data.foreach { case (fn, base, ext) => + assertEquals(fn.baseName, base) + assertEquals(fn.extension, ext) + } + } + + test("with part") { + assertEquals( + FileName("test.pdf").withPart("converted", '_'), + FileName("test_converted.pdf") + ) + assertEquals( + FileName("bla.xml.gz").withPart("converted", '_'), + FileName("bla.xml_converted.gz") + ) + assertEquals( + FileName("test").withPart("converted", '_'), + FileName("test_converted") + ) + assertEquals( + FileName("test").withPart("", '_'), + FileName("test") + ) + } + + test("with extension") { + assertEquals( + FileName("test.pdf").withExtension("xml"), + FileName("test.xml") + ) + assertEquals( + FileName("test").withExtension("xml"), + FileName("test.xml") + ) + assertEquals( + FileName("test.pdf.gz").withExtension("xml"), + FileName("test.pdf.xml") + ) + assertEquals( + FileName("test.pdf.gz").withExtension(""), + FileName("test.pdf") + ) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala new file mode 100644 index 00000000..d27e4504 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPreview.scala @@ -0,0 +1,89 @@ +package docspell.joex.process + +import cats.Functor +import cats.data.OptionT +import cats.effect._ +import cats.implicits._ +import fs2.Stream + +import docspell.common._ +import docspell.convert._ +import docspell.extract.pdfbox.PdfboxPreview +import docspell.joex.scheduler._ +import docspell.store.records.RAttachment +import docspell.store.records._ +import docspell.store.syntax.MimeTypes._ + +import bitpeace.{Mimetype, MimetypeHint, RangeDef} + +/** Goes through all attachments that must be already converted into a + * pdf. If it is a pdf, the first page is converted into a small + * preview png image and linked to the attachment. + */ +object AttachmentPreview { + + def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)( + item: ItemData + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + for { + _ <- ctx.logger.info( + s"Creating preview images for ${item.attachments.size} files…" + ) + _ <- item.attachments.traverse(createPreview(ctx, cfg)) + } yield item + } + + def createPreview[F[_]: Sync](ctx: Context[F, _], cfg: ConvertConfig)( + ra: RAttachment + ): F[Option[RAttachmentPreview]] = + findMime[F](ctx)(ra).flatMap { + case MimeType.PdfMatch(_) => + PdfboxPreview(48).flatMap(_.previewPNG(loadFile(ctx)(ra))).flatMap { + case Some(out) => + createRecord(ctx, out, ra, cfg.chunkSize).map(_.some) + case None => + (None: Option[RAttachmentPreview]).pure[F] + } + + case _ => + (None: Option[RAttachmentPreview]).pure[F] + } + + def createRecord[F[_]: Sync]( + ctx: Context[F, _], + png: Stream[F, Byte], + ra: RAttachment, + chunkSize: Int + ): F[RAttachmentPreview] = { + val name = ra.name + .map(FileName.apply) + .map(_.withPart("preview", '_').withExtension("png")) + for { + fileMeta <- ctx.store.bitpeace + .saveNew( + png, + chunkSize, + MimetypeHint(name.map(_.fullName), Some("image/png")) + ) + .compile + .lastOrError + now <- Timestamp.current[F] + rp = RAttachmentPreview(ra.id, Ident.unsafe(fileMeta.id), name.map(_.fullName), now) + _ <- ctx.store.transact(RAttachmentPreview.insert(rp)) + } yield rp + } + + def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] = + OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + .map(_.toLocal) + + def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] = + ctx.store.bitpeace + .get(ra.fileId.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index fb777b24..d3e7522b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -54,6 +54,7 @@ object ProcessItem { ConvertPdf(cfg.convert, item) .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) + .flatMap(AttachmentPreview(cfg.convert)) .flatMap(Task.setProgress(progress._2)) .flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(Task.setProgress(progress._3))