mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-04 06:05:59 +00:00
Create a preview image of all files during processing
This commit is contained in:
parent
0841a33ae3
commit
ef7cb4e779
48
modules/common/src/main/scala/docspell/common/FileName.scala
Normal file
48
modules/common/src/main/scala/docspell/common/FileName.scala
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
case class FileName private (name: String) {
|
||||||
|
|
||||||
|
private[this] val (base, ext) =
|
||||||
|
name.lastIndexOf('.') match {
|
||||||
|
case -1 => (name, None)
|
||||||
|
case n => (name.take(n), Some(name.drop(n + 1)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the name part without the extension. If there is no
|
||||||
|
* extension, it is the same as fullname.
|
||||||
|
*/
|
||||||
|
def baseName: String =
|
||||||
|
base
|
||||||
|
|
||||||
|
/** Returns the extension part if available without the dot. */
|
||||||
|
def extension: Option[String] =
|
||||||
|
ext
|
||||||
|
|
||||||
|
def fullName: String =
|
||||||
|
name
|
||||||
|
|
||||||
|
/** Creates a new name where part is spliced into the name before the
|
||||||
|
* extension, separated by separator.
|
||||||
|
*/
|
||||||
|
def withPart(part: String, sep: Char): FileName =
|
||||||
|
if (part.isEmpty()) this
|
||||||
|
else
|
||||||
|
ext
|
||||||
|
.map(e => new FileName(s"${base}${sep}${part}.${e}"))
|
||||||
|
.getOrElse(new FileName(s"${base}${sep}${part}"))
|
||||||
|
|
||||||
|
/** Create a new name using the given extension. */
|
||||||
|
def withExtension(newExt: String): FileName =
|
||||||
|
if (newExt.isEmpty()) new FileName(base)
|
||||||
|
else new FileName(s"${base}.${newExt}")
|
||||||
|
|
||||||
|
}
|
||||||
|
object FileName {
|
||||||
|
|
||||||
|
def apply(name: String): FileName =
|
||||||
|
Option(name)
|
||||||
|
.map(_.trim)
|
||||||
|
.filter(_.nonEmpty)
|
||||||
|
.map(n => new FileName(n))
|
||||||
|
.getOrElse(new FileName("unknown-file"))
|
||||||
|
}
|
@ -0,0 +1,58 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
import minitest._
|
||||||
|
|
||||||
|
object FileNameTest extends SimpleTestSuite {
|
||||||
|
|
||||||
|
test("make filename") {
|
||||||
|
val data = List(
|
||||||
|
(FileName("test"), "test", None),
|
||||||
|
(FileName("test.pdf"), "test", Some("pdf")),
|
||||||
|
(FileName("bla.xml.gz"), "bla.xml", Some("gz")),
|
||||||
|
(FileName(""), "unknown-file", None)
|
||||||
|
)
|
||||||
|
|
||||||
|
data.foreach { case (fn, base, ext) =>
|
||||||
|
assertEquals(fn.baseName, base)
|
||||||
|
assertEquals(fn.extension, ext)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("with part") {
|
||||||
|
assertEquals(
|
||||||
|
FileName("test.pdf").withPart("converted", '_'),
|
||||||
|
FileName("test_converted.pdf")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("bla.xml.gz").withPart("converted", '_'),
|
||||||
|
FileName("bla.xml_converted.gz")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("test").withPart("converted", '_'),
|
||||||
|
FileName("test_converted")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("test").withPart("", '_'),
|
||||||
|
FileName("test")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("with extension") {
|
||||||
|
assertEquals(
|
||||||
|
FileName("test.pdf").withExtension("xml"),
|
||||||
|
FileName("test.xml")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("test").withExtension("xml"),
|
||||||
|
FileName("test.xml")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("test.pdf.gz").withExtension("xml"),
|
||||||
|
FileName("test.pdf.xml")
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
FileName("test.pdf.gz").withExtension(""),
|
||||||
|
FileName("test.pdf")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,89 @@
|
|||||||
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.Functor
|
||||||
|
import cats.data.OptionT
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.convert._
|
||||||
|
import docspell.extract.pdfbox.PdfboxPreview
|
||||||
|
import docspell.joex.scheduler._
|
||||||
|
import docspell.store.records.RAttachment
|
||||||
|
import docspell.store.records._
|
||||||
|
import docspell.store.syntax.MimeTypes._
|
||||||
|
|
||||||
|
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||||
|
|
||||||
|
/** Goes through all attachments that must be already converted into a
|
||||||
|
* pdf. If it is a pdf, the first page is converted into a small
|
||||||
|
* preview png image and linked to the attachment.
|
||||||
|
*/
|
||||||
|
object AttachmentPreview {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync: ContextShift](cfg: ConvertConfig)(
|
||||||
|
item: ItemData
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.info(
|
||||||
|
s"Creating preview images for ${item.attachments.size} files…"
|
||||||
|
)
|
||||||
|
_ <- item.attachments.traverse(createPreview(ctx, cfg))
|
||||||
|
} yield item
|
||||||
|
}
|
||||||
|
|
||||||
|
def createPreview[F[_]: Sync](ctx: Context[F, _], cfg: ConvertConfig)(
|
||||||
|
ra: RAttachment
|
||||||
|
): F[Option[RAttachmentPreview]] =
|
||||||
|
findMime[F](ctx)(ra).flatMap {
|
||||||
|
case MimeType.PdfMatch(_) =>
|
||||||
|
PdfboxPreview(48).flatMap(_.previewPNG(loadFile(ctx)(ra))).flatMap {
|
||||||
|
case Some(out) =>
|
||||||
|
createRecord(ctx, out, ra, cfg.chunkSize).map(_.some)
|
||||||
|
case None =>
|
||||||
|
(None: Option[RAttachmentPreview]).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
(None: Option[RAttachmentPreview]).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
def createRecord[F[_]: Sync](
|
||||||
|
ctx: Context[F, _],
|
||||||
|
png: Stream[F, Byte],
|
||||||
|
ra: RAttachment,
|
||||||
|
chunkSize: Int
|
||||||
|
): F[RAttachmentPreview] = {
|
||||||
|
val name = ra.name
|
||||||
|
.map(FileName.apply)
|
||||||
|
.map(_.withPart("preview", '_').withExtension("png"))
|
||||||
|
for {
|
||||||
|
fileMeta <- ctx.store.bitpeace
|
||||||
|
.saveNew(
|
||||||
|
png,
|
||||||
|
chunkSize,
|
||||||
|
MimetypeHint(name.map(_.fullName), Some("image/png"))
|
||||||
|
)
|
||||||
|
.compile
|
||||||
|
.lastOrError
|
||||||
|
now <- Timestamp.current[F]
|
||||||
|
rp = RAttachmentPreview(ra.id, Ident.unsafe(fileMeta.id), name.map(_.fullName), now)
|
||||||
|
_ <- ctx.store.transact(RAttachmentPreview.insert(rp))
|
||||||
|
} yield rp
|
||||||
|
}
|
||||||
|
|
||||||
|
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||||
|
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||||
|
.map(_.mimetype)
|
||||||
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
.map(_.toLocal)
|
||||||
|
|
||||||
|
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
|
||||||
|
ctx.store.bitpeace
|
||||||
|
.get(ra.fileId.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
|
||||||
|
}
|
@ -54,6 +54,7 @@ object ProcessItem {
|
|||||||
ConvertPdf(cfg.convert, item)
|
ConvertPdf(cfg.convert, item)
|
||||||
.flatMap(Task.setProgress(progress._1))
|
.flatMap(Task.setProgress(progress._1))
|
||||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||||
|
.flatMap(AttachmentPreview(cfg.convert))
|
||||||
.flatMap(Task.setProgress(progress._2))
|
.flatMap(Task.setProgress(progress._2))
|
||||||
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||||
.flatMap(Task.setProgress(progress._3))
|
.flatMap(Task.setProgress(progress._3))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user