From 848f2658c4c7a20ea07cf80406d038f2325b1de2 Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 29 Jan 2024 13:09:11 +0100 Subject: [PATCH] Apply `flattenArchives` to email files Refs: #2063 --- .../multiupload/MultiUploadArchiveTask.scala | 86 ++++++++++++++----- website/site/content/docs/api/upload.md | 21 ++--- 2 files changed, 76 insertions(+), 31 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala b/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala index 61a6537a..35c1b3d1 100644 --- a/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala @@ -7,19 +7,20 @@ package docspell.joex.multiupload import cats.Monoid -import cats.data.OptionT import cats.effect._ import cats.implicits._ -import fs2.Stream import fs2.io.file.Files +import fs2.{Pipe, Stream} import docspell.backend.JobFactory import docspell.common._ import docspell.common.util.Zip +import docspell.joex.mail.ReadMail import docspell.logging.Logger import docspell.scheduler._ import docspell.scheduler.usertask.UserTaskScope import docspell.store.Store +import docspell.store.file.FileMetadata /** Task to submit multiple files at once. By default, one file in an upload results in * one item. Zip files are extracted, but its inner files are considered to be one item @@ -43,8 +44,8 @@ object MultiUploadArchiveTask { Task { ctx => ctx.args.files .traverse { file => - isZipFile(store)(file).flatMap { - case true => + store.fileRepo.findMeta(file.fileMetaId).map(ArchiveType.from).flatMap { + case ArchiveType.Zip => ctx.logger.info(s"Extracting zip file ${file.name}") *> extractZip(store, ctx.args)(file) .evalTap(entry => @@ -57,7 +58,20 @@ object MultiUploadArchiveTask { .toList .map(Jobs.extracted(file)) - case false => + case ArchiveType.Email => + ctx.logger.info(s"Extracting email file ${file.name}") *> + extractMail(store, ctx)(file) + .evalTap(entry => + ctx.logger.debug( + s"Create job for entry: ${entry.files.flatMap(_.name).mkString(", ")}" + ) + ) + .evalMap(makeJob[F](ctx, jobStore)) + .compile + .toList + .map(Jobs.extracted(file)) + + case _ => makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal) } } @@ -101,13 +115,6 @@ object MultiUploadArchiveTask { ) } yield job.encode - private def isZipFile[F[_]: Sync]( - store: Store[F] - )(file: ProcessItemArgs.File): F[Boolean] = - OptionT(store.fileRepo.findMeta(file.fileMetaId)) - .map(_.mimetype.matches(MimeType.zip)) - .getOrElse(false) - private def extractZip[F[_]: Async: Files]( store: Store[F], args: Args @@ -116,17 +123,54 @@ object MultiUploadArchiveTask { .getBytes(file.fileMetaId) .through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all))) .through(Binary.toBinary[F]) - .flatMap { entry => - val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some) - entry.data - .through( - store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint) - ) - .map(key => - args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil) - ) + .through(entryToArgs(store, args)) + + private def extractMail[F[_]: Async]( + store: Store[F], + ctx: Context[F, Args] + )(file: ProcessItemArgs.File): Stream[F, ProcessItemArgs] = { + val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all) + val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false) + store.fileRepo + .getBytes(file.fileMetaId) + .through(ReadMail.bytesToMail(ctx.logger)) + .flatMap( + ReadMail + .mailToEntries(ctx.logger, glob, attachOnly) + ) + .through(entryToArgs(store, ctx.args)) + } + + private def entryToArgs[F[_]]( + store: Store[F], + args: Args + ): Pipe[F, Binary[F], ProcessItemArgs] = + _.flatMap { entry => + val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some) + entry.data + .through( + store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint) + ) + .map(key => args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil)) + } + + sealed trait ArchiveType + object ArchiveType { + case object Email extends ArchiveType + case object Zip extends ArchiveType + case object NoArchive extends ArchiveType + + def from(fm: FileMetadata): ArchiveType = + fm.mimetype match { + case MimeType.ZipMatch(_) => Zip + case MimeType.EmailMatch(_) => Email + case _ => NoArchive } + def from(fm: Option[FileMetadata]): ArchiveType = + fm.map(from).getOrElse(NoArchive) + } + case class Jobs( result: Result, jobs: List[Job[String]], diff --git a/website/site/content/docs/api/upload.md b/website/site/content/docs/api/upload.md index 6e38c168..b02bcca7 100644 --- a/website/site/content/docs/api/upload.md +++ b/website/site/content/docs/api/upload.md @@ -96,16 +96,17 @@ specified via a JSON structure in a part with name `meta`: `*.eml`). If this is `true`, then the e-mail body is discarded and only the attachments are imported. An e-mail without any attachments is therefore skipped. -- `flattenArchives` is flag to control how zip files are treated. When - this is `false` (the default), then one zip file results in one item - and its contents are the attachments. If you rather want the - contents to be treated as independent files, then set this to - `true`. This will submit each entry in the zip file as a separate - processing job. Note: when this is `true` the zip file is just a - container and doesn't contain other useful information and therefore - is *NOT* kept in docspell, only its contents are. Also note that - only the uploaded zip files are extracted once (not recursively), so - if it contains other zip files, they are treated as normal. +- `flattenArchives` is flag to control how `zip` and `eml` files are + treated. When this is `false` (the default), then one `zip` or `eml` + file results in one item and its contents are the attachments. If + you rather want the contents to be treated as independent files, + then set this to `true`. This will submit each entry in the archive + file as a separate processing job. Note: when this is `true` the + archive file is assumed to be just a container and doesn't contain + other useful information. It is therefore *NOT* kept in docspell, + only its contents are. Also note that only the uploaded archive + files are extracted once (not recursively), so if it contains other + archive files, they are treated as normal. # Endpoints