From 848f2658c4c7a20ea07cf80406d038f2325b1de2 Mon Sep 17 00:00:00 2001
From: eikek <eike.kettner@posteo.de>
Date: Mon, 29 Jan 2024 13:09:11 +0100
Subject: [PATCH] Apply `flattenArchives` to email files

Refs: #2063
---
 .../multiupload/MultiUploadArchiveTask.scala  | 86 ++++++++++++++-----
 website/site/content/docs/api/upload.md       | 21 ++---
 2 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala b/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala
index 61a6537a..35c1b3d1 100644
--- a/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala
@@ -7,19 +7,20 @@
 package docspell.joex.multiupload
 
 import cats.Monoid
-import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
-import fs2.Stream
 import fs2.io.file.Files
+import fs2.{Pipe, Stream}
 
 import docspell.backend.JobFactory
 import docspell.common._
 import docspell.common.util.Zip
+import docspell.joex.mail.ReadMail
 import docspell.logging.Logger
 import docspell.scheduler._
 import docspell.scheduler.usertask.UserTaskScope
 import docspell.store.Store
+import docspell.store.file.FileMetadata
 
 /** Task to submit multiple files at once. By default, one file in an upload results in
   * one item. Zip files are extracted, but its inner files are considered to be one item
@@ -43,8 +44,8 @@ object MultiUploadArchiveTask {
     Task { ctx =>
       ctx.args.files
         .traverse { file =>
-          isZipFile(store)(file).flatMap {
-            case true =>
+          store.fileRepo.findMeta(file.fileMetaId).map(ArchiveType.from).flatMap {
+            case ArchiveType.Zip =>
               ctx.logger.info(s"Extracting zip file ${file.name}") *>
                 extractZip(store, ctx.args)(file)
                   .evalTap(entry =>
@@ -57,7 +58,20 @@ object MultiUploadArchiveTask {
                   .toList
                   .map(Jobs.extracted(file))
 
-            case false =>
+            case ArchiveType.Email =>
+              ctx.logger.info(s"Extracting email file ${file.name}") *>
+                extractMail(store, ctx)(file)
+                  .evalTap(entry =>
+                    ctx.logger.debug(
+                      s"Create job for entry: ${entry.files.flatMap(_.name).mkString(", ")}"
+                    )
+                  )
+                  .evalMap(makeJob[F](ctx, jobStore))
+                  .compile
+                  .toList
+                  .map(Jobs.extracted(file))
+
+            case _ =>
               makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal)
           }
         }
@@ -101,13 +115,6 @@ object MultiUploadArchiveTask {
       )
     } yield job.encode
 
-  private def isZipFile[F[_]: Sync](
-      store: Store[F]
-  )(file: ProcessItemArgs.File): F[Boolean] =
-    OptionT(store.fileRepo.findMeta(file.fileMetaId))
-      .map(_.mimetype.matches(MimeType.zip))
-      .getOrElse(false)
-
   private def extractZip[F[_]: Async: Files](
       store: Store[F],
       args: Args
@@ -116,17 +123,54 @@ object MultiUploadArchiveTask {
       .getBytes(file.fileMetaId)
       .through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all)))
       .through(Binary.toBinary[F])
-      .flatMap { entry =>
-        val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
-        entry.data
-          .through(
-            store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
-          )
-          .map(key =>
-            args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil)
-          )
+      .through(entryToArgs(store, args))
+
+  private def extractMail[F[_]: Async](
+      store: Store[F],
+      ctx: Context[F, Args]
+  )(file: ProcessItemArgs.File): Stream[F, ProcessItemArgs] = {
+    val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
+    val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
+    store.fileRepo
+      .getBytes(file.fileMetaId)
+      .through(ReadMail.bytesToMail(ctx.logger))
+      .flatMap(
+        ReadMail
+          .mailToEntries(ctx.logger, glob, attachOnly)
+      )
+      .through(entryToArgs(store, ctx.args))
+  }
+
+  private def entryToArgs[F[_]](
+      store: Store[F],
+      args: Args
+  ): Pipe[F, Binary[F], ProcessItemArgs] =
+    _.flatMap { entry =>
+      val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
+      entry.data
+        .through(
+          store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
+        )
+        .map(key => args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil))
+    }
+
+  sealed trait ArchiveType
+  object ArchiveType {
+    case object Email extends ArchiveType
+    case object Zip extends ArchiveType
+    case object NoArchive extends ArchiveType
+
+    def from(fm: FileMetadata): ArchiveType =
+      fm.mimetype match {
+        case MimeType.ZipMatch(_)   => Zip
+        case MimeType.EmailMatch(_) => Email
+        case _                      => NoArchive
       }
 
+    def from(fm: Option[FileMetadata]): ArchiveType =
+      fm.map(from).getOrElse(NoArchive)
+  }
+
   case class Jobs(
       result: Result,
       jobs: List[Job[String]],
diff --git a/website/site/content/docs/api/upload.md b/website/site/content/docs/api/upload.md
index 6e38c168..b02bcca7 100644
--- a/website/site/content/docs/api/upload.md
+++ b/website/site/content/docs/api/upload.md
@@ -96,16 +96,17 @@ specified via a JSON structure in a part with name `meta`:
   `*.eml`). If this is `true`, then the e-mail body is discarded and
   only the attachments are imported. An e-mail without any attachments
   is therefore skipped.
-- `flattenArchives` is flag to control how zip files are treated. When
-  this is `false` (the default), then one zip file results in one item
-  and its contents are the attachments. If you rather want the
-  contents to be treated as independent files, then set this to
-  `true`. This will submit each entry in the zip file as a separate
-  processing job. Note: when this is `true` the zip file is just a
-  container and doesn't contain other useful information and therefore
-  is *NOT* kept in docspell, only its contents are. Also note that
-  only the uploaded zip files are extracted once (not recursively), so
-  if it contains other zip files, they are treated as normal.
+- `flattenArchives` is flag to control how `zip` and `eml` files are
+  treated. When this is `false` (the default), then one `zip` or `eml`
+  file results in one item and its contents are the attachments. If
+  you rather want the contents to be treated as independent files,
+  then set this to `true`. This will submit each entry in the archive
+  file as a separate processing job. Note: when this is `true` the
+  archive file is assumed to be just a container and doesn't contain
+  other useful information. It is therefore *NOT* kept in docspell,
+  only its contents are. Also note that only the uploaded archive
+  files are extracted once (not recursively), so if it contains other
+  archive files, they are treated as normal.
 
 # Endpoints