Apply flattenArchives to email files

Refs: #2063
2025-08-01 13:04:52 +00:00 · 2024-01-29 13:09:11 +01:00
parent b5ebe73730
commit 848f2658c4
2 changed files with 76 additions and 31 deletions
--- a/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/multiupload/MultiUploadArchiveTask.scala
@ -7,19 +7,20 @@
 package docspell.joex.multiupload

 import cats.Monoid
-import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
-import fs2.Stream
 import fs2.io.file.Files
+import fs2.{Pipe, Stream}

 import docspell.backend.JobFactory
 import docspell.common._
 import docspell.common.util.Zip
+import docspell.joex.mail.ReadMail
 import docspell.logging.Logger
 import docspell.scheduler._
 import docspell.scheduler.usertask.UserTaskScope
 import docspell.store.Store
+import docspell.store.file.FileMetadata

 /** Task to submit multiple files at once. By default, one file in an upload results in
  * one item. Zip files are extracted, but its inner files are considered to be one item
@ -43,8 +44,8 @@ object MultiUploadArchiveTask {
    Task { ctx =>
      ctx.args.files
        .traverse { file =>
-          isZipFile(store)(file).flatMap {
-            case true =>
+          store.fileRepo.findMeta(file.fileMetaId).map(ArchiveType.from).flatMap {
+            case ArchiveType.Zip =>
              ctx.logger.info(s"Extracting zip file ${file.name}") *>
                extractZip(store, ctx.args)(file)
                  .evalTap(entry =>
@ -57,7 +58,20 @@ object MultiUploadArchiveTask {
                  .toList
                  .map(Jobs.extracted(file))

-            case false =>
+            case ArchiveType.Email =>
+              ctx.logger.info(s"Extracting email file ${file.name}") *>
+                extractMail(store, ctx)(file)
+                  .evalTap(entry =>
+                    ctx.logger.debug(
+                      s"Create job for entry: ${entry.files.flatMap(_.name).mkString(", ")}"
+                    )
+                  )
+                  .evalMap(makeJob[F](ctx, jobStore))
+                  .compile
+                  .toList
+                  .map(Jobs.extracted(file))
+
+            case _ =>
              makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal)
          }
        }
@ -101,13 +115,6 @@ object MultiUploadArchiveTask {
      )
    } yield job.encode

-  private def isZipFile[F[_]: Sync](
-      store: Store[F]
-  )(file: ProcessItemArgs.File): F[Boolean] =
-    OptionT(store.fileRepo.findMeta(file.fileMetaId))
-      .map(_.mimetype.matches(MimeType.zip))
-      .getOrElse(false)
-
  private def extractZip[F[_]: Async: Files](
      store: Store[F],
      args: Args
@ -116,17 +123,54 @@ object MultiUploadArchiveTask {
      .getBytes(file.fileMetaId)
      .through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all)))
      .through(Binary.toBinary[F])
-      .flatMap { entry =>
-        val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
-        entry.data
-          .through(
-            store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
-          )
-          .map(key =>
-            args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil)
-          )
+      .through(entryToArgs(store, args))
+
+  private def extractMail[F[_]: Async](
+      store: Store[F],
+      ctx: Context[F, Args]
+  )(file: ProcessItemArgs.File): Stream[F, ProcessItemArgs] = {
+    val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
+    val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
+    store.fileRepo
+      .getBytes(file.fileMetaId)
+      .through(ReadMail.bytesToMail(ctx.logger))
+      .flatMap(
+        ReadMail
+          .mailToEntries(ctx.logger, glob, attachOnly)
+      )
+      .through(entryToArgs(store, ctx.args))
+  }
+
+  private def entryToArgs[F[_]](
+      store: Store[F],
+      args: Args
+  ): Pipe[F, Binary[F], ProcessItemArgs] =
+    _.flatMap { entry =>
+      val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
+      entry.data
+        .through(
+          store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
+        )
+        .map(key => args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil))
+    }
+
+  sealed trait ArchiveType
+  object ArchiveType {
+    case object Email extends ArchiveType
+    case object Zip extends ArchiveType
+    case object NoArchive extends ArchiveType
+
+    def from(fm: FileMetadata): ArchiveType =
+      fm.mimetype match {
+        case MimeType.ZipMatch(_)   => Zip
+        case MimeType.EmailMatch(_) => Email
+        case _                      => NoArchive
      }

+    def from(fm: Option[FileMetadata]): ArchiveType =
+      fm.map(from).getOrElse(NoArchive)
+  }
+
  case class Jobs(
      result: Result,
      jobs: List[Job[String]],
--- a/website/site/content/docs/api/upload.md
+++ b/website/site/content/docs/api/upload.md
@ -96,16 +96,17 @@ specified via a JSON structure in a part with name `meta`:
  `*.eml`). If this is `true`, then the e-mail body is discarded and
  only the attachments are imported. An e-mail without any attachments
  is therefore skipped.
- `flattenArchives` is flag to control how zip files are treated. When
-  this is `false` (the default), then one zip file results in one item
-  and its contents are the attachments. If you rather want the
-  contents to be treated as independent files, then set this to
-  `true`. This will submit each entry in the zip file as a separate
-  processing job. Note: when this is `true` the zip file is just a
-  container and doesn't contain other useful information and therefore
-  is *NOT* kept in docspell, only its contents are. Also note that
-  only the uploaded zip files are extracted once (not recursively), so
-  if it contains other zip files, they are treated as normal.
+- `flattenArchives` is flag to control how `zip` and `eml` files are
+  treated. When this is `false` (the default), then one `zip` or `eml`
+  file results in one item and its contents are the attachments. If
+  you rather want the contents to be treated as independent files,
+  then set this to `true`. This will submit each entry in the archive
+  file as a separate processing job. Note: when this is `true` the
+  archive file is assumed to be just a container and doesn't contain
+  other useful information. It is therefore *NOT* kept in docspell,
+  only its contents are. Also note that only the uploaded archive
+  files are extracted once (not recursively), so if it contains other
+  archive files, they are treated as normal.

 # Endpoints