Apply flattenArchives to email files

Refs: #2063
This commit is contained in:
eikek
2024-01-29 13:09:11 +01:00
parent b5ebe73730
commit 848f2658c4
2 changed files with 76 additions and 31 deletions

View File

@ -7,19 +7,20 @@
package docspell.joex.multiupload
import cats.Monoid
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import fs2.Stream
import fs2.io.file.Files
import fs2.{Pipe, Stream}
import docspell.backend.JobFactory
import docspell.common._
import docspell.common.util.Zip
import docspell.joex.mail.ReadMail
import docspell.logging.Logger
import docspell.scheduler._
import docspell.scheduler.usertask.UserTaskScope
import docspell.store.Store
import docspell.store.file.FileMetadata
/** Task to submit multiple files at once. By default, one file in an upload results in
* one item. Zip files are extracted, but its inner files are considered to be one item
@ -43,8 +44,8 @@ object MultiUploadArchiveTask {
Task { ctx =>
ctx.args.files
.traverse { file =>
isZipFile(store)(file).flatMap {
case true =>
store.fileRepo.findMeta(file.fileMetaId).map(ArchiveType.from).flatMap {
case ArchiveType.Zip =>
ctx.logger.info(s"Extracting zip file ${file.name}") *>
extractZip(store, ctx.args)(file)
.evalTap(entry =>
@ -57,7 +58,20 @@ object MultiUploadArchiveTask {
.toList
.map(Jobs.extracted(file))
case false =>
case ArchiveType.Email =>
ctx.logger.info(s"Extracting email file ${file.name}") *>
extractMail(store, ctx)(file)
.evalTap(entry =>
ctx.logger.debug(
s"Create job for entry: ${entry.files.flatMap(_.name).mkString(", ")}"
)
)
.evalMap(makeJob[F](ctx, jobStore))
.compile
.toList
.map(Jobs.extracted(file))
case _ =>
makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal)
}
}
@ -101,13 +115,6 @@ object MultiUploadArchiveTask {
)
} yield job.encode
private def isZipFile[F[_]: Sync](
store: Store[F]
)(file: ProcessItemArgs.File): F[Boolean] =
OptionT(store.fileRepo.findMeta(file.fileMetaId))
.map(_.mimetype.matches(MimeType.zip))
.getOrElse(false)
private def extractZip[F[_]: Async: Files](
store: Store[F],
args: Args
@ -116,17 +123,54 @@ object MultiUploadArchiveTask {
.getBytes(file.fileMetaId)
.through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all)))
.through(Binary.toBinary[F])
.flatMap { entry =>
val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
entry.data
.through(
store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
)
.map(key =>
args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil)
)
.through(entryToArgs(store, args))
private def extractMail[F[_]: Async](
store: Store[F],
ctx: Context[F, Args]
)(file: ProcessItemArgs.File): Stream[F, ProcessItemArgs] = {
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
store.fileRepo
.getBytes(file.fileMetaId)
.through(ReadMail.bytesToMail(ctx.logger))
.flatMap(
ReadMail
.mailToEntries(ctx.logger, glob, attachOnly)
)
.through(entryToArgs(store, ctx.args))
}
private def entryToArgs[F[_]](
store: Store[F],
args: Args
): Pipe[F, Binary[F], ProcessItemArgs] =
_.flatMap { entry =>
val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
entry.data
.through(
store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
)
.map(key => args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil))
}
sealed trait ArchiveType
object ArchiveType {
case object Email extends ArchiveType
case object Zip extends ArchiveType
case object NoArchive extends ArchiveType
def from(fm: FileMetadata): ArchiveType =
fm.mimetype match {
case MimeType.ZipMatch(_) => Zip
case MimeType.EmailMatch(_) => Email
case _ => NoArchive
}
def from(fm: Option[FileMetadata]): ArchiveType =
fm.map(from).getOrElse(NoArchive)
}
case class Jobs(
result: Result,
jobs: List[Job[String]],

View File

@ -96,16 +96,17 @@ specified via a JSON structure in a part with name `meta`:
`*.eml`). If this is `true`, then the e-mail body is discarded and
only the attachments are imported. An e-mail without any attachments
is therefore skipped.
- `flattenArchives` is flag to control how zip files are treated. When
this is `false` (the default), then one zip file results in one item
and its contents are the attachments. If you rather want the
contents to be treated as independent files, then set this to
`true`. This will submit each entry in the zip file as a separate
processing job. Note: when this is `true` the zip file is just a
container and doesn't contain other useful information and therefore
is *NOT* kept in docspell, only its contents are. Also note that
only the uploaded zip files are extracted once (not recursively), so
if it contains other zip files, they are treated as normal.
- `flattenArchives` is flag to control how `zip` and `eml` files are
treated. When this is `false` (the default), then one `zip` or `eml`
file results in one item and its contents are the attachments. If
you rather want the contents to be treated as independent files,
then set this to `true`. This will submit each entry in the archive
file as a separate processing job. Note: when this is `true` the
archive file is assumed to be just a container and doesn't contain
other useful information. It is therefore *NOT* kept in docspell,
only its contents are. Also note that only the uploaded archive
files are extracted once (not recursively), so if it contains other
archive files, they are treated as normal.
# Endpoints