Merge pull request #2475 from eikek/fix/2063-split-eml

Apply `flattenArchives` to email files
This commit is contained in:
mergify[bot]
2024-01-29 12:37:49 +00:00
committed by GitHub
2 changed files with 76 additions and 31 deletions

View File

@ -7,19 +7,20 @@
package docspell.joex.multiupload package docspell.joex.multiupload
import cats.Monoid import cats.Monoid
import cats.data.OptionT
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Stream
import fs2.io.file.Files import fs2.io.file.Files
import fs2.{Pipe, Stream}
import docspell.backend.JobFactory import docspell.backend.JobFactory
import docspell.common._ import docspell.common._
import docspell.common.util.Zip import docspell.common.util.Zip
import docspell.joex.mail.ReadMail
import docspell.logging.Logger import docspell.logging.Logger
import docspell.scheduler._ import docspell.scheduler._
import docspell.scheduler.usertask.UserTaskScope import docspell.scheduler.usertask.UserTaskScope
import docspell.store.Store import docspell.store.Store
import docspell.store.file.FileMetadata
/** Task to submit multiple files at once. By default, one file in an upload results in /** Task to submit multiple files at once. By default, one file in an upload results in
* one item. Zip files are extracted, but its inner files are considered to be one item * one item. Zip files are extracted, but its inner files are considered to be one item
@ -43,8 +44,8 @@ object MultiUploadArchiveTask {
Task { ctx => Task { ctx =>
ctx.args.files ctx.args.files
.traverse { file => .traverse { file =>
isZipFile(store)(file).flatMap { store.fileRepo.findMeta(file.fileMetaId).map(ArchiveType.from).flatMap {
case true => case ArchiveType.Zip =>
ctx.logger.info(s"Extracting zip file ${file.name}") *> ctx.logger.info(s"Extracting zip file ${file.name}") *>
extractZip(store, ctx.args)(file) extractZip(store, ctx.args)(file)
.evalTap(entry => .evalTap(entry =>
@ -57,7 +58,20 @@ object MultiUploadArchiveTask {
.toList .toList
.map(Jobs.extracted(file)) .map(Jobs.extracted(file))
case false => case ArchiveType.Email =>
ctx.logger.info(s"Extracting email file ${file.name}") *>
extractMail(store, ctx)(file)
.evalTap(entry =>
ctx.logger.debug(
s"Create job for entry: ${entry.files.flatMap(_.name).mkString(", ")}"
)
)
.evalMap(makeJob[F](ctx, jobStore))
.compile
.toList
.map(Jobs.extracted(file))
case _ =>
makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal) makeJob(ctx, jobStore)(ctx.args.copy(files = List(file))).map(Jobs.normal)
} }
} }
@ -101,13 +115,6 @@ object MultiUploadArchiveTask {
) )
} yield job.encode } yield job.encode
private def isZipFile[F[_]: Sync](
store: Store[F]
)(file: ProcessItemArgs.File): F[Boolean] =
OptionT(store.fileRepo.findMeta(file.fileMetaId))
.map(_.mimetype.matches(MimeType.zip))
.getOrElse(false)
private def extractZip[F[_]: Async: Files]( private def extractZip[F[_]: Async: Files](
store: Store[F], store: Store[F],
args: Args args: Args
@ -116,15 +123,52 @@ object MultiUploadArchiveTask {
.getBytes(file.fileMetaId) .getBytes(file.fileMetaId)
.through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all))) .through(Zip[F]().unzip(glob = args.meta.fileFilter.getOrElse(Glob.all)))
.through(Binary.toBinary[F]) .through(Binary.toBinary[F])
.flatMap { entry => .through(entryToArgs(store, args))
private def extractMail[F[_]: Async](
store: Store[F],
ctx: Context[F, Args]
)(file: ProcessItemArgs.File): Stream[F, ProcessItemArgs] = {
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
store.fileRepo
.getBytes(file.fileMetaId)
.through(ReadMail.bytesToMail(ctx.logger))
.flatMap(
ReadMail
.mailToEntries(ctx.logger, glob, attachOnly)
)
.through(entryToArgs(store, ctx.args))
}
private def entryToArgs[F[_]](
store: Store[F],
args: Args
): Pipe[F, Binary[F], ProcessItemArgs] =
_.flatMap { entry =>
val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some) val hint = MimeTypeHint(entry.name.some, entry.mime.asString.some)
entry.data entry.data
.through( .through(
store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint) store.fileRepo.save(args.meta.collective, FileCategory.AttachmentSource, hint)
) )
.map(key => .map(key => args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil))
args.copy(files = ProcessItemArgs.File(entry.name.some, key) :: Nil) }
)
sealed trait ArchiveType
object ArchiveType {
case object Email extends ArchiveType
case object Zip extends ArchiveType
case object NoArchive extends ArchiveType
def from(fm: FileMetadata): ArchiveType =
fm.mimetype match {
case MimeType.ZipMatch(_) => Zip
case MimeType.EmailMatch(_) => Email
case _ => NoArchive
}
def from(fm: Option[FileMetadata]): ArchiveType =
fm.map(from).getOrElse(NoArchive)
} }
case class Jobs( case class Jobs(

View File

@ -96,16 +96,17 @@ specified via a JSON structure in a part with name `meta`:
`*.eml`). If this is `true`, then the e-mail body is discarded and `*.eml`). If this is `true`, then the e-mail body is discarded and
only the attachments are imported. An e-mail without any attachments only the attachments are imported. An e-mail without any attachments
is therefore skipped. is therefore skipped.
- `flattenArchives` is flag to control how zip files are treated. When - `flattenArchives` is flag to control how `zip` and `eml` files are
this is `false` (the default), then one zip file results in one item treated. When this is `false` (the default), then one `zip` or `eml`
and its contents are the attachments. If you rather want the file results in one item and its contents are the attachments. If
contents to be treated as independent files, then set this to you rather want the contents to be treated as independent files,
`true`. This will submit each entry in the zip file as a separate then set this to `true`. This will submit each entry in the archive
processing job. Note: when this is `true` the zip file is just a file as a separate processing job. Note: when this is `true` the
container and doesn't contain other useful information and therefore archive file is assumed to be just a container and doesn't contain
is *NOT* kept in docspell, only its contents are. Also note that other useful information. It is therefore *NOT* kept in docspell,
only the uploaded zip files are extracted once (not recursively), so only its contents are. Also note that only the uploaded archive
if it contains other zip files, they are treated as normal. files are extracted once (not recursively), so if it contains other
archive files, they are treated as normal.
# Endpoints # Endpoints