Add attachments-only filter to uploads

When uploading a file which is an e-mail, this option allows to skip
the mail body when the file is being processed.
This commit is contained in:
eikek 2021-08-21 13:37:17 +02:00
parent bb8a6c054b
commit 751fa3da5a
11 changed files with 84 additions and 31 deletions

View File

@ -73,7 +73,8 @@ object OUpload {
skipDuplicates: Boolean,
fileFilter: Glob,
tags: List[String],
language: Option[Language]
language: Option[Language],
attachmentsOnly: Option[Boolean]
)
case class UploadData[F[_]](
@ -150,7 +151,8 @@ object OUpload {
data.meta.skipDuplicates,
data.meta.fileFilter.some,
data.meta.tags.some,
false
false,
data.meta.attachmentsOnly
)
args =
if (data.multiple) files.map(f => ProcessItemArgs(meta, List(f)))

View File

@ -51,7 +51,8 @@ object ProcessItemArgs {
skipDuplicate: Boolean,
fileFilter: Option[Glob],
tags: Option[List[String]],
reprocess: Boolean
reprocess: Boolean,
attachmentsOnly: Option[Boolean]
)
object ProcessMeta {

View File

@ -44,7 +44,9 @@ case class ScanMailboxArgs(
// the language for extraction and analysis
language: Option[Language],
// apply additional filter to all mails or only imported
postHandleAll: Option[Boolean]
postHandleAll: Option[Boolean],
// Exclude the mail body when importing
attachmentsOnly: Option[Boolean]
)
object ScanMailboxArgs {

View File

@ -23,9 +23,12 @@ object ReadMail {
def readBytesP[F[_]: Async](
logger: Logger[F],
glob: Glob
glob: Glob,
attachmentsOnly: Boolean
): Pipe[F, Byte, Binary[F]] =
_.through(bytesToMail(logger)).flatMap(mailToEntries[F](logger, glob))
_.through(bytesToMail(logger)).flatMap(
mailToEntries[F](logger, glob, attachmentsOnly)
)
def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
s =>
@ -34,10 +37,30 @@ object ReadMail {
def mailToEntries[F[_]: Async](
logger: Logger[F],
glob: Glob
glob: Glob,
attachmentsOnly: Boolean
)(mail: Mail[F]): Stream[F, Binary[F]] =
Stream.eval(
logger.debug(
s"E-mail has ${mail.attachments.size} attachments and ${bodyType(mail.body)}"
)
) >>
(makeBodyEntry(logger, glob, attachmentsOnly)(mail) ++
Stream
.eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all))
.filter(a => a.filename.exists(glob.matches(caseSensitive = false)))
.map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
))
private def makeBodyEntry[F[_]: Async](
logger: Logger[F],
glob: Glob,
attachmentsOnly: Boolean
)(mail: Mail[F]): Stream[F, Binary[F]] = {
val bodyEntry: F[Option[Binary[F]]] =
if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
if (mail.body.isEmpty || attachmentsOnly) (None: Option[Binary[F]]).pure[F]
else {
val markdownCfg = MarkdownConfig.defaultConfig
HtmlBodyView(
@ -49,22 +72,14 @@ object ReadMail {
).map(makeHtmlBinary[F] _).map(b => Some(b))
}
Stream.eval(
logger.debug(
s"E-mail has ${mail.attachments.size} attachments and ${bodyType(mail.body)}"
)
) >>
(Stream
.eval(bodyEntry)
.flatMap(e => Stream.emits(e.toSeq))
.filter(a => glob.matches(caseSensitive = false)(a.name)) ++
for {
_ <- Stream.eval(logger.debug(s"Import attachments only: $attachmentsOnly"))
bin <-
Stream
.eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all))
.filter(a => a.filename.exists(glob.matches(caseSensitive = false)))
.map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
))
.eval(bodyEntry)
.flatMap(e => Stream.emits(e.toSeq))
.filter(a => glob.matches(caseSensitive = false)(a.name))
} yield bin
}
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =

View File

@ -161,7 +161,8 @@ object ExtractArchive {
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
ctx.logger.debug(s"Filtering email attachments with '${glob.asString}'") *>
email
.through(ReadMail.bytesToMail[F](ctx.logger))
@ -174,7 +175,7 @@ object ExtractArchive {
} yield s
ReadMail
.mailToEntries(ctx.logger, glob)(mail)
.mailToEntries(ctx.logger, glob, attachOnly)(mail)
.zipWithIndex
.flatMap(handleEntry(ctx, ra, pos, archive, mId)) ++ Stream.eval(givenMeta)
}

View File

@ -114,7 +114,8 @@ object ReProcessItem {
false,
None,
None,
true
true,
None // attachOnly (not used when reprocessing attachments)
),
Nil
).pure[F]

View File

@ -300,7 +300,8 @@ object ScanMailboxTask {
true,
args.fileFilter.getOrElse(Glob.all),
args.tags.getOrElse(Nil),
args.language
args.language,
args.attachmentsOnly
)
data = OUpload.UploadData(
multiple = false,

View File

@ -4336,6 +4336,10 @@ components:
format: language
postHandleAll:
type: boolean
attachmentsOnly:
type: boolean
description: |
Import only the attachments e-mails and discard the body
ImapSettingsList:
description: |
@ -5282,6 +5286,14 @@ components:
description: |
The `language` of the document may be specified, otherwise
the one from settings is used.
attachmentsOnly:
type: boolean
default: false
description: |
Only applies to e-mail files. If `true` then only
attachments of the e-mail are imported and the e-mail body
is discarded. E-mails that don't have any attachments are
skipped.
Collective:
description: |

View File

@ -337,7 +337,8 @@ trait Conversions {
m.skipDuplicates.getOrElse(false),
m.fileFilter.getOrElse(Glob.all),
m.tags.map(_.items).getOrElse(Nil),
m.language
m.language,
m.attachmentsOnly
)
)
)
@ -345,7 +346,17 @@ trait Conversions {
.getOrElse(
(
true,
UploadMeta(None, sourceName, None, validFileTypes, false, Glob.all, Nil, None)
UploadMeta(
None,
sourceName,
None,
validFileTypes,
false,
Glob.all,
Nil,
None,
None
)
)
.pure[F]
)

View File

@ -125,7 +125,8 @@ object ScanMailboxRoutes {
settings.tags.map(_.items),
settings.subjectFilter,
settings.language,
settings.postHandleAll
settings.postHandleAll,
settings.attachmentsOnly
)
)
)
@ -159,6 +160,7 @@ object ScanMailboxRoutes {
task.args.fileFilter,
task.args.subjectFilter,
task.args.language,
task.args.postHandleAll
task.args.postHandleAll,
task.args.attachmentsOnly
)
}

View File

@ -52,6 +52,7 @@ specified via a JSON structure in a part with name `meta`:
, tags: Maybe StringList
, fileFilter: Maybe String
, language: Maybe String
, attachmentsOnly: Maybe Bool
}
```
@ -90,6 +91,10 @@ specified via a JSON structure in a part with name `meta`:
- The `language` is used for processing the document(s) contained in
the request. If not specified the collective's default language is
used.
- The `attachmentsOnly` property only applies to e-mail files (usually
`*.eml`). If this is `true`, then the e-mail body is discarded and
only the attachments are imported. An e-mail without any attachments
is therefore skipped.
# Endpoints