Add attachments-only filter to uploads

When uploading a file which is an e-mail, this option allows to skip
the mail body when the file is being processed.
This commit is contained in:
eikek 2021-08-21 13:37:17 +02:00
parent bb8a6c054b
commit 751fa3da5a
11 changed files with 84 additions and 31 deletions

View File

@ -73,7 +73,8 @@ object OUpload {
skipDuplicates: Boolean, skipDuplicates: Boolean,
fileFilter: Glob, fileFilter: Glob,
tags: List[String], tags: List[String],
language: Option[Language] language: Option[Language],
attachmentsOnly: Option[Boolean]
) )
case class UploadData[F[_]]( case class UploadData[F[_]](
@ -150,7 +151,8 @@ object OUpload {
data.meta.skipDuplicates, data.meta.skipDuplicates,
data.meta.fileFilter.some, data.meta.fileFilter.some,
data.meta.tags.some, data.meta.tags.some,
false false,
data.meta.attachmentsOnly
) )
args = args =
if (data.multiple) files.map(f => ProcessItemArgs(meta, List(f))) if (data.multiple) files.map(f => ProcessItemArgs(meta, List(f)))

View File

@ -51,7 +51,8 @@ object ProcessItemArgs {
skipDuplicate: Boolean, skipDuplicate: Boolean,
fileFilter: Option[Glob], fileFilter: Option[Glob],
tags: Option[List[String]], tags: Option[List[String]],
reprocess: Boolean reprocess: Boolean,
attachmentsOnly: Option[Boolean]
) )
object ProcessMeta { object ProcessMeta {

View File

@ -44,7 +44,9 @@ case class ScanMailboxArgs(
// the language for extraction and analysis // the language for extraction and analysis
language: Option[Language], language: Option[Language],
// apply additional filter to all mails or only imported // apply additional filter to all mails or only imported
postHandleAll: Option[Boolean] postHandleAll: Option[Boolean],
// Exclude the mail body when importing
attachmentsOnly: Option[Boolean]
) )
object ScanMailboxArgs { object ScanMailboxArgs {

View File

@ -23,9 +23,12 @@ object ReadMail {
def readBytesP[F[_]: Async]( def readBytesP[F[_]: Async](
logger: Logger[F], logger: Logger[F],
glob: Glob glob: Glob,
attachmentsOnly: Boolean
): Pipe[F, Byte, Binary[F]] = ): Pipe[F, Byte, Binary[F]] =
_.through(bytesToMail(logger)).flatMap(mailToEntries[F](logger, glob)) _.through(bytesToMail(logger)).flatMap(
mailToEntries[F](logger, glob, attachmentsOnly)
)
def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] = def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
s => s =>
@ -34,10 +37,30 @@ object ReadMail {
def mailToEntries[F[_]: Async]( def mailToEntries[F[_]: Async](
logger: Logger[F], logger: Logger[F],
glob: Glob glob: Glob,
attachmentsOnly: Boolean
)(mail: Mail[F]): Stream[F, Binary[F]] =
Stream.eval(
logger.debug(
s"E-mail has ${mail.attachments.size} attachments and ${bodyType(mail.body)}"
)
) >>
(makeBodyEntry(logger, glob, attachmentsOnly)(mail) ++
Stream
.eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all))
.filter(a => a.filename.exists(glob.matches(caseSensitive = false)))
.map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
))
private def makeBodyEntry[F[_]: Async](
logger: Logger[F],
glob: Glob,
attachmentsOnly: Boolean
)(mail: Mail[F]): Stream[F, Binary[F]] = { )(mail: Mail[F]): Stream[F, Binary[F]] = {
val bodyEntry: F[Option[Binary[F]]] = val bodyEntry: F[Option[Binary[F]]] =
if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F] if (mail.body.isEmpty || attachmentsOnly) (None: Option[Binary[F]]).pure[F]
else { else {
val markdownCfg = MarkdownConfig.defaultConfig val markdownCfg = MarkdownConfig.defaultConfig
HtmlBodyView( HtmlBodyView(
@ -49,22 +72,14 @@ object ReadMail {
).map(makeHtmlBinary[F] _).map(b => Some(b)) ).map(makeHtmlBinary[F] _).map(b => Some(b))
} }
Stream.eval( for {
logger.debug( _ <- Stream.eval(logger.debug(s"Import attachments only: $attachmentsOnly"))
s"E-mail has ${mail.attachments.size} attachments and ${bodyType(mail.body)}" bin <-
) Stream
) >>
(Stream
.eval(bodyEntry) .eval(bodyEntry)
.flatMap(e => Stream.emits(e.toSeq)) .flatMap(e => Stream.emits(e.toSeq))
.filter(a => glob.matches(caseSensitive = false)(a.name)) ++ .filter(a => glob.matches(caseSensitive = false)(a.name))
Stream } yield bin
.eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all))
.filter(a => a.filename.exists(glob.matches(caseSensitive = false)))
.map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
))
} }
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =

View File

@ -162,6 +162,7 @@ object ExtractArchive {
.through(ctx.store.bitpeace.fetchData2(RangeDef.all)) .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all) val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
ctx.logger.debug(s"Filtering email attachments with '${glob.asString}'") *> ctx.logger.debug(s"Filtering email attachments with '${glob.asString}'") *>
email email
.through(ReadMail.bytesToMail[F](ctx.logger)) .through(ReadMail.bytesToMail[F](ctx.logger))
@ -174,7 +175,7 @@ object ExtractArchive {
} yield s } yield s
ReadMail ReadMail
.mailToEntries(ctx.logger, glob)(mail) .mailToEntries(ctx.logger, glob, attachOnly)(mail)
.zipWithIndex .zipWithIndex
.flatMap(handleEntry(ctx, ra, pos, archive, mId)) ++ Stream.eval(givenMeta) .flatMap(handleEntry(ctx, ra, pos, archive, mId)) ++ Stream.eval(givenMeta)
} }

View File

@ -114,7 +114,8 @@ object ReProcessItem {
false, false,
None, None,
None, None,
true true,
None // attachOnly (not used when reprocessing attachments)
), ),
Nil Nil
).pure[F] ).pure[F]

View File

@ -300,7 +300,8 @@ object ScanMailboxTask {
true, true,
args.fileFilter.getOrElse(Glob.all), args.fileFilter.getOrElse(Glob.all),
args.tags.getOrElse(Nil), args.tags.getOrElse(Nil),
args.language args.language,
args.attachmentsOnly
) )
data = OUpload.UploadData( data = OUpload.UploadData(
multiple = false, multiple = false,

View File

@ -4336,6 +4336,10 @@ components:
format: language format: language
postHandleAll: postHandleAll:
type: boolean type: boolean
attachmentsOnly:
type: boolean
description: |
Import only the attachments e-mails and discard the body
ImapSettingsList: ImapSettingsList:
description: | description: |
@ -5282,6 +5286,14 @@ components:
description: | description: |
The `language` of the document may be specified, otherwise The `language` of the document may be specified, otherwise
the one from settings is used. the one from settings is used.
attachmentsOnly:
type: boolean
default: false
description: |
Only applies to e-mail files. If `true` then only
attachments of the e-mail are imported and the e-mail body
is discarded. E-mails that don't have any attachments are
skipped.
Collective: Collective:
description: | description: |

View File

@ -337,7 +337,8 @@ trait Conversions {
m.skipDuplicates.getOrElse(false), m.skipDuplicates.getOrElse(false),
m.fileFilter.getOrElse(Glob.all), m.fileFilter.getOrElse(Glob.all),
m.tags.map(_.items).getOrElse(Nil), m.tags.map(_.items).getOrElse(Nil),
m.language m.language,
m.attachmentsOnly
) )
) )
) )
@ -345,7 +346,17 @@ trait Conversions {
.getOrElse( .getOrElse(
( (
true, true,
UploadMeta(None, sourceName, None, validFileTypes, false, Glob.all, Nil, None) UploadMeta(
None,
sourceName,
None,
validFileTypes,
false,
Glob.all,
Nil,
None,
None
)
) )
.pure[F] .pure[F]
) )

View File

@ -125,7 +125,8 @@ object ScanMailboxRoutes {
settings.tags.map(_.items), settings.tags.map(_.items),
settings.subjectFilter, settings.subjectFilter,
settings.language, settings.language,
settings.postHandleAll settings.postHandleAll,
settings.attachmentsOnly
) )
) )
) )
@ -159,6 +160,7 @@ object ScanMailboxRoutes {
task.args.fileFilter, task.args.fileFilter,
task.args.subjectFilter, task.args.subjectFilter,
task.args.language, task.args.language,
task.args.postHandleAll task.args.postHandleAll,
task.args.attachmentsOnly
) )
} }

View File

@ -52,6 +52,7 @@ specified via a JSON structure in a part with name `meta`:
, tags: Maybe StringList , tags: Maybe StringList
, fileFilter: Maybe String , fileFilter: Maybe String
, language: Maybe String , language: Maybe String
, attachmentsOnly: Maybe Bool
} }
``` ```
@ -90,6 +91,10 @@ specified via a JSON structure in a part with name `meta`:
- The `language` is used for processing the document(s) contained in - The `language` is used for processing the document(s) contained in
the request. If not specified the collective's default language is the request. If not specified the collective's default language is
used. used.
- The `attachmentsOnly` property only applies to e-mail files (usually
`*.eml`). If this is `true`, then the e-mail body is discarded and
only the attachments are imported. An e-mail without any attachments
is therefore skipped.
# Endpoints # Endpoints