diff --git a/build.sbt b/build.sbt index b4183a4d..992bbe74 100644 --- a/build.sbt +++ b/build.sbt @@ -204,6 +204,7 @@ val store = project.in(file("modules/store")). Dependencies.flyway ++ Dependencies.loggingApi ++ Dependencies.emil ++ + Dependencies.emilDoobie ++ Dependencies.calev ).dependsOn(common) @@ -293,6 +294,7 @@ val joex = project.in(file("modules/joex")). Dependencies.http4s ++ Dependencies.circe ++ Dependencies.pureconfig ++ + Dependencies.emilTnef ++ Dependencies.loggingApi ++ Dependencies.logging.map(_ % Runtime), addCompilerPlugin(Dependencies.kindProjectorPlugin), diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 85694176..27539543 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -5,17 +5,21 @@ import cats.implicits._ import fs2.{Pipe, Stream} import emil.{MimeType => _, _} import emil.javamail.syntax._ -import cats.Applicative +import emil.tnef.TnefExtract import docspell.common._ import java.nio.charset.StandardCharsets +import java.nio.charset.Charset +import scodec.bits.ByteVector object ReadMail { def read[F[_]: Sync](str: String): F[Mail[F]] = Mail.deserialize(str) - def readBytesP[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Binary[F]] = + def readBytesP[F[_]: ConcurrentEffect: ContextShift]( + logger: Logger[F] + ): Pipe[F, Byte, Binary[F]] = s => Stream.eval(logger.debug(s"Converting e-mail into its parts")) >> bytesToMail(s).flatMap(mailToEntries[F](logger)) @@ -23,14 +27,14 @@ object ReadMail { def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] = data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F]) - def mailToEntries[F[_]: Applicative]( + def mailToEntries[F[_]: ConcurrentEffect: ContextShift]( logger: Logger[F] )(mail: Mail[F]): Stream[F, Binary[F]] = { val bodyEntry: F[Option[Binary[F]]] = mail.body.fold( _ => (None: Option[Binary[F]]).pure[F], txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some), - html => html.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some), - both => both.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some) + html => html.html.map(c => makeHtmlBinary(c).some), + both => both.html.map(c => makeHtmlBinary(c).some) ) Stream.eval( @@ -42,12 +46,33 @@ object ReadMail { .eval(bodyEntry) .flatMap(e => Stream.emits(e.toSeq)) ++ Stream - .emits(mail.attachments.all) + .eval(TnefExtract.replace(mail)) + .flatMap(m => Stream.emits(m.attachments.all)) .map(a => Binary(a.filename.getOrElse("noname"), a.mimeType.toDocspell, a.content) )) } + private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = { + val c = fixHtml(cnt) + Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8) + } + + private def fixHtml(cnt: BodyContent): BodyContent = { + val str = cnt.asString.trim.toLowerCase + val head = htmlHeader(cnt.charsetOrUtf8) + if (str.startsWith(" + BodyContent(head + s + htmlHeaderEnd) + case BodyContent.ByteContent(bv, cs) => + val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8)) + val end = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8)) + BodyContent(begin ++ bv ++ end, cs) + } + } + + implicit class MimeTypeConv(m: emil.MimeType) { def toDocspell: MimeType = MimeType(m.primary, m.sub, m.params) @@ -60,4 +85,16 @@ object ReadMail { _ => "html-body", _ => "text-and-html-body" ) + + private def htmlHeader(cs: Charset): String = + s""" + | + | + | + | + | + """ + + private def htmlHeaderEnd: String = + "" } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index de973c67..3671aa86 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -126,7 +126,7 @@ object ExtractArchive { .lastOrError } - def extractMail[F[_]: Sync]( + def extractMail[F[_]: ConcurrentEffect: ContextShift]( ctx: Context[F, _], archive: Option[RAttachmentArchive] )(ra: RAttachment): F[Extracted] = {