From 0b805726641f91b6305660a749f1aa817e626c44 Mon Sep 17 00:00:00 2001 From: Eike Kettner <eike.kettner@posteo.de> Date: Tue, 24 Mar 2020 23:03:18 +0100 Subject: [PATCH] Fix encodings for mails with non-utf8 html parts --- .../common/src/main/scala/docspell/common/Binary.scala | 9 ++++++++- .../src/main/scala/docspell/joex/mail/ReadMail.scala | 6 +++--- project/Dependencies.scala | 8 +++++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/Binary.scala b/modules/common/src/main/scala/docspell/common/Binary.scala index ec128b66..88bcd99a 100644 --- a/modules/common/src/main/scala/docspell/common/Binary.scala +++ b/modules/common/src/main/scala/docspell/common/Binary.scala @@ -1,8 +1,9 @@ package docspell.common -import fs2.{Pipe, Stream} +import fs2.{Chunk, Pipe, Stream} import java.nio.charset.Charset import java.nio.charset.StandardCharsets +import scodec.bits.ByteVector final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) { @@ -25,9 +26,15 @@ object Binary { def text[F[_]](name: String, content: String): Binary[F] = utf8(name, content).withMime(MimeType.plain.withUtf8Charset) + def text[F[_]](name: String, content: ByteVector, cs: Charset): Binary[F] = + Binary(name, MimeType.plain.withCharset(cs), Stream.chunk(Chunk.byteVector(content))) + def html[F[_]](name: String, content: String): Binary[F] = utf8(name, content).withMime(MimeType.html.withUtf8Charset) + def html[F[_]](name: String, content: ByteVector, cs: Charset): Binary[F] = + Binary(name, MimeType.html.withCharset(cs), Stream.chunk(Chunk.byteVector(content))) + def decode[F[_]](cs: Charset): Pipe[F, Byte, String] = if (cs == StandardCharsets.UTF_8) { fs2.text.utf8Decode diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 3525b9f5..85694176 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -28,9 +28,9 @@ object ReadMail { )(mail: Mail[F]): Stream[F, Binary[F]] = { val bodyEntry: F[Option[Binary[F]]] = mail.body.fold( _ => (None: Option[Binary[F]]).pure[F], - txt => txt.text.map(c => Binary.text[F]("mail.txt", c).some), - html => html.html.map(c => Binary.html[F]("mail.html", c).some), - both => both.html.map(c => Binary.html[F]("mail.html", c).some) + txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some), + html => html.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some), + both => both.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some) ) Stream.eval( diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 2a524df7..f1939a58 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -10,7 +10,7 @@ object Dependencies { val CalevVersion = "0.1.0" val CirceVersion = "0.13.0" val DoobieVersion = "0.8.8" - val EmilVersion = "0.3.0" + val EmilVersion = "0.4.0" val FastparseVersion = "2.1.3" val FlexmarkVersion = "0.60.2" val FlywayVersion = "6.3.2" @@ -89,6 +89,12 @@ object Dependencies { "com.github.eikek" %% "emil-common" % EmilVersion, "com.github.eikek" %% "emil-javamail" % EmilVersion ) + val emilDoobie = Seq( + "com.github.eikek" %% "emil-doobie" % EmilVersion, + ) + val emilTnef = Seq( + "com.github.eikek" %% "emil-tnef" % EmilVersion, + ) val stanfordNlpCore = Seq( "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(