From 1206105f0bcbdd38fbc9b58ba604d916599cbdce Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 7 Apr 2020 22:05:24 +0200
Subject: [PATCH] Fix several bugs with handling e-mail files

- When converting from html->pdf, the wkhtmltopdf program exits with
  errors if the document contains invalid links. The content is now
  cleaned before handed to wkhtmltopdf.
- Update emil library which fixes a bug when reading mails without
  explicit transfer encoding (8bit)
- Add a info header to converted mails
---
 build.sbt                                     |  3 +
 .../main/scala/docspell/common/Binary.scala   |  5 ++
 .../scala/docspell/convert/Conversion.scala   |  7 ++-
 .../scala/docspell/convert/SanitizeHtml.scala | 16 +++++
 .../docspell/convert/extern/WkHtmlPdf.scala   | 22 ++++++-
 .../joex/extract/JsoupSanitizer.scala         | 29 +++++++++
 .../scala/docspell/joex/mail/ReadMail.scala   | 61 ++++++-------------
 .../docspell/joex/process/ConvertPdf.scala    |  9 ++-
 project/Dependencies.scala                    | 15 ++++-
 9 files changed, 115 insertions(+), 52 deletions(-)
 create mode 100644 modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
 create mode 100644 modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala

diff --git a/build.sbt b/build.sbt
index aa8eea03..6bb5c1c6 100644
--- a/build.sbt
+++ b/build.sbt
@@ -295,6 +295,9 @@ val joex = project.in(file("modules/joex")).
       Dependencies.circe ++
       Dependencies.pureconfig ++
       Dependencies.emilTnef ++
+      Dependencies.emilMarkdown ++
+      Dependencies.emilJsoup ++
+      Dependencies.jsoup ++
       Dependencies.loggingApi ++
       Dependencies.logging.map(_ % Runtime),
     addCompilerPlugin(Dependencies.kindProjectorPlugin),
diff --git a/modules/common/src/main/scala/docspell/common/Binary.scala b/modules/common/src/main/scala/docspell/common/Binary.scala
index 88bcd99a..14237a1c 100644
--- a/modules/common/src/main/scala/docspell/common/Binary.scala
+++ b/modules/common/src/main/scala/docspell/common/Binary.scala
@@ -1,5 +1,6 @@
 package docspell.common
 
+import cats.effect._
 import fs2.{Chunk, Pipe, Stream}
 import java.nio.charset.Charset
 import java.nio.charset.StandardCharsets
@@ -42,6 +43,9 @@ object Binary {
       util.decode[F](cs)
     }
 
+  def loadAllBytes[F[_]: Sync](data: Stream[F, Byte]): F[ByteVector] =
+    data.chunks.map(_.toByteVector).compile.fold(ByteVector.empty)((r, e) => r ++ e)
+
   // This is a copy from org.http4s.util
   // Http4s is licensed under the Apache License 2.0
   private object util {
@@ -85,5 +89,6 @@ object Binary {
       if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
         chunk.drop(3)
       } else chunk
+
   }
 }
diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
index 85e30c78..1b53513c 100644
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@@ -23,6 +23,7 @@ object Conversion {
 
   def create[F[_]: Sync: ContextShift](
       cfg: ConvertConfig,
+      sanitizeHtml: SanitizeHtml,
       blocker: Blocker,
       logger: Logger[F]
   ): Resource[F, Conversion[F]] =
@@ -38,7 +39,10 @@ object Conversion {
           case mt @ MimeType(_, "html", _) =>
             val cs = mt.charsetOrUtf8
             WkHtmlPdf
-              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
+              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
+                in,
+                handler
+              )
 
           case mt @ Texts(_) =>
             val cs = mt.charsetOrUtf8
@@ -50,6 +54,7 @@ object Conversion {
                 cfg.wkhtmlpdf,
                 cfg.chunkSize,
                 StandardCharsets.UTF_8,
+                sanitizeHtml,
                 blocker,
                 logger
               )(bytes, handler)
diff --git a/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
new file mode 100644
index 00000000..e119d410
--- /dev/null
+++ b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
@@ -0,0 +1,16 @@
+package docspell.convert
+import scodec.bits.ByteVector
+import java.nio.charset.Charset
+
+@FunctionalInterface
+trait SanitizeHtml {
+
+  /** The given `bytes' are html which can be modified to strip out
+    * unwanted content.
+    *
+    * The result should use the same character encoding as the given
+    * charset implies, or utf8 if not specified.
+    */
+  def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
+
+}
diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
index 8199191e..41c88040 100644
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@@ -3,9 +3,10 @@ package docspell.convert.extern
 import java.nio.file.Path
 
 import cats.effect._
-import fs2.Stream
+import cats.implicits._
+import fs2.{Chunk, Stream}
 import docspell.common._
-import docspell.convert.ConversionResult
+import docspell.convert.{ConversionResult, SanitizeHtml}
 import docspell.convert.ConversionResult.Handler
 import java.nio.charset.Charset
 
@@ -15,6 +16,7 @@ object WkHtmlPdf {
       cfg: WkHtmlPdfConfig,
       chunkSize: Int,
       charset: Charset,
+      sanitizeHtml: SanitizeHtml,
       blocker: Blocker,
       logger: Logger[F]
   )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
@@ -22,9 +24,23 @@ object WkHtmlPdf {
       ExternConv.readResult[F](blocker, chunkSize, logger)
 
     val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
+
+    // html sanitize should (among other) remove links to invalid
+    // protocols like cid: which is not supported by further
+    // processing (wkhtmltopdf errors)
+    //
+    // Since jsoup will load everything anyways, a stream-based
+    // conversion to java's inputstream doesn't make much sense.
+    val inSane = Stream.evalUnChunk(
+      Binary
+        .loadAllBytes(in)
+        .map(bv => sanitizeHtml(bv, charset.some))
+        .map(bv => Chunk.byteVector(bv))
+    )
+
     ExternConv
       .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
-        in,
+        inSane,
         handler
       )
   }
diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
new file mode 100644
index 00000000..dbc1ba66
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
@@ -0,0 +1,29 @@
+package docspell.joex.extract
+
+import org.jsoup.Jsoup
+import org.jsoup.nodes._
+import emil.jsoup._
+import scodec.bits.ByteVector
+import java.io.ByteArrayInputStream
+import java.nio.charset.{Charset, StandardCharsets}
+
+object JsoupSanitizer {
+
+  //BIG NOTE: this changes the input document
+  def apply(doc: Document): Document =
+    BodyClean.whitelistClean(EmailWhitelist.default)(doc)
+
+  def clean(html: String): String = {
+    //note: Jsoup.clean throws away the html head, which removes the
+    //charset if present
+    val doc = Jsoup.parse(html)
+    apply(doc).outerHtml
+  }
+
+  def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
+    val in  = new ByteArrayInputStream(html.toArray)
+    val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
+    ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
+  }
+
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
index 4528fa0a..c5b119d0 100644
--- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
+++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
@@ -6,17 +6,14 @@ import fs2.{Pipe, Stream}
 import emil.{MimeType => _, _}
 import emil.javamail.syntax._
 import emil.tnef.TnefExtract
+import emil.markdown._
+import emil.jsoup.HtmlBodyView
 
 import docspell.common._
-import java.nio.charset.StandardCharsets
-import java.nio.charset.Charset
-import scodec.bits.ByteVector
+import docspell.joex.extract.JsoupSanitizer
 
 object ReadMail {
 
-  def read[F[_]: Sync](str: String): F[Mail[F]] =
-    Mail.deserialize(str)
-
   def readBytesP[F[_]: ConcurrentEffect: ContextShift](
       logger: Logger[F]
   ): Pipe[F, Byte, Binary[F]] =
@@ -25,17 +22,22 @@ object ReadMail {
   def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
     s =>
       Stream.eval(logger.debug(s"Converting e-mail file...")) >>
-        s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
+        s.through(Mail.readBytes[F])
 
   def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
       logger: Logger[F]
   )(mail: Mail[F]): Stream[F, Binary[F]] = {
-    val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
-      _ => (None: Option[Binary[F]]).pure[F],
-      txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
-      html => html.html.map(c => makeHtmlBinary(c).some),
-      both => both.html.map(c => makeHtmlBinary(c).some)
-    )
+    val bodyEntry: F[Option[Binary[F]]] =
+      if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
+      else {
+        val markdownCfg = MarkdownConfig.defaultConfig
+        HtmlBodyView(
+          mail.body,
+          Some(mail.header),
+          Some(MarkdownBody.makeHtml(markdownCfg)),
+          Some(JsoupSanitizer.apply)
+        ).map(makeHtmlBinary[F] _).map(b => Some(b))
+      }
 
     Stream.eval(
       logger.debug(
@@ -53,25 +55,8 @@ object ReadMail {
           ))
   }
 
-  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
-    val c = fixHtml(cnt)
-    Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
-  }
-
-  private def fixHtml(cnt: BodyContent): BodyContent = {
-    val str  = cnt.asString.trim.toLowerCase
-    val head = htmlHeader(cnt.charsetOrUtf8)
-    if (str.startsWith("<html")) cnt
-    else
-      cnt match {
-        case BodyContent.StringContent(s) =>
-          BodyContent(head + s + htmlHeaderEnd)
-        case BodyContent.ByteContent(bv, cs) =>
-          val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
-          val end   = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
-          BodyContent(begin ++ bv ++ end, cs)
-      }
-  }
+  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
+    Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
 
   implicit class MimeTypeConv(m: emil.MimeType) {
     def toDocspell: MimeType =
@@ -85,16 +70,4 @@ object ReadMail {
       _ => "html-body",
       _ => "text-and-html-body"
     )
-
-  private def htmlHeader(cs: Charset): String =
-    s"""<!DOCTYPE html>
-       |<html>
-       |<head>
-       |<meta charset="${cs.name}"/>
-       |</head>
-       |<body>
-       """
-
-  private def htmlHeaderEnd: String =
-    "</body></html>"
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
index f49a4d80..b3a93260 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@@ -12,6 +12,8 @@ import docspell.convert._
 import docspell.joex.scheduler._
 import docspell.store.records._
 import docspell.convert.ConversionResult.Handler
+import docspell.convert.SanitizeHtml
+import docspell.joex.extract.JsoupSanitizer
 
 /** Goes through all attachments and creates a PDF version of it where
   * supported.
@@ -35,7 +37,9 @@ object ConvertPdf {
   ): Task[F, ProcessItemArgs, ItemData] =
     Task { ctx =>
       def convert(ra: RAttachment) =
-        findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
+        findMime(ctx)(ra).flatMap(m =>
+          convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
+        )
 
       for {
         ras <- item.attachments.traverse(convert)
@@ -52,10 +56,11 @@ object ConvertPdf {
 
   def convertSafe[F[_]: Sync: ContextShift](
       cfg: ConvertConfig,
+      sanitizeHtml: SanitizeHtml,
       ctx: Context[F, ProcessItemArgs],
       item: ItemData
   )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
-    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
+    Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
       mime match {
         case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
           ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index c12926bb..744bb9d4 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -6,11 +6,11 @@ object Dependencies {
 
   val BcryptVersion = "0.4"
   val BetterMonadicForVersion = "0.3.1"
-  val BitpeaceVersion = "0.4.5"
+  val BitpeaceVersion = "0.5.0"
   val CalevVersion = "0.3.0"
   val CirceVersion = "0.13.0"
   val DoobieVersion = "0.9.0"
-  val EmilVersion = "0.4.0"
+  val EmilVersion = "0.5.0"
   val FastparseVersion = "2.1.3"
   val FlexmarkVersion = "0.61.0"
   val FlywayVersion = "6.3.3"
@@ -18,6 +18,7 @@ object Dependencies {
   val H2Version = "1.4.200"
   val Http4sVersion = "0.21.3"
   val Icu4jVersion = "66.1"
+  val JsoupVersion = "1.13.1"
   val KindProjectorVersion = "0.10.3"
   val Log4sVersion = "1.8.2"
   val LogbackVersion = "1.2.3"
@@ -95,6 +96,16 @@ object Dependencies {
   val emilTnef = Seq(
     "com.github.eikek" %% "emil-tnef" % EmilVersion,
   )
+  val emilMarkdown = Seq(
+    "com.github.eikek" %% "emil-markdown" % EmilVersion,
+  )
+  val emilJsoup = Seq(
+    "com.github.eikek" %% "emil-jsoup" % EmilVersion,
+  )
+
+  val jsoup = Seq(
+    "org.jsoup" % "jsoup" % JsoupVersion
+  )
 
   val stanfordNlpCore = Seq(
     "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(