Fix several bugs with handling e-mail files

- When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails
2025-08-05 02:24:52 +00:00 · 2020-04-07 22:05:24 +02:00
parent 12672938a0
commit 1206105f0b
9 changed files with 115 additions and 52 deletions
--- a/build.sbt
+++ b/build.sbt
@ -295,6 +295,9 @@ val joex = project.in(file("modules/joex")).
      Dependencies.circe ++
      Dependencies.pureconfig ++
      Dependencies.emilTnef ++
+      Dependencies.emilMarkdown ++
+      Dependencies.emilJsoup ++
+      Dependencies.jsoup ++
      Dependencies.loggingApi ++
      Dependencies.logging.map(_ % Runtime),
    addCompilerPlugin(Dependencies.kindProjectorPlugin),
--- a/modules/common/src/main/scala/docspell/common/Binary.scala
+++ b/modules/common/src/main/scala/docspell/common/Binary.scala
@ -1,5 +1,6 @@
 package docspell.common

+import cats.effect._
 import fs2.{Chunk, Pipe, Stream}
 import java.nio.charset.Charset
 import java.nio.charset.StandardCharsets
@ -42,6 +43,9 @@ object Binary {
      util.decode[F](cs)
    }

+  def loadAllBytes[F[_]: Sync](data: Stream[F, Byte]): F[ByteVector] =
+    data.chunks.map(_.toByteVector).compile.fold(ByteVector.empty)((r, e) => r ++ e)
+
  // This is a copy from org.http4s.util
  // Http4s is licensed under the Apache License 2.0
  private object util {
@ -85,5 +89,6 @@ object Binary {
      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
        chunk.drop(3)
      } else chunk
+
  }
 }
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@ -23,6 +23,7 @@ object Conversion {

  def create[F[_]: Sync: ContextShift](
      cfg: ConvertConfig,
+      sanitizeHtml: SanitizeHtml,
      blocker: Blocker,
      logger: Logger[F]
  ): Resource[F, Conversion[F]] =
@ -38,7 +39,10 @@ object Conversion {
          case mt @ MimeType(_, "html", _) =>
            val cs = mt.charsetOrUtf8
            WkHtmlPdf
-              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
+              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
+                in,
+                handler
+              )

          case mt @ Texts(_) =>
            val cs = mt.charsetOrUtf8
@ -50,6 +54,7 @@ object Conversion {
                cfg.wkhtmlpdf,
                cfg.chunkSize,
                StandardCharsets.UTF_8,
+                sanitizeHtml,
                blocker,
                logger
              )(bytes, handler)
--- a/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
+++ b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
@ -0,0 +1,16 @@
+package docspell.convert
+import scodec.bits.ByteVector
+import java.nio.charset.Charset
+
+@FunctionalInterface
+trait SanitizeHtml {
+
+  /** The given `bytes' are html which can be modified to strip out
+    * unwanted content.
+    *
+    * The result should use the same character encoding as the given
+    * charset implies, or utf8 if not specified.
+    */
+  def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
+
+}
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@ -3,9 +3,10 @@ package docspell.convert.extern
 import java.nio.file.Path

 import cats.effect._
-import fs2.Stream
+import cats.implicits._
+import fs2.{Chunk, Stream}
 import docspell.common._
-import docspell.convert.ConversionResult
+import docspell.convert.{ConversionResult, SanitizeHtml}
 import docspell.convert.ConversionResult.Handler
 import java.nio.charset.Charset

@ -15,6 +16,7 @@ object WkHtmlPdf {
      cfg: WkHtmlPdfConfig,
      chunkSize: Int,
      charset: Charset,
+      sanitizeHtml: SanitizeHtml,
      blocker: Blocker,
      logger: Logger[F]
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
@ -22,9 +24,23 @@ object WkHtmlPdf {
      ExternConv.readResult[F](blocker, chunkSize, logger)

    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
+
+    // html sanitize should (among other) remove links to invalid
+    // protocols like cid: which is not supported by further
+    // processing (wkhtmltopdf errors)
+    //
+    // Since jsoup will load everything anyways, a stream-based
+    // conversion to java's inputstream doesn't make much sense.
+    val inSane = Stream.evalUnChunk(
+      Binary
+        .loadAllBytes(in)
+        .map(bv => sanitizeHtml(bv, charset.some))
+        .map(bv => Chunk.byteVector(bv))
+    )
+
    ExternConv
      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
-        in,
+        inSane,
        handler
      )
  }
--- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
+++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
@ -0,0 +1,29 @@
+package docspell.joex.extract
+
+import org.jsoup.Jsoup
+import org.jsoup.nodes._
+import emil.jsoup._
+import scodec.bits.ByteVector
+import java.io.ByteArrayInputStream
+import java.nio.charset.{Charset, StandardCharsets}
+
+object JsoupSanitizer {
+
+  //BIG NOTE: this changes the input document
+  def apply(doc: Document): Document =
+    BodyClean.whitelistClean(EmailWhitelist.default)(doc)
+
+  def clean(html: String): String = {
+    //note: Jsoup.clean throws away the html head, which removes the
+    //charset if present
+    val doc = Jsoup.parse(html)
+    apply(doc).outerHtml
+  }
+
+  def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
+    val in  = new ByteArrayInputStream(html.toArray)
+    val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
+    ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
+  }
+
+}
--- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
+++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
@ -6,17 +6,14 @@ import fs2.{Pipe, Stream}
 import emil.{MimeType => _, _}
 import emil.javamail.syntax._
 import emil.tnef.TnefExtract
+import emil.markdown._
+import emil.jsoup.HtmlBodyView

 import docspell.common._
-import java.nio.charset.StandardCharsets
-import java.nio.charset.Charset
-import scodec.bits.ByteVector
+import docspell.joex.extract.JsoupSanitizer

 object ReadMail {

-  def read[F[_]: Sync](str: String): F[Mail[F]] =
-    Mail.deserialize(str)
-
  def readBytesP[F[_]: ConcurrentEffect: ContextShift](
      logger: Logger[F]
  ): Pipe[F, Byte, Binary[F]] =
@ -25,17 +22,22 @@ object ReadMail {
  def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
    s =>
      Stream.eval(logger.debug(s"Converting e-mail file...")) >>
-        s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
+        s.through(Mail.readBytes[F])

  def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
      logger: Logger[F]
  )(mail: Mail[F]): Stream[F, Binary[F]] = {
-    val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
-      _ => (None: Option[Binary[F]]).pure[F],
-      txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
-      html => html.html.map(c => makeHtmlBinary(c).some),
-      both => both.html.map(c => makeHtmlBinary(c).some)
-    )
+    val bodyEntry: F[Option[Binary[F]]] =
+      if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
+      else {
+        val markdownCfg = MarkdownConfig.defaultConfig
+        HtmlBodyView(
+          mail.body,
+          Some(mail.header),
+          Some(MarkdownBody.makeHtml(markdownCfg)),
+          Some(JsoupSanitizer.apply)
+        ).map(makeHtmlBinary[F] _).map(b => Some(b))
+      }

    Stream.eval(
      logger.debug(
@ -53,25 +55,8 @@ object ReadMail {
          ))
  }

-  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
-    val c = fixHtml(cnt)
-    Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
-  }
-
-  private def fixHtml(cnt: BodyContent): BodyContent = {
-    val str  = cnt.asString.trim.toLowerCase
-    val head = htmlHeader(cnt.charsetOrUtf8)
-    if (str.startsWith("<html")) cnt
-    else
-      cnt match {
-        case BodyContent.StringContent(s) =>
-          BodyContent(head + s + htmlHeaderEnd)
-        case BodyContent.ByteContent(bv, cs) =>
-          val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
-          val end   = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
-          BodyContent(begin ++ bv ++ end, cs)
-      }
-  }
+  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
+    Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)

  implicit class MimeTypeConv(m: emil.MimeType) {
    def toDocspell: MimeType =
@ -85,16 +70,4 @@ object ReadMail {
      _ => "html-body",
      _ => "text-and-html-body"
    )
-
-  private def htmlHeader(cs: Charset): String =
-    s"""<!DOCTYPE html>
-       |<html>
-       |<head>
-       |<meta charset="${cs.name}"/>
-       |</head>
-       |<body>
-       """
-
-  private def htmlHeaderEnd: String =
-    "</body></html>"
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@ -12,6 +12,8 @@ import docspell.convert._
 import docspell.joex.scheduler._
 import docspell.store.records._
 import docspell.convert.ConversionResult.Handler
+import docspell.convert.SanitizeHtml
+import docspell.joex.extract.JsoupSanitizer

 /** Goes through all attachments and creates a PDF version of it where
  * supported.
@ -35,7 +37,9 @@ object ConvertPdf {
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
      def convert(ra: RAttachment) =
-        findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
+        findMime(ctx)(ra).flatMap(m =>
+          convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
+        )

      for {
        ras <- item.attachments.traverse(convert)
@ -52,10 +56,11 @@ object ConvertPdf {

  def convertSafe[F[_]: Sync: ContextShift](
      cfg: ConvertConfig,
+      sanitizeHtml: SanitizeHtml,
      ctx: Context[F, ProcessItemArgs],
      item: ItemData
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
-    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
+    Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
      mime match {
        case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
          ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@ -6,11 +6,11 @@ object Dependencies {

  val BcryptVersion = "0.4"
  val BetterMonadicForVersion = "0.3.1"
-  val BitpeaceVersion = "0.4.5"
+  val BitpeaceVersion = "0.5.0"
  val CalevVersion = "0.3.0"
  val CirceVersion = "0.13.0"
  val DoobieVersion = "0.9.0"
-  val EmilVersion = "0.4.0"
+  val EmilVersion = "0.5.0"
  val FastparseVersion = "2.1.3"
  val FlexmarkVersion = "0.61.0"
  val FlywayVersion = "6.3.3"
@ -18,6 +18,7 @@ object Dependencies {
  val H2Version = "1.4.200"
  val Http4sVersion = "0.21.3"
  val Icu4jVersion = "66.1"
+  val JsoupVersion = "1.13.1"
  val KindProjectorVersion = "0.10.3"
  val Log4sVersion = "1.8.2"
  val LogbackVersion = "1.2.3"
@ -95,6 +96,16 @@ object Dependencies {
  val emilTnef = Seq(
    "com.github.eikek" %% "emil-tnef" % EmilVersion,
  )
+  val emilMarkdown = Seq(
+    "com.github.eikek" %% "emil-markdown" % EmilVersion,
+  )
+  val emilJsoup = Seq(
+    "com.github.eikek" %% "emil-jsoup" % EmilVersion,
+  )
+
+  val jsoup = Seq(
+    "org.jsoup" % "jsoup" % JsoupVersion
+  )

  val stanfordNlpCore = Seq(
    "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(