Fix several bugs with handling e-mail files

- When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails
2025-08-05 02:24:52 +00:00 · 2020-04-07 22:05:24 +02:00
parent 12672938a0
commit 1206105f0b
9 changed files with 115 additions and 52 deletions
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@ -23,6 +23,7 @@ object Conversion {

  def create[F[_]: Sync: ContextShift](
      cfg: ConvertConfig,
+      sanitizeHtml: SanitizeHtml,
      blocker: Blocker,
      logger: Logger[F]
  ): Resource[F, Conversion[F]] =
@ -38,7 +39,10 @@ object Conversion {
          case mt @ MimeType(_, "html", _) =>
            val cs = mt.charsetOrUtf8
            WkHtmlPdf
-              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
+              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
+                in,
+                handler
+              )

          case mt @ Texts(_) =>
            val cs = mt.charsetOrUtf8
@ -50,6 +54,7 @@ object Conversion {
                cfg.wkhtmlpdf,
                cfg.chunkSize,
                StandardCharsets.UTF_8,
+                sanitizeHtml,
                blocker,
                logger
              )(bytes, handler)
--- a/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
+++ b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala
@ -0,0 +1,16 @@
+package docspell.convert
+import scodec.bits.ByteVector
+import java.nio.charset.Charset
+
+@FunctionalInterface
+trait SanitizeHtml {
+
+  /** The given `bytes' are html which can be modified to strip out
+    * unwanted content.
+    *
+    * The result should use the same character encoding as the given
+    * charset implies, or utf8 if not specified.
+    */
+  def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
+
+}
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@ -3,9 +3,10 @@ package docspell.convert.extern
 import java.nio.file.Path

 import cats.effect._
-import fs2.Stream
+import cats.implicits._
+import fs2.{Chunk, Stream}
 import docspell.common._
-import docspell.convert.ConversionResult
+import docspell.convert.{ConversionResult, SanitizeHtml}
 import docspell.convert.ConversionResult.Handler
 import java.nio.charset.Charset

@ -15,6 +16,7 @@ object WkHtmlPdf {
      cfg: WkHtmlPdfConfig,
      chunkSize: Int,
      charset: Charset,
+      sanitizeHtml: SanitizeHtml,
      blocker: Blocker,
      logger: Logger[F]
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
@ -22,9 +24,23 @@ object WkHtmlPdf {
      ExternConv.readResult[F](blocker, chunkSize, logger)

    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
+
+    // html sanitize should (among other) remove links to invalid
+    // protocols like cid: which is not supported by further
+    // processing (wkhtmltopdf errors)
+    //
+    // Since jsoup will load everything anyways, a stream-based
+    // conversion to java's inputstream doesn't make much sense.
+    val inSane = Stream.evalUnChunk(
+      Binary
+        .loadAllBytes(in)
+        .map(bv => sanitizeHtml(bv, charset.some))
+        .map(bv => Chunk.byteVector(bv))
+    )
+
    ExternConv
      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
-        in,
+        inSane,
        handler
      )
  }