mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Fix several bugs with handling e-mail files
- When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails
This commit is contained in:
		@@ -1,5 +1,6 @@
 | 
			
		||||
package docspell.common
 | 
			
		||||
 | 
			
		||||
import cats.effect._
 | 
			
		||||
import fs2.{Chunk, Pipe, Stream}
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
@@ -42,6 +43,9 @@ object Binary {
 | 
			
		||||
      util.decode[F](cs)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  def loadAllBytes[F[_]: Sync](data: Stream[F, Byte]): F[ByteVector] =
 | 
			
		||||
    data.chunks.map(_.toByteVector).compile.fold(ByteVector.empty)((r, e) => r ++ e)
 | 
			
		||||
 | 
			
		||||
  // This is a copy from org.http4s.util
 | 
			
		||||
  // Http4s is licensed under the Apache License 2.0
 | 
			
		||||
  private object util {
 | 
			
		||||
@@ -85,5 +89,6 @@ object Binary {
 | 
			
		||||
      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
 | 
			
		||||
        chunk.drop(3)
 | 
			
		||||
      } else chunk
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -23,6 +23,7 @@ object Conversion {
 | 
			
		||||
 | 
			
		||||
  def create[F[_]: Sync: ContextShift](
 | 
			
		||||
      cfg: ConvertConfig,
 | 
			
		||||
      sanitizeHtml: SanitizeHtml,
 | 
			
		||||
      blocker: Blocker,
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  ): Resource[F, Conversion[F]] =
 | 
			
		||||
@@ -38,7 +39,10 @@ object Conversion {
 | 
			
		||||
          case mt @ MimeType(_, "html", _) =>
 | 
			
		||||
            val cs = mt.charsetOrUtf8
 | 
			
		||||
            WkHtmlPdf
 | 
			
		||||
              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
 | 
			
		||||
              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
 | 
			
		||||
                in,
 | 
			
		||||
                handler
 | 
			
		||||
              )
 | 
			
		||||
 | 
			
		||||
          case mt @ Texts(_) =>
 | 
			
		||||
            val cs = mt.charsetOrUtf8
 | 
			
		||||
@@ -50,6 +54,7 @@ object Conversion {
 | 
			
		||||
                cfg.wkhtmlpdf,
 | 
			
		||||
                cfg.chunkSize,
 | 
			
		||||
                StandardCharsets.UTF_8,
 | 
			
		||||
                sanitizeHtml,
 | 
			
		||||
                blocker,
 | 
			
		||||
                logger
 | 
			
		||||
              )(bytes, handler)
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,16 @@
 | 
			
		||||
package docspell.convert
 | 
			
		||||
import scodec.bits.ByteVector
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
@FunctionalInterface
 | 
			
		||||
trait SanitizeHtml {
 | 
			
		||||
 | 
			
		||||
  /** The given `bytes' are html which can be modified to strip out
 | 
			
		||||
    * unwanted content.
 | 
			
		||||
    *
 | 
			
		||||
    * The result should use the same character encoding as the given
 | 
			
		||||
    * charset implies, or utf8 if not specified.
 | 
			
		||||
    */
 | 
			
		||||
  def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
@@ -3,9 +3,10 @@ package docspell.convert.extern
 | 
			
		||||
import java.nio.file.Path
 | 
			
		||||
 | 
			
		||||
import cats.effect._
 | 
			
		||||
import fs2.Stream
 | 
			
		||||
import cats.implicits._
 | 
			
		||||
import fs2.{Chunk, Stream}
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import docspell.convert.ConversionResult
 | 
			
		||||
import docspell.convert.{ConversionResult, SanitizeHtml}
 | 
			
		||||
import docspell.convert.ConversionResult.Handler
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
@@ -15,6 +16,7 @@ object WkHtmlPdf {
 | 
			
		||||
      cfg: WkHtmlPdfConfig,
 | 
			
		||||
      chunkSize: Int,
 | 
			
		||||
      charset: Charset,
 | 
			
		||||
      sanitizeHtml: SanitizeHtml,
 | 
			
		||||
      blocker: Blocker,
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
 | 
			
		||||
@@ -22,9 +24,23 @@ object WkHtmlPdf {
 | 
			
		||||
      ExternConv.readResult[F](blocker, chunkSize, logger)
 | 
			
		||||
 | 
			
		||||
    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
 | 
			
		||||
 | 
			
		||||
    // html sanitize should (among other) remove links to invalid
 | 
			
		||||
    // protocols like cid: which is not supported by further
 | 
			
		||||
    // processing (wkhtmltopdf errors)
 | 
			
		||||
    //
 | 
			
		||||
    // Since jsoup will load everything anyways, a stream-based
 | 
			
		||||
    // conversion to java's inputstream doesn't make much sense.
 | 
			
		||||
    val inSane = Stream.evalUnChunk(
 | 
			
		||||
      Binary
 | 
			
		||||
        .loadAllBytes(in)
 | 
			
		||||
        .map(bv => sanitizeHtml(bv, charset.some))
 | 
			
		||||
        .map(bv => Chunk.byteVector(bv))
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    ExternConv
 | 
			
		||||
      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
 | 
			
		||||
        in,
 | 
			
		||||
        inSane,
 | 
			
		||||
        handler
 | 
			
		||||
      )
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,29 @@
 | 
			
		||||
package docspell.joex.extract
 | 
			
		||||
 | 
			
		||||
import org.jsoup.Jsoup
 | 
			
		||||
import org.jsoup.nodes._
 | 
			
		||||
import emil.jsoup._
 | 
			
		||||
import scodec.bits.ByteVector
 | 
			
		||||
import java.io.ByteArrayInputStream
 | 
			
		||||
import java.nio.charset.{Charset, StandardCharsets}
 | 
			
		||||
 | 
			
		||||
object JsoupSanitizer {
 | 
			
		||||
 | 
			
		||||
  //BIG NOTE: this changes the input document
 | 
			
		||||
  def apply(doc: Document): Document =
 | 
			
		||||
    BodyClean.whitelistClean(EmailWhitelist.default)(doc)
 | 
			
		||||
 | 
			
		||||
  def clean(html: String): String = {
 | 
			
		||||
    //note: Jsoup.clean throws away the html head, which removes the
 | 
			
		||||
    //charset if present
 | 
			
		||||
    val doc = Jsoup.parse(html)
 | 
			
		||||
    apply(doc).outerHtml
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
 | 
			
		||||
    val in  = new ByteArrayInputStream(html.toArray)
 | 
			
		||||
    val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
 | 
			
		||||
    ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
@@ -6,17 +6,14 @@ import fs2.{Pipe, Stream}
 | 
			
		||||
import emil.{MimeType => _, _}
 | 
			
		||||
import emil.javamail.syntax._
 | 
			
		||||
import emil.tnef.TnefExtract
 | 
			
		||||
import emil.markdown._
 | 
			
		||||
import emil.jsoup.HtmlBodyView
 | 
			
		||||
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
import scodec.bits.ByteVector
 | 
			
		||||
import docspell.joex.extract.JsoupSanitizer
 | 
			
		||||
 | 
			
		||||
object ReadMail {
 | 
			
		||||
 | 
			
		||||
  def read[F[_]: Sync](str: String): F[Mail[F]] =
 | 
			
		||||
    Mail.deserialize(str)
 | 
			
		||||
 | 
			
		||||
  def readBytesP[F[_]: ConcurrentEffect: ContextShift](
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  ): Pipe[F, Byte, Binary[F]] =
 | 
			
		||||
@@ -25,17 +22,22 @@ object ReadMail {
 | 
			
		||||
  def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
 | 
			
		||||
    s =>
 | 
			
		||||
      Stream.eval(logger.debug(s"Converting e-mail file...")) >>
 | 
			
		||||
        s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
 | 
			
		||||
        s.through(Mail.readBytes[F])
 | 
			
		||||
 | 
			
		||||
  def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  )(mail: Mail[F]): Stream[F, Binary[F]] = {
 | 
			
		||||
    val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
 | 
			
		||||
      _ => (None: Option[Binary[F]]).pure[F],
 | 
			
		||||
      txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
 | 
			
		||||
      html => html.html.map(c => makeHtmlBinary(c).some),
 | 
			
		||||
      both => both.html.map(c => makeHtmlBinary(c).some)
 | 
			
		||||
    )
 | 
			
		||||
    val bodyEntry: F[Option[Binary[F]]] =
 | 
			
		||||
      if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
 | 
			
		||||
      else {
 | 
			
		||||
        val markdownCfg = MarkdownConfig.defaultConfig
 | 
			
		||||
        HtmlBodyView(
 | 
			
		||||
          mail.body,
 | 
			
		||||
          Some(mail.header),
 | 
			
		||||
          Some(MarkdownBody.makeHtml(markdownCfg)),
 | 
			
		||||
          Some(JsoupSanitizer.apply)
 | 
			
		||||
        ).map(makeHtmlBinary[F] _).map(b => Some(b))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    Stream.eval(
 | 
			
		||||
      logger.debug(
 | 
			
		||||
@@ -53,25 +55,8 @@ object ReadMail {
 | 
			
		||||
          ))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
 | 
			
		||||
    val c = fixHtml(cnt)
 | 
			
		||||
    Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def fixHtml(cnt: BodyContent): BodyContent = {
 | 
			
		||||
    val str  = cnt.asString.trim.toLowerCase
 | 
			
		||||
    val head = htmlHeader(cnt.charsetOrUtf8)
 | 
			
		||||
    if (str.startsWith("<html")) cnt
 | 
			
		||||
    else
 | 
			
		||||
      cnt match {
 | 
			
		||||
        case BodyContent.StringContent(s) =>
 | 
			
		||||
          BodyContent(head + s + htmlHeaderEnd)
 | 
			
		||||
        case BodyContent.ByteContent(bv, cs) =>
 | 
			
		||||
          val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
 | 
			
		||||
          val end   = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
 | 
			
		||||
          BodyContent(begin ++ bv ++ end, cs)
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
 | 
			
		||||
    Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
 | 
			
		||||
 | 
			
		||||
  implicit class MimeTypeConv(m: emil.MimeType) {
 | 
			
		||||
    def toDocspell: MimeType =
 | 
			
		||||
@@ -85,16 +70,4 @@ object ReadMail {
 | 
			
		||||
      _ => "html-body",
 | 
			
		||||
      _ => "text-and-html-body"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
  private def htmlHeader(cs: Charset): String =
 | 
			
		||||
    s"""<!DOCTYPE html>
 | 
			
		||||
       |<html>
 | 
			
		||||
       |<head>
 | 
			
		||||
       |<meta charset="${cs.name}"/>
 | 
			
		||||
       |</head>
 | 
			
		||||
       |<body>
 | 
			
		||||
       """
 | 
			
		||||
 | 
			
		||||
  private def htmlHeaderEnd: String =
 | 
			
		||||
    "</body></html>"
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -12,6 +12,8 @@ import docspell.convert._
 | 
			
		||||
import docspell.joex.scheduler._
 | 
			
		||||
import docspell.store.records._
 | 
			
		||||
import docspell.convert.ConversionResult.Handler
 | 
			
		||||
import docspell.convert.SanitizeHtml
 | 
			
		||||
import docspell.joex.extract.JsoupSanitizer
 | 
			
		||||
 | 
			
		||||
/** Goes through all attachments and creates a PDF version of it where
 | 
			
		||||
  * supported.
 | 
			
		||||
@@ -35,7 +37,9 @@ object ConvertPdf {
 | 
			
		||||
  ): Task[F, ProcessItemArgs, ItemData] =
 | 
			
		||||
    Task { ctx =>
 | 
			
		||||
      def convert(ra: RAttachment) =
 | 
			
		||||
        findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
 | 
			
		||||
        findMime(ctx)(ra).flatMap(m =>
 | 
			
		||||
          convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
      for {
 | 
			
		||||
        ras <- item.attachments.traverse(convert)
 | 
			
		||||
@@ -52,10 +56,11 @@ object ConvertPdf {
 | 
			
		||||
 | 
			
		||||
  def convertSafe[F[_]: Sync: ContextShift](
 | 
			
		||||
      cfg: ConvertConfig,
 | 
			
		||||
      sanitizeHtml: SanitizeHtml,
 | 
			
		||||
      ctx: Context[F, ProcessItemArgs],
 | 
			
		||||
      item: ItemData
 | 
			
		||||
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
 | 
			
		||||
    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
 | 
			
		||||
    Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
 | 
			
		||||
      mime match {
 | 
			
		||||
        case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
 | 
			
		||||
          ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user