Fix several bugs with handling e-mail files

- When converting from html->pdf, the wkhtmltopdf program exits with
  errors if the document contains invalid links. The content is now
  cleaned before handed to wkhtmltopdf.
- Update emil library which fixes a bug when reading mails without
  explicit transfer encoding (8bit)
- Add a info header to converted mails
This commit is contained in:
Eike Kettner 2020-04-07 22:05:24 +02:00
parent 12672938a0
commit 1206105f0b
9 changed files with 115 additions and 52 deletions

View File

@ -295,6 +295,9 @@ val joex = project.in(file("modules/joex")).
Dependencies.circe ++
Dependencies.pureconfig ++
Dependencies.emilTnef ++
Dependencies.emilMarkdown ++
Dependencies.emilJsoup ++
Dependencies.jsoup ++
Dependencies.loggingApi ++
Dependencies.logging.map(_ % Runtime),
addCompilerPlugin(Dependencies.kindProjectorPlugin),

View File

@ -1,5 +1,6 @@
package docspell.common
import cats.effect._
import fs2.{Chunk, Pipe, Stream}
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
@ -42,6 +43,9 @@ object Binary {
util.decode[F](cs)
}
def loadAllBytes[F[_]: Sync](data: Stream[F, Byte]): F[ByteVector] =
data.chunks.map(_.toByteVector).compile.fold(ByteVector.empty)((r, e) => r ++ e)
// This is a copy from org.http4s.util
// Http4s is licensed under the Apache License 2.0
private object util {
@ -85,5 +89,6 @@ object Binary {
if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
chunk.drop(3)
} else chunk
}
}

View File

@ -23,6 +23,7 @@ object Conversion {
def create[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
): Resource[F, Conversion[F]] =
@ -38,7 +39,10 @@ object Conversion {
case mt @ MimeType(_, "html", _) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
in,
handler
)
case mt @ Texts(_) =>
val cs = mt.charsetOrUtf8
@ -50,6 +54,7 @@ object Conversion {
cfg.wkhtmlpdf,
cfg.chunkSize,
StandardCharsets.UTF_8,
sanitizeHtml,
blocker,
logger
)(bytes, handler)

View File

@ -0,0 +1,16 @@
package docspell.convert
import scodec.bits.ByteVector
import java.nio.charset.Charset
@FunctionalInterface
trait SanitizeHtml {
/** The given `bytes' are html which can be modified to strip out
* unwanted content.
*
* The result should use the same character encoding as the given
* charset implies, or utf8 if not specified.
*/
def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
}

View File

@ -3,9 +3,10 @@ package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Stream
import cats.implicits._
import fs2.{Chunk, Stream}
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.{ConversionResult, SanitizeHtml}
import docspell.convert.ConversionResult.Handler
import java.nio.charset.Charset
@ -15,6 +16,7 @@ object WkHtmlPdf {
cfg: WkHtmlPdfConfig,
chunkSize: Int,
charset: Charset,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
@ -22,9 +24,23 @@ object WkHtmlPdf {
ExternConv.readResult[F](blocker, chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
// html sanitize should (among other) remove links to invalid
// protocols like cid: which is not supported by further
// processing (wkhtmltopdf errors)
//
// Since jsoup will load everything anyways, a stream-based
// conversion to java's inputstream doesn't make much sense.
val inSane = Stream.evalUnChunk(
Binary
.loadAllBytes(in)
.map(bv => sanitizeHtml(bv, charset.some))
.map(bv => Chunk.byteVector(bv))
)
ExternConv
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
in,
inSane,
handler
)
}

View File

@ -0,0 +1,29 @@
package docspell.joex.extract
import org.jsoup.Jsoup
import org.jsoup.nodes._
import emil.jsoup._
import scodec.bits.ByteVector
import java.io.ByteArrayInputStream
import java.nio.charset.{Charset, StandardCharsets}
object JsoupSanitizer {
//BIG NOTE: this changes the input document
def apply(doc: Document): Document =
BodyClean.whitelistClean(EmailWhitelist.default)(doc)
def clean(html: String): String = {
//note: Jsoup.clean throws away the html head, which removes the
//charset if present
val doc = Jsoup.parse(html)
apply(doc).outerHtml
}
def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
val in = new ByteArrayInputStream(html.toArray)
val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
}
}

View File

@ -6,17 +6,14 @@ import fs2.{Pipe, Stream}
import emil.{MimeType => _, _}
import emil.javamail.syntax._
import emil.tnef.TnefExtract
import emil.markdown._
import emil.jsoup.HtmlBodyView
import docspell.common._
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
import scodec.bits.ByteVector
import docspell.joex.extract.JsoupSanitizer
object ReadMail {
def read[F[_]: Sync](str: String): F[Mail[F]] =
Mail.deserialize(str)
def readBytesP[F[_]: ConcurrentEffect: ContextShift](
logger: Logger[F]
): Pipe[F, Byte, Binary[F]] =
@ -25,17 +22,22 @@ object ReadMail {
def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
s =>
Stream.eval(logger.debug(s"Converting e-mail file...")) >>
s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
s.through(Mail.readBytes[F])
def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
logger: Logger[F]
)(mail: Mail[F]): Stream[F, Binary[F]] = {
val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
_ => (None: Option[Binary[F]]).pure[F],
txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
html => html.html.map(c => makeHtmlBinary(c).some),
both => both.html.map(c => makeHtmlBinary(c).some)
)
val bodyEntry: F[Option[Binary[F]]] =
if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
else {
val markdownCfg = MarkdownConfig.defaultConfig
HtmlBodyView(
mail.body,
Some(mail.header),
Some(MarkdownBody.makeHtml(markdownCfg)),
Some(JsoupSanitizer.apply)
).map(makeHtmlBinary[F] _).map(b => Some(b))
}
Stream.eval(
logger.debug(
@ -53,25 +55,8 @@ object ReadMail {
))
}
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
val c = fixHtml(cnt)
Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
}
private def fixHtml(cnt: BodyContent): BodyContent = {
val str = cnt.asString.trim.toLowerCase
val head = htmlHeader(cnt.charsetOrUtf8)
if (str.startsWith("<html")) cnt
else
cnt match {
case BodyContent.StringContent(s) =>
BodyContent(head + s + htmlHeaderEnd)
case BodyContent.ByteContent(bv, cs) =>
val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
val end = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
BodyContent(begin ++ bv ++ end, cs)
}
}
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
implicit class MimeTypeConv(m: emil.MimeType) {
def toDocspell: MimeType =
@ -85,16 +70,4 @@ object ReadMail {
_ => "html-body",
_ => "text-and-html-body"
)
private def htmlHeader(cs: Charset): String =
s"""<!DOCTYPE html>
|<html>
|<head>
|<meta charset="${cs.name}"/>
|</head>
|<body>
"""
private def htmlHeaderEnd: String =
"</body></html>"
}

View File

@ -12,6 +12,8 @@ import docspell.convert._
import docspell.joex.scheduler._
import docspell.store.records._
import docspell.convert.ConversionResult.Handler
import docspell.convert.SanitizeHtml
import docspell.joex.extract.JsoupSanitizer
/** Goes through all attachments and creates a PDF version of it where
* supported.
@ -35,7 +37,9 @@ object ConvertPdf {
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
findMime(ctx)(ra).flatMap(m =>
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
)
for {
ras <- item.attachments.traverse(convert)
@ -52,10 +56,11 @@ object ConvertPdf {
def convertSafe[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml,
ctx: Context[F, ProcessItemArgs],
item: ItemData
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
mime match {
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>

View File

@ -6,11 +6,11 @@ object Dependencies {
val BcryptVersion = "0.4"
val BetterMonadicForVersion = "0.3.1"
val BitpeaceVersion = "0.4.5"
val BitpeaceVersion = "0.5.0"
val CalevVersion = "0.3.0"
val CirceVersion = "0.13.0"
val DoobieVersion = "0.9.0"
val EmilVersion = "0.4.0"
val EmilVersion = "0.5.0"
val FastparseVersion = "2.1.3"
val FlexmarkVersion = "0.61.0"
val FlywayVersion = "6.3.3"
@ -18,6 +18,7 @@ object Dependencies {
val H2Version = "1.4.200"
val Http4sVersion = "0.21.3"
val Icu4jVersion = "66.1"
val JsoupVersion = "1.13.1"
val KindProjectorVersion = "0.10.3"
val Log4sVersion = "1.8.2"
val LogbackVersion = "1.2.3"
@ -95,6 +96,16 @@ object Dependencies {
val emilTnef = Seq(
"com.github.eikek" %% "emil-tnef" % EmilVersion,
)
val emilMarkdown = Seq(
"com.github.eikek" %% "emil-markdown" % EmilVersion,
)
val emilJsoup = Seq(
"com.github.eikek" %% "emil-jsoup" % EmilVersion,
)
val jsoup = Seq(
"org.jsoup" % "jsoup" % JsoupVersion
)
val stanfordNlpCore = Seq(
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(