From 1206105f0bcbdd38fbc9b58ba604d916599cbdce Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 7 Apr 2020 22:05:24 +0200 Subject: [PATCH] Fix several bugs with handling e-mail files - When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails --- build.sbt | 3 + .../main/scala/docspell/common/Binary.scala | 5 ++ .../scala/docspell/convert/Conversion.scala | 7 ++- .../scala/docspell/convert/SanitizeHtml.scala | 16 +++++ .../docspell/convert/extern/WkHtmlPdf.scala | 22 ++++++- .../joex/extract/JsoupSanitizer.scala | 29 +++++++++ .../scala/docspell/joex/mail/ReadMail.scala | 61 ++++++------------- .../docspell/joex/process/ConvertPdf.scala | 9 ++- project/Dependencies.scala | 15 ++++- 9 files changed, 115 insertions(+), 52 deletions(-) create mode 100644 modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala diff --git a/build.sbt b/build.sbt index aa8eea03..6bb5c1c6 100644 --- a/build.sbt +++ b/build.sbt @@ -295,6 +295,9 @@ val joex = project.in(file("modules/joex")). Dependencies.circe ++ Dependencies.pureconfig ++ Dependencies.emilTnef ++ + Dependencies.emilMarkdown ++ + Dependencies.emilJsoup ++ + Dependencies.jsoup ++ Dependencies.loggingApi ++ Dependencies.logging.map(_ % Runtime), addCompilerPlugin(Dependencies.kindProjectorPlugin), diff --git a/modules/common/src/main/scala/docspell/common/Binary.scala b/modules/common/src/main/scala/docspell/common/Binary.scala index 88bcd99a..14237a1c 100644 --- a/modules/common/src/main/scala/docspell/common/Binary.scala +++ b/modules/common/src/main/scala/docspell/common/Binary.scala @@ -1,5 +1,6 @@ package docspell.common +import cats.effect._ import fs2.{Chunk, Pipe, Stream} import java.nio.charset.Charset import java.nio.charset.StandardCharsets @@ -42,6 +43,9 @@ object Binary { util.decode[F](cs) } + def loadAllBytes[F[_]: Sync](data: Stream[F, Byte]): F[ByteVector] = + data.chunks.map(_.toByteVector).compile.fold(ByteVector.empty)((r, e) => r ++ e) + // This is a copy from org.http4s.util // Http4s is licensed under the Apache License 2.0 private object util { @@ -85,5 +89,6 @@ object Binary { if (chunk.size >= 3 && chunk.take(3) == utf8Bom) { chunk.drop(3) } else chunk + } } diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 85e30c78..1b53513c 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -23,6 +23,7 @@ object Conversion { def create[F[_]: Sync: ContextShift]( cfg: ConvertConfig, + sanitizeHtml: SanitizeHtml, blocker: Blocker, logger: Logger[F] ): Resource[F, Conversion[F]] = @@ -38,7 +39,10 @@ object Conversion { case mt @ MimeType(_, "html", _) => val cs = mt.charsetOrUtf8 WkHtmlPdf - .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler) + .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)( + in, + handler + ) case mt @ Texts(_) => val cs = mt.charsetOrUtf8 @@ -50,6 +54,7 @@ object Conversion { cfg.wkhtmlpdf, cfg.chunkSize, StandardCharsets.UTF_8, + sanitizeHtml, blocker, logger )(bytes, handler) diff --git a/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala new file mode 100644 index 00000000..e119d410 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/SanitizeHtml.scala @@ -0,0 +1,16 @@ +package docspell.convert +import scodec.bits.ByteVector +import java.nio.charset.Charset + +@FunctionalInterface +trait SanitizeHtml { + + /** The given `bytes' are html which can be modified to strip out + * unwanted content. + * + * The result should use the same character encoding as the given + * charset implies, or utf8 if not specified. + */ + def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 8199191e..41c88040 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -3,9 +3,10 @@ package docspell.convert.extern import java.nio.file.Path import cats.effect._ -import fs2.Stream +import cats.implicits._ +import fs2.{Chunk, Stream} import docspell.common._ -import docspell.convert.ConversionResult +import docspell.convert.{ConversionResult, SanitizeHtml} import docspell.convert.ConversionResult.Handler import java.nio.charset.Charset @@ -15,6 +16,7 @@ object WkHtmlPdf { cfg: WkHtmlPdfConfig, chunkSize: Int, charset: Charset, + sanitizeHtml: SanitizeHtml, blocker: Blocker, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { @@ -22,9 +24,23 @@ object WkHtmlPdf { ExternConv.readResult[F](blocker, chunkSize, logger) val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) + + // html sanitize should (among other) remove links to invalid + // protocols like cid: which is not supported by further + // processing (wkhtmltopdf errors) + // + // Since jsoup will load everything anyways, a stream-based + // conversion to java's inputstream doesn't make much sense. + val inSane = Stream.evalUnChunk( + Binary + .loadAllBytes(in) + .map(bv => sanitizeHtml(bv, charset.some)) + .map(bv => Chunk.byteVector(bv)) + ) + ExternConv .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)( - in, + inSane, handler ) } diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala new file mode 100644 index 00000000..dbc1ba66 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala @@ -0,0 +1,29 @@ +package docspell.joex.extract + +import org.jsoup.Jsoup +import org.jsoup.nodes._ +import emil.jsoup._ +import scodec.bits.ByteVector +import java.io.ByteArrayInputStream +import java.nio.charset.{Charset, StandardCharsets} + +object JsoupSanitizer { + + //BIG NOTE: this changes the input document + def apply(doc: Document): Document = + BodyClean.whitelistClean(EmailWhitelist.default)(doc) + + def clean(html: String): String = { + //note: Jsoup.clean throws away the html head, which removes the + //charset if present + val doc = Jsoup.parse(html) + apply(doc).outerHtml + } + + def clean(html: ByteVector, cs: Option[Charset]): ByteVector = { + val in = new ByteArrayInputStream(html.toArray) + val doc = Jsoup.parse(in, cs.map(_.name).orNull, "") + ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8))) + } + +} diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 4528fa0a..c5b119d0 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -6,17 +6,14 @@ import fs2.{Pipe, Stream} import emil.{MimeType => _, _} import emil.javamail.syntax._ import emil.tnef.TnefExtract +import emil.markdown._ +import emil.jsoup.HtmlBodyView import docspell.common._ -import java.nio.charset.StandardCharsets -import java.nio.charset.Charset -import scodec.bits.ByteVector +import docspell.joex.extract.JsoupSanitizer object ReadMail { - def read[F[_]: Sync](str: String): F[Mail[F]] = - Mail.deserialize(str) - def readBytesP[F[_]: ConcurrentEffect: ContextShift]( logger: Logger[F] ): Pipe[F, Byte, Binary[F]] = @@ -25,17 +22,22 @@ object ReadMail { def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] = s => Stream.eval(logger.debug(s"Converting e-mail file...")) >> - s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F]) + s.through(Mail.readBytes[F]) def mailToEntries[F[_]: ConcurrentEffect: ContextShift]( logger: Logger[F] )(mail: Mail[F]): Stream[F, Binary[F]] = { - val bodyEntry: F[Option[Binary[F]]] = mail.body.fold( - _ => (None: Option[Binary[F]]).pure[F], - txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some), - html => html.html.map(c => makeHtmlBinary(c).some), - both => both.html.map(c => makeHtmlBinary(c).some) - ) + val bodyEntry: F[Option[Binary[F]]] = + if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F] + else { + val markdownCfg = MarkdownConfig.defaultConfig + HtmlBodyView( + mail.body, + Some(mail.header), + Some(MarkdownBody.makeHtml(markdownCfg)), + Some(JsoupSanitizer.apply) + ).map(makeHtmlBinary[F] _).map(b => Some(b)) + } Stream.eval( logger.debug( @@ -53,25 +55,8 @@ object ReadMail { )) } - private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = { - val c = fixHtml(cnt) - Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8) - } - - private def fixHtml(cnt: BodyContent): BodyContent = { - val str = cnt.asString.trim.toLowerCase - val head = htmlHeader(cnt.charsetOrUtf8) - if (str.startsWith(" - BodyContent(head + s + htmlHeaderEnd) - case BodyContent.ByteContent(bv, cs) => - val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8)) - val end = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8)) - BodyContent(begin ++ bv ++ end, cs) - } - } + private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = + Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8) implicit class MimeTypeConv(m: emil.MimeType) { def toDocspell: MimeType = @@ -85,16 +70,4 @@ object ReadMail { _ => "html-body", _ => "text-and-html-body" ) - - private def htmlHeader(cs: Charset): String = - s""" - | - | - | - | - | - """ - - private def htmlHeaderEnd: String = - "" } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index f49a4d80..b3a93260 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -12,6 +12,8 @@ import docspell.convert._ import docspell.joex.scheduler._ import docspell.store.records._ import docspell.convert.ConversionResult.Handler +import docspell.convert.SanitizeHtml +import docspell.joex.extract.JsoupSanitizer /** Goes through all attachments and creates a PDF version of it where * supported. @@ -35,7 +37,9 @@ object ConvertPdf { ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => def convert(ra: RAttachment) = - findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m)) + findMime(ctx)(ra).flatMap(m => + convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m) + ) for { ras <- item.attachments.traverse(convert) @@ -52,10 +56,11 @@ object ConvertPdf { def convertSafe[F[_]: Sync: ContextShift]( cfg: ConvertConfig, + sanitizeHtml: SanitizeHtml, ctx: Context[F, ProcessItemArgs], item: ItemData )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = - Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv => + Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv => mime match { case mt if mt.baseEqual(Mimetype.`application/pdf`) => ctx.logger.info("Not going to convert a PDF file into a PDF.") *> diff --git a/project/Dependencies.scala b/project/Dependencies.scala index c12926bb..744bb9d4 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -6,11 +6,11 @@ object Dependencies { val BcryptVersion = "0.4" val BetterMonadicForVersion = "0.3.1" - val BitpeaceVersion = "0.4.5" + val BitpeaceVersion = "0.5.0" val CalevVersion = "0.3.0" val CirceVersion = "0.13.0" val DoobieVersion = "0.9.0" - val EmilVersion = "0.4.0" + val EmilVersion = "0.5.0" val FastparseVersion = "2.1.3" val FlexmarkVersion = "0.61.0" val FlywayVersion = "6.3.3" @@ -18,6 +18,7 @@ object Dependencies { val H2Version = "1.4.200" val Http4sVersion = "0.21.3" val Icu4jVersion = "66.1" + val JsoupVersion = "1.13.1" val KindProjectorVersion = "0.10.3" val Log4sVersion = "1.8.2" val LogbackVersion = "1.2.3" @@ -95,6 +96,16 @@ object Dependencies { val emilTnef = Seq( "com.github.eikek" %% "emil-tnef" % EmilVersion, ) + val emilMarkdown = Seq( + "com.github.eikek" %% "emil-markdown" % EmilVersion, + ) + val emilJsoup = Seq( + "com.github.eikek" %% "emil-jsoup" % EmilVersion, + ) + + val jsoup = Seq( + "org.jsoup" % "jsoup" % JsoupVersion + ) val stanfordNlpCore = Seq( "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(