mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Fix several bugs with handling e-mail files
- When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails
This commit is contained in:
@ -0,0 +1,29 @@
|
||||
package docspell.joex.extract
|
||||
|
||||
import org.jsoup.Jsoup
|
||||
import org.jsoup.nodes._
|
||||
import emil.jsoup._
|
||||
import scodec.bits.ByteVector
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.nio.charset.{Charset, StandardCharsets}
|
||||
|
||||
object JsoupSanitizer {
|
||||
|
||||
//BIG NOTE: this changes the input document
|
||||
def apply(doc: Document): Document =
|
||||
BodyClean.whitelistClean(EmailWhitelist.default)(doc)
|
||||
|
||||
def clean(html: String): String = {
|
||||
//note: Jsoup.clean throws away the html head, which removes the
|
||||
//charset if present
|
||||
val doc = Jsoup.parse(html)
|
||||
apply(doc).outerHtml
|
||||
}
|
||||
|
||||
def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
|
||||
val in = new ByteArrayInputStream(html.toArray)
|
||||
val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
|
||||
ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
|
||||
}
|
||||
|
||||
}
|
@ -6,17 +6,14 @@ import fs2.{Pipe, Stream}
|
||||
import emil.{MimeType => _, _}
|
||||
import emil.javamail.syntax._
|
||||
import emil.tnef.TnefExtract
|
||||
import emil.markdown._
|
||||
import emil.jsoup.HtmlBodyView
|
||||
|
||||
import docspell.common._
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.charset.Charset
|
||||
import scodec.bits.ByteVector
|
||||
import docspell.joex.extract.JsoupSanitizer
|
||||
|
||||
object ReadMail {
|
||||
|
||||
def read[F[_]: Sync](str: String): F[Mail[F]] =
|
||||
Mail.deserialize(str)
|
||||
|
||||
def readBytesP[F[_]: ConcurrentEffect: ContextShift](
|
||||
logger: Logger[F]
|
||||
): Pipe[F, Byte, Binary[F]] =
|
||||
@ -25,17 +22,22 @@ object ReadMail {
|
||||
def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] =
|
||||
s =>
|
||||
Stream.eval(logger.debug(s"Converting e-mail file...")) >>
|
||||
s.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
|
||||
s.through(Mail.readBytes[F])
|
||||
|
||||
def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
|
||||
logger: Logger[F]
|
||||
)(mail: Mail[F]): Stream[F, Binary[F]] = {
|
||||
val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
|
||||
_ => (None: Option[Binary[F]]).pure[F],
|
||||
txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
|
||||
html => html.html.map(c => makeHtmlBinary(c).some),
|
||||
both => both.html.map(c => makeHtmlBinary(c).some)
|
||||
)
|
||||
val bodyEntry: F[Option[Binary[F]]] =
|
||||
if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F]
|
||||
else {
|
||||
val markdownCfg = MarkdownConfig.defaultConfig
|
||||
HtmlBodyView(
|
||||
mail.body,
|
||||
Some(mail.header),
|
||||
Some(MarkdownBody.makeHtml(markdownCfg)),
|
||||
Some(JsoupSanitizer.apply)
|
||||
).map(makeHtmlBinary[F] _).map(b => Some(b))
|
||||
}
|
||||
|
||||
Stream.eval(
|
||||
logger.debug(
|
||||
@ -53,25 +55,8 @@ object ReadMail {
|
||||
))
|
||||
}
|
||||
|
||||
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
|
||||
val c = fixHtml(cnt)
|
||||
Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
|
||||
}
|
||||
|
||||
private def fixHtml(cnt: BodyContent): BodyContent = {
|
||||
val str = cnt.asString.trim.toLowerCase
|
||||
val head = htmlHeader(cnt.charsetOrUtf8)
|
||||
if (str.startsWith("<html")) cnt
|
||||
else
|
||||
cnt match {
|
||||
case BodyContent.StringContent(s) =>
|
||||
BodyContent(head + s + htmlHeaderEnd)
|
||||
case BodyContent.ByteContent(bv, cs) =>
|
||||
val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
|
||||
val end = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
|
||||
BodyContent(begin ++ bv ++ end, cs)
|
||||
}
|
||||
}
|
||||
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
|
||||
Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
|
||||
|
||||
implicit class MimeTypeConv(m: emil.MimeType) {
|
||||
def toDocspell: MimeType =
|
||||
@ -85,16 +70,4 @@ object ReadMail {
|
||||
_ => "html-body",
|
||||
_ => "text-and-html-body"
|
||||
)
|
||||
|
||||
private def htmlHeader(cs: Charset): String =
|
||||
s"""<!DOCTYPE html>
|
||||
|<html>
|
||||
|<head>
|
||||
|<meta charset="${cs.name}"/>
|
||||
|</head>
|
||||
|<body>
|
||||
"""
|
||||
|
||||
private def htmlHeaderEnd: String =
|
||||
"</body></html>"
|
||||
}
|
||||
|
@ -12,6 +12,8 @@ import docspell.convert._
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.SanitizeHtml
|
||||
import docspell.joex.extract.JsoupSanitizer
|
||||
|
||||
/** Goes through all attachments and creates a PDF version of it where
|
||||
* supported.
|
||||
@ -35,7 +37,9 @@ object ConvertPdf {
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
def convert(ra: RAttachment) =
|
||||
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
|
||||
findMime(ctx)(ra).flatMap(m =>
|
||||
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
|
||||
)
|
||||
|
||||
for {
|
||||
ras <- item.attachments.traverse(convert)
|
||||
@ -52,10 +56,11 @@ object ConvertPdf {
|
||||
|
||||
def convertSafe[F[_]: Sync: ContextShift](
|
||||
cfg: ConvertConfig,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
item: ItemData
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
|
||||
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
|
||||
mime match {
|
||||
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
|
||||
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
|
||||
|
Reference in New Issue
Block a user