Fix several bugs with handling e-mail files

- When converting from html->pdf, the wkhtmltopdf program exits with
  errors if the document contains invalid links. The content is now
  cleaned before handed to wkhtmltopdf.
- Update emil library which fixes a bug when reading mails without
  explicit transfer encoding (8bit)
- Add a info header to converted mails
This commit is contained in:
Eike Kettner
2020-04-07 22:05:24 +02:00
parent 12672938a0
commit 1206105f0b
9 changed files with 115 additions and 52 deletions

View File

@ -23,6 +23,7 @@ object Conversion {
def create[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
): Resource[F, Conversion[F]] =
@ -38,7 +39,10 @@ object Conversion {
case mt @ MimeType(_, "html", _) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
in,
handler
)
case mt @ Texts(_) =>
val cs = mt.charsetOrUtf8
@ -50,6 +54,7 @@ object Conversion {
cfg.wkhtmlpdf,
cfg.chunkSize,
StandardCharsets.UTF_8,
sanitizeHtml,
blocker,
logger
)(bytes, handler)

View File

@ -0,0 +1,16 @@
package docspell.convert
import scodec.bits.ByteVector
import java.nio.charset.Charset
@FunctionalInterface
trait SanitizeHtml {
/** The given `bytes' are html which can be modified to strip out
* unwanted content.
*
* The result should use the same character encoding as the given
* charset implies, or utf8 if not specified.
*/
def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
}

View File

@ -3,9 +3,10 @@ package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Stream
import cats.implicits._
import fs2.{Chunk, Stream}
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.{ConversionResult, SanitizeHtml}
import docspell.convert.ConversionResult.Handler
import java.nio.charset.Charset
@ -15,6 +16,7 @@ object WkHtmlPdf {
cfg: WkHtmlPdfConfig,
chunkSize: Int,
charset: Charset,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
@ -22,9 +24,23 @@ object WkHtmlPdf {
ExternConv.readResult[F](blocker, chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
// html sanitize should (among other) remove links to invalid
// protocols like cid: which is not supported by further
// processing (wkhtmltopdf errors)
//
// Since jsoup will load everything anyways, a stream-based
// conversion to java's inputstream doesn't make much sense.
val inSane = Stream.evalUnChunk(
Binary
.loadAllBytes(in)
.map(bv => sanitizeHtml(bv, charset.some))
.map(bv => Chunk.byteVector(bv))
)
ExternConv
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
in,
inSane,
handler
)
}