mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Fix several bugs with handling e-mail files
- When converting from html->pdf, the wkhtmltopdf program exits with errors if the document contains invalid links. The content is now cleaned before handed to wkhtmltopdf. - Update emil library which fixes a bug when reading mails without explicit transfer encoding (8bit) - Add a info header to converted mails
This commit is contained in:
@ -23,6 +23,7 @@ object Conversion {
|
||||
|
||||
def create[F[_]: Sync: ContextShift](
|
||||
cfg: ConvertConfig,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Resource[F, Conversion[F]] =
|
||||
@ -38,7 +39,10 @@ object Conversion {
|
||||
case mt @ MimeType(_, "html", _) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
|
||||
case mt @ Texts(_) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
@ -50,6 +54,7 @@ object Conversion {
|
||||
cfg.wkhtmlpdf,
|
||||
cfg.chunkSize,
|
||||
StandardCharsets.UTF_8,
|
||||
sanitizeHtml,
|
||||
blocker,
|
||||
logger
|
||||
)(bytes, handler)
|
||||
|
@ -0,0 +1,16 @@
|
||||
package docspell.convert
|
||||
import scodec.bits.ByteVector
|
||||
import java.nio.charset.Charset
|
||||
|
||||
@FunctionalInterface
|
||||
trait SanitizeHtml {
|
||||
|
||||
/** The given `bytes' are html which can be modified to strip out
|
||||
* unwanted content.
|
||||
*
|
||||
* The result should use the same character encoding as the given
|
||||
* charset implies, or utf8 if not specified.
|
||||
*/
|
||||
def apply(bytes: ByteVector, charset: Option[Charset]): ByteVector
|
||||
|
||||
}
|
@ -3,9 +3,10 @@ package docspell.convert.extern
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
import cats.implicits._
|
||||
import fs2.{Chunk, Stream}
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.{ConversionResult, SanitizeHtml}
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import java.nio.charset.Charset
|
||||
|
||||
@ -15,6 +16,7 @@ object WkHtmlPdf {
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
charset: Charset,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
@ -22,9 +24,23 @@ object WkHtmlPdf {
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
|
||||
// html sanitize should (among other) remove links to invalid
|
||||
// protocols like cid: which is not supported by further
|
||||
// processing (wkhtmltopdf errors)
|
||||
//
|
||||
// Since jsoup will load everything anyways, a stream-based
|
||||
// conversion to java's inputstream doesn't make much sense.
|
||||
val inSane = Stream.evalUnChunk(
|
||||
Binary
|
||||
.loadAllBytes(in)
|
||||
.map(bv => sanitizeHtml(bv, charset.some))
|
||||
.map(bv => Chunk.byteVector(bv))
|
||||
)
|
||||
|
||||
ExternConv
|
||||
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
|
||||
in,
|
||||
inSane,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
Reference in New Issue
Block a user