Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now
detected, which may not work for all files. Default/fallback will be
utf-8.

There is still a problem with mails that contain html parts not in
utf8 encoding. The mail text is always returned as a string and the
original encoding is lost. Then the html is stored using utf-8 bytes,
but wkhtmltopdf reads it using latin1. It seems that the `--encoding`
setting doesn't override encoding provided by the document.
This commit is contained in:
Eike Kettner
2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions

View File

@ -32,18 +32,27 @@ object Conversion {
in: Stream[F, Byte]
): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.pdf =>
case Pdfs(_) =>
handler.run(ConversionResult.successPdf(in))
case MimeType.html =>
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
case mt @ MimeType(_, "html", _) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
case Texts(_) =>
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
case mt @ Texts(_) =>
val cs = mt.charsetOrUtf8
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
val bytes = Stream
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
.covary[F]
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
WkHtmlPdf.toPDF(
cfg.wkhtmlpdf,
cfg.chunkSize,
StandardCharsets.UTF_8,
blocker,
logger
)(bytes, handler)
}
case Images(mt) =>
@ -51,7 +60,9 @@ object Conversion {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
.info(
s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
) *>
handler.run(
ConversionResult.inputMalformed(
mt,
@ -59,14 +70,20 @@ object Conversion {
)
)
} else {
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
in,
handler
)
}
case None =>
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
in,
handler
)
}
case Office(_) =>
@ -90,6 +107,11 @@ object Conversion {
Some(m).filter(_.primary == "text")
}
object Pdfs {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.matches(MimeType.pdf))
}
object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
@ -97,18 +119,33 @@ object Conversion {
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")
val docx =
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx =
MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")
// without a filename, tika returns application/zip for odt/ods files, since
// they are just zip files
val odfContainer = MimeType.zip
val all =
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
Set(
odt,
ods,
odtAlias,
odsAlias,
msoffice,
ooxml,
docx,
xlsx,
xls,
doc,
rtf,
odfContainer
)
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)

View File

@ -7,20 +7,23 @@ import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
import java.nio.charset.Charset
object WkHtmlPdf {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: WkHtmlPdfConfig,
chunkSize: Int,
charset: Charset,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
ExternConv
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
in,
handler
)

View File

@ -1,8 +1,9 @@
package docspell.convert.flexmark
import java.io.{InputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
import java.util
import scala.util.Try
import cats.effect.Sync
import cats.implicits._
@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
import fs2.Stream
import scala.util.Try
import docspell.common._
object Markdown {
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
val p = createParser()
val r = createRenderer()
Try {
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
val reader = new InputStreamReader(is, cs)
val doc = p.parseReader(reader)
wrapHtml(r.render(doc), cfg)
}.toEither
@ -34,8 +35,8 @@ object Markdown {
wrapHtml(r.render(doc), cfg)
}
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
s"""<!DOCTYPE html>

View File

@ -7,6 +7,7 @@ import docspell.common._
import docspell.convert.FileChecks
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
import java.nio.charset.StandardCharsets
object ExternConvTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
val wkCfg = WkHtmlPdfConfig(cfg, target)
val p =
WkHtmlPdf
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
.toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
storePdfHandler(dir.resolve("test.pdf"))
)