mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
@ -32,18 +32,27 @@ object Conversion {
|
||||
in: Stream[F, Byte]
|
||||
): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.pdf =>
|
||||
case Pdfs(_) =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
|
||||
case MimeType.html =>
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
case mt @ MimeType(_, "html", _) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
|
||||
|
||||
case Texts(_) =>
|
||||
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
|
||||
case mt @ Texts(_) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
|
||||
val bytes = Stream
|
||||
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
|
||||
.covary[F]
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
|
||||
WkHtmlPdf.toPDF(
|
||||
cfg.wkhtmlpdf,
|
||||
cfg.chunkSize,
|
||||
StandardCharsets.UTF_8,
|
||||
blocker,
|
||||
logger
|
||||
)(bytes, handler)
|
||||
}
|
||||
|
||||
case Images(mt) =>
|
||||
@ -51,7 +60,9 @@ object Conversion {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.maxImageSize) {
|
||||
logger
|
||||
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||
.info(
|
||||
s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
|
||||
) *>
|
||||
handler.run(
|
||||
ConversionResult.inputMalformed(
|
||||
mt,
|
||||
@ -59,14 +70,20 @@ object Conversion {
|
||||
)
|
||||
)
|
||||
} else {
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case None =>
|
||||
logger.info(
|
||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||
) *>
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case Office(_) =>
|
||||
@ -90,6 +107,11 @@ object Conversion {
|
||||
Some(m).filter(_.primary == "text")
|
||||
}
|
||||
|
||||
object Pdfs {
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(_.matches(MimeType.pdf))
|
||||
}
|
||||
|
||||
object Office {
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
@ -97,18 +119,33 @@ object Conversion {
|
||||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val rtf = MimeType.application("rtf")
|
||||
val docx =
|
||||
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx =
|
||||
MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val rtf = MimeType.application("rtf")
|
||||
|
||||
// without a filename, tika returns application/zip for odt/ods files, since
|
||||
// they are just zip files
|
||||
val odfContainer = MimeType.zip
|
||||
|
||||
val all =
|
||||
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
|
||||
Set(
|
||||
odt,
|
||||
ods,
|
||||
odtAlias,
|
||||
odsAlias,
|
||||
msoffice,
|
||||
ooxml,
|
||||
docx,
|
||||
xlsx,
|
||||
xls,
|
||||
doc,
|
||||
rtf,
|
||||
odfContainer
|
||||
)
|
||||
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(all.contains)
|
||||
|
@ -7,20 +7,23 @@ import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import java.nio.charset.Charset
|
||||
|
||||
object WkHtmlPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
charset: Charset,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
ExternConv
|
||||
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
|
||||
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
|
@ -1,8 +1,9 @@
|
||||
package docspell.convert.flexmark
|
||||
|
||||
import java.io.{InputStream, InputStreamReader}
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.charset.Charset
|
||||
import java.util
|
||||
import scala.util.Try
|
||||
|
||||
import cats.effect.Sync
|
||||
import cats.implicits._
|
||||
@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
|
||||
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
|
||||
import fs2.Stream
|
||||
|
||||
import scala.util.Try
|
||||
import docspell.common._
|
||||
|
||||
object Markdown {
|
||||
|
||||
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
|
||||
def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
|
||||
val p = createParser()
|
||||
val r = createRenderer()
|
||||
Try {
|
||||
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
|
||||
val reader = new InputStreamReader(is, cs)
|
||||
val doc = p.parseReader(reader)
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}.toEither
|
||||
@ -34,8 +35,8 @@ object Markdown {
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}
|
||||
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
|
||||
data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||
|
||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
|
||||
s"""<!DOCTYPE html>
|
||||
|
@ -7,6 +7,7 @@ import docspell.common._
|
||||
import docspell.convert.FileChecks
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||
val p =
|
||||
WkHtmlPdf
|
||||
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
|
||||
.toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
|
||||
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
|
Reference in New Issue
Block a user