Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now
detected, which may not work for all files. Default/fallback will be
utf-8.

There is still a problem with mails that contain html parts not in
utf8 encoding. The mail text is always returned as a string and the
original encoding is lost. Then the html is stored using utf-8 bytes,
but wkhtmltopdf reads it using latin1. It seems that the `--encoding`
setting doesn't override encoding provided by the document.
This commit is contained in:
Eike Kettner
2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions

View File

@ -231,7 +231,9 @@ docspell.joex {
"-s",
"A4",
"--encoding",
"UTF-8",
"{{encoding}}",
"--load-error-handling", "ignore",
"--load-media-error-handling", "ignore",
"-",
"{{outfile}}"
]

View File

@ -8,6 +8,7 @@ import emil.javamail.syntax._
import cats.Applicative
import docspell.common._
import java.nio.charset.StandardCharsets
object ReadMail {
@ -20,7 +21,7 @@ object ReadMail {
bytesToMail(s).flatMap(mailToEntries[F](logger))
def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
def mailToEntries[F[_]: Applicative](
logger: Logger[F]
@ -49,7 +50,7 @@ object ReadMail {
implicit class MimeTypeConv(m: emil.MimeType) {
def toDocspell: MimeType =
MimeType(m.primary, m.sub)
MimeType(m.primary, m.sub, m.params)
}
private def bodyType[F[_]](body: MailBody[F]): String =

View File

@ -57,7 +57,7 @@ object ConvertPdf {
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
mime match {
case Mimetype.`application/pdf` =>
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
@ -66,9 +66,10 @@ object ConvertPdf {
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val mt = MimeType(mime.primary, mime.sub, mime.params)
val handler = conversionHandler[F](ctx, cfg, ra, item)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
data
)
}
@ -104,7 +105,8 @@ object ConvertPdf {
(ra, None: Option[RAttachmentMeta]).pure[F]
case ConversionResult.Failure(ex) =>
ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
ctx.logger
.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
})
@ -114,7 +116,8 @@ object ConvertPdf {
ra: RAttachment,
pdf: Stream[F, Byte]
) = {
val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
val hint =
MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
val newName = ra.name.map(n => s"$n.pdf")
ctx.store.bitpeace
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
@ -122,7 +125,9 @@ object ConvertPdf {
.lastOrError
.map(fm => Ident.unsafe(fm.id))
.flatMap(fmId =>
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
ctx.store
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
.map(_ => fmId)
)
.map(fmId => ra.copy(fileId = fmId, name = newName))
}

View File

@ -70,7 +70,7 @@ object ExtractArchive {
archive: Option[RAttachmentArchive]
)(ra: RAttachment, mime: Mimetype): F[Extracted] =
mime match {
case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
extractZip(ctx, archive)(ra)
.flatTap(_ => cleanupParents(ctx, ra, archive))

View File

@ -76,7 +76,7 @@ object TextExtraction {
.getOrElse(Mimetype.`application/octet-stream`)
findMime
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
}
private def extractTextFallback[F[_]: Sync: ContextShift](