mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
@ -231,7 +231,9 @@ docspell.joex {
|
||||
"-s",
|
||||
"A4",
|
||||
"--encoding",
|
||||
"UTF-8",
|
||||
"{{encoding}}",
|
||||
"--load-error-handling", "ignore",
|
||||
"--load-media-error-handling", "ignore",
|
||||
"-",
|
||||
"{{outfile}}"
|
||||
]
|
||||
|
@ -8,6 +8,7 @@ import emil.javamail.syntax._
|
||||
import cats.Applicative
|
||||
|
||||
import docspell.common._
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
object ReadMail {
|
||||
|
||||
@ -20,7 +21,7 @@ object ReadMail {
|
||||
bytesToMail(s).flatMap(mailToEntries[F](logger))
|
||||
|
||||
def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
|
||||
data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
|
||||
data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
|
||||
|
||||
def mailToEntries[F[_]: Applicative](
|
||||
logger: Logger[F]
|
||||
@ -49,7 +50,7 @@ object ReadMail {
|
||||
|
||||
implicit class MimeTypeConv(m: emil.MimeType) {
|
||||
def toDocspell: MimeType =
|
||||
MimeType(m.primary, m.sub)
|
||||
MimeType(m.primary, m.sub, m.params)
|
||||
}
|
||||
|
||||
private def bodyType[F[_]](body: MailBody[F]): String =
|
||||
|
@ -57,7 +57,7 @@ object ConvertPdf {
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
|
||||
mime match {
|
||||
case Mimetype.`application/pdf` =>
|
||||
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
|
||||
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
@ -66,9 +66,10 @@ object ConvertPdf {
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val mt = MimeType(mime.primary, mime.sub, mime.params)
|
||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
|
||||
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
|
||||
data
|
||||
)
|
||||
}
|
||||
@ -104,7 +105,8 @@ object ConvertPdf {
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
case ConversionResult.Failure(ex) =>
|
||||
ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
|
||||
ctx.logger
|
||||
.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
})
|
||||
|
||||
@ -114,7 +116,8 @@ object ConvertPdf {
|
||||
ra: RAttachment,
|
||||
pdf: Stream[F, Byte]
|
||||
) = {
|
||||
val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
|
||||
val hint =
|
||||
MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
|
||||
val newName = ra.name.map(n => s"$n.pdf")
|
||||
ctx.store.bitpeace
|
||||
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
|
||||
@ -122,7 +125,9 @@ object ConvertPdf {
|
||||
.lastOrError
|
||||
.map(fm => Ident.unsafe(fm.id))
|
||||
.flatMap(fmId =>
|
||||
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
|
||||
ctx.store
|
||||
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
|
||||
.map(_ => fmId)
|
||||
)
|
||||
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ object ExtractArchive {
|
||||
archive: Option[RAttachmentArchive]
|
||||
)(ra: RAttachment, mime: Mimetype): F[Extracted] =
|
||||
mime match {
|
||||
case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
|
||||
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
|
||||
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
|
||||
extractZip(ctx, archive)(ra)
|
||||
.flatTap(_ => cleanupParents(ctx, ra, archive))
|
||||
|
@ -76,7 +76,7 @@ object TextExtraction {
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
|
||||
findMime
|
||||
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
|
||||
}
|
||||
|
||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||
|
Reference in New Issue
Block a user