Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now
detected, which may not work for all files. Default/fallback will be
utf-8.

There is still a problem with mails that contain html parts not in
utf8 encoding. The mail text is always returned as a string and the
original encoding is lost. Then the html is stored using utf-8 bytes,
but wkhtmltopdf reads it using latin1. It seems that the `--encoding`
setting doesn't override encoding provided by the document.
This commit is contained in:
Eike Kettner
2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions

View File

@ -1,6 +1,8 @@
package docspell.common
import fs2.Stream
import fs2.{Pipe, Stream}
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {
@ -14,11 +16,67 @@ object Binary {
Binary[F](name, MimeType.octetStream, data)
def utf8[F[_]](name: String, content: String): Binary[F] =
Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
Binary[F](
name,
MimeType.octetStream,
Stream.emit(content).through(fs2.text.utf8Encode)
)
def text[F[_]](name: String, content: String): Binary[F] =
utf8(name, content).withMime(MimeType.plain)
utf8(name, content).withMime(MimeType.plain.withUtf8Charset)
def html[F[_]](name: String, content: String): Binary[F] =
utf8(name, content).withMime(MimeType.html)
utf8(name, content).withMime(MimeType.html.withUtf8Charset)
def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
if (cs == StandardCharsets.UTF_8) {
fs2.text.utf8Decode
} else {
util.decode[F](cs)
}
// This is a copy from org.http4s.util
// Http4s is licensed under the Apache License 2.0
private object util {
import fs2._
import java.nio._
private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
val decoder = charset.newDecoder
val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
val charBufferSize = 128
_.repeatPull[String] {
_.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
case None =>
val charBuffer = CharBuffer.allocate(1)
decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
decoder.flush(charBuffer)
val outputString = charBuffer.flip().toString
if (outputString.isEmpty) Pull.done.as(None)
else Pull.output1(outputString).as(None)
case Some((chunk, stream)) =>
if (chunk.nonEmpty) {
val chunkWithoutBom = skipByteOrderMark(chunk)
val bytes = chunkWithoutBom.toArray
val byteBuffer = ByteBuffer.wrap(bytes)
val charBuffer = CharBuffer.allocate(bytes.length * maxCharsPerByte)
decoder.decode(byteBuffer, charBuffer, false)
val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
} else {
Pull.output(Chunk.empty[String]).as(Some(stream))
}
}
}
}
private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
chunk.drop(3)
} else chunk
}
}

View File

@ -2,13 +2,39 @@ package docspell.common
import docspell.common.syntax.all._
import io.circe.{Decoder, Encoder}
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
/** A MIME Type impl with just enough features for the use here.
*/
case class MimeType(primary: String, sub: String) {
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
def withParam(name: String, value: String): MimeType =
copy(params = params.updated(name, value))
def withCharset(cs: Charset): MimeType =
withParam("charset", cs.name())
def withUtf8Charset: MimeType =
withCharset(StandardCharsets.UTF_8)
def resolveCharset: Option[Charset] =
params.get("charset").flatMap { cs =>
if (Charset.isSupported(cs)) Some(Charset.forName(cs))
else None
}
def charsetOrUtf8: Charset =
resolveCharset.getOrElse(StandardCharsets.UTF_8)
def baseType: MimeType =
if (params.isEmpty) this else copy(params = Map.empty)
def asString: String =
s"$primary/$sub"
if (params.isEmpty) s"$primary/$sub"
else {
val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
s"$primary/$sub; $parameters"
}
def matches(other: MimeType): Boolean =
primary == other.primary &&
@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
object MimeType {
def application(sub: String): MimeType =
MimeType("application", partFromString(sub).throwLeft)
MimeType("application", sub, Map.empty)
def text(sub: String): MimeType =
MimeType("text", partFromString(sub).throwLeft)
MimeType("text", sub, Map.empty)
def image(sub: String): MimeType =
MimeType("image", partFromString(sub).throwLeft)
MimeType("image", sub, Map.empty)
private[this] val validChars: Set[Char] =
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
def parse(str: String): Either[String, MimeType] = {
def parsePrimary: Either[String, (String, String)] =
str.indexOf('/') match {
case -1 => Left(s"Invalid mediatype: $str")
case n => Right(str.take(n) -> str.drop(n + 1))
}
def parse(str: String): Either[String, MimeType] =
str.indexOf('/') match {
case -1 => Left(s"Invalid MIME type: $str")
case n =>
for {
prim <- partFromString(str.substring(0, n))
sub <- partFromString(str.substring(n + 1))
} yield MimeType(prim.toLowerCase, sub.toLowerCase)
}
def parseSub(s: String): Either[String, (String, String)] =
s.indexOf(';') match {
case -1 => Right((s, ""))
case n => Right((s.take(n), s.drop(n)))
}
def parseParams(s: String): Map[String, String] =
s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
case a :: b :: Nil => Some((a, b))
case _ => None
}).toMap
for {
pt <- parsePrimary
st <- parseSub(pt._2)
pa = parseParams(st._2)
} yield MimeType(pt._1, st._1, pa)
}
def unsafe(str: String): MimeType =
parse(str).throwLeft
private def partFromString(s: String): Either[String, String] =
if (s.forall(validChars.contains)) Right(s)
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
val octetStream = application("octet-stream")
val pdf = application("pdf")
val zip = application("zip")
@ -55,6 +90,16 @@ object MimeType {
val html = text("html")
val plain = text("plain")
object PdfMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(pdf))
}
object HtmlMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(html))
}
implicit val jsonEncoder: Encoder[MimeType] =
Encoder.encodeString.contramap(_.asString)