Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
2025-06-21 18:08:25 +00:00 · 2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions
--- a/modules/common/src/main/scala/docspell/common/Binary.scala
+++ b/modules/common/src/main/scala/docspell/common/Binary.scala
@ -1,6 +1,8 @@
 package docspell.common

-import fs2.Stream
+import fs2.{Pipe, Stream}
+import java.nio.charset.Charset
+import java.nio.charset.StandardCharsets

 final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {

@ -14,11 +16,67 @@ object Binary {
    Binary[F](name, MimeType.octetStream, data)

  def utf8[F[_]](name: String, content: String): Binary[F] =
-    Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
+    Binary[F](
+      name,
+      MimeType.octetStream,
+      Stream.emit(content).through(fs2.text.utf8Encode)
+    )

  def text[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.plain)
+    utf8(name, content).withMime(MimeType.plain.withUtf8Charset)

  def html[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.html)
+    utf8(name, content).withMime(MimeType.html.withUtf8Charset)
+
+  def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
+    if (cs == StandardCharsets.UTF_8) {
+      fs2.text.utf8Decode
+    } else {
+      util.decode[F](cs)
+    }
+
+  // This is a copy from org.http4s.util
+  // Http4s is licensed under the Apache License 2.0
+  private object util {
+    import fs2._
+    import java.nio._
+
+    private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
+
+    def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
+      val decoder         = charset.newDecoder
+      val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
+      val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
+      val charBufferSize  = 128
+
+      _.repeatPull[String] {
+        _.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
+          case None =>
+            val charBuffer = CharBuffer.allocate(1)
+            decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
+            decoder.flush(charBuffer)
+            val outputString = charBuffer.flip().toString
+            if (outputString.isEmpty) Pull.done.as(None)
+            else Pull.output1(outputString).as(None)
+          case Some((chunk, stream)) =>
+            if (chunk.nonEmpty) {
+              val chunkWithoutBom = skipByteOrderMark(chunk)
+              val bytes           = chunkWithoutBom.toArray
+              val byteBuffer      = ByteBuffer.wrap(bytes)
+              val charBuffer      = CharBuffer.allocate(bytes.length * maxCharsPerByte)
+              decoder.decode(byteBuffer, charBuffer, false)
+              val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
+              Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
+            } else {
+              Pull.output(Chunk.empty[String]).as(Some(stream))
+            }
+        }
+      }
+    }
+
+    private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
+      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
+        chunk.drop(3)
+      } else chunk
+  }
 }
--- a/modules/common/src/main/scala/docspell/common/MimeType.scala
+++ b/modules/common/src/main/scala/docspell/common/MimeType.scala
@ -2,13 +2,39 @@ package docspell.common

 import docspell.common.syntax.all._
 import io.circe.{Decoder, Encoder}
+import java.nio.charset.StandardCharsets
+import java.nio.charset.Charset

 /** A MIME Type impl with just enough features for the use here.
  */
-case class MimeType(primary: String, sub: String) {
+case class MimeType(primary: String, sub: String, params: Map[String, String]) {
+  def withParam(name: String, value: String): MimeType =
+    copy(params = params.updated(name, value))
+
+  def withCharset(cs: Charset): MimeType =
+    withParam("charset", cs.name())
+
+  def withUtf8Charset: MimeType =
+    withCharset(StandardCharsets.UTF_8)
+
+  def resolveCharset: Option[Charset] =
+    params.get("charset").flatMap { cs =>
+      if (Charset.isSupported(cs)) Some(Charset.forName(cs))
+      else None
+    }
+
+  def charsetOrUtf8: Charset =
+    resolveCharset.getOrElse(StandardCharsets.UTF_8)
+
+  def baseType: MimeType =
+    if (params.isEmpty) this else copy(params = Map.empty)

  def asString: String =
-    s"$primary/$sub"
+    if (params.isEmpty) s"$primary/$sub"
+    else {
+      val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
+      s"$primary/$sub; $parameters"
+    }

  def matches(other: MimeType): Boolean =
    primary == other.primary &&
@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
 object MimeType {

  def application(sub: String): MimeType =
-    MimeType("application", partFromString(sub).throwLeft)
+    MimeType("application", sub, Map.empty)

  def text(sub: String): MimeType =
-    MimeType("text", partFromString(sub).throwLeft)
+    MimeType("text", sub, Map.empty)

  def image(sub: String): MimeType =
-    MimeType("image", partFromString(sub).throwLeft)
+    MimeType("image", sub, Map.empty)

-  private[this] val validChars: Set[Char] =
-    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
+  def parse(str: String): Either[String, MimeType] = {
+    def parsePrimary: Either[String, (String, String)] =
+      str.indexOf('/') match {
+        case -1 => Left(s"Invalid mediatype: $str")
+        case n => Right(str.take(n) -> str.drop(n + 1))
+      }

-  def parse(str: String): Either[String, MimeType] =
-    str.indexOf('/') match {
-      case -1 => Left(s"Invalid MIME type: $str")
-      case n =>
-        for {
-          prim <- partFromString(str.substring(0, n))
-          sub  <- partFromString(str.substring(n + 1))
-        } yield MimeType(prim.toLowerCase, sub.toLowerCase)
-    }
+    def parseSub(s: String): Either[String, (String, String)] =
+      s.indexOf(';') match {
+        case -1 => Right((s, ""))
+        case n => Right((s.take(n), s.drop(n)))
+      }
+
+    def parseParams(s: String): Map[String, String] =
+      s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
+        case a :: b :: Nil => Some((a, b))
+        case _ => None
+      }).toMap
+
+    for {
+      pt <- parsePrimary
+      st <- parseSub(pt._2)
+      pa  = parseParams(st._2)
+    } yield MimeType(pt._1, st._1, pa)
+  }

  def unsafe(str: String): MimeType =
    parse(str).throwLeft

-  private def partFromString(s: String): Either[String, String] =
-    if (s.forall(validChars.contains)) Right(s)
-    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
-
  val octetStream = application("octet-stream")
  val pdf         = application("pdf")
  val zip         = application("zip")
@ -55,6 +90,16 @@ object MimeType {
  val html        = text("html")
  val plain       = text("plain")

+  object PdfMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(pdf))
+  }
+
+  object HtmlMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(html))
+  }
+
  implicit val jsonEncoder: Encoder[MimeType] =
    Encoder.encodeString.contramap(_.asString)