Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
2025-11-04 12:30:12 +00:00 · 2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -0,0 +1,13 @@
+Docspell
+Copyright 2019-2020
+Licensed under the GPLv3
+
+This software contains portions of code from tika-parser
+https://tika.apache.org
+Copyright (C) Apache Software Foundation (ASF) <https://www.apache.org>
+Licensed under Apache License 2.0
+
+This software contains portions of code from http4s
+https://http4s.org
+Copyright 2013-2018 http4s.org
+Licensed under Apache License 2.0
--- a/build.sbt
+++ b/build.sbt
@@ -161,7 +161,8 @@ val files = project.in(file("modules/files")).
  settings(
    name := "docspell-files",
    libraryDependencies ++=
-      Dependencies.tika,
+      Dependencies.tika ++
+      Dependencies.icu4j,
    Test / sourceGenerators += Def.task {
      val base = (Test/resourceDirectory).value
      val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
--- a/modules/common/src/main/scala/docspell/common/Binary.scala
+++ b/modules/common/src/main/scala/docspell/common/Binary.scala
@@ -1,6 +1,8 @@
 package docspell.common

-import fs2.Stream
+import fs2.{Pipe, Stream}
+import java.nio.charset.Charset
+import java.nio.charset.StandardCharsets

 final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {

@@ -14,11 +16,67 @@ object Binary {
    Binary[F](name, MimeType.octetStream, data)

  def utf8[F[_]](name: String, content: String): Binary[F] =
-    Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
+    Binary[F](
+      name,
+      MimeType.octetStream,
+      Stream.emit(content).through(fs2.text.utf8Encode)
+    )

  def text[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.plain)
+    utf8(name, content).withMime(MimeType.plain.withUtf8Charset)

  def html[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.html)
+    utf8(name, content).withMime(MimeType.html.withUtf8Charset)
+
+  def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
+    if (cs == StandardCharsets.UTF_8) {
+      fs2.text.utf8Decode
+    } else {
+      util.decode[F](cs)
+    }
+
+  // This is a copy from org.http4s.util
+  // Http4s is licensed under the Apache License 2.0
+  private object util {
+    import fs2._
+    import java.nio._
+
+    private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
+
+    def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
+      val decoder         = charset.newDecoder
+      val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
+      val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
+      val charBufferSize  = 128
+
+      _.repeatPull[String] {
+        _.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
+          case None =>
+            val charBuffer = CharBuffer.allocate(1)
+            decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
+            decoder.flush(charBuffer)
+            val outputString = charBuffer.flip().toString
+            if (outputString.isEmpty) Pull.done.as(None)
+            else Pull.output1(outputString).as(None)
+          case Some((chunk, stream)) =>
+            if (chunk.nonEmpty) {
+              val chunkWithoutBom = skipByteOrderMark(chunk)
+              val bytes           = chunkWithoutBom.toArray
+              val byteBuffer      = ByteBuffer.wrap(bytes)
+              val charBuffer      = CharBuffer.allocate(bytes.length * maxCharsPerByte)
+              decoder.decode(byteBuffer, charBuffer, false)
+              val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
+              Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
+            } else {
+              Pull.output(Chunk.empty[String]).as(Some(stream))
+            }
+        }
+      }
+    }
+
+    private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
+      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
+        chunk.drop(3)
+      } else chunk
+  }
 }
--- a/modules/common/src/main/scala/docspell/common/MimeType.scala
+++ b/modules/common/src/main/scala/docspell/common/MimeType.scala
@@ -2,13 +2,39 @@ package docspell.common

 import docspell.common.syntax.all._
 import io.circe.{Decoder, Encoder}
+import java.nio.charset.StandardCharsets
+import java.nio.charset.Charset

 /** A MIME Type impl with just enough features for the use here.
  */
-case class MimeType(primary: String, sub: String) {
+case class MimeType(primary: String, sub: String, params: Map[String, String]) {
+  def withParam(name: String, value: String): MimeType =
+    copy(params = params.updated(name, value))
+
+  def withCharset(cs: Charset): MimeType =
+    withParam("charset", cs.name())
+
+  def withUtf8Charset: MimeType =
+    withCharset(StandardCharsets.UTF_8)
+
+  def resolveCharset: Option[Charset] =
+    params.get("charset").flatMap { cs =>
+      if (Charset.isSupported(cs)) Some(Charset.forName(cs))
+      else None
+    }
+
+  def charsetOrUtf8: Charset =
+    resolveCharset.getOrElse(StandardCharsets.UTF_8)
+
+  def baseType: MimeType =
+    if (params.isEmpty) this else copy(params = Map.empty)

  def asString: String =
-    s"$primary/$sub"
+    if (params.isEmpty) s"$primary/$sub"
+    else {
+      val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
+      s"$primary/$sub; $parameters"
+    }

  def matches(other: MimeType): Boolean =
    primary == other.primary &&
@@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
 object MimeType {

  def application(sub: String): MimeType =
-    MimeType("application", partFromString(sub).throwLeft)
+    MimeType("application", sub, Map.empty)

  def text(sub: String): MimeType =
-    MimeType("text", partFromString(sub).throwLeft)
+    MimeType("text", sub, Map.empty)

  def image(sub: String): MimeType =
-    MimeType("image", partFromString(sub).throwLeft)
+    MimeType("image", sub, Map.empty)

-  private[this] val validChars: Set[Char] =
-    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
+  def parse(str: String): Either[String, MimeType] = {
+    def parsePrimary: Either[String, (String, String)] =
+      str.indexOf('/') match {
+        case -1 => Left(s"Invalid mediatype: $str")
+        case n => Right(str.take(n) -> str.drop(n + 1))
+      }

-  def parse(str: String): Either[String, MimeType] =
-    str.indexOf('/') match {
-      case -1 => Left(s"Invalid MIME type: $str")
-      case n =>
-        for {
-          prim <- partFromString(str.substring(0, n))
-          sub  <- partFromString(str.substring(n + 1))
-        } yield MimeType(prim.toLowerCase, sub.toLowerCase)
-    }
+    def parseSub(s: String): Either[String, (String, String)] =
+      s.indexOf(';') match {
+        case -1 => Right((s, ""))
+        case n => Right((s.take(n), s.drop(n)))
+      }
+
+    def parseParams(s: String): Map[String, String] =
+      s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
+        case a :: b :: Nil => Some((a, b))
+        case _ => None
+      }).toMap
+
+    for {
+      pt <- parsePrimary
+      st <- parseSub(pt._2)
+      pa  = parseParams(st._2)
+    } yield MimeType(pt._1, st._1, pa)
+  }

  def unsafe(str: String): MimeType =
    parse(str).throwLeft

-  private def partFromString(s: String): Either[String, String] =
-    if (s.forall(validChars.contains)) Right(s)
-    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
-
  val octetStream = application("octet-stream")
  val pdf         = application("pdf")
  val zip         = application("zip")
@@ -55,6 +90,16 @@ object MimeType {
  val html        = text("html")
  val plain       = text("plain")

+  object PdfMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(pdf))
+  }
+
+  object HtmlMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(html))
+  }
+
  implicit val jsonEncoder: Encoder[MimeType] =
    Encoder.encodeString.contramap(_.asString)

--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@@ -32,18 +32,27 @@ object Conversion {
          in: Stream[F, Byte]
      ): F[A] =
        TikaMimetype.resolve(dataType, in).flatMap {
-          case MimeType.pdf =>
+          case Pdfs(_) =>
            handler.run(ConversionResult.successPdf(in))

-          case MimeType.html =>
-            WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
+          case mt @ MimeType(_, "html", _) =>
+            val cs = mt.charsetOrUtf8
+            WkHtmlPdf
+              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)

-          case Texts(_) =>
-            Markdown.toHtml(in, cfg.markdown).flatMap { html =>
+          case mt @ Texts(_) =>
+            val cs = mt.charsetOrUtf8
+            Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
              val bytes = Stream
                .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
                .covary[F]
-              WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
+              WkHtmlPdf.toPDF(
+                cfg.wkhtmlpdf,
+                cfg.chunkSize,
+                StandardCharsets.UTF_8,
+                blocker,
+                logger
+              )(bytes, handler)
            }

          case Images(mt) =>
@@ -51,7 +60,9 @@ object Conversion {
              case Some(dim) =>
                if (dim.product > cfg.maxImageSize) {
                  logger
-                    .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
+                    .info(
+                      s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
+                    ) *>
                    handler.run(
                      ConversionResult.inputMalformed(
                        mt,
@@ -59,14 +70,20 @@ object Conversion {
                      )
                    )
                } else {
-                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
+                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
+                    in,
+                    handler
+                  )
                }

              case None =>
                logger.info(
                  s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
                ) *>
-                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
+                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
+                    in,
+                    handler
+                  )
            }

          case Office(_) =>
@@ -90,6 +107,11 @@ object Conversion {
      Some(m).filter(_.primary == "text")
  }

+  object Pdfs {
+    def unapply(m: MimeType): Option[MimeType] =
+      Some(m).filter(_.matches(MimeType.pdf))
+  }
+
  object Office {
    val odt      = MimeType.application("vnd.oasis.opendocument.text")
    val ods      = MimeType.application("vnd.oasis.opendocument.spreadsheet")
@@ -97,18 +119,33 @@ object Conversion {
    val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
    val msoffice = MimeType.application("x-tika-msoffice")
    val ooxml    = MimeType.application("x-tika-ooxml")
-    val docx     = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
-    val xlsx     = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
-    val xls      = MimeType.application("vnd.ms-excel")
-    val doc      = MimeType.application("msword")
-    val rtf      = MimeType.application("rtf")
+    val docx =
+      MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
+    val xlsx =
+      MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+    val xls = MimeType.application("vnd.ms-excel")
+    val doc = MimeType.application("msword")
+    val rtf = MimeType.application("rtf")

    // without a filename, tika returns application/zip for odt/ods files, since
    // they are just zip files
    val odfContainer = MimeType.zip

    val all =
-      Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
+      Set(
+        odt,
+        ods,
+        odtAlias,
+        odsAlias,
+        msoffice,
+        ooxml,
+        docx,
+        xlsx,
+        xls,
+        doc,
+        rtf,
+        odfContainer
+      )

    def unapply(m: MimeType): Option[MimeType] =
      Some(m).filter(all.contains)
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@@ -7,20 +7,23 @@ import fs2.Stream
 import docspell.common._
 import docspell.convert.ConversionResult
 import docspell.convert.ConversionResult.Handler
+import java.nio.charset.Charset

 object WkHtmlPdf {

  def toPDF[F[_]: Sync: ContextShift, A](
      cfg: WkHtmlPdfConfig,
      chunkSize: Int,
+      charset: Charset,
      blocker: Blocker,
      logger: Logger[F]
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
    val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
      ExternConv.readResult[F](blocker, chunkSize, logger)

+    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
    ExternConv
-      .toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
+      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
        in,
        handler
      )
--- a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala
+++ b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala
@@ -1,8 +1,9 @@
 package docspell.convert.flexmark

 import java.io.{InputStream, InputStreamReader}
-import java.nio.charset.StandardCharsets
+import java.nio.charset.Charset
 import java.util
+import scala.util.Try

 import cats.effect.Sync
 import cats.implicits._
@@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
 import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
 import fs2.Stream

-import scala.util.Try
+import docspell.common._

 object Markdown {

-  def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
+  def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
    val p = createParser()
    val r = createRenderer()
    Try {
-      val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
+      val reader = new InputStreamReader(is, cs)
      val doc    = p.parseReader(reader)
      wrapHtml(r.render(doc), cfg)
    }.toEither
@@ -34,8 +35,8 @@ object Markdown {
    wrapHtml(r.render(doc), cfg)
  }

-  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
-    data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
+  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
+    data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))

  private def wrapHtml(body: String, cfg: MarkdownConfig): String =
    s"""<!DOCTYPE html>
--- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala
@@ -7,6 +7,7 @@ import docspell.common._
 import docspell.convert.FileChecks
 import docspell.files.{ExampleFiles, TestFiles}
 import minitest.SimpleTestSuite
+import java.nio.charset.StandardCharsets

 object ExternConvTest extends SimpleTestSuite with FileChecks {
  val blocker     = TestFiles.blocker
@@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
            val wkCfg = WkHtmlPdfConfig(cfg, target)
            val p =
              WkHtmlPdf
-                .toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
+                .toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
                  ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
                  storePdfHandler(dir.resolve("test.pdf"))
                )
--- a/modules/extract/NOTICE
+++ b/modules/extract/NOTICE
@@ -1,11 +0,0 @@
-The Java source files in docspell-extract are unmodified copies of
-those found in the Apache Tika parser project. It follows the
-NOTICE.txt file from Apache Tika parsers:
-
-Apache Tika parsers
-Copyright 2007-2019 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -31,7 +31,7 @@ object Extraction {
          lang: Language
      ): F[ExtractResult] =
        TikaMimetype.resolve(dataType, data).flatMap {
-          case MimeType.pdf =>
+          case MimeType.PdfMatch(_) =>
            PdfExtract
              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
              .map(ExtractResult.fromEither)
@@ -75,14 +75,15 @@ object Extraction {
                  doExtract
            }

-          case OdfType.container =>
+          case OdfType.ContainerMatch(_) =>
            logger
              .info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
              OdfExtract.get(data).map(ExtractResult.fromEither)

-          case mt @ MimeType("text", sub) if !sub.contains("html") =>
+          case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
+            val cs = mt.charsetOrUtf8
            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
-              data.through(fs2.text.utf8Decode).compile.last.map { txt =>
+              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
                ExtractResult.success(txt.getOrElse("").trim)
              }

--- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala
@@ -12,5 +12,5 @@ object OcrType {
  val all = Set(jpeg, png, tiff, pdf)

  def unapply(mt: MimeType): Option[MimeType] =
-    Some(mt).filter(all.contains)
+    Some(mt).map(_.baseType).filter(all.contains)
 }
--- a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala
@@ -14,5 +14,10 @@ object OdfType {
  val all = Set(odt, ods, odtAlias, odsAlias)

  def unapply(mt: MimeType): Option[MimeType] =
-    Some(mt).filter(all.contains)
+    Some(mt).map(_.baseType).filter(all.contains)
+
+  object ContainerMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(container))
+  }
 }
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
@@ -14,6 +14,6 @@ object PoiType {
  val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)

  def unapply(arg: MimeType): Option[MimeType] =
-    Some(arg).filter(all.contains)
+    Some(arg).map(_.baseType).filter(all.contains)

 }
--- a/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
@@ -0,0 +1,11 @@
+package org.apache.tika.parser.txt;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+public final class IOUtils {
+
+    public static long readFully(InputStream in, byte[] buffer) throws IOException {
+        return in.read(buffer, 0, buffer.length);
+    }
+}
--- a/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+public class Icu4jEncodingDetector implements EncodingDetector {
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        CharsetDetector detector = new CharsetDetector();
+
+        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
+            // TIKA-341: Use charset in content-type
+            MediaType mt = MediaType.parse(incomingType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
+            }
+        }
+
+        if (incomingCharset != null) {
+            String cleaned = CharsetUtils.clean(incomingCharset);
+            if (cleaned != null) {
+                detector.setDeclaredEncoding(cleaned);
+            } else {
+                // TODO: log a warning?
+            }
+        }
+
+        // TIKA-341 without enabling input filtering (stripping of tags)
+        // short HTML tests don't work well
+        detector.enableInputFilter(true);
+
+        detector.setText(input);
+
+        for (CharsetMatch match : detector.detectAll()) {
+            try {
+                return CharsetUtils.forName(match.getName());
+            } catch (Exception e) {
+                // ignore
+            }
+        }
+
+        return null;
+    }
+}
--- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
+++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
@@ -2,27 +2,32 @@ package docspell.files

 import java.io.BufferedInputStream
 import java.nio.file.{Files, Path}
+import java.nio.charset.Charset

+import scala.jdk.CollectionConverters._
+import scala.util.Using
 import cats.implicits._
 import cats.effect.Sync
-import docspell.common._
 import fs2.Stream
 import org.apache.tika.config.TikaConfig
 import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
 import org.apache.tika.mime.MediaType
-
-import scala.util.Using
+import org.apache.tika.parser.txt.Icu4jEncodingDetector
+import docspell.common._

 object TikaMimetype {
  private val tika = new TikaConfig().getDetector

  private def convert(mt: MediaType): MimeType =
-    Option(mt)
-      .map(_.toString)
-      .map(MimeType.parse)
-      .flatMap(_.toOption)
-      .map(normalize)
-      .getOrElse(MimeType.octetStream)
+    Option(mt) match {
+      case Some(_) =>
+        val params  = mt.getParameters.asScala.toMap
+        val primary = mt.getType
+        val sub     = mt.getSubtype
+        normalize(MimeType(primary, sub, params))
+      case None =>
+        MimeType.octetStream
+    }

  private def makeMetadata(hint: MimeTypeHint): Metadata = {
    val md = new Metadata
@@ -32,21 +37,55 @@ object TikaMimetype {
  }

  private def normalize(in: MimeType): MimeType = in match {
-    case MimeType(_, sub) if sub contains "xhtml" =>
-      MimeType.html
+    case MimeType(_, sub, p) if sub contains "xhtml" =>
+      MimeType.html.copy(params = p)
    case _ => in
  }

-  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
-    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
+  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
+    val mt = convert(
+      tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
+    )
+    if (mt.primary == "text") {
+      charsetFromBytes(bv, hint) match {
+        case Some(cs) =>
+          mt.withCharset(cs)
+        case None =>
+          mt
+      }
+    } else mt
+  }
+
+  private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
+    Either
+      .catchNonFatal {
+        val cd = new Icu4jEncodingDetector()
+        val md = makeMetadata(hint)
+        Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
+      }
+      .toOption
+      .flatten
+
+  def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
+    data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))

  def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
    data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))

  def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
    dt match {
-      case DataType.Exact(mt)  => mt.pure[F]
-      case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
+      case DataType.Exact(mt) =>
+        mt.resolveCharset match {
+          case None if mt.primary == "text" =>
+            detectCharset[F](data, MimeTypeHint.advertised(mt))
+              .map {
+                case Some(cs) => mt.withCharset(cs)
+                case None     => mt
+              }
+          case _ => mt.pure[F]
+        }
+      case DataType.Hint(hint) =>
+        TikaMimetype.detect(data, hint)
    }

  def detect[F[_]: Sync](file: Path): F[MimeType] =
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -231,7 +231,9 @@ docspell.joex {
          "-s",
          "A4",
          "--encoding",
-          "UTF-8",
+          "{{encoding}}",
+          "--load-error-handling", "ignore",
+          "--load-media-error-handling", "ignore",
          "-",
          "{{outfile}}"
        ]
--- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
+++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
@@ -8,6 +8,7 @@ import emil.javamail.syntax._
 import cats.Applicative

 import docspell.common._
+import java.nio.charset.StandardCharsets

 object ReadMail {

@@ -20,7 +21,7 @@ object ReadMail {
        bytesToMail(s).flatMap(mailToEntries[F](logger))

  def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
-    data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
+    data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])

  def mailToEntries[F[_]: Applicative](
      logger: Logger[F]
@@ -49,7 +50,7 @@ object ReadMail {

  implicit class MimeTypeConv(m: emil.MimeType) {
    def toDocspell: MimeType =
-      MimeType(m.primary, m.sub)
+      MimeType(m.primary, m.sub, m.params)
  }

  private def bodyType[F[_]](body: MailBody[F]): String =
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@@ -57,7 +57,7 @@ object ConvertPdf {
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
      mime match {
-        case Mimetype.`application/pdf` =>
+        case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
          ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
            (ra, None: Option[RAttachmentMeta]).pure[F]

@@ -66,9 +66,10 @@ object ConvertPdf {
            .get(ra.fileId.id)
            .unNoneTerminate
            .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
+          val mt      = MimeType(mime.primary, mime.sub, mime.params)
          val handler = conversionHandler[F](ctx, cfg, ra, item)
          ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
-            conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
+            conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
              data
            )
      }
@@ -104,7 +105,8 @@ object ConvertPdf {
          (ra, None: Option[RAttachmentMeta]).pure[F]

      case ConversionResult.Failure(ex) =>
-        ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
+        ctx.logger
+          .error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
          (ra, None: Option[RAttachmentMeta]).pure[F]
    })

@@ -114,7 +116,8 @@ object ConvertPdf {
      ra: RAttachment,
      pdf: Stream[F, Byte]
  ) = {
-    val hint    = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
+    val hint =
+      MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
    val newName = ra.name.map(n => s"$n.pdf")
    ctx.store.bitpeace
      .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
@@ -122,7 +125,9 @@ object ConvertPdf {
      .lastOrError
      .map(fm => Ident.unsafe(fm.id))
      .flatMap(fmId =>
-        ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
+        ctx.store
+          .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
+          .map(_ => fmId)
      )
      .map(fmId => ra.copy(fileId = fmId, name = newName))
  }
--- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
@@ -70,7 +70,7 @@ object ExtractArchive {
      archive: Option[RAttachmentArchive]
  )(ra: RAttachment, mime: Mimetype): F[Extracted] =
    mime match {
-      case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
+      case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
        ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
          extractZip(ctx, archive)(ra)
            .flatTap(_ => cleanupParents(ctx, ra, archive))
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -76,7 +76,7 @@ object TextExtraction {
        .getOrElse(Mimetype.`application/octet-stream`)

    findMime
-      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
+      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
  }

  private def extractTextFallback[F[_]: Sync: ContextShift](
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@@ -448,7 +448,7 @@ trait Conversions {
  // MIME Type

  def fromContentType(header: `Content-Type`): MimeType =
-    MimeType(header.mediaType.mainType, header.mediaType.subType)
+    MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions)
 }

 object Conversions extends Conversions {
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -17,6 +17,7 @@ object Dependencies {
  val Fs2Version = "2.3.0"
  val H2Version = "1.4.200"
  val Http4sVersion = "0.21.1"
+  val Icu4jVersion = "66.1"
  val KindProjectorVersion = "0.10.3"
  val Log4sVersion = "1.8.2"
  val LogbackVersion = "1.2.3"
@@ -218,4 +219,7 @@ object Dependencies {
    "org.webjars" % "viewerjs" % ViewerJSVersion
  )

+  val icu4j = Seq(
+    "com.ibm.icu" % "icu4j" % Icu4jVersion
+  )
 }