diff --git a/NOTICE.txt b/NOTICE.txt
new file mode 100644
index 00000000..c87ceb0e
--- /dev/null
+++ b/NOTICE.txt
@@ -0,0 +1,13 @@
+Docspell
+Copyright 2019-2020
+Licensed under the GPLv3
+
+This software contains portions of code from tika-parser
+https://tika.apache.org
+Copyright (C) Apache Software Foundation (ASF) <https://www.apache.org>
+Licensed under Apache License 2.0
+
+This software contains portions of code from http4s
+https://http4s.org
+Copyright 2013-2018 http4s.org
+Licensed under Apache License 2.0
diff --git a/build.sbt b/build.sbt
index b5b674e1..b4183a4d 100644
--- a/build.sbt
+++ b/build.sbt
@@ -161,7 +161,8 @@ val files = project.in(file("modules/files")).
   settings(
     name := "docspell-files",
     libraryDependencies ++=
-      Dependencies.tika,
+      Dependencies.tika ++
+      Dependencies.icu4j,
     Test / sourceGenerators += Def.task {
       val base = (Test/resourceDirectory).value
       val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
diff --git a/modules/common/src/main/scala/docspell/common/Binary.scala b/modules/common/src/main/scala/docspell/common/Binary.scala
index 34f2059c..ec128b66 100644
--- a/modules/common/src/main/scala/docspell/common/Binary.scala
+++ b/modules/common/src/main/scala/docspell/common/Binary.scala
@@ -1,6 +1,8 @@
 package docspell.common
 
-import fs2.Stream
+import fs2.{Pipe, Stream}
+import java.nio.charset.Charset
+import java.nio.charset.StandardCharsets
 
 final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {
 
@@ -14,11 +16,67 @@ object Binary {
     Binary[F](name, MimeType.octetStream, data)
 
   def utf8[F[_]](name: String, content: String): Binary[F] =
-    Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
+    Binary[F](
+      name,
+      MimeType.octetStream,
+      Stream.emit(content).through(fs2.text.utf8Encode)
+    )
 
   def text[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.plain)
+    utf8(name, content).withMime(MimeType.plain.withUtf8Charset)
 
   def html[F[_]](name: String, content: String): Binary[F] =
-    utf8(name, content).withMime(MimeType.html)
+    utf8(name, content).withMime(MimeType.html.withUtf8Charset)
+
+  def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
+    if (cs == StandardCharsets.UTF_8) {
+      fs2.text.utf8Decode
+    } else {
+      util.decode[F](cs)
+    }
+
+  // This is a copy from org.http4s.util
+  // Http4s is licensed under the Apache License 2.0
+  private object util {
+    import fs2._
+    import java.nio._
+
+    private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
+
+    def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
+      val decoder         = charset.newDecoder
+      val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
+      val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
+      val charBufferSize  = 128
+
+      _.repeatPull[String] {
+        _.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
+          case None =>
+            val charBuffer = CharBuffer.allocate(1)
+            decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
+            decoder.flush(charBuffer)
+            val outputString = charBuffer.flip().toString
+            if (outputString.isEmpty) Pull.done.as(None)
+            else Pull.output1(outputString).as(None)
+          case Some((chunk, stream)) =>
+            if (chunk.nonEmpty) {
+              val chunkWithoutBom = skipByteOrderMark(chunk)
+              val bytes           = chunkWithoutBom.toArray
+              val byteBuffer      = ByteBuffer.wrap(bytes)
+              val charBuffer      = CharBuffer.allocate(bytes.length * maxCharsPerByte)
+              decoder.decode(byteBuffer, charBuffer, false)
+              val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
+              Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
+            } else {
+              Pull.output(Chunk.empty[String]).as(Some(stream))
+            }
+        }
+      }
+    }
+
+    private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
+      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
+        chunk.drop(3)
+      } else chunk
+  }
 }
diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala
index bffbb667..5acc048c 100644
--- a/modules/common/src/main/scala/docspell/common/MimeType.scala
+++ b/modules/common/src/main/scala/docspell/common/MimeType.scala
@@ -2,13 +2,39 @@ package docspell.common
 
 import docspell.common.syntax.all._
 import io.circe.{Decoder, Encoder}
+import java.nio.charset.StandardCharsets
+import java.nio.charset.Charset
 
 /** A MIME Type impl with just enough features for the use here.
   */
-case class MimeType(primary: String, sub: String) {
+case class MimeType(primary: String, sub: String, params: Map[String, String]) {
+  def withParam(name: String, value: String): MimeType =
+    copy(params = params.updated(name, value))
+
+  def withCharset(cs: Charset): MimeType =
+    withParam("charset", cs.name())
+
+  def withUtf8Charset: MimeType =
+    withCharset(StandardCharsets.UTF_8)
+
+  def resolveCharset: Option[Charset] =
+    params.get("charset").flatMap { cs =>
+      if (Charset.isSupported(cs)) Some(Charset.forName(cs))
+      else None
+    }
+
+  def charsetOrUtf8: Charset =
+    resolveCharset.getOrElse(StandardCharsets.UTF_8)
+
+  def baseType: MimeType =
+    if (params.isEmpty) this else copy(params = Map.empty)
 
   def asString: String =
-    s"$primary/$sub"
+    if (params.isEmpty) s"$primary/$sub"
+    else {
+      val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
+      s"$primary/$sub; $parameters"
+    }
 
   def matches(other: MimeType): Boolean =
     primary == other.primary &&
@@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
 object MimeType {
 
   def application(sub: String): MimeType =
-    MimeType("application", partFromString(sub).throwLeft)
+    MimeType("application", sub, Map.empty)
 
   def text(sub: String): MimeType =
-    MimeType("text", partFromString(sub).throwLeft)
+    MimeType("text", sub, Map.empty)
 
   def image(sub: String): MimeType =
-    MimeType("image", partFromString(sub).throwLeft)
+    MimeType("image", sub, Map.empty)
 
-  private[this] val validChars: Set[Char] =
-    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
+  def parse(str: String): Either[String, MimeType] = {
+    def parsePrimary: Either[String, (String, String)] =
+      str.indexOf('/') match {
+        case -1 => Left(s"Invalid mediatype: $str")
+        case n => Right(str.take(n) -> str.drop(n + 1))
+      }
 
-  def parse(str: String): Either[String, MimeType] =
-    str.indexOf('/') match {
-      case -1 => Left(s"Invalid MIME type: $str")
-      case n =>
-        for {
-          prim <- partFromString(str.substring(0, n))
-          sub  <- partFromString(str.substring(n + 1))
-        } yield MimeType(prim.toLowerCase, sub.toLowerCase)
-    }
+    def parseSub(s: String): Either[String, (String, String)] =
+      s.indexOf(';') match {
+        case -1 => Right((s, ""))
+        case n => Right((s.take(n), s.drop(n)))
+      }
+
+    def parseParams(s: String): Map[String, String] =
+      s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
+        case a :: b :: Nil => Some((a, b))
+        case _ => None
+      }).toMap
+
+    for {
+      pt <- parsePrimary
+      st <- parseSub(pt._2)
+      pa  = parseParams(st._2)
+    } yield MimeType(pt._1, st._1, pa)
+  }
 
   def unsafe(str: String): MimeType =
     parse(str).throwLeft
 
-  private def partFromString(s: String): Either[String, String] =
-    if (s.forall(validChars.contains)) Right(s)
-    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
-
   val octetStream = application("octet-stream")
   val pdf         = application("pdf")
   val zip         = application("zip")
@@ -55,6 +90,16 @@ object MimeType {
   val html        = text("html")
   val plain       = text("plain")
 
+  object PdfMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(pdf))
+  }
+
+  object HtmlMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(html))
+  }
+
   implicit val jsonEncoder: Encoder[MimeType] =
     Encoder.encodeString.contramap(_.asString)
 
diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
index f9320413..66029e35 100644
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@@ -32,18 +32,27 @@ object Conversion {
           in: Stream[F, Byte]
       ): F[A] =
         TikaMimetype.resolve(dataType, in).flatMap {
-          case MimeType.pdf =>
+          case Pdfs(_) =>
             handler.run(ConversionResult.successPdf(in))
 
-          case MimeType.html =>
-            WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
+          case mt @ MimeType(_, "html", _) =>
+            val cs = mt.charsetOrUtf8
+            WkHtmlPdf
+              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
 
-          case Texts(_) =>
-            Markdown.toHtml(in, cfg.markdown).flatMap { html =>
+          case mt @ Texts(_) =>
+            val cs = mt.charsetOrUtf8
+            Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
               val bytes = Stream
                 .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
                 .covary[F]
-              WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
+              WkHtmlPdf.toPDF(
+                cfg.wkhtmlpdf,
+                cfg.chunkSize,
+                StandardCharsets.UTF_8,
+                blocker,
+                logger
+              )(bytes, handler)
             }
 
           case Images(mt) =>
@@ -51,7 +60,9 @@ object Conversion {
               case Some(dim) =>
                 if (dim.product > cfg.maxImageSize) {
                   logger
-                    .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
+                    .info(
+                      s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
+                    ) *>
                     handler.run(
                       ConversionResult.inputMalformed(
                         mt,
@@ -59,14 +70,20 @@ object Conversion {
                       )
                     )
                 } else {
-                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
+                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
+                    in,
+                    handler
+                  )
                 }
 
               case None =>
                 logger.info(
                   s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
                 ) *>
-                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
+                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
+                    in,
+                    handler
+                  )
             }
 
           case Office(_) =>
@@ -90,6 +107,11 @@ object Conversion {
       Some(m).filter(_.primary == "text")
   }
 
+  object Pdfs {
+    def unapply(m: MimeType): Option[MimeType] =
+      Some(m).filter(_.matches(MimeType.pdf))
+  }
+
   object Office {
     val odt      = MimeType.application("vnd.oasis.opendocument.text")
     val ods      = MimeType.application("vnd.oasis.opendocument.spreadsheet")
@@ -97,18 +119,33 @@ object Conversion {
     val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
     val msoffice = MimeType.application("x-tika-msoffice")
     val ooxml    = MimeType.application("x-tika-ooxml")
-    val docx     = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
-    val xlsx     = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
-    val xls      = MimeType.application("vnd.ms-excel")
-    val doc      = MimeType.application("msword")
-    val rtf      = MimeType.application("rtf")
+    val docx =
+      MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
+    val xlsx =
+      MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+    val xls = MimeType.application("vnd.ms-excel")
+    val doc = MimeType.application("msword")
+    val rtf = MimeType.application("rtf")
 
     // without a filename, tika returns application/zip for odt/ods files, since
     // they are just zip files
     val odfContainer = MimeType.zip
 
     val all =
-      Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
+      Set(
+        odt,
+        ods,
+        odtAlias,
+        odsAlias,
+        msoffice,
+        ooxml,
+        docx,
+        xlsx,
+        xls,
+        doc,
+        rtf,
+        odfContainer
+      )
 
     def unapply(m: MimeType): Option[MimeType] =
       Some(m).filter(all.contains)
diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
index 19473de3..8199191e 100644
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@@ -7,20 +7,23 @@ import fs2.Stream
 import docspell.common._
 import docspell.convert.ConversionResult
 import docspell.convert.ConversionResult.Handler
+import java.nio.charset.Charset
 
 object WkHtmlPdf {
 
   def toPDF[F[_]: Sync: ContextShift, A](
       cfg: WkHtmlPdfConfig,
       chunkSize: Int,
+      charset: Charset,
       blocker: Blocker,
       logger: Logger[F]
   )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
     val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
       ExternConv.readResult[F](blocker, chunkSize, logger)
 
+    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
     ExternConv
-      .toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
+      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
         in,
         handler
       )
diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala
index 7d185d86..543dec41 100644
--- a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala
+++ b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala
@@ -1,8 +1,9 @@
 package docspell.convert.flexmark
 
 import java.io.{InputStream, InputStreamReader}
-import java.nio.charset.StandardCharsets
+import java.nio.charset.Charset
 import java.util
+import scala.util.Try
 
 import cats.effect.Sync
 import cats.implicits._
@@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
 import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
 import fs2.Stream
 
-import scala.util.Try
+import docspell.common._
 
 object Markdown {
 
-  def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
+  def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
     val p = createParser()
     val r = createRenderer()
     Try {
-      val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
+      val reader = new InputStreamReader(is, cs)
       val doc    = p.parseReader(reader)
       wrapHtml(r.render(doc), cfg)
     }.toEither
@@ -34,8 +35,8 @@ object Markdown {
     wrapHtml(r.render(doc), cfg)
   }
 
-  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
-    data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
+  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
+    data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
 
   private def wrapHtml(body: String, cfg: MarkdownConfig): String =
     s"""<!DOCTYPE html>
diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala
index 2834b2e0..780783bf 100644
--- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala
@@ -7,6 +7,7 @@ import docspell.common._
 import docspell.convert.FileChecks
 import docspell.files.{ExampleFiles, TestFiles}
 import minitest.SimpleTestSuite
+import java.nio.charset.StandardCharsets
 
 object ExternConvTest extends SimpleTestSuite with FileChecks {
   val blocker     = TestFiles.blocker
@@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
             val wkCfg = WkHtmlPdfConfig(cfg, target)
             val p =
               WkHtmlPdf
-                .toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
+                .toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
                   ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
                   storePdfHandler(dir.resolve("test.pdf"))
                 )
diff --git a/modules/extract/NOTICE b/modules/extract/NOTICE
deleted file mode 100644
index 05ccbbcc..00000000
--- a/modules/extract/NOTICE
+++ /dev/null
@@ -1,11 +0,0 @@
-The Java source files in docspell-extract are unmodified copies of
-those found in the Apache Tika parser project. It follows the
-NOTICE.txt file from Apache Tika parsers:
-
-Apache Tika parsers
-Copyright 2007-2019 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-
diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
index 54c5cf10..ed86bfd3 100644
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -31,7 +31,7 @@ object Extraction {
           lang: Language
       ): F[ExtractResult] =
         TikaMimetype.resolve(dataType, data).flatMap {
-          case MimeType.pdf =>
+          case MimeType.PdfMatch(_) =>
             PdfExtract
               .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
               .map(ExtractResult.fromEither)
@@ -75,14 +75,15 @@ object Extraction {
                   doExtract
             }
 
-          case OdfType.container =>
+          case OdfType.ContainerMatch(_) =>
             logger
               .info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
               OdfExtract.get(data).map(ExtractResult.fromEither)
 
-          case mt @ MimeType("text", sub) if !sub.contains("html") =>
+          case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
+            val cs = mt.charsetOrUtf8
             logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
-              data.through(fs2.text.utf8Decode).compile.last.map { txt =>
+              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
                 ExtractResult.success(txt.getOrElse("").trim)
               }
 
diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala
index 886b0e50..96a51005 100644
--- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala
@@ -12,5 +12,5 @@ object OcrType {
   val all = Set(jpeg, png, tiff, pdf)
 
   def unapply(mt: MimeType): Option[MimeType] =
-    Some(mt).filter(all.contains)
+    Some(mt).map(_.baseType).filter(all.contains)
 }
diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala
index 93c7ddcc..13dd3dd2 100644
--- a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala
@@ -14,5 +14,10 @@ object OdfType {
   val all = Set(odt, ods, odtAlias, odsAlias)
 
   def unapply(mt: MimeType): Option[MimeType] =
-    Some(mt).filter(all.contains)
+    Some(mt).map(_.baseType).filter(all.contains)
+
+  object ContainerMatch {
+    def unapply(mt: MimeType): Option[MimeType] =
+      Some(mt).filter(_.matches(container))
+  }
 }
diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
index d2f9f4cb..23850994 100644
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
@@ -14,6 +14,6 @@ object PoiType {
   val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
 
   def unapply(arg: MimeType): Option[MimeType] =
-    Some(arg).filter(all.contains)
+    Some(arg).map(_.baseType).filter(all.contains)
 
 }
diff --git a/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
new file mode 100644
index 00000000..359e7bb0
--- /dev/null
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
@@ -0,0 +1,11 @@
+package org.apache.tika.parser.txt;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+public final class IOUtils {
+
+    public static long readFully(InputStream in, byte[] buffer) throws IOException {
+        return in.read(buffer, 0, buffer.length);
+    }
+}
diff --git a/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
new file mode 100644
index 00000000..7737aa72
--- /dev/null
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+public class Icu4jEncodingDetector implements EncodingDetector {
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        CharsetDetector detector = new CharsetDetector();
+
+        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
+            // TIKA-341: Use charset in content-type
+            MediaType mt = MediaType.parse(incomingType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
+            }
+        }
+
+        if (incomingCharset != null) {
+            String cleaned = CharsetUtils.clean(incomingCharset);
+            if (cleaned != null) {
+                detector.setDeclaredEncoding(cleaned);
+            } else {
+                // TODO: log a warning?
+            }
+        }
+
+        // TIKA-341 without enabling input filtering (stripping of tags)
+        // short HTML tests don't work well
+        detector.enableInputFilter(true);
+
+        detector.setText(input);
+
+        for (CharsetMatch match : detector.detectAll()) {
+            try {
+                return CharsetUtils.forName(match.getName());
+            } catch (Exception e) {
+                // ignore
+            }
+        }
+
+        return null;
+    }
+}
diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
index 3a1ea3cf..a9e594e6 100644
--- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
+++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
@@ -2,27 +2,32 @@ package docspell.files
 
 import java.io.BufferedInputStream
 import java.nio.file.{Files, Path}
+import java.nio.charset.Charset
 
+import scala.jdk.CollectionConverters._
+import scala.util.Using
 import cats.implicits._
 import cats.effect.Sync
-import docspell.common._
 import fs2.Stream
 import org.apache.tika.config.TikaConfig
 import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
 import org.apache.tika.mime.MediaType
-
-import scala.util.Using
+import org.apache.tika.parser.txt.Icu4jEncodingDetector
+import docspell.common._
 
 object TikaMimetype {
   private val tika = new TikaConfig().getDetector
 
   private def convert(mt: MediaType): MimeType =
-    Option(mt)
-      .map(_.toString)
-      .map(MimeType.parse)
-      .flatMap(_.toOption)
-      .map(normalize)
-      .getOrElse(MimeType.octetStream)
+    Option(mt) match {
+      case Some(_) =>
+        val params  = mt.getParameters.asScala.toMap
+        val primary = mt.getType
+        val sub     = mt.getSubtype
+        normalize(MimeType(primary, sub, params))
+      case None =>
+        MimeType.octetStream
+    }
 
   private def makeMetadata(hint: MimeTypeHint): Metadata = {
     val md = new Metadata
@@ -32,21 +37,55 @@ object TikaMimetype {
   }
 
   private def normalize(in: MimeType): MimeType = in match {
-    case MimeType(_, sub) if sub contains "xhtml" =>
-      MimeType.html
+    case MimeType(_, sub, p) if sub contains "xhtml" =>
+      MimeType.html.copy(params = p)
     case _ => in
   }
 
-  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
-    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
+  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
+    val mt = convert(
+      tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
+    )
+    if (mt.primary == "text") {
+      charsetFromBytes(bv, hint) match {
+        case Some(cs) =>
+          mt.withCharset(cs)
+        case None =>
+          mt
+      }
+    } else mt
+  }
+
+  private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
+    Either
+      .catchNonFatal {
+        val cd = new Icu4jEncodingDetector()
+        val md = makeMetadata(hint)
+        Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
+      }
+      .toOption
+      .flatten
+
+  def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
+    data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
 
   def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
     data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
 
   def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
     dt match {
-      case DataType.Exact(mt)  => mt.pure[F]
-      case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
+      case DataType.Exact(mt) =>
+        mt.resolveCharset match {
+          case None if mt.primary == "text" =>
+            detectCharset[F](data, MimeTypeHint.advertised(mt))
+              .map {
+                case Some(cs) => mt.withCharset(cs)
+                case None     => mt
+              }
+          case _ => mt.pure[F]
+        }
+      case DataType.Hint(hint) =>
+        TikaMimetype.detect(data, hint)
     }
 
   def detect[F[_]: Sync](file: Path): F[MimeType] =
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index cd345cfb..c33d727c 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -231,7 +231,9 @@ docspell.joex {
           "-s",
           "A4",
           "--encoding",
-          "UTF-8",
+          "{{encoding}}",
+          "--load-error-handling", "ignore",
+          "--load-media-error-handling", "ignore",
           "-",
           "{{outfile}}"
         ]
diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
index 2f4f8b54..3525b9f5 100644
--- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
+++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
@@ -8,6 +8,7 @@ import emil.javamail.syntax._
 import cats.Applicative
 
 import docspell.common._
+import java.nio.charset.StandardCharsets
 
 object ReadMail {
 
@@ -20,7 +21,7 @@ object ReadMail {
         bytesToMail(s).flatMap(mailToEntries[F](logger))
 
   def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
-    data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
+    data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
 
   def mailToEntries[F[_]: Applicative](
       logger: Logger[F]
@@ -49,7 +50,7 @@ object ReadMail {
 
   implicit class MimeTypeConv(m: emil.MimeType) {
     def toDocspell: MimeType =
-      MimeType(m.primary, m.sub)
+      MimeType(m.primary, m.sub, m.params)
   }
 
   private def bodyType[F[_]](body: MailBody[F]): String =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
index 542d1f5a..f49a4d80 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala
@@ -57,7 +57,7 @@ object ConvertPdf {
   )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
     Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
       mime match {
-        case Mimetype.`application/pdf` =>
+        case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
           ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
             (ra, None: Option[RAttachmentMeta]).pure[F]
 
@@ -66,9 +66,10 @@ object ConvertPdf {
             .get(ra.fileId.id)
             .unNoneTerminate
             .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
+          val mt      = MimeType(mime.primary, mime.sub, mime.params)
           val handler = conversionHandler[F](ctx, cfg, ra, item)
           ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
-            conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
+            conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
               data
             )
       }
@@ -104,7 +105,8 @@ object ConvertPdf {
           (ra, None: Option[RAttachmentMeta]).pure[F]
 
       case ConversionResult.Failure(ex) =>
-        ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
+        ctx.logger
+          .error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
           (ra, None: Option[RAttachmentMeta]).pure[F]
     })
 
@@ -114,7 +116,8 @@ object ConvertPdf {
       ra: RAttachment,
       pdf: Stream[F, Byte]
   ) = {
-    val hint    = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
+    val hint =
+      MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
     val newName = ra.name.map(n => s"$n.pdf")
     ctx.store.bitpeace
       .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
@@ -122,7 +125,9 @@ object ConvertPdf {
       .lastOrError
       .map(fm => Ident.unsafe(fm.id))
       .flatMap(fmId =>
-        ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
+        ctx.store
+          .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
+          .map(_ => fmId)
       )
       .map(fmId => ra.copy(fileId = fmId, name = newName))
   }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
index 62ea43cf..de973c67 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala
@@ -70,7 +70,7 @@ object ExtractArchive {
       archive: Option[RAttachmentArchive]
   )(ra: RAttachment, mime: Mimetype): F[Extracted] =
     mime match {
-      case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
+      case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
         ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
           extractZip(ctx, archive)(ra)
             .flatTap(_ => cleanupParents(ctx, ra, archive))
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
index e96a71be..ffe06f22 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -76,7 +76,7 @@ object TextExtraction {
         .getOrElse(Mimetype.`application/octet-stream`)
 
     findMime
-      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
+      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
   }
 
   private def extractTextFallback[F[_]: Sync: ContextShift](
diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
index f1726f8e..322f1abf 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@@ -448,7 +448,7 @@ trait Conversions {
   // MIME Type
 
   def fromContentType(header: `Content-Type`): MimeType =
-    MimeType(header.mediaType.mainType, header.mediaType.subType)
+    MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions)
 }
 
 object Conversions extends Conversions {
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 74f66fe4..ac2817e4 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -17,6 +17,7 @@ object Dependencies {
   val Fs2Version = "2.3.0"
   val H2Version = "1.4.200"
   val Http4sVersion = "0.21.1"
+  val Icu4jVersion = "66.1"
   val KindProjectorVersion = "0.10.3"
   val Log4sVersion = "1.8.2"
   val LogbackVersion = "1.2.3"
@@ -218,4 +219,7 @@ object Dependencies {
     "org.webjars" % "viewerjs" % ViewerJSVersion
   )
 
+  val icu4j = Seq(
+    "com.ibm.icu" % "icu4j" % Icu4jVersion
+  )
 }