Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
2025-08-05 02:24:52 +00:00 · 2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions
--- a/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java
@ -0,0 +1,11 @@
+package org.apache.tika.parser.txt;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+public final class IOUtils {
+
+    public static long readFully(InputStream in, byte[] buffer) throws IOException {
+        return in.read(buffer, 0, buffer.length);
+    }
+}
--- a/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.txt;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.CharsetUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+public class Icu4jEncodingDetector implements EncodingDetector {
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        CharsetDetector detector = new CharsetDetector();
+
+        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
+            // TIKA-341: Use charset in content-type
+            MediaType mt = MediaType.parse(incomingType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
+            }
+        }
+
+        if (incomingCharset != null) {
+            String cleaned = CharsetUtils.clean(incomingCharset);
+            if (cleaned != null) {
+                detector.setDeclaredEncoding(cleaned);
+            } else {
+                // TODO: log a warning?
+            }
+        }
+
+        // TIKA-341 without enabling input filtering (stripping of tags)
+        // short HTML tests don't work well
+        detector.enableInputFilter(true);
+
+        detector.setText(input);
+
+        for (CharsetMatch match : detector.detectAll()) {
+            try {
+                return CharsetUtils.forName(match.getName());
+            } catch (Exception e) {
+                // ignore
+            }
+        }
+
+        return null;
+    }
+}
--- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
+++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
@ -2,27 +2,32 @@ package docspell.files

 import java.io.BufferedInputStream
 import java.nio.file.{Files, Path}
+import java.nio.charset.Charset

+import scala.jdk.CollectionConverters._
+import scala.util.Using
 import cats.implicits._
 import cats.effect.Sync
-import docspell.common._
 import fs2.Stream
 import org.apache.tika.config.TikaConfig
 import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
 import org.apache.tika.mime.MediaType
-
-import scala.util.Using
+import org.apache.tika.parser.txt.Icu4jEncodingDetector
+import docspell.common._

 object TikaMimetype {
  private val tika = new TikaConfig().getDetector

  private def convert(mt: MediaType): MimeType =
-    Option(mt)
-      .map(_.toString)
-      .map(MimeType.parse)
-      .flatMap(_.toOption)
-      .map(normalize)
-      .getOrElse(MimeType.octetStream)
+    Option(mt) match {
+      case Some(_) =>
+        val params  = mt.getParameters.asScala.toMap
+        val primary = mt.getType
+        val sub     = mt.getSubtype
+        normalize(MimeType(primary, sub, params))
+      case None =>
+        MimeType.octetStream
+    }

  private def makeMetadata(hint: MimeTypeHint): Metadata = {
    val md = new Metadata
@ -32,21 +37,55 @@ object TikaMimetype {
  }

  private def normalize(in: MimeType): MimeType = in match {
-    case MimeType(_, sub) if sub contains "xhtml" =>
-      MimeType.html
+    case MimeType(_, sub, p) if sub contains "xhtml" =>
+      MimeType.html.copy(params = p)
    case _ => in
  }

-  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
-    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
+  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
+    val mt = convert(
+      tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
+    )
+    if (mt.primary == "text") {
+      charsetFromBytes(bv, hint) match {
+        case Some(cs) =>
+          mt.withCharset(cs)
+        case None =>
+          mt
+      }
+    } else mt
+  }
+
+  private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
+    Either
+      .catchNonFatal {
+        val cd = new Icu4jEncodingDetector()
+        val md = makeMetadata(hint)
+        Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
+      }
+      .toOption
+      .flatten
+
+  def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
+    data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))

  def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
    data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))

  def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
    dt match {
-      case DataType.Exact(mt)  => mt.pure[F]
-      case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
+      case DataType.Exact(mt) =>
+        mt.resolveCharset match {
+          case None if mt.primary == "text" =>
+            detectCharset[F](data, MimeTypeHint.advertised(mt))
+              .map {
+                case Some(cs) => mt.withCharset(cs)
+                case None     => mt
+              }
+          case _ => mt.pure[F]
+        }
+      case DataType.Hint(hint) =>
+        TikaMimetype.detect(data, hint)
    }

  def detect[F[_]: Sync](file: Path): F[MimeType] =