Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now
detected, which may not work for all files. Default/fallback will be
utf-8.

There is still a problem with mails that contain html parts not in
utf8 encoding. The mail text is always returned as a string and the
original encoding is lost. Then the html is stored using utf-8 bytes,
but wkhtmltopdf reads it using latin1. It seems that the `--encoding`
setting doesn't override encoding provided by the document.
This commit is contained in:
Eike Kettner
2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions

View File

@ -0,0 +1,11 @@
package org.apache.tika.parser.txt;
import java.io.InputStream;
import java.io.IOException;
public final class IOUtils {
public static long readFully(InputStream in, byte[] buffer) throws IOException {
return in.read(buffer, 0, buffer.length);
}
}

View File

@ -0,0 +1,75 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.txt;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
public class Icu4jEncodingDetector implements EncodingDetector {
public Charset detect(InputStream input, Metadata metadata)
throws IOException {
if (input == null) {
return null;
}
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
incomingCharset = mt.getParameters().get("charset");
}
}
if (incomingCharset != null) {
String cleaned = CharsetUtils.clean(incomingCharset);
if (cleaned != null) {
detector.setDeclaredEncoding(cleaned);
} else {
// TODO: log a warning?
}
}
// TIKA-341 without enabling input filtering (stripping of tags)
// short HTML tests don't work well
detector.enableInputFilter(true);
detector.setText(input);
for (CharsetMatch match : detector.detectAll()) {
try {
return CharsetUtils.forName(match.getName());
} catch (Exception e) {
// ignore
}
}
return null;
}
}

View File

@ -2,27 +2,32 @@ package docspell.files
import java.io.BufferedInputStream
import java.nio.file.{Files, Path}
import java.nio.charset.Charset
import scala.jdk.CollectionConverters._
import scala.util.Using
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import fs2.Stream
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.mime.MediaType
import scala.util.Using
import org.apache.tika.parser.txt.Icu4jEncodingDetector
import docspell.common._
object TikaMimetype {
private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType =
Option(mt)
.map(_.toString)
.map(MimeType.parse)
.flatMap(_.toOption)
.map(normalize)
.getOrElse(MimeType.octetStream)
Option(mt) match {
case Some(_) =>
val params = mt.getParameters.asScala.toMap
val primary = mt.getType
val sub = mt.getSubtype
normalize(MimeType(primary, sub, params))
case None =>
MimeType.octetStream
}
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
@ -32,21 +37,55 @@ object TikaMimetype {
}
private def normalize(in: MimeType): MimeType = in match {
case MimeType(_, sub) if sub contains "xhtml" =>
MimeType.html
case MimeType(_, sub, p) if sub contains "xhtml" =>
MimeType.html.copy(params = p)
case _ => in
}
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
val mt = convert(
tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
)
if (mt.primary == "text") {
charsetFromBytes(bv, hint) match {
case Some(cs) =>
mt.withCharset(cs)
case None =>
mt
}
} else mt
}
private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
Either
.catchNonFatal {
val cd = new Icu4jEncodingDetector()
val md = makeMetadata(hint)
Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
}
.toOption
.flatten
def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
dt match {
case DataType.Exact(mt) => mt.pure[F]
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
case DataType.Exact(mt) =>
mt.resolveCharset match {
case None if mt.primary == "text" =>
detectCharset[F](data, MimeTypeHint.advertised(mt))
.map {
case Some(cs) => mt.withCharset(cs)
case None => mt
}
case _ => mt.pure[F]
}
case DataType.Hint(hint) =>
TikaMimetype.detect(data, hint)
}
def detect[F[_]: Sync](file: Path): F[MimeType] =