mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-20 17:39:54 +00:00
Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
@ -0,0 +1,11 @@
|
||||
package org.apache.tika.parser.txt;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
public final class IOUtils {
|
||||
|
||||
public static long readFully(InputStream in, byte[] buffer) throws IOException {
|
||||
return in.read(buffer, 0, buffer.length);
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.txt;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import org.apache.tika.detect.EncodingDetector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.utils.CharsetUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class Icu4jEncodingDetector implements EncodingDetector {
|
||||
|
||||
public Charset detect(InputStream input, Metadata metadata)
|
||||
throws IOException {
|
||||
if (input == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
|
||||
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
|
||||
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (incomingCharset == null && incomingType != null) {
|
||||
// TIKA-341: Use charset in content-type
|
||||
MediaType mt = MediaType.parse(incomingType);
|
||||
if (mt != null) {
|
||||
incomingCharset = mt.getParameters().get("charset");
|
||||
}
|
||||
}
|
||||
|
||||
if (incomingCharset != null) {
|
||||
String cleaned = CharsetUtils.clean(incomingCharset);
|
||||
if (cleaned != null) {
|
||||
detector.setDeclaredEncoding(cleaned);
|
||||
} else {
|
||||
// TODO: log a warning?
|
||||
}
|
||||
}
|
||||
|
||||
// TIKA-341 without enabling input filtering (stripping of tags)
|
||||
// short HTML tests don't work well
|
||||
detector.enableInputFilter(true);
|
||||
|
||||
detector.setText(input);
|
||||
|
||||
for (CharsetMatch match : detector.detectAll()) {
|
||||
try {
|
||||
return CharsetUtils.forName(match.getName());
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -2,27 +2,32 @@ package docspell.files
|
||||
|
||||
import java.io.BufferedInputStream
|
||||
import java.nio.file.{Files, Path}
|
||||
import java.nio.charset.Charset
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.Using
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import fs2.Stream
|
||||
import org.apache.tika.config.TikaConfig
|
||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
||||
import org.apache.tika.mime.MediaType
|
||||
|
||||
import scala.util.Using
|
||||
import org.apache.tika.parser.txt.Icu4jEncodingDetector
|
||||
import docspell.common._
|
||||
|
||||
object TikaMimetype {
|
||||
private val tika = new TikaConfig().getDetector
|
||||
|
||||
private def convert(mt: MediaType): MimeType =
|
||||
Option(mt)
|
||||
.map(_.toString)
|
||||
.map(MimeType.parse)
|
||||
.flatMap(_.toOption)
|
||||
.map(normalize)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
Option(mt) match {
|
||||
case Some(_) =>
|
||||
val params = mt.getParameters.asScala.toMap
|
||||
val primary = mt.getType
|
||||
val sub = mt.getSubtype
|
||||
normalize(MimeType(primary, sub, params))
|
||||
case None =>
|
||||
MimeType.octetStream
|
||||
}
|
||||
|
||||
private def makeMetadata(hint: MimeTypeHint): Metadata = {
|
||||
val md = new Metadata
|
||||
@ -32,21 +37,55 @@ object TikaMimetype {
|
||||
}
|
||||
|
||||
private def normalize(in: MimeType): MimeType = in match {
|
||||
case MimeType(_, sub) if sub contains "xhtml" =>
|
||||
MimeType.html
|
||||
case MimeType(_, sub, p) if sub contains "xhtml" =>
|
||||
MimeType.html.copy(params = p)
|
||||
case _ => in
|
||||
}
|
||||
|
||||
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
|
||||
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
|
||||
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
|
||||
val mt = convert(
|
||||
tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
|
||||
)
|
||||
if (mt.primary == "text") {
|
||||
charsetFromBytes(bv, hint) match {
|
||||
case Some(cs) =>
|
||||
mt.withCharset(cs)
|
||||
case None =>
|
||||
mt
|
||||
}
|
||||
} else mt
|
||||
}
|
||||
|
||||
private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
|
||||
Either
|
||||
.catchNonFatal {
|
||||
val cd = new Icu4jEncodingDetector()
|
||||
val md = makeMetadata(hint)
|
||||
Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
|
||||
}
|
||||
.toOption
|
||||
.flatten
|
||||
|
||||
def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
|
||||
data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
|
||||
|
||||
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
|
||||
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
|
||||
|
||||
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
|
||||
dt match {
|
||||
case DataType.Exact(mt) => mt.pure[F]
|
||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||
case DataType.Exact(mt) =>
|
||||
mt.resolveCharset match {
|
||||
case None if mt.primary == "text" =>
|
||||
detectCharset[F](data, MimeTypeHint.advertised(mt))
|
||||
.map {
|
||||
case Some(cs) => mt.withCharset(cs)
|
||||
case None => mt
|
||||
}
|
||||
case _ => mt.pure[F]
|
||||
}
|
||||
case DataType.Hint(hint) =>
|
||||
TikaMimetype.detect(data, hint)
|
||||
}
|
||||
|
||||
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||
|
Reference in New Issue
Block a user