mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
		@@ -0,0 +1,11 @@
 | 
			
		||||
package org.apache.tika.parser.txt;
 | 
			
		||||
 | 
			
		||||
import java.io.InputStream;
 | 
			
		||||
import java.io.IOException;
 | 
			
		||||
 | 
			
		||||
public final class IOUtils {
 | 
			
		||||
 | 
			
		||||
    public static long readFully(InputStream in, byte[] buffer) throws IOException {
 | 
			
		||||
        return in.read(buffer, 0, buffer.length);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,75 @@
 | 
			
		||||
/**
 | 
			
		||||
 * Licensed to the Apache Software Foundation (ASF) under one or more
 | 
			
		||||
 * contributor license agreements.  See the NOTICE file distributed with
 | 
			
		||||
 * this work for additional information regarding copyright ownership.
 | 
			
		||||
 * The ASF licenses this file to You under the Apache License, Version 2.0
 | 
			
		||||
 * (the "License"); you may not use this file except in compliance with
 | 
			
		||||
 * the License.  You may obtain a copy of the License at
 | 
			
		||||
 * <p/>
 | 
			
		||||
 * http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 * <p/>
 | 
			
		||||
 * Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
 * See the License for the specific language governing permissions and
 | 
			
		||||
 * limitations under the License.
 | 
			
		||||
 */
 | 
			
		||||
package org.apache.tika.parser.txt;
 | 
			
		||||
 | 
			
		||||
import com.ibm.icu.text.CharsetDetector;
 | 
			
		||||
import com.ibm.icu.text.CharsetMatch;
 | 
			
		||||
import org.apache.tika.detect.EncodingDetector;
 | 
			
		||||
import org.apache.tika.metadata.Metadata;
 | 
			
		||||
import org.apache.tika.mime.MediaType;
 | 
			
		||||
import org.apache.tika.utils.CharsetUtils;
 | 
			
		||||
 | 
			
		||||
import java.io.IOException;
 | 
			
		||||
import java.io.InputStream;
 | 
			
		||||
import java.nio.charset.Charset;
 | 
			
		||||
 | 
			
		||||
public class Icu4jEncodingDetector implements EncodingDetector {
 | 
			
		||||
 | 
			
		||||
    public Charset detect(InputStream input, Metadata metadata)
 | 
			
		||||
            throws IOException {
 | 
			
		||||
        if (input == null) {
 | 
			
		||||
            return null;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CharsetDetector detector = new CharsetDetector();
 | 
			
		||||
 | 
			
		||||
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
 | 
			
		||||
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
 | 
			
		||||
        if (incomingCharset == null && incomingType != null) {
 | 
			
		||||
            // TIKA-341: Use charset in content-type
 | 
			
		||||
            MediaType mt = MediaType.parse(incomingType);
 | 
			
		||||
            if (mt != null) {
 | 
			
		||||
                incomingCharset = mt.getParameters().get("charset");
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (incomingCharset != null) {
 | 
			
		||||
            String cleaned = CharsetUtils.clean(incomingCharset);
 | 
			
		||||
            if (cleaned != null) {
 | 
			
		||||
                detector.setDeclaredEncoding(cleaned);
 | 
			
		||||
            } else {
 | 
			
		||||
                // TODO: log a warning?
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // TIKA-341 without enabling input filtering (stripping of tags)
 | 
			
		||||
        // short HTML tests don't work well
 | 
			
		||||
        detector.enableInputFilter(true);
 | 
			
		||||
 | 
			
		||||
        detector.setText(input);
 | 
			
		||||
 | 
			
		||||
        for (CharsetMatch match : detector.detectAll()) {
 | 
			
		||||
            try {
 | 
			
		||||
                return CharsetUtils.forName(match.getName());
 | 
			
		||||
            } catch (Exception e) {
 | 
			
		||||
                // ignore
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return null;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -2,27 +2,32 @@ package docspell.files
 | 
			
		||||
 | 
			
		||||
import java.io.BufferedInputStream
 | 
			
		||||
import java.nio.file.{Files, Path}
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
import scala.jdk.CollectionConverters._
 | 
			
		||||
import scala.util.Using
 | 
			
		||||
import cats.implicits._
 | 
			
		||||
import cats.effect.Sync
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import fs2.Stream
 | 
			
		||||
import org.apache.tika.config.TikaConfig
 | 
			
		||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
 | 
			
		||||
import org.apache.tika.mime.MediaType
 | 
			
		||||
 | 
			
		||||
import scala.util.Using
 | 
			
		||||
import org.apache.tika.parser.txt.Icu4jEncodingDetector
 | 
			
		||||
import docspell.common._
 | 
			
		||||
 | 
			
		||||
object TikaMimetype {
 | 
			
		||||
  private val tika = new TikaConfig().getDetector
 | 
			
		||||
 | 
			
		||||
  private def convert(mt: MediaType): MimeType =
 | 
			
		||||
    Option(mt)
 | 
			
		||||
      .map(_.toString)
 | 
			
		||||
      .map(MimeType.parse)
 | 
			
		||||
      .flatMap(_.toOption)
 | 
			
		||||
      .map(normalize)
 | 
			
		||||
      .getOrElse(MimeType.octetStream)
 | 
			
		||||
    Option(mt) match {
 | 
			
		||||
      case Some(_) =>
 | 
			
		||||
        val params  = mt.getParameters.asScala.toMap
 | 
			
		||||
        val primary = mt.getType
 | 
			
		||||
        val sub     = mt.getSubtype
 | 
			
		||||
        normalize(MimeType(primary, sub, params))
 | 
			
		||||
      case None =>
 | 
			
		||||
        MimeType.octetStream
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  private def makeMetadata(hint: MimeTypeHint): Metadata = {
 | 
			
		||||
    val md = new Metadata
 | 
			
		||||
@@ -32,21 +37,55 @@ object TikaMimetype {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def normalize(in: MimeType): MimeType = in match {
 | 
			
		||||
    case MimeType(_, sub) if sub contains "xhtml" =>
 | 
			
		||||
      MimeType.html
 | 
			
		||||
    case MimeType(_, sub, p) if sub contains "xhtml" =>
 | 
			
		||||
      MimeType.html.copy(params = p)
 | 
			
		||||
    case _ => in
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
 | 
			
		||||
    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
 | 
			
		||||
  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
 | 
			
		||||
    val mt = convert(
 | 
			
		||||
      tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
 | 
			
		||||
    )
 | 
			
		||||
    if (mt.primary == "text") {
 | 
			
		||||
      charsetFromBytes(bv, hint) match {
 | 
			
		||||
        case Some(cs) =>
 | 
			
		||||
          mt.withCharset(cs)
 | 
			
		||||
        case None =>
 | 
			
		||||
          mt
 | 
			
		||||
      }
 | 
			
		||||
    } else mt
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
 | 
			
		||||
    Either
 | 
			
		||||
      .catchNonFatal {
 | 
			
		||||
        val cd = new Icu4jEncodingDetector()
 | 
			
		||||
        val md = makeMetadata(hint)
 | 
			
		||||
        Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
 | 
			
		||||
      }
 | 
			
		||||
      .toOption
 | 
			
		||||
      .flatten
 | 
			
		||||
 | 
			
		||||
  def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
 | 
			
		||||
    data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
 | 
			
		||||
 | 
			
		||||
  def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
 | 
			
		||||
    data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
 | 
			
		||||
 | 
			
		||||
  def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
 | 
			
		||||
    dt match {
 | 
			
		||||
      case DataType.Exact(mt)  => mt.pure[F]
 | 
			
		||||
      case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
 | 
			
		||||
      case DataType.Exact(mt) =>
 | 
			
		||||
        mt.resolveCharset match {
 | 
			
		||||
          case None if mt.primary == "text" =>
 | 
			
		||||
            detectCharset[F](data, MimeTypeHint.advertised(mt))
 | 
			
		||||
              .map {
 | 
			
		||||
                case Some(cs) => mt.withCharset(cs)
 | 
			
		||||
                case None     => mt
 | 
			
		||||
              }
 | 
			
		||||
          case _ => mt.pure[F]
 | 
			
		||||
        }
 | 
			
		||||
      case DataType.Hint(hint) =>
 | 
			
		||||
        TikaMimetype.detect(data, hint)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  def detect[F[_]: Sync](file: Path): F[MimeType] =
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user