mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-25 16:45:05 +00:00
Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
parent
b265421a46
commit
cf7ccd572c
13
NOTICE.txt
Normal file
13
NOTICE.txt
Normal file
@ -0,0 +1,13 @@
|
||||
Docspell
|
||||
Copyright 2019-2020
|
||||
Licensed under the GPLv3
|
||||
|
||||
This software contains portions of code from tika-parser
|
||||
https://tika.apache.org
|
||||
Copyright (C) Apache Software Foundation (ASF) <https://www.apache.org>
|
||||
Licensed under Apache License 2.0
|
||||
|
||||
This software contains portions of code from http4s
|
||||
https://http4s.org
|
||||
Copyright 2013-2018 http4s.org
|
||||
Licensed under Apache License 2.0
|
@ -161,7 +161,8 @@ val files = project.in(file("modules/files")).
|
||||
settings(
|
||||
name := "docspell-files",
|
||||
libraryDependencies ++=
|
||||
Dependencies.tika,
|
||||
Dependencies.tika ++
|
||||
Dependencies.icu4j,
|
||||
Test / sourceGenerators += Def.task {
|
||||
val base = (Test/resourceDirectory).value
|
||||
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
|
||||
|
@ -1,6 +1,8 @@
|
||||
package docspell.common
|
||||
|
||||
import fs2.Stream
|
||||
import fs2.{Pipe, Stream}
|
||||
import java.nio.charset.Charset
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {
|
||||
|
||||
@ -14,11 +16,67 @@ object Binary {
|
||||
Binary[F](name, MimeType.octetStream, data)
|
||||
|
||||
def utf8[F[_]](name: String, content: String): Binary[F] =
|
||||
Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
|
||||
Binary[F](
|
||||
name,
|
||||
MimeType.octetStream,
|
||||
Stream.emit(content).through(fs2.text.utf8Encode)
|
||||
)
|
||||
|
||||
def text[F[_]](name: String, content: String): Binary[F] =
|
||||
utf8(name, content).withMime(MimeType.plain)
|
||||
utf8(name, content).withMime(MimeType.plain.withUtf8Charset)
|
||||
|
||||
def html[F[_]](name: String, content: String): Binary[F] =
|
||||
utf8(name, content).withMime(MimeType.html)
|
||||
utf8(name, content).withMime(MimeType.html.withUtf8Charset)
|
||||
|
||||
def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
|
||||
if (cs == StandardCharsets.UTF_8) {
|
||||
fs2.text.utf8Decode
|
||||
} else {
|
||||
util.decode[F](cs)
|
||||
}
|
||||
|
||||
// This is a copy from org.http4s.util
|
||||
// Http4s is licensed under the Apache License 2.0
|
||||
private object util {
|
||||
import fs2._
|
||||
import java.nio._
|
||||
|
||||
private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
|
||||
|
||||
def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
|
||||
val decoder = charset.newDecoder
|
||||
val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
|
||||
val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
|
||||
val charBufferSize = 128
|
||||
|
||||
_.repeatPull[String] {
|
||||
_.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
|
||||
case None =>
|
||||
val charBuffer = CharBuffer.allocate(1)
|
||||
decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
|
||||
decoder.flush(charBuffer)
|
||||
val outputString = charBuffer.flip().toString
|
||||
if (outputString.isEmpty) Pull.done.as(None)
|
||||
else Pull.output1(outputString).as(None)
|
||||
case Some((chunk, stream)) =>
|
||||
if (chunk.nonEmpty) {
|
||||
val chunkWithoutBom = skipByteOrderMark(chunk)
|
||||
val bytes = chunkWithoutBom.toArray
|
||||
val byteBuffer = ByteBuffer.wrap(bytes)
|
||||
val charBuffer = CharBuffer.allocate(bytes.length * maxCharsPerByte)
|
||||
decoder.decode(byteBuffer, charBuffer, false)
|
||||
val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
|
||||
Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
|
||||
} else {
|
||||
Pull.output(Chunk.empty[String]).as(Some(stream))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
|
||||
if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
|
||||
chunk.drop(3)
|
||||
} else chunk
|
||||
}
|
||||
}
|
||||
|
@ -2,13 +2,39 @@ package docspell.common
|
||||
|
||||
import docspell.common.syntax.all._
|
||||
import io.circe.{Decoder, Encoder}
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.charset.Charset
|
||||
|
||||
/** A MIME Type impl with just enough features for the use here.
|
||||
*/
|
||||
case class MimeType(primary: String, sub: String) {
|
||||
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
|
||||
def withParam(name: String, value: String): MimeType =
|
||||
copy(params = params.updated(name, value))
|
||||
|
||||
def withCharset(cs: Charset): MimeType =
|
||||
withParam("charset", cs.name())
|
||||
|
||||
def withUtf8Charset: MimeType =
|
||||
withCharset(StandardCharsets.UTF_8)
|
||||
|
||||
def resolveCharset: Option[Charset] =
|
||||
params.get("charset").flatMap { cs =>
|
||||
if (Charset.isSupported(cs)) Some(Charset.forName(cs))
|
||||
else None
|
||||
}
|
||||
|
||||
def charsetOrUtf8: Charset =
|
||||
resolveCharset.getOrElse(StandardCharsets.UTF_8)
|
||||
|
||||
def baseType: MimeType =
|
||||
if (params.isEmpty) this else copy(params = Map.empty)
|
||||
|
||||
def asString: String =
|
||||
s"$primary/$sub"
|
||||
if (params.isEmpty) s"$primary/$sub"
|
||||
else {
|
||||
val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
|
||||
s"$primary/$sub; $parameters"
|
||||
}
|
||||
|
||||
def matches(other: MimeType): Boolean =
|
||||
primary == other.primary &&
|
||||
@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
|
||||
object MimeType {
|
||||
|
||||
def application(sub: String): MimeType =
|
||||
MimeType("application", partFromString(sub).throwLeft)
|
||||
MimeType("application", sub, Map.empty)
|
||||
|
||||
def text(sub: String): MimeType =
|
||||
MimeType("text", partFromString(sub).throwLeft)
|
||||
MimeType("text", sub, Map.empty)
|
||||
|
||||
def image(sub: String): MimeType =
|
||||
MimeType("image", partFromString(sub).throwLeft)
|
||||
MimeType("image", sub, Map.empty)
|
||||
|
||||
private[this] val validChars: Set[Char] =
|
||||
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
|
||||
def parse(str: String): Either[String, MimeType] = {
|
||||
def parsePrimary: Either[String, (String, String)] =
|
||||
str.indexOf('/') match {
|
||||
case -1 => Left(s"Invalid mediatype: $str")
|
||||
case n => Right(str.take(n) -> str.drop(n + 1))
|
||||
}
|
||||
|
||||
def parse(str: String): Either[String, MimeType] =
|
||||
str.indexOf('/') match {
|
||||
case -1 => Left(s"Invalid MIME type: $str")
|
||||
case n =>
|
||||
for {
|
||||
prim <- partFromString(str.substring(0, n))
|
||||
sub <- partFromString(str.substring(n + 1))
|
||||
} yield MimeType(prim.toLowerCase, sub.toLowerCase)
|
||||
}
|
||||
def parseSub(s: String): Either[String, (String, String)] =
|
||||
s.indexOf(';') match {
|
||||
case -1 => Right((s, ""))
|
||||
case n => Right((s.take(n), s.drop(n)))
|
||||
}
|
||||
|
||||
def parseParams(s: String): Map[String, String] =
|
||||
s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
|
||||
case a :: b :: Nil => Some((a, b))
|
||||
case _ => None
|
||||
}).toMap
|
||||
|
||||
for {
|
||||
pt <- parsePrimary
|
||||
st <- parseSub(pt._2)
|
||||
pa = parseParams(st._2)
|
||||
} yield MimeType(pt._1, st._1, pa)
|
||||
}
|
||||
|
||||
def unsafe(str: String): MimeType =
|
||||
parse(str).throwLeft
|
||||
|
||||
private def partFromString(s: String): Either[String, String] =
|
||||
if (s.forall(validChars.contains)) Right(s)
|
||||
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
|
||||
|
||||
val octetStream = application("octet-stream")
|
||||
val pdf = application("pdf")
|
||||
val zip = application("zip")
|
||||
@ -55,6 +90,16 @@ object MimeType {
|
||||
val html = text("html")
|
||||
val plain = text("plain")
|
||||
|
||||
object PdfMatch {
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(_.matches(pdf))
|
||||
}
|
||||
|
||||
object HtmlMatch {
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(_.matches(html))
|
||||
}
|
||||
|
||||
implicit val jsonEncoder: Encoder[MimeType] =
|
||||
Encoder.encodeString.contramap(_.asString)
|
||||
|
||||
|
@ -32,18 +32,27 @@ object Conversion {
|
||||
in: Stream[F, Byte]
|
||||
): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.pdf =>
|
||||
case Pdfs(_) =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
|
||||
case MimeType.html =>
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
case mt @ MimeType(_, "html", _) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
|
||||
|
||||
case Texts(_) =>
|
||||
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
|
||||
case mt @ Texts(_) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
|
||||
val bytes = Stream
|
||||
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
|
||||
.covary[F]
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
|
||||
WkHtmlPdf.toPDF(
|
||||
cfg.wkhtmlpdf,
|
||||
cfg.chunkSize,
|
||||
StandardCharsets.UTF_8,
|
||||
blocker,
|
||||
logger
|
||||
)(bytes, handler)
|
||||
}
|
||||
|
||||
case Images(mt) =>
|
||||
@ -51,7 +60,9 @@ object Conversion {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.maxImageSize) {
|
||||
logger
|
||||
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||
.info(
|
||||
s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
|
||||
) *>
|
||||
handler.run(
|
||||
ConversionResult.inputMalformed(
|
||||
mt,
|
||||
@ -59,14 +70,20 @@ object Conversion {
|
||||
)
|
||||
)
|
||||
} else {
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case None =>
|
||||
logger.info(
|
||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||
) *>
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case Office(_) =>
|
||||
@ -90,6 +107,11 @@ object Conversion {
|
||||
Some(m).filter(_.primary == "text")
|
||||
}
|
||||
|
||||
object Pdfs {
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(_.matches(MimeType.pdf))
|
||||
}
|
||||
|
||||
object Office {
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
@ -97,18 +119,33 @@ object Conversion {
|
||||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val rtf = MimeType.application("rtf")
|
||||
val docx =
|
||||
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx =
|
||||
MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val rtf = MimeType.application("rtf")
|
||||
|
||||
// without a filename, tika returns application/zip for odt/ods files, since
|
||||
// they are just zip files
|
||||
val odfContainer = MimeType.zip
|
||||
|
||||
val all =
|
||||
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
|
||||
Set(
|
||||
odt,
|
||||
ods,
|
||||
odtAlias,
|
||||
odsAlias,
|
||||
msoffice,
|
||||
ooxml,
|
||||
docx,
|
||||
xlsx,
|
||||
xls,
|
||||
doc,
|
||||
rtf,
|
||||
odfContainer
|
||||
)
|
||||
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(all.contains)
|
||||
|
@ -7,20 +7,23 @@ import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import java.nio.charset.Charset
|
||||
|
||||
object WkHtmlPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
charset: Charset,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
ExternConv
|
||||
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
|
||||
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
|
@ -1,8 +1,9 @@
|
||||
package docspell.convert.flexmark
|
||||
|
||||
import java.io.{InputStream, InputStreamReader}
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.charset.Charset
|
||||
import java.util
|
||||
import scala.util.Try
|
||||
|
||||
import cats.effect.Sync
|
||||
import cats.implicits._
|
||||
@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
|
||||
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
|
||||
import fs2.Stream
|
||||
|
||||
import scala.util.Try
|
||||
import docspell.common._
|
||||
|
||||
object Markdown {
|
||||
|
||||
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
|
||||
def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
|
||||
val p = createParser()
|
||||
val r = createRenderer()
|
||||
Try {
|
||||
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
|
||||
val reader = new InputStreamReader(is, cs)
|
||||
val doc = p.parseReader(reader)
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}.toEither
|
||||
@ -34,8 +35,8 @@ object Markdown {
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}
|
||||
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
|
||||
data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||
|
||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
|
||||
s"""<!DOCTYPE html>
|
||||
|
@ -7,6 +7,7 @@ import docspell.common._
|
||||
import docspell.convert.FileChecks
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||
val p =
|
||||
WkHtmlPdf
|
||||
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
|
||||
.toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
|
||||
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
|
@ -1,11 +0,0 @@
|
||||
The Java source files in docspell-extract are unmodified copies of
|
||||
those found in the Apache Tika parser project. It follows the
|
||||
NOTICE.txt file from Apache Tika parsers:
|
||||
|
||||
Apache Tika parsers
|
||||
Copyright 2007-2019 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
||||
|
@ -31,7 +31,7 @@ object Extraction {
|
||||
lang: Language
|
||||
): F[ExtractResult] =
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.pdf =>
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(ExtractResult.fromEither)
|
||||
@ -75,14 +75,15 @@ object Extraction {
|
||||
doExtract
|
||||
}
|
||||
|
||||
case OdfType.container =>
|
||||
case OdfType.ContainerMatch(_) =>
|
||||
logger
|
||||
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
||||
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
}
|
||||
|
||||
|
@ -12,5 +12,5 @@ object OcrType {
|
||||
val all = Set(jpeg, png, tiff, pdf)
|
||||
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(all.contains)
|
||||
Some(mt).map(_.baseType).filter(all.contains)
|
||||
}
|
||||
|
@ -14,5 +14,10 @@ object OdfType {
|
||||
val all = Set(odt, ods, odtAlias, odsAlias)
|
||||
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(all.contains)
|
||||
Some(mt).map(_.baseType).filter(all.contains)
|
||||
|
||||
object ContainerMatch {
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(_.matches(container))
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,6 @@ object PoiType {
|
||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||
|
||||
def unapply(arg: MimeType): Option[MimeType] =
|
||||
Some(arg).filter(all.contains)
|
||||
Some(arg).map(_.baseType).filter(all.contains)
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,11 @@
|
||||
package org.apache.tika.parser.txt;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
public final class IOUtils {
|
||||
|
||||
public static long readFully(InputStream in, byte[] buffer) throws IOException {
|
||||
return in.read(buffer, 0, buffer.length);
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.txt;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import org.apache.tika.detect.EncodingDetector;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.utils.CharsetUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class Icu4jEncodingDetector implements EncodingDetector {
|
||||
|
||||
public Charset detect(InputStream input, Metadata metadata)
|
||||
throws IOException {
|
||||
if (input == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
|
||||
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
|
||||
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (incomingCharset == null && incomingType != null) {
|
||||
// TIKA-341: Use charset in content-type
|
||||
MediaType mt = MediaType.parse(incomingType);
|
||||
if (mt != null) {
|
||||
incomingCharset = mt.getParameters().get("charset");
|
||||
}
|
||||
}
|
||||
|
||||
if (incomingCharset != null) {
|
||||
String cleaned = CharsetUtils.clean(incomingCharset);
|
||||
if (cleaned != null) {
|
||||
detector.setDeclaredEncoding(cleaned);
|
||||
} else {
|
||||
// TODO: log a warning?
|
||||
}
|
||||
}
|
||||
|
||||
// TIKA-341 without enabling input filtering (stripping of tags)
|
||||
// short HTML tests don't work well
|
||||
detector.enableInputFilter(true);
|
||||
|
||||
detector.setText(input);
|
||||
|
||||
for (CharsetMatch match : detector.detectAll()) {
|
||||
try {
|
||||
return CharsetUtils.forName(match.getName());
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
@ -2,27 +2,32 @@ package docspell.files
|
||||
|
||||
import java.io.BufferedInputStream
|
||||
import java.nio.file.{Files, Path}
|
||||
import java.nio.charset.Charset
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.Using
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import fs2.Stream
|
||||
import org.apache.tika.config.TikaConfig
|
||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
||||
import org.apache.tika.mime.MediaType
|
||||
|
||||
import scala.util.Using
|
||||
import org.apache.tika.parser.txt.Icu4jEncodingDetector
|
||||
import docspell.common._
|
||||
|
||||
object TikaMimetype {
|
||||
private val tika = new TikaConfig().getDetector
|
||||
|
||||
private def convert(mt: MediaType): MimeType =
|
||||
Option(mt)
|
||||
.map(_.toString)
|
||||
.map(MimeType.parse)
|
||||
.flatMap(_.toOption)
|
||||
.map(normalize)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
Option(mt) match {
|
||||
case Some(_) =>
|
||||
val params = mt.getParameters.asScala.toMap
|
||||
val primary = mt.getType
|
||||
val sub = mt.getSubtype
|
||||
normalize(MimeType(primary, sub, params))
|
||||
case None =>
|
||||
MimeType.octetStream
|
||||
}
|
||||
|
||||
private def makeMetadata(hint: MimeTypeHint): Metadata = {
|
||||
val md = new Metadata
|
||||
@ -32,21 +37,55 @@ object TikaMimetype {
|
||||
}
|
||||
|
||||
private def normalize(in: MimeType): MimeType = in match {
|
||||
case MimeType(_, sub) if sub contains "xhtml" =>
|
||||
MimeType.html
|
||||
case MimeType(_, sub, p) if sub contains "xhtml" =>
|
||||
MimeType.html.copy(params = p)
|
||||
case _ => in
|
||||
}
|
||||
|
||||
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
|
||||
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
|
||||
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
|
||||
val mt = convert(
|
||||
tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
|
||||
)
|
||||
if (mt.primary == "text") {
|
||||
charsetFromBytes(bv, hint) match {
|
||||
case Some(cs) =>
|
||||
mt.withCharset(cs)
|
||||
case None =>
|
||||
mt
|
||||
}
|
||||
} else mt
|
||||
}
|
||||
|
||||
private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
|
||||
Either
|
||||
.catchNonFatal {
|
||||
val cd = new Icu4jEncodingDetector()
|
||||
val md = makeMetadata(hint)
|
||||
Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
|
||||
}
|
||||
.toOption
|
||||
.flatten
|
||||
|
||||
def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
|
||||
data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
|
||||
|
||||
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
|
||||
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
|
||||
|
||||
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
|
||||
dt match {
|
||||
case DataType.Exact(mt) => mt.pure[F]
|
||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||
case DataType.Exact(mt) =>
|
||||
mt.resolveCharset match {
|
||||
case None if mt.primary == "text" =>
|
||||
detectCharset[F](data, MimeTypeHint.advertised(mt))
|
||||
.map {
|
||||
case Some(cs) => mt.withCharset(cs)
|
||||
case None => mt
|
||||
}
|
||||
case _ => mt.pure[F]
|
||||
}
|
||||
case DataType.Hint(hint) =>
|
||||
TikaMimetype.detect(data, hint)
|
||||
}
|
||||
|
||||
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||
|
@ -231,7 +231,9 @@ docspell.joex {
|
||||
"-s",
|
||||
"A4",
|
||||
"--encoding",
|
||||
"UTF-8",
|
||||
"{{encoding}}",
|
||||
"--load-error-handling", "ignore",
|
||||
"--load-media-error-handling", "ignore",
|
||||
"-",
|
||||
"{{outfile}}"
|
||||
]
|
||||
|
@ -8,6 +8,7 @@ import emil.javamail.syntax._
|
||||
import cats.Applicative
|
||||
|
||||
import docspell.common._
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
object ReadMail {
|
||||
|
||||
@ -20,7 +21,7 @@ object ReadMail {
|
||||
bytesToMail(s).flatMap(mailToEntries[F](logger))
|
||||
|
||||
def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
|
||||
data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
|
||||
data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
|
||||
|
||||
def mailToEntries[F[_]: Applicative](
|
||||
logger: Logger[F]
|
||||
@ -49,7 +50,7 @@ object ReadMail {
|
||||
|
||||
implicit class MimeTypeConv(m: emil.MimeType) {
|
||||
def toDocspell: MimeType =
|
||||
MimeType(m.primary, m.sub)
|
||||
MimeType(m.primary, m.sub, m.params)
|
||||
}
|
||||
|
||||
private def bodyType[F[_]](body: MailBody[F]): String =
|
||||
|
@ -57,7 +57,7 @@ object ConvertPdf {
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
|
||||
mime match {
|
||||
case Mimetype.`application/pdf` =>
|
||||
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
|
||||
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
@ -66,9 +66,10 @@ object ConvertPdf {
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val mt = MimeType(mime.primary, mime.sub, mime.params)
|
||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
|
||||
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
|
||||
data
|
||||
)
|
||||
}
|
||||
@ -104,7 +105,8 @@ object ConvertPdf {
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
|
||||
case ConversionResult.Failure(ex) =>
|
||||
ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
|
||||
ctx.logger
|
||||
.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
|
||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||
})
|
||||
|
||||
@ -114,7 +116,8 @@ object ConvertPdf {
|
||||
ra: RAttachment,
|
||||
pdf: Stream[F, Byte]
|
||||
) = {
|
||||
val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
|
||||
val hint =
|
||||
MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
|
||||
val newName = ra.name.map(n => s"$n.pdf")
|
||||
ctx.store.bitpeace
|
||||
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
|
||||
@ -122,7 +125,9 @@ object ConvertPdf {
|
||||
.lastOrError
|
||||
.map(fm => Ident.unsafe(fm.id))
|
||||
.flatMap(fmId =>
|
||||
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
|
||||
ctx.store
|
||||
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
|
||||
.map(_ => fmId)
|
||||
)
|
||||
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ object ExtractArchive {
|
||||
archive: Option[RAttachmentArchive]
|
||||
)(ra: RAttachment, mime: Mimetype): F[Extracted] =
|
||||
mime match {
|
||||
case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
|
||||
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
|
||||
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
|
||||
extractZip(ctx, archive)(ra)
|
||||
.flatTap(_ => cleanupParents(ctx, ra, archive))
|
||||
|
@ -76,7 +76,7 @@ object TextExtraction {
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
|
||||
findMime
|
||||
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
|
||||
}
|
||||
|
||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||
|
@ -448,7 +448,7 @@ trait Conversions {
|
||||
// MIME Type
|
||||
|
||||
def fromContentType(header: `Content-Type`): MimeType =
|
||||
MimeType(header.mediaType.mainType, header.mediaType.subType)
|
||||
MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions)
|
||||
}
|
||||
|
||||
object Conversions extends Conversions {
|
||||
|
@ -17,6 +17,7 @@ object Dependencies {
|
||||
val Fs2Version = "2.3.0"
|
||||
val H2Version = "1.4.200"
|
||||
val Http4sVersion = "0.21.1"
|
||||
val Icu4jVersion = "66.1"
|
||||
val KindProjectorVersion = "0.10.3"
|
||||
val Log4sVersion = "1.8.2"
|
||||
val LogbackVersion = "1.2.3"
|
||||
@ -218,4 +219,7 @@ object Dependencies {
|
||||
"org.webjars" % "viewerjs" % ViewerJSVersion
|
||||
)
|
||||
|
||||
val icu4j = Seq(
|
||||
"com.ibm.icu" % "icu4j" % Icu4jVersion
|
||||
)
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user