Improve handling encodings

Html and text files are not fixed to be UTF-8. The encoding is now
detected, which may not work for all files. Default/fallback will be
utf-8.

There is still a problem with mails that contain html parts not in
utf8 encoding. The mail text is always returned as a string and the
original encoding is lost. Then the html is stored using utf-8 bytes,
but wkhtmltopdf reads it using latin1. It seems that the `--encoding`
setting doesn't override encoding provided by the document.
This commit is contained in:
Eike Kettner 2020-03-23 22:43:15 +01:00
parent b265421a46
commit cf7ccd572c
23 changed files with 383 additions and 92 deletions

13
NOTICE.txt Normal file
View File

@ -0,0 +1,13 @@
Docspell
Copyright 2019-2020
Licensed under the GPLv3
This software contains portions of code from tika-parser
https://tika.apache.org
Copyright (C) Apache Software Foundation (ASF) <https://www.apache.org>
Licensed under Apache License 2.0
This software contains portions of code from http4s
https://http4s.org
Copyright 2013-2018 http4s.org
Licensed under Apache License 2.0

View File

@ -161,7 +161,8 @@ val files = project.in(file("modules/files")).
settings(
name := "docspell-files",
libraryDependencies ++=
Dependencies.tika,
Dependencies.tika ++
Dependencies.icu4j,
Test / sourceGenerators += Def.task {
val base = (Test/resourceDirectory).value
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)

View File

@ -1,6 +1,8 @@
package docspell.common
import fs2.Stream
import fs2.{Pipe, Stream}
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {
@ -14,11 +16,67 @@ object Binary {
Binary[F](name, MimeType.octetStream, data)
def utf8[F[_]](name: String, content: String): Binary[F] =
Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
Binary[F](
name,
MimeType.octetStream,
Stream.emit(content).through(fs2.text.utf8Encode)
)
def text[F[_]](name: String, content: String): Binary[F] =
utf8(name, content).withMime(MimeType.plain)
utf8(name, content).withMime(MimeType.plain.withUtf8Charset)
def html[F[_]](name: String, content: String): Binary[F] =
utf8(name, content).withMime(MimeType.html)
utf8(name, content).withMime(MimeType.html.withUtf8Charset)
def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
if (cs == StandardCharsets.UTF_8) {
fs2.text.utf8Decode
} else {
util.decode[F](cs)
}
// This is a copy from org.http4s.util
// Http4s is licensed under the Apache License 2.0
private object util {
import fs2._
import java.nio._
private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
val decoder = charset.newDecoder
val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
val charBufferSize = 128
_.repeatPull[String] {
_.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
case None =>
val charBuffer = CharBuffer.allocate(1)
decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
decoder.flush(charBuffer)
val outputString = charBuffer.flip().toString
if (outputString.isEmpty) Pull.done.as(None)
else Pull.output1(outputString).as(None)
case Some((chunk, stream)) =>
if (chunk.nonEmpty) {
val chunkWithoutBom = skipByteOrderMark(chunk)
val bytes = chunkWithoutBom.toArray
val byteBuffer = ByteBuffer.wrap(bytes)
val charBuffer = CharBuffer.allocate(bytes.length * maxCharsPerByte)
decoder.decode(byteBuffer, charBuffer, false)
val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
} else {
Pull.output(Chunk.empty[String]).as(Some(stream))
}
}
}
}
private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
chunk.drop(3)
} else chunk
}
}

View File

@ -2,13 +2,39 @@ package docspell.common
import docspell.common.syntax.all._
import io.circe.{Decoder, Encoder}
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
/** A MIME Type impl with just enough features for the use here.
*/
case class MimeType(primary: String, sub: String) {
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
def withParam(name: String, value: String): MimeType =
copy(params = params.updated(name, value))
def withCharset(cs: Charset): MimeType =
withParam("charset", cs.name())
def withUtf8Charset: MimeType =
withCharset(StandardCharsets.UTF_8)
def resolveCharset: Option[Charset] =
params.get("charset").flatMap { cs =>
if (Charset.isSupported(cs)) Some(Charset.forName(cs))
else None
}
def charsetOrUtf8: Charset =
resolveCharset.getOrElse(StandardCharsets.UTF_8)
def baseType: MimeType =
if (params.isEmpty) this else copy(params = Map.empty)
def asString: String =
s"$primary/$sub"
if (params.isEmpty) s"$primary/$sub"
else {
val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
s"$primary/$sub; $parameters"
}
def matches(other: MimeType): Boolean =
primary == other.primary &&
@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
object MimeType {
def application(sub: String): MimeType =
MimeType("application", partFromString(sub).throwLeft)
MimeType("application", sub, Map.empty)
def text(sub: String): MimeType =
MimeType("text", partFromString(sub).throwLeft)
MimeType("text", sub, Map.empty)
def image(sub: String): MimeType =
MimeType("image", partFromString(sub).throwLeft)
MimeType("image", sub, Map.empty)
private[this] val validChars: Set[Char] =
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
def parse(str: String): Either[String, MimeType] = {
def parsePrimary: Either[String, (String, String)] =
str.indexOf('/') match {
case -1 => Left(s"Invalid mediatype: $str")
case n => Right(str.take(n) -> str.drop(n + 1))
}
def parse(str: String): Either[String, MimeType] =
str.indexOf('/') match {
case -1 => Left(s"Invalid MIME type: $str")
case n =>
for {
prim <- partFromString(str.substring(0, n))
sub <- partFromString(str.substring(n + 1))
} yield MimeType(prim.toLowerCase, sub.toLowerCase)
}
def parseSub(s: String): Either[String, (String, String)] =
s.indexOf(';') match {
case -1 => Right((s, ""))
case n => Right((s.take(n), s.drop(n)))
}
def parseParams(s: String): Map[String, String] =
s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
case a :: b :: Nil => Some((a, b))
case _ => None
}).toMap
for {
pt <- parsePrimary
st <- parseSub(pt._2)
pa = parseParams(st._2)
} yield MimeType(pt._1, st._1, pa)
}
def unsafe(str: String): MimeType =
parse(str).throwLeft
private def partFromString(s: String): Either[String, String] =
if (s.forall(validChars.contains)) Right(s)
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
val octetStream = application("octet-stream")
val pdf = application("pdf")
val zip = application("zip")
@ -55,6 +90,16 @@ object MimeType {
val html = text("html")
val plain = text("plain")
object PdfMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(pdf))
}
object HtmlMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(html))
}
implicit val jsonEncoder: Encoder[MimeType] =
Encoder.encodeString.contramap(_.asString)

View File

@ -32,18 +32,27 @@ object Conversion {
in: Stream[F, Byte]
): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.pdf =>
case Pdfs(_) =>
handler.run(ConversionResult.successPdf(in))
case MimeType.html =>
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
case mt @ MimeType(_, "html", _) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
case Texts(_) =>
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
case mt @ Texts(_) =>
val cs = mt.charsetOrUtf8
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
val bytes = Stream
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
.covary[F]
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
WkHtmlPdf.toPDF(
cfg.wkhtmlpdf,
cfg.chunkSize,
StandardCharsets.UTF_8,
blocker,
logger
)(bytes, handler)
}
case Images(mt) =>
@ -51,7 +60,9 @@ object Conversion {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
.info(
s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
) *>
handler.run(
ConversionResult.inputMalformed(
mt,
@ -59,14 +70,20 @@ object Conversion {
)
)
} else {
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
in,
handler
)
}
case None =>
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
in,
handler
)
}
case Office(_) =>
@ -90,6 +107,11 @@ object Conversion {
Some(m).filter(_.primary == "text")
}
object Pdfs {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.matches(MimeType.pdf))
}
object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
@ -97,18 +119,33 @@ object Conversion {
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")
val docx =
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx =
MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")
// without a filename, tika returns application/zip for odt/ods files, since
// they are just zip files
val odfContainer = MimeType.zip
val all =
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
Set(
odt,
ods,
odtAlias,
odsAlias,
msoffice,
ooxml,
docx,
xlsx,
xls,
doc,
rtf,
odfContainer
)
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)

View File

@ -7,20 +7,23 @@ import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
import java.nio.charset.Charset
object WkHtmlPdf {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: WkHtmlPdfConfig,
chunkSize: Int,
charset: Charset,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
ExternConv
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
in,
handler
)

View File

@ -1,8 +1,9 @@
package docspell.convert.flexmark
import java.io.{InputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
import java.util
import scala.util.Try
import cats.effect.Sync
import cats.implicits._
@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
import fs2.Stream
import scala.util.Try
import docspell.common._
object Markdown {
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
val p = createParser()
val r = createRenderer()
Try {
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
val reader = new InputStreamReader(is, cs)
val doc = p.parseReader(reader)
wrapHtml(r.render(doc), cfg)
}.toEither
@ -34,8 +35,8 @@ object Markdown {
wrapHtml(r.render(doc), cfg)
}
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
s"""<!DOCTYPE html>

View File

@ -7,6 +7,7 @@ import docspell.common._
import docspell.convert.FileChecks
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
import java.nio.charset.StandardCharsets
object ExternConvTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
val wkCfg = WkHtmlPdfConfig(cfg, target)
val p =
WkHtmlPdf
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
.toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
storePdfHandler(dir.resolve("test.pdf"))
)

View File

@ -1,11 +0,0 @@
The Java source files in docspell-extract are unmodified copies of
those found in the Apache Tika parser project. It follows the
NOTICE.txt file from Apache Tika parsers:
Apache Tika parsers
Copyright 2007-2019 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -31,7 +31,7 @@ object Extraction {
lang: Language
): F[ExtractResult] =
TikaMimetype.resolve(dataType, data).flatMap {
case MimeType.pdf =>
case MimeType.PdfMatch(_) =>
PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(ExtractResult.fromEither)
@ -75,14 +75,15 @@ object Extraction {
doExtract
}
case OdfType.container =>
case OdfType.ContainerMatch(_) =>
logger
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
OdfExtract.get(data).map(ExtractResult.fromEither)
case mt @ MimeType("text", sub) if !sub.contains("html") =>
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim)
}

View File

@ -12,5 +12,5 @@ object OcrType {
val all = Set(jpeg, png, tiff, pdf)
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(all.contains)
Some(mt).map(_.baseType).filter(all.contains)
}

View File

@ -14,5 +14,10 @@ object OdfType {
val all = Set(odt, ods, odtAlias, odsAlias)
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(all.contains)
Some(mt).map(_.baseType).filter(all.contains)
object ContainerMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(container))
}
}

View File

@ -14,6 +14,6 @@ object PoiType {
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
def unapply(arg: MimeType): Option[MimeType] =
Some(arg).filter(all.contains)
Some(arg).map(_.baseType).filter(all.contains)
}

View File

@ -0,0 +1,11 @@
package org.apache.tika.parser.txt;
import java.io.InputStream;
import java.io.IOException;
public final class IOUtils {
public static long readFully(InputStream in, byte[] buffer) throws IOException {
return in.read(buffer, 0, buffer.length);
}
}

View File

@ -0,0 +1,75 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.txt;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
public class Icu4jEncodingDetector implements EncodingDetector {
public Charset detect(InputStream input, Metadata metadata)
throws IOException {
if (input == null) {
return null;
}
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
incomingCharset = mt.getParameters().get("charset");
}
}
if (incomingCharset != null) {
String cleaned = CharsetUtils.clean(incomingCharset);
if (cleaned != null) {
detector.setDeclaredEncoding(cleaned);
} else {
// TODO: log a warning?
}
}
// TIKA-341 without enabling input filtering (stripping of tags)
// short HTML tests don't work well
detector.enableInputFilter(true);
detector.setText(input);
for (CharsetMatch match : detector.detectAll()) {
try {
return CharsetUtils.forName(match.getName());
} catch (Exception e) {
// ignore
}
}
return null;
}
}

View File

@ -2,27 +2,32 @@ package docspell.files
import java.io.BufferedInputStream
import java.nio.file.{Files, Path}
import java.nio.charset.Charset
import scala.jdk.CollectionConverters._
import scala.util.Using
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import fs2.Stream
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.mime.MediaType
import scala.util.Using
import org.apache.tika.parser.txt.Icu4jEncodingDetector
import docspell.common._
object TikaMimetype {
private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType =
Option(mt)
.map(_.toString)
.map(MimeType.parse)
.flatMap(_.toOption)
.map(normalize)
.getOrElse(MimeType.octetStream)
Option(mt) match {
case Some(_) =>
val params = mt.getParameters.asScala.toMap
val primary = mt.getType
val sub = mt.getSubtype
normalize(MimeType(primary, sub, params))
case None =>
MimeType.octetStream
}
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
@ -32,21 +37,55 @@ object TikaMimetype {
}
private def normalize(in: MimeType): MimeType = in match {
case MimeType(_, sub) if sub contains "xhtml" =>
MimeType.html
case MimeType(_, sub, p) if sub contains "xhtml" =>
MimeType.html.copy(params = p)
case _ => in
}
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
val mt = convert(
tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
)
if (mt.primary == "text") {
charsetFromBytes(bv, hint) match {
case Some(cs) =>
mt.withCharset(cs)
case None =>
mt
}
} else mt
}
private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
Either
.catchNonFatal {
val cd = new Icu4jEncodingDetector()
val md = makeMetadata(hint)
Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
}
.toOption
.flatten
def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
dt match {
case DataType.Exact(mt) => mt.pure[F]
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
case DataType.Exact(mt) =>
mt.resolveCharset match {
case None if mt.primary == "text" =>
detectCharset[F](data, MimeTypeHint.advertised(mt))
.map {
case Some(cs) => mt.withCharset(cs)
case None => mt
}
case _ => mt.pure[F]
}
case DataType.Hint(hint) =>
TikaMimetype.detect(data, hint)
}
def detect[F[_]: Sync](file: Path): F[MimeType] =

View File

@ -231,7 +231,9 @@ docspell.joex {
"-s",
"A4",
"--encoding",
"UTF-8",
"{{encoding}}",
"--load-error-handling", "ignore",
"--load-media-error-handling", "ignore",
"-",
"{{outfile}}"
]

View File

@ -8,6 +8,7 @@ import emil.javamail.syntax._
import cats.Applicative
import docspell.common._
import java.nio.charset.StandardCharsets
object ReadMail {
@ -20,7 +21,7 @@ object ReadMail {
bytesToMail(s).flatMap(mailToEntries[F](logger))
def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
def mailToEntries[F[_]: Applicative](
logger: Logger[F]
@ -49,7 +50,7 @@ object ReadMail {
implicit class MimeTypeConv(m: emil.MimeType) {
def toDocspell: MimeType =
MimeType(m.primary, m.sub)
MimeType(m.primary, m.sub, m.params)
}
private def bodyType[F[_]](body: MailBody[F]): String =

View File

@ -57,7 +57,7 @@ object ConvertPdf {
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
mime match {
case Mimetype.`application/pdf` =>
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
@ -66,9 +66,10 @@ object ConvertPdf {
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val mt = MimeType(mime.primary, mime.sub, mime.params)
val handler = conversionHandler[F](ctx, cfg, ra, item)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
data
)
}
@ -104,7 +105,8 @@ object ConvertPdf {
(ra, None: Option[RAttachmentMeta]).pure[F]
case ConversionResult.Failure(ex) =>
ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
ctx.logger
.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
})
@ -114,7 +116,8 @@ object ConvertPdf {
ra: RAttachment,
pdf: Stream[F, Byte]
) = {
val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
val hint =
MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
val newName = ra.name.map(n => s"$n.pdf")
ctx.store.bitpeace
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
@ -122,7 +125,9 @@ object ConvertPdf {
.lastOrError
.map(fm => Ident.unsafe(fm.id))
.flatMap(fmId =>
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
ctx.store
.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
.map(_ => fmId)
)
.map(fmId => ra.copy(fileId = fmId, name = newName))
}

View File

@ -70,7 +70,7 @@ object ExtractArchive {
archive: Option[RAttachmentArchive]
)(ra: RAttachment, mime: Mimetype): F[Extracted] =
mime match {
case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
extractZip(ctx, archive)(ra)
.flatTap(_ => cleanupParents(ctx, ra, archive))

View File

@ -76,7 +76,7 @@ object TextExtraction {
.getOrElse(Mimetype.`application/octet-stream`)
findMime
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
}
private def extractTextFallback[F[_]: Sync: ContextShift](

View File

@ -448,7 +448,7 @@ trait Conversions {
// MIME Type
def fromContentType(header: `Content-Type`): MimeType =
MimeType(header.mediaType.mainType, header.mediaType.subType)
MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions)
}
object Conversions extends Conversions {

View File

@ -17,6 +17,7 @@ object Dependencies {
val Fs2Version = "2.3.0"
val H2Version = "1.4.200"
val Http4sVersion = "0.21.1"
val Icu4jVersion = "66.1"
val KindProjectorVersion = "0.10.3"
val Log4sVersion = "1.8.2"
val LogbackVersion = "1.2.3"
@ -218,4 +219,7 @@ object Dependencies {
"org.webjars" % "viewerjs" % ViewerJSVersion
)
val icu4j = Seq(
"com.ibm.icu" % "icu4j" % Icu4jVersion
)
}