diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 00000000..c87ceb0e --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,13 @@ +Docspell +Copyright 2019-2020 +Licensed under the GPLv3 + +This software contains portions of code from tika-parser +https://tika.apache.org +Copyright (C) Apache Software Foundation (ASF) +Licensed under Apache License 2.0 + +This software contains portions of code from http4s +https://http4s.org +Copyright 2013-2018 http4s.org +Licensed under Apache License 2.0 diff --git a/build.sbt b/build.sbt index b5b674e1..b4183a4d 100644 --- a/build.sbt +++ b/build.sbt @@ -161,7 +161,8 @@ val files = project.in(file("modules/files")). settings( name := "docspell-files", libraryDependencies ++= - Dependencies.tika, + Dependencies.tika ++ + Dependencies.icu4j, Test / sourceGenerators += Def.task { val base = (Test/resourceDirectory).value val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base) diff --git a/modules/common/src/main/scala/docspell/common/Binary.scala b/modules/common/src/main/scala/docspell/common/Binary.scala index 34f2059c..ec128b66 100644 --- a/modules/common/src/main/scala/docspell/common/Binary.scala +++ b/modules/common/src/main/scala/docspell/common/Binary.scala @@ -1,6 +1,8 @@ package docspell.common -import fs2.Stream +import fs2.{Pipe, Stream} +import java.nio.charset.Charset +import java.nio.charset.StandardCharsets final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) { @@ -14,11 +16,67 @@ object Binary { Binary[F](name, MimeType.octetStream, data) def utf8[F[_]](name: String, content: String): Binary[F] = - Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode)) + Binary[F]( + name, + MimeType.octetStream, + Stream.emit(content).through(fs2.text.utf8Encode) + ) def text[F[_]](name: String, content: String): Binary[F] = - utf8(name, content).withMime(MimeType.plain) + utf8(name, content).withMime(MimeType.plain.withUtf8Charset) def html[F[_]](name: String, content: String): Binary[F] = - utf8(name, content).withMime(MimeType.html) + utf8(name, content).withMime(MimeType.html.withUtf8Charset) + + def decode[F[_]](cs: Charset): Pipe[F, Byte, String] = + if (cs == StandardCharsets.UTF_8) { + fs2.text.utf8Decode + } else { + util.decode[F](cs) + } + + // This is a copy from org.http4s.util + // Http4s is licensed under the Apache License 2.0 + private object util { + import fs2._ + import java.nio._ + + private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte) + + def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = { + val decoder = charset.newDecoder + val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt + val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt + val charBufferSize = 128 + + _.repeatPull[String] { + _.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap { + case None => + val charBuffer = CharBuffer.allocate(1) + decoder.decode(ByteBuffer.allocate(0), charBuffer, true) + decoder.flush(charBuffer) + val outputString = charBuffer.flip().toString + if (outputString.isEmpty) Pull.done.as(None) + else Pull.output1(outputString).as(None) + case Some((chunk, stream)) => + if (chunk.nonEmpty) { + val chunkWithoutBom = skipByteOrderMark(chunk) + val bytes = chunkWithoutBom.toArray + val byteBuffer = ByteBuffer.wrap(bytes) + val charBuffer = CharBuffer.allocate(bytes.length * maxCharsPerByte) + decoder.decode(byteBuffer, charBuffer, false) + val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice())) + Pull.output1(charBuffer.flip().toString).as(Some(nextStream)) + } else { + Pull.output(Chunk.empty[String]).as(Some(stream)) + } + } + } + } + + private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] = + if (chunk.size >= 3 && chunk.take(3) == utf8Bom) { + chunk.drop(3) + } else chunk + } } diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index bffbb667..5acc048c 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -2,13 +2,39 @@ package docspell.common import docspell.common.syntax.all._ import io.circe.{Decoder, Encoder} +import java.nio.charset.StandardCharsets +import java.nio.charset.Charset /** A MIME Type impl with just enough features for the use here. */ -case class MimeType(primary: String, sub: String) { +case class MimeType(primary: String, sub: String, params: Map[String, String]) { + def withParam(name: String, value: String): MimeType = + copy(params = params.updated(name, value)) + + def withCharset(cs: Charset): MimeType = + withParam("charset", cs.name()) + + def withUtf8Charset: MimeType = + withCharset(StandardCharsets.UTF_8) + + def resolveCharset: Option[Charset] = + params.get("charset").flatMap { cs => + if (Charset.isSupported(cs)) Some(Charset.forName(cs)) + else None + } + + def charsetOrUtf8: Charset = + resolveCharset.getOrElse(StandardCharsets.UTF_8) + + def baseType: MimeType = + if (params.isEmpty) this else copy(params = Map.empty) def asString: String = - s"$primary/$sub" + if (params.isEmpty) s"$primary/$sub" + else { + val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";") + s"$primary/$sub; $parameters" + } def matches(other: MimeType): Boolean = primary == other.primary && @@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) { object MimeType { def application(sub: String): MimeType = - MimeType("application", partFromString(sub).throwLeft) + MimeType("application", sub, Map.empty) def text(sub: String): MimeType = - MimeType("text", partFromString(sub).throwLeft) + MimeType("text", sub, Map.empty) def image(sub: String): MimeType = - MimeType("image", partFromString(sub).throwLeft) + MimeType("image", sub, Map.empty) - private[this] val validChars: Set[Char] = - (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet + def parse(str: String): Either[String, MimeType] = { + def parsePrimary: Either[String, (String, String)] = + str.indexOf('/') match { + case -1 => Left(s"Invalid mediatype: $str") + case n => Right(str.take(n) -> str.drop(n + 1)) + } - def parse(str: String): Either[String, MimeType] = - str.indexOf('/') match { - case -1 => Left(s"Invalid MIME type: $str") - case n => - for { - prim <- partFromString(str.substring(0, n)) - sub <- partFromString(str.substring(n + 1)) - } yield MimeType(prim.toLowerCase, sub.toLowerCase) - } + def parseSub(s: String): Either[String, (String, String)] = + s.indexOf(';') match { + case -1 => Right((s, "")) + case n => Right((s.take(n), s.drop(n))) + } + + def parseParams(s: String): Map[String, String] = + s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match { + case a :: b :: Nil => Some((a, b)) + case _ => None + }).toMap + + for { + pt <- parsePrimary + st <- parseSub(pt._2) + pa = parseParams(st._2) + } yield MimeType(pt._1, st._1, pa) + } def unsafe(str: String): MimeType = parse(str).throwLeft - private def partFromString(s: String): Either[String, String] = - if (s.forall(validChars.contains)) Right(s) - else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}") - val octetStream = application("octet-stream") val pdf = application("pdf") val zip = application("zip") @@ -55,6 +90,16 @@ object MimeType { val html = text("html") val plain = text("plain") + object PdfMatch { + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(_.matches(pdf)) + } + + object HtmlMatch { + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(_.matches(html)) + } + implicit val jsonEncoder: Encoder[MimeType] = Encoder.encodeString.contramap(_.asString) diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index f9320413..66029e35 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -32,18 +32,27 @@ object Conversion { in: Stream[F, Byte] ): F[A] = TikaMimetype.resolve(dataType, in).flatMap { - case MimeType.pdf => + case Pdfs(_) => handler.run(ConversionResult.successPdf(in)) - case MimeType.html => - WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler) + case mt @ MimeType(_, "html", _) => + val cs = mt.charsetOrUtf8 + WkHtmlPdf + .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler) - case Texts(_) => - Markdown.toHtml(in, cfg.markdown).flatMap { html => + case mt @ Texts(_) => + val cs = mt.charsetOrUtf8 + Markdown.toHtml(in, cfg.markdown, cs).flatMap { html => val bytes = Stream .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8))) .covary[F] - WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler) + WkHtmlPdf.toPDF( + cfg.wkhtmlpdf, + cfg.chunkSize, + StandardCharsets.UTF_8, + blocker, + logger + )(bytes, handler) } case Images(mt) => @@ -51,7 +60,9 @@ object Conversion { case Some(dim) => if (dim.product > cfg.maxImageSize) { logger - .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + .info( + s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})." + ) *> handler.run( ConversionResult.inputMalformed( mt, @@ -59,14 +70,20 @@ object Conversion { ) ) } else { - Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)( + in, + handler + ) } case None => logger.info( s"Cannot read image when determining size for ${mt.asString}. Converting anyways." ) *> - Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)( + in, + handler + ) } case Office(_) => @@ -90,6 +107,11 @@ object Conversion { Some(m).filter(_.primary == "text") } + object Pdfs { + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(_.matches(MimeType.pdf)) + } + object Office { val odt = MimeType.application("vnd.oasis.opendocument.text") val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") @@ -97,18 +119,33 @@ object Conversion { val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") val msoffice = MimeType.application("x-tika-msoffice") val ooxml = MimeType.application("x-tika-ooxml") - val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") - val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") - val xls = MimeType.application("vnd.ms-excel") - val doc = MimeType.application("msword") - val rtf = MimeType.application("rtf") + val docx = + MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") + val xlsx = + MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") + val xls = MimeType.application("vnd.ms-excel") + val doc = MimeType.application("msword") + val rtf = MimeType.application("rtf") // without a filename, tika returns application/zip for odt/ods files, since // they are just zip files val odfContainer = MimeType.zip val all = - Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer) + Set( + odt, + ods, + odtAlias, + odsAlias, + msoffice, + ooxml, + docx, + xlsx, + xls, + doc, + rtf, + odfContainer + ) def unapply(m: MimeType): Option[MimeType] = Some(m).filter(all.contains) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 19473de3..8199191e 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -7,20 +7,23 @@ import fs2.Stream import docspell.common._ import docspell.convert.ConversionResult import docspell.convert.ConversionResult.Handler +import java.nio.charset.Charset object WkHtmlPdf { def toPDF[F[_]: Sync: ContextShift, A]( cfg: WkHtmlPdfConfig, chunkSize: Int, + charset: Charset, blocker: Blocker, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResult[F](blocker, chunkSize, logger) + val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) ExternConv - .toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)( + .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)( in, handler ) diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala index 7d185d86..543dec41 100644 --- a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala +++ b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala @@ -1,8 +1,9 @@ package docspell.convert.flexmark import java.io.{InputStream, InputStreamReader} -import java.nio.charset.StandardCharsets +import java.nio.charset.Charset import java.util +import scala.util.Try import cats.effect.Sync import cats.implicits._ @@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet} import fs2.Stream -import scala.util.Try +import docspell.common._ object Markdown { - def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = { + def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = { val p = createParser() val r = createRenderer() Try { - val reader = new InputStreamReader(is, StandardCharsets.UTF_8) + val reader = new InputStreamReader(is, cs) val doc = p.parseReader(reader) wrapHtml(r.render(doc), cfg) }.toEither @@ -34,8 +35,8 @@ object Markdown { wrapHtml(r.render(doc), cfg) } - def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] = - data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg)) + def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] = + data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg)) private def wrapHtml(body: String, cfg: MarkdownConfig): String = s""" diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala index 2834b2e0..780783bf 100644 --- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -7,6 +7,7 @@ import docspell.common._ import docspell.convert.FileChecks import docspell.files.{ExampleFiles, TestFiles} import minitest.SimpleTestSuite +import java.nio.charset.StandardCharsets object ExternConvTest extends SimpleTestSuite with FileChecks { val blocker = TestFiles.blocker @@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks { val wkCfg = WkHtmlPdfConfig(cfg, target) val p = WkHtmlPdf - .toPDF[IO, Path](wkCfg, 8192, blocker, logger)( + .toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)( ExampleFiles.letter_de_html.readURL[IO](8192, blocker), storePdfHandler(dir.resolve("test.pdf")) ) diff --git a/modules/extract/NOTICE b/modules/extract/NOTICE deleted file mode 100644 index 05ccbbcc..00000000 --- a/modules/extract/NOTICE +++ /dev/null @@ -1,11 +0,0 @@ -The Java source files in docspell-extract are unmodified copies of -those found in the Apache Tika parser project. It follows the -NOTICE.txt file from Apache Tika parsers: - -Apache Tika parsers -Copyright 2007-2019 The Apache Software Foundation - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). - - diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index 54c5cf10..ed86bfd3 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -31,7 +31,7 @@ object Extraction { lang: Language ): F[ExtractResult] = TikaMimetype.resolve(dataType, data).flatMap { - case MimeType.pdf => + case MimeType.PdfMatch(_) => PdfExtract .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) .map(ExtractResult.fromEither) @@ -75,14 +75,15 @@ object Extraction { doExtract } - case OdfType.container => + case OdfType.ContainerMatch(_) => logger .info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> OdfExtract.get(data).map(ExtractResult.fromEither) - case mt @ MimeType("text", sub) if !sub.contains("html") => + case mt @ MimeType("text", sub, _) if !sub.contains("html") => + val cs = mt.charsetOrUtf8 logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> - data.through(fs2.text.utf8Decode).compile.last.map { txt => + data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => ExtractResult.success(txt.getOrElse("").trim) } diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala index 886b0e50..96a51005 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala @@ -12,5 +12,5 @@ object OcrType { val all = Set(jpeg, png, tiff, pdf) def unapply(mt: MimeType): Option[MimeType] = - Some(mt).filter(all.contains) + Some(mt).map(_.baseType).filter(all.contains) } diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala index 93c7ddcc..13dd3dd2 100644 --- a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala @@ -14,5 +14,10 @@ object OdfType { val all = Set(odt, ods, odtAlias, odsAlias) def unapply(mt: MimeType): Option[MimeType] = - Some(mt).filter(all.contains) + Some(mt).map(_.baseType).filter(all.contains) + + object ContainerMatch { + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(_.matches(container)) + } } diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala index d2f9f4cb..23850994 100644 --- a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala @@ -14,6 +14,6 @@ object PoiType { val all = Set(msoffice, ooxml, docx, xlsx, xls, doc) def unapply(arg: MimeType): Option[MimeType] = - Some(arg).filter(all.contains) + Some(arg).map(_.baseType).filter(all.contains) } diff --git a/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java new file mode 100644 index 00000000..359e7bb0 --- /dev/null +++ b/modules/files/src/main/java/org/apache/tika/parser/txt/IOUtils.java @@ -0,0 +1,11 @@ +package org.apache.tika.parser.txt; + +import java.io.InputStream; +import java.io.IOException; + +public final class IOUtils { + + public static long readFully(InputStream in, byte[] buffer) throws IOException { + return in.read(buffer, 0, buffer.length); + } +} diff --git a/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java new file mode 100644 index 00000000..7737aa72 --- /dev/null +++ b/modules/files/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.txt; + +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; +import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.utils.CharsetUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; + +public class Icu4jEncodingDetector implements EncodingDetector { + + public Charset detect(InputStream input, Metadata metadata) + throws IOException { + if (input == null) { + return null; + } + + CharsetDetector detector = new CharsetDetector(); + + String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + String incomingType = metadata.get(Metadata.CONTENT_TYPE); + if (incomingCharset == null && incomingType != null) { + // TIKA-341: Use charset in content-type + MediaType mt = MediaType.parse(incomingType); + if (mt != null) { + incomingCharset = mt.getParameters().get("charset"); + } + } + + if (incomingCharset != null) { + String cleaned = CharsetUtils.clean(incomingCharset); + if (cleaned != null) { + detector.setDeclaredEncoding(cleaned); + } else { + // TODO: log a warning? + } + } + + // TIKA-341 without enabling input filtering (stripping of tags) + // short HTML tests don't work well + detector.enableInputFilter(true); + + detector.setText(input); + + for (CharsetMatch match : detector.detectAll()) { + try { + return CharsetUtils.forName(match.getName()); + } catch (Exception e) { + // ignore + } + } + + return null; + } +} diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala index 3a1ea3cf..a9e594e6 100644 --- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -2,27 +2,32 @@ package docspell.files import java.io.BufferedInputStream import java.nio.file.{Files, Path} +import java.nio.charset.Charset +import scala.jdk.CollectionConverters._ +import scala.util.Using import cats.implicits._ import cats.effect.Sync -import docspell.common._ import fs2.Stream import org.apache.tika.config.TikaConfig import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} import org.apache.tika.mime.MediaType - -import scala.util.Using +import org.apache.tika.parser.txt.Icu4jEncodingDetector +import docspell.common._ object TikaMimetype { private val tika = new TikaConfig().getDetector private def convert(mt: MediaType): MimeType = - Option(mt) - .map(_.toString) - .map(MimeType.parse) - .flatMap(_.toOption) - .map(normalize) - .getOrElse(MimeType.octetStream) + Option(mt) match { + case Some(_) => + val params = mt.getParameters.asScala.toMap + val primary = mt.getType + val sub = mt.getSubtype + normalize(MimeType(primary, sub, params)) + case None => + MimeType.octetStream + } private def makeMetadata(hint: MimeTypeHint): Metadata = { val md = new Metadata @@ -32,21 +37,55 @@ object TikaMimetype { } private def normalize(in: MimeType): MimeType = in match { - case MimeType(_, sub) if sub contains "xhtml" => - MimeType.html + case MimeType(_, sub, p) if sub contains "xhtml" => + MimeType.html.copy(params = p) case _ => in } - private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = - convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) + private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = { + val mt = convert( + tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)) + ) + if (mt.primary == "text") { + charsetFromBytes(bv, hint) match { + case Some(cs) => + mt.withCharset(cs) + case None => + mt + } + } else mt + } + + private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] = + Either + .catchNonFatal { + val cd = new Icu4jEncodingDetector() + val md = makeMetadata(hint) + Option(cd.detect(new java.io.ByteArrayInputStream(bv), md)) + } + .toOption + .flatten + + def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) = + data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint)) def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] = data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint)) def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] = dt match { - case DataType.Exact(mt) => mt.pure[F] - case DataType.Hint(hint) => TikaMimetype.detect(data, hint) + case DataType.Exact(mt) => + mt.resolveCharset match { + case None if mt.primary == "text" => + detectCharset[F](data, MimeTypeHint.advertised(mt)) + .map { + case Some(cs) => mt.withCharset(cs) + case None => mt + } + case _ => mt.pure[F] + } + case DataType.Hint(hint) => + TikaMimetype.detect(data, hint) } def detect[F[_]: Sync](file: Path): F[MimeType] = diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index cd345cfb..c33d727c 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -231,7 +231,9 @@ docspell.joex { "-s", "A4", "--encoding", - "UTF-8", + "{{encoding}}", + "--load-error-handling", "ignore", + "--load-media-error-handling", "ignore", "-", "{{outfile}}" ] diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 2f4f8b54..3525b9f5 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -8,6 +8,7 @@ import emil.javamail.syntax._ import cats.Applicative import docspell.common._ +import java.nio.charset.StandardCharsets object ReadMail { @@ -20,7 +21,7 @@ object ReadMail { bytesToMail(s).flatMap(mailToEntries[F](logger)) def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] = - data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F]) + data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F]) def mailToEntries[F[_]: Applicative]( logger: Logger[F] @@ -49,7 +50,7 @@ object ReadMail { implicit class MimeTypeConv(m: emil.MimeType) { def toDocspell: MimeType = - MimeType(m.primary, m.sub) + MimeType(m.primary, m.sub, m.params) } private def bodyType[F[_]](body: MailBody[F]): String = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 542d1f5a..f49a4d80 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -57,7 +57,7 @@ object ConvertPdf { )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv => mime match { - case Mimetype.`application/pdf` => + case mt if mt.baseEqual(Mimetype.`application/pdf`) => ctx.logger.info("Not going to convert a PDF file into a PDF.") *> (ra, None: Option[RAttachmentMeta]).pure[F] @@ -66,9 +66,10 @@ object ConvertPdf { .get(ra.fileId.id) .unNoneTerminate .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + val mt = MimeType(mime.primary, mime.sub, mime.params) val handler = conversionHandler[F](ctx, cfg, ra, item) ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> - conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)( + conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( data ) } @@ -104,7 +105,8 @@ object ConvertPdf { (ra, None: Option[RAttachmentMeta]).pure[F] case ConversionResult.Failure(ex) => - ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *> + ctx.logger + .error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *> (ra, None: Option[RAttachmentMeta]).pure[F] }) @@ -114,7 +116,8 @@ object ConvertPdf { ra: RAttachment, pdf: Stream[F, Byte] ) = { - val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf")) + val hint = + MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf")) val newName = ra.name.map(n => s"$n.pdf") ctx.store.bitpeace .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised)) @@ -122,7 +125,9 @@ object ConvertPdf { .lastOrError .map(fm => Ident.unsafe(fm.id)) .flatMap(fmId => - ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId) + ctx.store + .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)) + .map(_ => fmId) ) .map(fmId => ra.copy(fileId = fmId, name = newName)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index 62ea43cf..de973c67 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -70,7 +70,7 @@ object ExtractArchive { archive: Option[RAttachmentArchive] )(ra: RAttachment, mime: Mimetype): F[Extracted] = mime match { - case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) => + case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) => ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("")}.") *> extractZip(ctx, archive)(ra) .flatTap(_ => cleanupParents(ctx, ra, archive)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index e96a71be..ffe06f22 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -76,7 +76,7 @@ object TextExtraction { .getOrElse(Mimetype.`application/octet-stream`) findMime - .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang)) + .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang)) } private def extractTextFallback[F[_]: Sync: ContextShift]( diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index f1726f8e..322f1abf 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -448,7 +448,7 @@ trait Conversions { // MIME Type def fromContentType(header: `Content-Type`): MimeType = - MimeType(header.mediaType.mainType, header.mediaType.subType) + MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions) } object Conversions extends Conversions { diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 74f66fe4..ac2817e4 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -17,6 +17,7 @@ object Dependencies { val Fs2Version = "2.3.0" val H2Version = "1.4.200" val Http4sVersion = "0.21.1" + val Icu4jVersion = "66.1" val KindProjectorVersion = "0.10.3" val Log4sVersion = "1.8.2" val LogbackVersion = "1.2.3" @@ -218,4 +219,7 @@ object Dependencies { "org.webjars" % "viewerjs" % ViewerJSVersion ) + val icu4j = Seq( + "com.ibm.icu" % "icu4j" % Icu4jVersion + ) }