mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-04 12:30:12 +00:00 
			
		
		
		
	Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
		
							
								
								
									
										13
									
								
								NOTICE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								NOTICE.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,13 @@
 | 
			
		||||
Docspell
 | 
			
		||||
Copyright 2019-2020
 | 
			
		||||
Licensed under the GPLv3
 | 
			
		||||
 | 
			
		||||
This software contains portions of code from tika-parser
 | 
			
		||||
https://tika.apache.org
 | 
			
		||||
Copyright (C) Apache Software Foundation (ASF) <https://www.apache.org>
 | 
			
		||||
Licensed under Apache License 2.0
 | 
			
		||||
 | 
			
		||||
This software contains portions of code from http4s
 | 
			
		||||
https://http4s.org
 | 
			
		||||
Copyright 2013-2018 http4s.org
 | 
			
		||||
Licensed under Apache License 2.0
 | 
			
		||||
@@ -161,7 +161,8 @@ val files = project.in(file("modules/files")).
 | 
			
		||||
  settings(
 | 
			
		||||
    name := "docspell-files",
 | 
			
		||||
    libraryDependencies ++=
 | 
			
		||||
      Dependencies.tika,
 | 
			
		||||
      Dependencies.tika ++
 | 
			
		||||
      Dependencies.icu4j,
 | 
			
		||||
    Test / sourceGenerators += Def.task {
 | 
			
		||||
      val base = (Test/resourceDirectory).value
 | 
			
		||||
      val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,8 @@
 | 
			
		||||
package docspell.common
 | 
			
		||||
 | 
			
		||||
import fs2.Stream
 | 
			
		||||
import fs2.{Pipe, Stream}
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
 | 
			
		||||
final case class Binary[F[_]](name: String, mime: MimeType, data: Stream[F, Byte]) {
 | 
			
		||||
 | 
			
		||||
@@ -14,11 +16,67 @@ object Binary {
 | 
			
		||||
    Binary[F](name, MimeType.octetStream, data)
 | 
			
		||||
 | 
			
		||||
  def utf8[F[_]](name: String, content: String): Binary[F] =
 | 
			
		||||
    Binary[F](name, MimeType.octetStream, Stream.emit(content).through(fs2.text.utf8Encode))
 | 
			
		||||
    Binary[F](
 | 
			
		||||
      name,
 | 
			
		||||
      MimeType.octetStream,
 | 
			
		||||
      Stream.emit(content).through(fs2.text.utf8Encode)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
  def text[F[_]](name: String, content: String): Binary[F] =
 | 
			
		||||
    utf8(name, content).withMime(MimeType.plain)
 | 
			
		||||
    utf8(name, content).withMime(MimeType.plain.withUtf8Charset)
 | 
			
		||||
 | 
			
		||||
  def html[F[_]](name: String, content: String): Binary[F] =
 | 
			
		||||
    utf8(name, content).withMime(MimeType.html)
 | 
			
		||||
    utf8(name, content).withMime(MimeType.html.withUtf8Charset)
 | 
			
		||||
 | 
			
		||||
  def decode[F[_]](cs: Charset): Pipe[F, Byte, String] =
 | 
			
		||||
    if (cs == StandardCharsets.UTF_8) {
 | 
			
		||||
      fs2.text.utf8Decode
 | 
			
		||||
    } else {
 | 
			
		||||
      util.decode[F](cs)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  // This is a copy from org.http4s.util
 | 
			
		||||
  // Http4s is licensed under the Apache License 2.0
 | 
			
		||||
  private object util {
 | 
			
		||||
    import fs2._
 | 
			
		||||
    import java.nio._
 | 
			
		||||
 | 
			
		||||
    private val utf8Bom: Chunk[Byte] = Chunk(0xef.toByte, 0xbb.toByte, 0xbf.toByte)
 | 
			
		||||
 | 
			
		||||
    def decode[F[_]](charset: Charset): Pipe[F, Byte, String] = {
 | 
			
		||||
      val decoder         = charset.newDecoder
 | 
			
		||||
      val maxCharsPerByte = math.ceil(decoder.maxCharsPerByte().toDouble).toInt
 | 
			
		||||
      val avgBytesPerChar = math.ceil(1.0 / decoder.averageCharsPerByte().toDouble).toInt
 | 
			
		||||
      val charBufferSize  = 128
 | 
			
		||||
 | 
			
		||||
      _.repeatPull[String] {
 | 
			
		||||
        _.unconsN(charBufferSize * avgBytesPerChar, allowFewer = true).flatMap {
 | 
			
		||||
          case None =>
 | 
			
		||||
            val charBuffer = CharBuffer.allocate(1)
 | 
			
		||||
            decoder.decode(ByteBuffer.allocate(0), charBuffer, true)
 | 
			
		||||
            decoder.flush(charBuffer)
 | 
			
		||||
            val outputString = charBuffer.flip().toString
 | 
			
		||||
            if (outputString.isEmpty) Pull.done.as(None)
 | 
			
		||||
            else Pull.output1(outputString).as(None)
 | 
			
		||||
          case Some((chunk, stream)) =>
 | 
			
		||||
            if (chunk.nonEmpty) {
 | 
			
		||||
              val chunkWithoutBom = skipByteOrderMark(chunk)
 | 
			
		||||
              val bytes           = chunkWithoutBom.toArray
 | 
			
		||||
              val byteBuffer      = ByteBuffer.wrap(bytes)
 | 
			
		||||
              val charBuffer      = CharBuffer.allocate(bytes.length * maxCharsPerByte)
 | 
			
		||||
              decoder.decode(byteBuffer, charBuffer, false)
 | 
			
		||||
              val nextStream = stream.consChunk(Chunk.byteBuffer(byteBuffer.slice()))
 | 
			
		||||
              Pull.output1(charBuffer.flip().toString).as(Some(nextStream))
 | 
			
		||||
            } else {
 | 
			
		||||
              Pull.output(Chunk.empty[String]).as(Some(stream))
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    private def skipByteOrderMark[F[_]](chunk: Chunk[Byte]): Chunk[Byte] =
 | 
			
		||||
      if (chunk.size >= 3 && chunk.take(3) == utf8Bom) {
 | 
			
		||||
        chunk.drop(3)
 | 
			
		||||
      } else chunk
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -2,13 +2,39 @@ package docspell.common
 | 
			
		||||
 | 
			
		||||
import docspell.common.syntax.all._
 | 
			
		||||
import io.circe.{Decoder, Encoder}
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
/** A MIME Type impl with just enough features for the use here.
 | 
			
		||||
  */
 | 
			
		||||
case class MimeType(primary: String, sub: String) {
 | 
			
		||||
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
 | 
			
		||||
  def withParam(name: String, value: String): MimeType =
 | 
			
		||||
    copy(params = params.updated(name, value))
 | 
			
		||||
 | 
			
		||||
  def withCharset(cs: Charset): MimeType =
 | 
			
		||||
    withParam("charset", cs.name())
 | 
			
		||||
 | 
			
		||||
  def withUtf8Charset: MimeType =
 | 
			
		||||
    withCharset(StandardCharsets.UTF_8)
 | 
			
		||||
 | 
			
		||||
  def resolveCharset: Option[Charset] =
 | 
			
		||||
    params.get("charset").flatMap { cs =>
 | 
			
		||||
      if (Charset.isSupported(cs)) Some(Charset.forName(cs))
 | 
			
		||||
      else None
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  def charsetOrUtf8: Charset =
 | 
			
		||||
    resolveCharset.getOrElse(StandardCharsets.UTF_8)
 | 
			
		||||
 | 
			
		||||
  def baseType: MimeType =
 | 
			
		||||
    if (params.isEmpty) this else copy(params = Map.empty)
 | 
			
		||||
 | 
			
		||||
  def asString: String =
 | 
			
		||||
    s"$primary/$sub"
 | 
			
		||||
    if (params.isEmpty) s"$primary/$sub"
 | 
			
		||||
    else {
 | 
			
		||||
      val parameters = params.toList.map(t => s"${t._1}=${t._2}").mkString(";")
 | 
			
		||||
      s"$primary/$sub; $parameters"
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  def matches(other: MimeType): Boolean =
 | 
			
		||||
    primary == other.primary &&
 | 
			
		||||
@@ -18,34 +44,43 @@ case class MimeType(primary: String, sub: String) {
 | 
			
		||||
object MimeType {
 | 
			
		||||
 | 
			
		||||
  def application(sub: String): MimeType =
 | 
			
		||||
    MimeType("application", partFromString(sub).throwLeft)
 | 
			
		||||
    MimeType("application", sub, Map.empty)
 | 
			
		||||
 | 
			
		||||
  def text(sub: String): MimeType =
 | 
			
		||||
    MimeType("text", partFromString(sub).throwLeft)
 | 
			
		||||
    MimeType("text", sub, Map.empty)
 | 
			
		||||
 | 
			
		||||
  def image(sub: String): MimeType =
 | 
			
		||||
    MimeType("image", partFromString(sub).throwLeft)
 | 
			
		||||
    MimeType("image", sub, Map.empty)
 | 
			
		||||
 | 
			
		||||
  private[this] val validChars: Set[Char] =
 | 
			
		||||
    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet
 | 
			
		||||
  def parse(str: String): Either[String, MimeType] = {
 | 
			
		||||
    def parsePrimary: Either[String, (String, String)] =
 | 
			
		||||
      str.indexOf('/') match {
 | 
			
		||||
        case -1 => Left(s"Invalid mediatype: $str")
 | 
			
		||||
        case n => Right(str.take(n) -> str.drop(n + 1))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
  def parse(str: String): Either[String, MimeType] =
 | 
			
		||||
    str.indexOf('/') match {
 | 
			
		||||
      case -1 => Left(s"Invalid MIME type: $str")
 | 
			
		||||
      case n =>
 | 
			
		||||
        for {
 | 
			
		||||
          prim <- partFromString(str.substring(0, n))
 | 
			
		||||
          sub  <- partFromString(str.substring(n + 1))
 | 
			
		||||
        } yield MimeType(prim.toLowerCase, sub.toLowerCase)
 | 
			
		||||
    }
 | 
			
		||||
    def parseSub(s: String): Either[String, (String, String)] =
 | 
			
		||||
      s.indexOf(';') match {
 | 
			
		||||
        case -1 => Right((s, ""))
 | 
			
		||||
        case n => Right((s.take(n), s.drop(n)))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    def parseParams(s: String): Map[String, String] =
 | 
			
		||||
      s.split(';').map(_.trim).filter(_.nonEmpty).toList.flatMap(p => p.split("=", 2).toList match {
 | 
			
		||||
        case a :: b :: Nil => Some((a, b))
 | 
			
		||||
        case _ => None
 | 
			
		||||
      }).toMap
 | 
			
		||||
 | 
			
		||||
    for {
 | 
			
		||||
      pt <- parsePrimary
 | 
			
		||||
      st <- parseSub(pt._2)
 | 
			
		||||
      pa  = parseParams(st._2)
 | 
			
		||||
    } yield MimeType(pt._1, st._1, pa)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  def unsafe(str: String): MimeType =
 | 
			
		||||
    parse(str).throwLeft
 | 
			
		||||
 | 
			
		||||
  private def partFromString(s: String): Either[String, String] =
 | 
			
		||||
    if (s.forall(validChars.contains)) Right(s)
 | 
			
		||||
    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
 | 
			
		||||
 | 
			
		||||
  val octetStream = application("octet-stream")
 | 
			
		||||
  val pdf         = application("pdf")
 | 
			
		||||
  val zip         = application("zip")
 | 
			
		||||
@@ -55,6 +90,16 @@ object MimeType {
 | 
			
		||||
  val html        = text("html")
 | 
			
		||||
  val plain       = text("plain")
 | 
			
		||||
 | 
			
		||||
  object PdfMatch {
 | 
			
		||||
    def unapply(mt: MimeType): Option[MimeType] =
 | 
			
		||||
      Some(mt).filter(_.matches(pdf))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  object HtmlMatch {
 | 
			
		||||
    def unapply(mt: MimeType): Option[MimeType] =
 | 
			
		||||
      Some(mt).filter(_.matches(html))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  implicit val jsonEncoder: Encoder[MimeType] =
 | 
			
		||||
    Encoder.encodeString.contramap(_.asString)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -32,18 +32,27 @@ object Conversion {
 | 
			
		||||
          in: Stream[F, Byte]
 | 
			
		||||
      ): F[A] =
 | 
			
		||||
        TikaMimetype.resolve(dataType, in).flatMap {
 | 
			
		||||
          case MimeType.pdf =>
 | 
			
		||||
          case Pdfs(_) =>
 | 
			
		||||
            handler.run(ConversionResult.successPdf(in))
 | 
			
		||||
 | 
			
		||||
          case MimeType.html =>
 | 
			
		||||
            WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
 | 
			
		||||
          case mt @ MimeType(_, "html", _) =>
 | 
			
		||||
            val cs = mt.charsetOrUtf8
 | 
			
		||||
            WkHtmlPdf
 | 
			
		||||
              .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, blocker, logger)(in, handler)
 | 
			
		||||
 | 
			
		||||
          case Texts(_) =>
 | 
			
		||||
            Markdown.toHtml(in, cfg.markdown).flatMap { html =>
 | 
			
		||||
          case mt @ Texts(_) =>
 | 
			
		||||
            val cs = mt.charsetOrUtf8
 | 
			
		||||
            Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
 | 
			
		||||
              val bytes = Stream
 | 
			
		||||
                .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
 | 
			
		||||
                .covary[F]
 | 
			
		||||
              WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
 | 
			
		||||
              WkHtmlPdf.toPDF(
 | 
			
		||||
                cfg.wkhtmlpdf,
 | 
			
		||||
                cfg.chunkSize,
 | 
			
		||||
                StandardCharsets.UTF_8,
 | 
			
		||||
                blocker,
 | 
			
		||||
                logger
 | 
			
		||||
              )(bytes, handler)
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
          case Images(mt) =>
 | 
			
		||||
@@ -51,7 +60,9 @@ object Conversion {
 | 
			
		||||
              case Some(dim) =>
 | 
			
		||||
                if (dim.product > cfg.maxImageSize) {
 | 
			
		||||
                  logger
 | 
			
		||||
                    .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
 | 
			
		||||
                    .info(
 | 
			
		||||
                      s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize})."
 | 
			
		||||
                    ) *>
 | 
			
		||||
                    handler.run(
 | 
			
		||||
                      ConversionResult.inputMalformed(
 | 
			
		||||
                        mt,
 | 
			
		||||
@@ -59,14 +70,20 @@ object Conversion {
 | 
			
		||||
                      )
 | 
			
		||||
                    )
 | 
			
		||||
                } else {
 | 
			
		||||
                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
 | 
			
		||||
                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
 | 
			
		||||
                    in,
 | 
			
		||||
                    handler
 | 
			
		||||
                  )
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
              case None =>
 | 
			
		||||
                logger.info(
 | 
			
		||||
                  s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
 | 
			
		||||
                ) *>
 | 
			
		||||
                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
 | 
			
		||||
                  Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
 | 
			
		||||
                    in,
 | 
			
		||||
                    handler
 | 
			
		||||
                  )
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
          case Office(_) =>
 | 
			
		||||
@@ -90,6 +107,11 @@ object Conversion {
 | 
			
		||||
      Some(m).filter(_.primary == "text")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  object Pdfs {
 | 
			
		||||
    def unapply(m: MimeType): Option[MimeType] =
 | 
			
		||||
      Some(m).filter(_.matches(MimeType.pdf))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  object Office {
 | 
			
		||||
    val odt      = MimeType.application("vnd.oasis.opendocument.text")
 | 
			
		||||
    val ods      = MimeType.application("vnd.oasis.opendocument.spreadsheet")
 | 
			
		||||
@@ -97,18 +119,33 @@ object Conversion {
 | 
			
		||||
    val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
 | 
			
		||||
    val msoffice = MimeType.application("x-tika-msoffice")
 | 
			
		||||
    val ooxml    = MimeType.application("x-tika-ooxml")
 | 
			
		||||
    val docx     = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
 | 
			
		||||
    val xlsx     = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
 | 
			
		||||
    val xls      = MimeType.application("vnd.ms-excel")
 | 
			
		||||
    val doc      = MimeType.application("msword")
 | 
			
		||||
    val rtf      = MimeType.application("rtf")
 | 
			
		||||
    val docx =
 | 
			
		||||
      MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
 | 
			
		||||
    val xlsx =
 | 
			
		||||
      MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
 | 
			
		||||
    val xls = MimeType.application("vnd.ms-excel")
 | 
			
		||||
    val doc = MimeType.application("msword")
 | 
			
		||||
    val rtf = MimeType.application("rtf")
 | 
			
		||||
 | 
			
		||||
    // without a filename, tika returns application/zip for odt/ods files, since
 | 
			
		||||
    // they are just zip files
 | 
			
		||||
    val odfContainer = MimeType.zip
 | 
			
		||||
 | 
			
		||||
    val all =
 | 
			
		||||
      Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
 | 
			
		||||
      Set(
 | 
			
		||||
        odt,
 | 
			
		||||
        ods,
 | 
			
		||||
        odtAlias,
 | 
			
		||||
        odsAlias,
 | 
			
		||||
        msoffice,
 | 
			
		||||
        ooxml,
 | 
			
		||||
        docx,
 | 
			
		||||
        xlsx,
 | 
			
		||||
        xls,
 | 
			
		||||
        doc,
 | 
			
		||||
        rtf,
 | 
			
		||||
        odfContainer
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
    def unapply(m: MimeType): Option[MimeType] =
 | 
			
		||||
      Some(m).filter(all.contains)
 | 
			
		||||
 
 | 
			
		||||
@@ -7,20 +7,23 @@ import fs2.Stream
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import docspell.convert.ConversionResult
 | 
			
		||||
import docspell.convert.ConversionResult.Handler
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
object WkHtmlPdf {
 | 
			
		||||
 | 
			
		||||
  def toPDF[F[_]: Sync: ContextShift, A](
 | 
			
		||||
      cfg: WkHtmlPdfConfig,
 | 
			
		||||
      chunkSize: Int,
 | 
			
		||||
      charset: Charset,
 | 
			
		||||
      blocker: Blocker,
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
 | 
			
		||||
    val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
 | 
			
		||||
      ExternConv.readResult[F](blocker, chunkSize, logger)
 | 
			
		||||
 | 
			
		||||
    val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
 | 
			
		||||
    ExternConv
 | 
			
		||||
      .toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
 | 
			
		||||
      .toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
 | 
			
		||||
        in,
 | 
			
		||||
        handler
 | 
			
		||||
      )
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,9 @@
 | 
			
		||||
package docspell.convert.flexmark
 | 
			
		||||
 | 
			
		||||
import java.io.{InputStream, InputStreamReader}
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
import java.util
 | 
			
		||||
import scala.util.Try
 | 
			
		||||
 | 
			
		||||
import cats.effect.Sync
 | 
			
		||||
import cats.implicits._
 | 
			
		||||
@@ -13,15 +14,15 @@ import com.vladsch.flexmark.parser.Parser
 | 
			
		||||
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
 | 
			
		||||
import fs2.Stream
 | 
			
		||||
 | 
			
		||||
import scala.util.Try
 | 
			
		||||
import docspell.common._
 | 
			
		||||
 | 
			
		||||
object Markdown {
 | 
			
		||||
 | 
			
		||||
  def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
 | 
			
		||||
  def toHtml(is: InputStream, cfg: MarkdownConfig, cs: Charset): Either[Throwable, String] = {
 | 
			
		||||
    val p = createParser()
 | 
			
		||||
    val r = createRenderer()
 | 
			
		||||
    Try {
 | 
			
		||||
      val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
 | 
			
		||||
      val reader = new InputStreamReader(is, cs)
 | 
			
		||||
      val doc    = p.parseReader(reader)
 | 
			
		||||
      wrapHtml(r.render(doc), cfg)
 | 
			
		||||
    }.toEither
 | 
			
		||||
@@ -34,8 +35,8 @@ object Markdown {
 | 
			
		||||
    wrapHtml(r.render(doc), cfg)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
 | 
			
		||||
    data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
 | 
			
		||||
  def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig, cs: Charset): F[String] =
 | 
			
		||||
    data.through(Binary.decode(cs)).compile.foldMonoid.map(str => toHtml(str, cfg))
 | 
			
		||||
 | 
			
		||||
  private def wrapHtml(body: String, cfg: MarkdownConfig): String =
 | 
			
		||||
    s"""<!DOCTYPE html>
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@ import docspell.common._
 | 
			
		||||
import docspell.convert.FileChecks
 | 
			
		||||
import docspell.files.{ExampleFiles, TestFiles}
 | 
			
		||||
import minitest.SimpleTestSuite
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
 | 
			
		||||
object ExternConvTest extends SimpleTestSuite with FileChecks {
 | 
			
		||||
  val blocker     = TestFiles.blocker
 | 
			
		||||
@@ -31,7 +32,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
 | 
			
		||||
            val wkCfg = WkHtmlPdfConfig(cfg, target)
 | 
			
		||||
            val p =
 | 
			
		||||
              WkHtmlPdf
 | 
			
		||||
                .toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
 | 
			
		||||
                .toPDF[IO, Path](wkCfg, 8192, StandardCharsets.UTF_8, blocker, logger)(
 | 
			
		||||
                  ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
 | 
			
		||||
                  storePdfHandler(dir.resolve("test.pdf"))
 | 
			
		||||
                )
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +0,0 @@
 | 
			
		||||
The Java source files in docspell-extract are unmodified copies of
 | 
			
		||||
those found in the Apache Tika parser project. It follows the
 | 
			
		||||
NOTICE.txt file from Apache Tika parsers:
 | 
			
		||||
 | 
			
		||||
Apache Tika parsers
 | 
			
		||||
Copyright 2007-2019 The Apache Software Foundation
 | 
			
		||||
 | 
			
		||||
This product includes software developed at
 | 
			
		||||
The Apache Software Foundation (http://www.apache.org/).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -31,7 +31,7 @@ object Extraction {
 | 
			
		||||
          lang: Language
 | 
			
		||||
      ): F[ExtractResult] =
 | 
			
		||||
        TikaMimetype.resolve(dataType, data).flatMap {
 | 
			
		||||
          case MimeType.pdf =>
 | 
			
		||||
          case MimeType.PdfMatch(_) =>
 | 
			
		||||
            PdfExtract
 | 
			
		||||
              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
 | 
			
		||||
              .map(ExtractResult.fromEither)
 | 
			
		||||
@@ -75,14 +75,15 @@ object Extraction {
 | 
			
		||||
                  doExtract
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
          case OdfType.container =>
 | 
			
		||||
          case OdfType.ContainerMatch(_) =>
 | 
			
		||||
            logger
 | 
			
		||||
              .info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
 | 
			
		||||
              OdfExtract.get(data).map(ExtractResult.fromEither)
 | 
			
		||||
 | 
			
		||||
          case mt @ MimeType("text", sub) if !sub.contains("html") =>
 | 
			
		||||
          case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
 | 
			
		||||
            val cs = mt.charsetOrUtf8
 | 
			
		||||
            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
 | 
			
		||||
              data.through(fs2.text.utf8Decode).compile.last.map { txt =>
 | 
			
		||||
              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
 | 
			
		||||
                ExtractResult.success(txt.getOrElse("").trim)
 | 
			
		||||
              }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -12,5 +12,5 @@ object OcrType {
 | 
			
		||||
  val all = Set(jpeg, png, tiff, pdf)
 | 
			
		||||
 | 
			
		||||
  def unapply(mt: MimeType): Option[MimeType] =
 | 
			
		||||
    Some(mt).filter(all.contains)
 | 
			
		||||
    Some(mt).map(_.baseType).filter(all.contains)
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -14,5 +14,10 @@ object OdfType {
 | 
			
		||||
  val all = Set(odt, ods, odtAlias, odsAlias)
 | 
			
		||||
 | 
			
		||||
  def unapply(mt: MimeType): Option[MimeType] =
 | 
			
		||||
    Some(mt).filter(all.contains)
 | 
			
		||||
    Some(mt).map(_.baseType).filter(all.contains)
 | 
			
		||||
 | 
			
		||||
  object ContainerMatch {
 | 
			
		||||
    def unapply(mt: MimeType): Option[MimeType] =
 | 
			
		||||
      Some(mt).filter(_.matches(container))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -14,6 +14,6 @@ object PoiType {
 | 
			
		||||
  val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
 | 
			
		||||
 | 
			
		||||
  def unapply(arg: MimeType): Option[MimeType] =
 | 
			
		||||
    Some(arg).filter(all.contains)
 | 
			
		||||
    Some(arg).map(_.baseType).filter(all.contains)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,11 @@
 | 
			
		||||
package org.apache.tika.parser.txt;
 | 
			
		||||
 | 
			
		||||
import java.io.InputStream;
 | 
			
		||||
import java.io.IOException;
 | 
			
		||||
 | 
			
		||||
public final class IOUtils {
 | 
			
		||||
 | 
			
		||||
    public static long readFully(InputStream in, byte[] buffer) throws IOException {
 | 
			
		||||
        return in.read(buffer, 0, buffer.length);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,75 @@
 | 
			
		||||
/**
 | 
			
		||||
 * Licensed to the Apache Software Foundation (ASF) under one or more
 | 
			
		||||
 * contributor license agreements.  See the NOTICE file distributed with
 | 
			
		||||
 * this work for additional information regarding copyright ownership.
 | 
			
		||||
 * The ASF licenses this file to You under the Apache License, Version 2.0
 | 
			
		||||
 * (the "License"); you may not use this file except in compliance with
 | 
			
		||||
 * the License.  You may obtain a copy of the License at
 | 
			
		||||
 * <p/>
 | 
			
		||||
 * http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 * <p/>
 | 
			
		||||
 * Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
 * distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
 * See the License for the specific language governing permissions and
 | 
			
		||||
 * limitations under the License.
 | 
			
		||||
 */
 | 
			
		||||
package org.apache.tika.parser.txt;
 | 
			
		||||
 | 
			
		||||
import com.ibm.icu.text.CharsetDetector;
 | 
			
		||||
import com.ibm.icu.text.CharsetMatch;
 | 
			
		||||
import org.apache.tika.detect.EncodingDetector;
 | 
			
		||||
import org.apache.tika.metadata.Metadata;
 | 
			
		||||
import org.apache.tika.mime.MediaType;
 | 
			
		||||
import org.apache.tika.utils.CharsetUtils;
 | 
			
		||||
 | 
			
		||||
import java.io.IOException;
 | 
			
		||||
import java.io.InputStream;
 | 
			
		||||
import java.nio.charset.Charset;
 | 
			
		||||
 | 
			
		||||
public class Icu4jEncodingDetector implements EncodingDetector {
 | 
			
		||||
 | 
			
		||||
    public Charset detect(InputStream input, Metadata metadata)
 | 
			
		||||
            throws IOException {
 | 
			
		||||
        if (input == null) {
 | 
			
		||||
            return null;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CharsetDetector detector = new CharsetDetector();
 | 
			
		||||
 | 
			
		||||
        String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
 | 
			
		||||
        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
 | 
			
		||||
        if (incomingCharset == null && incomingType != null) {
 | 
			
		||||
            // TIKA-341: Use charset in content-type
 | 
			
		||||
            MediaType mt = MediaType.parse(incomingType);
 | 
			
		||||
            if (mt != null) {
 | 
			
		||||
                incomingCharset = mt.getParameters().get("charset");
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (incomingCharset != null) {
 | 
			
		||||
            String cleaned = CharsetUtils.clean(incomingCharset);
 | 
			
		||||
            if (cleaned != null) {
 | 
			
		||||
                detector.setDeclaredEncoding(cleaned);
 | 
			
		||||
            } else {
 | 
			
		||||
                // TODO: log a warning?
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // TIKA-341 without enabling input filtering (stripping of tags)
 | 
			
		||||
        // short HTML tests don't work well
 | 
			
		||||
        detector.enableInputFilter(true);
 | 
			
		||||
 | 
			
		||||
        detector.setText(input);
 | 
			
		||||
 | 
			
		||||
        for (CharsetMatch match : detector.detectAll()) {
 | 
			
		||||
            try {
 | 
			
		||||
                return CharsetUtils.forName(match.getName());
 | 
			
		||||
            } catch (Exception e) {
 | 
			
		||||
                // ignore
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return null;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -2,27 +2,32 @@ package docspell.files
 | 
			
		||||
 | 
			
		||||
import java.io.BufferedInputStream
 | 
			
		||||
import java.nio.file.{Files, Path}
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
 | 
			
		||||
import scala.jdk.CollectionConverters._
 | 
			
		||||
import scala.util.Using
 | 
			
		||||
import cats.implicits._
 | 
			
		||||
import cats.effect.Sync
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import fs2.Stream
 | 
			
		||||
import org.apache.tika.config.TikaConfig
 | 
			
		||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
 | 
			
		||||
import org.apache.tika.mime.MediaType
 | 
			
		||||
 | 
			
		||||
import scala.util.Using
 | 
			
		||||
import org.apache.tika.parser.txt.Icu4jEncodingDetector
 | 
			
		||||
import docspell.common._
 | 
			
		||||
 | 
			
		||||
object TikaMimetype {
 | 
			
		||||
  private val tika = new TikaConfig().getDetector
 | 
			
		||||
 | 
			
		||||
  private def convert(mt: MediaType): MimeType =
 | 
			
		||||
    Option(mt)
 | 
			
		||||
      .map(_.toString)
 | 
			
		||||
      .map(MimeType.parse)
 | 
			
		||||
      .flatMap(_.toOption)
 | 
			
		||||
      .map(normalize)
 | 
			
		||||
      .getOrElse(MimeType.octetStream)
 | 
			
		||||
    Option(mt) match {
 | 
			
		||||
      case Some(_) =>
 | 
			
		||||
        val params  = mt.getParameters.asScala.toMap
 | 
			
		||||
        val primary = mt.getType
 | 
			
		||||
        val sub     = mt.getSubtype
 | 
			
		||||
        normalize(MimeType(primary, sub, params))
 | 
			
		||||
      case None =>
 | 
			
		||||
        MimeType.octetStream
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  private def makeMetadata(hint: MimeTypeHint): Metadata = {
 | 
			
		||||
    val md = new Metadata
 | 
			
		||||
@@ -32,21 +37,55 @@ object TikaMimetype {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def normalize(in: MimeType): MimeType = in match {
 | 
			
		||||
    case MimeType(_, sub) if sub contains "xhtml" =>
 | 
			
		||||
      MimeType.html
 | 
			
		||||
    case MimeType(_, sub, p) if sub contains "xhtml" =>
 | 
			
		||||
      MimeType.html.copy(params = p)
 | 
			
		||||
    case _ => in
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
 | 
			
		||||
    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
 | 
			
		||||
  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
 | 
			
		||||
    val mt = convert(
 | 
			
		||||
      tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))
 | 
			
		||||
    )
 | 
			
		||||
    if (mt.primary == "text") {
 | 
			
		||||
      charsetFromBytes(bv, hint) match {
 | 
			
		||||
        case Some(cs) =>
 | 
			
		||||
          mt.withCharset(cs)
 | 
			
		||||
        case None =>
 | 
			
		||||
          mt
 | 
			
		||||
      }
 | 
			
		||||
    } else mt
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def charsetFromBytes(bv: Array[Byte], hint: MimeTypeHint): Option[Charset] =
 | 
			
		||||
    Either
 | 
			
		||||
      .catchNonFatal {
 | 
			
		||||
        val cd = new Icu4jEncodingDetector()
 | 
			
		||||
        val md = makeMetadata(hint)
 | 
			
		||||
        Option(cd.detect(new java.io.ByteArrayInputStream(bv), md))
 | 
			
		||||
      }
 | 
			
		||||
      .toOption
 | 
			
		||||
      .flatten
 | 
			
		||||
 | 
			
		||||
  def detectCharset[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint) =
 | 
			
		||||
    data.take(8000).compile.toVector.map(bytes => charsetFromBytes(bytes.toArray, hint))
 | 
			
		||||
 | 
			
		||||
  def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
 | 
			
		||||
    data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
 | 
			
		||||
 | 
			
		||||
  def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
 | 
			
		||||
    dt match {
 | 
			
		||||
      case DataType.Exact(mt)  => mt.pure[F]
 | 
			
		||||
      case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
 | 
			
		||||
      case DataType.Exact(mt) =>
 | 
			
		||||
        mt.resolveCharset match {
 | 
			
		||||
          case None if mt.primary == "text" =>
 | 
			
		||||
            detectCharset[F](data, MimeTypeHint.advertised(mt))
 | 
			
		||||
              .map {
 | 
			
		||||
                case Some(cs) => mt.withCharset(cs)
 | 
			
		||||
                case None     => mt
 | 
			
		||||
              }
 | 
			
		||||
          case _ => mt.pure[F]
 | 
			
		||||
        }
 | 
			
		||||
      case DataType.Hint(hint) =>
 | 
			
		||||
        TikaMimetype.detect(data, hint)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  def detect[F[_]: Sync](file: Path): F[MimeType] =
 | 
			
		||||
 
 | 
			
		||||
@@ -231,7 +231,9 @@ docspell.joex {
 | 
			
		||||
          "-s",
 | 
			
		||||
          "A4",
 | 
			
		||||
          "--encoding",
 | 
			
		||||
          "UTF-8",
 | 
			
		||||
          "{{encoding}}",
 | 
			
		||||
          "--load-error-handling", "ignore",
 | 
			
		||||
          "--load-media-error-handling", "ignore",
 | 
			
		||||
          "-",
 | 
			
		||||
          "{{outfile}}"
 | 
			
		||||
        ]
 | 
			
		||||
 
 | 
			
		||||
@@ -8,6 +8,7 @@ import emil.javamail.syntax._
 | 
			
		||||
import cats.Applicative
 | 
			
		||||
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
 | 
			
		||||
object ReadMail {
 | 
			
		||||
 | 
			
		||||
@@ -20,7 +21,7 @@ object ReadMail {
 | 
			
		||||
        bytesToMail(s).flatMap(mailToEntries[F](logger))
 | 
			
		||||
 | 
			
		||||
  def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
 | 
			
		||||
    data.through(fs2.text.utf8Decode).foldMonoid.evalMap(read[F])
 | 
			
		||||
    data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
 | 
			
		||||
 | 
			
		||||
  def mailToEntries[F[_]: Applicative](
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
@@ -49,7 +50,7 @@ object ReadMail {
 | 
			
		||||
 | 
			
		||||
  implicit class MimeTypeConv(m: emil.MimeType) {
 | 
			
		||||
    def toDocspell: MimeType =
 | 
			
		||||
      MimeType(m.primary, m.sub)
 | 
			
		||||
      MimeType(m.primary, m.sub, m.params)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def bodyType[F[_]](body: MailBody[F]): String =
 | 
			
		||||
 
 | 
			
		||||
@@ -57,7 +57,7 @@ object ConvertPdf {
 | 
			
		||||
  )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
 | 
			
		||||
    Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
 | 
			
		||||
      mime match {
 | 
			
		||||
        case Mimetype.`application/pdf` =>
 | 
			
		||||
        case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
 | 
			
		||||
          ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
 | 
			
		||||
            (ra, None: Option[RAttachmentMeta]).pure[F]
 | 
			
		||||
 | 
			
		||||
@@ -66,9 +66,10 @@ object ConvertPdf {
 | 
			
		||||
            .get(ra.fileId.id)
 | 
			
		||||
            .unNoneTerminate
 | 
			
		||||
            .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
 | 
			
		||||
          val mt      = MimeType(mime.primary, mime.sub, mime.params)
 | 
			
		||||
          val handler = conversionHandler[F](ctx, cfg, ra, item)
 | 
			
		||||
          ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
 | 
			
		||||
            conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
 | 
			
		||||
            conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
 | 
			
		||||
              data
 | 
			
		||||
            )
 | 
			
		||||
      }
 | 
			
		||||
@@ -104,7 +105,8 @@ object ConvertPdf {
 | 
			
		||||
          (ra, None: Option[RAttachmentMeta]).pure[F]
 | 
			
		||||
 | 
			
		||||
      case ConversionResult.Failure(ex) =>
 | 
			
		||||
        ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
 | 
			
		||||
        ctx.logger
 | 
			
		||||
          .error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
 | 
			
		||||
          (ra, None: Option[RAttachmentMeta]).pure[F]
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
@@ -114,7 +116,8 @@ object ConvertPdf {
 | 
			
		||||
      ra: RAttachment,
 | 
			
		||||
      pdf: Stream[F, Byte]
 | 
			
		||||
  ) = {
 | 
			
		||||
    val hint    = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
 | 
			
		||||
    val hint =
 | 
			
		||||
      MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
 | 
			
		||||
    val newName = ra.name.map(n => s"$n.pdf")
 | 
			
		||||
    ctx.store.bitpeace
 | 
			
		||||
      .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
 | 
			
		||||
@@ -122,7 +125,9 @@ object ConvertPdf {
 | 
			
		||||
      .lastOrError
 | 
			
		||||
      .map(fm => Ident.unsafe(fm.id))
 | 
			
		||||
      .flatMap(fmId =>
 | 
			
		||||
        ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
 | 
			
		||||
        ctx.store
 | 
			
		||||
          .transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName))
 | 
			
		||||
          .map(_ => fmId)
 | 
			
		||||
      )
 | 
			
		||||
      .map(fmId => ra.copy(fileId = fmId, name = newName))
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -70,7 +70,7 @@ object ExtractArchive {
 | 
			
		||||
      archive: Option[RAttachmentArchive]
 | 
			
		||||
  )(ra: RAttachment, mime: Mimetype): F[Extracted] =
 | 
			
		||||
    mime match {
 | 
			
		||||
      case Mimetype.`application/zip` if ra.name.exists(_.endsWith(".zip")) =>
 | 
			
		||||
      case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
 | 
			
		||||
        ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
 | 
			
		||||
          extractZip(ctx, archive)(ra)
 | 
			
		||||
            .flatTap(_ => cleanupParents(ctx, ra, archive))
 | 
			
		||||
 
 | 
			
		||||
@@ -76,7 +76,7 @@ object TextExtraction {
 | 
			
		||||
        .getOrElse(Mimetype.`application/octet-stream`)
 | 
			
		||||
 | 
			
		||||
    findMime
 | 
			
		||||
      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
 | 
			
		||||
      .flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def extractTextFallback[F[_]: Sync: ContextShift](
 | 
			
		||||
 
 | 
			
		||||
@@ -448,7 +448,7 @@ trait Conversions {
 | 
			
		||||
  // MIME Type
 | 
			
		||||
 | 
			
		||||
  def fromContentType(header: `Content-Type`): MimeType =
 | 
			
		||||
    MimeType(header.mediaType.mainType, header.mediaType.subType)
 | 
			
		||||
    MimeType(header.mediaType.mainType, header.mediaType.subType, header.mediaType.extensions)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
object Conversions extends Conversions {
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ object Dependencies {
 | 
			
		||||
  val Fs2Version = "2.3.0"
 | 
			
		||||
  val H2Version = "1.4.200"
 | 
			
		||||
  val Http4sVersion = "0.21.1"
 | 
			
		||||
  val Icu4jVersion = "66.1"
 | 
			
		||||
  val KindProjectorVersion = "0.10.3"
 | 
			
		||||
  val Log4sVersion = "1.8.2"
 | 
			
		||||
  val LogbackVersion = "1.2.3"
 | 
			
		||||
@@ -218,4 +219,7 @@ object Dependencies {
 | 
			
		||||
    "org.webjars" % "viewerjs" % ViewerJSVersion
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
  val icu4j = Seq(
 | 
			
		||||
    "com.ibm.icu" % "icu4j" % Icu4jVersion
 | 
			
		||||
  )
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user