Simplify MimeType class and parse mimetypes in a more lenient way

This commit is contained in:
eikek 2021-09-23 14:09:39 +02:00
parent 617f931a14
commit 1761526e20
7 changed files with 321 additions and 64 deletions
modules
common/src
main/scala/docspell/common
test/scala/docspell/common
files/src
main/scala/docspell/files
test/scala/docspell/files
restserver/src/main/scala/docspell/restserver/conv
store/src/main/scala/docspell/store/syntax

@ -9,6 +9,8 @@ package docspell.common
import java.nio.charset.Charset
import java.nio.charset.StandardCharsets
import scala.util.Try
import cats.data.NonEmptyList
import docspell.common.syntax.all._
@ -16,33 +18,31 @@ import docspell.common.syntax.all._
import io.circe.{Decoder, Encoder}
/** A MIME Type impl with just enough features for the use here. */
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
def withParam(name: String, value: String): MimeType =
copy(params = params.updated(name, value))
case class MimeType(primary: String, sub: String, charset: Option[Charset]) {
def withCharset(cs: Charset): MimeType =
withParam("charset", cs.name())
copy(charset = Some(cs))
def withUtf8Charset: MimeType =
withCharset(StandardCharsets.UTF_8)
def resolveCharset: Option[Charset] =
params.get("charset").flatMap { cs =>
if (Charset.isSupported(cs)) Some(Charset.forName(cs))
else None
}
def withCharsetName(csName: String): MimeType =
if (Try(Charset.isSupported(csName)).getOrElse(false))
withCharset(Charset.forName(csName))
else this
def charsetOrUtf8: Charset =
resolveCharset.getOrElse(StandardCharsets.UTF_8)
charset.getOrElse(StandardCharsets.UTF_8)
def baseType: MimeType =
if (params.isEmpty) this else copy(params = Map.empty)
if (charset.isEmpty) this else copy(charset = None)
def asString: String =
if (params.isEmpty) s"$primary/$sub"
else {
val parameters = params.toList.map(t => s"""${t._1}="${t._2}"""").mkString(";")
s"$primary/$sub; $parameters"
charset match {
case Some(cs) =>
s"$primary/$sub; charset=\"${cs.name()}\""
case None =>
s"$primary/$sub"
}
def matches(other: MimeType): Boolean =
@ -53,46 +53,16 @@ case class MimeType(primary: String, sub: String, params: Map[String, String]) {
object MimeType {
def application(sub: String): MimeType =
MimeType("application", sub, Map.empty)
MimeType("application", sub, None)
def text(sub: String): MimeType =
MimeType("text", sub, Map.empty)
MimeType("text", sub, None)
def image(sub: String): MimeType =
MimeType("image", sub, Map.empty)
MimeType("image", sub, None)
def parse(str: String): Either[String, MimeType] = {
def parsePrimary: Either[String, (String, String)] =
str.indexOf('/') match {
case -1 => Left(s"Invalid mediatype: $str")
case n => Right(str.take(n) -> str.drop(n + 1))
}
def parseSub(s: String): Either[String, (String, String)] =
s.indexOf(';') match {
case -1 => Right((s, ""))
case n => Right((s.take(n), s.drop(n)))
}
def parseParams(s: String): Map[String, String] =
s.split(';')
.map(_.trim)
.filter(_.nonEmpty)
.toList
.flatMap(p =>
p.split("=", 2).toList match {
case a :: b :: Nil => Some((a, b))
case _ => None
}
)
.toMap
for {
pt <- parsePrimary
st <- parseSub(pt._2)
pa = parseParams(st._2)
} yield MimeType(pt._1, st._1, pa)
}
def parse(str: String): Either[String, MimeType] =
Parser.parse(str)
def unsafe(str: String): MimeType =
parse(str).throwLeft
@ -105,8 +75,9 @@ object MimeType {
val tiff = image("tiff")
val html = text("html")
val plain = text("plain")
val json = application("json")
val emls = NonEmptyList.of(
MimeType("message", "rfc822", Map.empty),
MimeType("message", "rfc822", None),
application("mbox")
)
@ -158,4 +129,88 @@ object MimeType {
implicit val jsonDecoder: Decoder[MimeType] =
Decoder.decodeString.emap(parse)
private object Parser {
def parse(s: String): Either[String, MimeType] =
mimeType(s).map(_._1)
type Result[A] = Either[String, (A, String)]
type P[A] = String => Result[A]
private[this] val tokenExtraChars = "+-$%*._~".toSet
private def seq[A, B, C](pa: P[A], pb: P[B])(f: (A, B) => C): P[C] =
in =>
pa(in) match {
case Right((a, resta)) =>
pb(resta) match {
case Right((b, restb)) =>
Right((f(a, b), restb))
case left =>
left.asInstanceOf[Result[C]]
}
case left =>
left.asInstanceOf[Result[C]]
}
private def takeWhile(p: Char => Boolean): P[String] =
in => {
val (prefix, suffix) = in.span(p)
Right((prefix.trim, suffix.drop(1).trim))
}
private def check[A](p: P[A], test: A => Boolean, err: => String): P[A] =
in =>
p(in) match {
case r @ Right((a, _)) =>
if (test(a)) r else Left(err)
case left =>
left
}
//https://datatracker.ietf.org/doc/html/rfc7230#section-3.2.6
private def isToken(s: String): Boolean =
s.nonEmpty && s.forall(c => c.isLetterOrDigit || tokenExtraChars.contains(c))
private val baseType: P[MimeType] = {
val primary = check(
takeWhile(_ != '/'),
isToken,
"Primary type must be non-empty and contain valid characters"
)
val sub = check(
takeWhile(_ != ';'),
isToken,
"Subtype must be non-empty and contain valid characters"
)
seq(primary, sub)((p, s) => MimeType(p.toLowerCase, s.toLowerCase, None))
}
//https://datatracker.ietf.org/doc/html/rfc2046#section-4.1.2
private val charset: P[Option[Charset]] = in =>
in.trim.toLowerCase.indexOf("charset=") match {
case -1 => Right((None, in))
case n =>
val csValueStart = in.substring(n + "charset=".length).trim
val csName = csValueStart.indexOf(';') match {
case -1 => unquote(csValueStart).trim
case n => unquote(csValueStart.substring(0, n)).trim
}
if (Charset.isSupported(csName)) Right((Some(Charset.forName(csName)), ""))
else Right((None, ""))
}
private val mimeType =
seq(baseType, charset)((bt, cs) => bt.copy(charset = cs))
private def unquote(s: String): String = {
val len = s.length
if (len == 0 || len == 1) s
else {
if (s.charAt(0) == '"' && s.charAt(len - 1) == '"')
unquote(s.substring(1, len - 1))
else s
}
}
}
}

@ -10,10 +10,7 @@ trait EitherSyntax {
implicit final class LeftStringEitherOps[A](e: Either[String, A]) {
def throwLeft: A =
e match {
case Right(a) => a
case Left(err) => sys.error(err)
}
e.fold(sys.error, identity)
}
implicit final class ThrowableLeftEitherOps[A](e: Either[Throwable, A]) {

@ -0,0 +1,128 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.common
import java.nio.charset.{Charset, StandardCharsets}
import scala.jdk.CollectionConverters._
import munit.ScalaCheckSuite
import org.scalacheck.Gen
import org.scalacheck.Prop.forAll
class MimeTypeTest extends ScalaCheckSuite {
test("asString") {
assertEquals(MimeType.html.asString, "text/html")
assertEquals(
MimeType.html.withCharset(StandardCharsets.ISO_8859_1).asString,
"text/html; charset=\"ISO-8859-1\""
)
assertEquals(
MimeType.html.withUtf8Charset.asString,
"text/html; charset=\"UTF-8\""
)
}
test("parse without params") {
assertEquals(MimeType.unsafe("application/pdf"), MimeType.pdf)
assertEquals(MimeType.unsafe("image/jpeg"), MimeType.jpeg)
assertEquals(MimeType.unsafe("image/jpeg "), MimeType.jpeg)
assertEquals(MimeType.unsafe(" image/jpeg "), MimeType.jpeg)
assertEquals(MimeType.unsafe(" image / jpeg "), MimeType.jpeg)
assertEquals(
MimeType.unsafe("application/xml+html"),
MimeType.application("xml+html")
)
assertEquals(
MimeType.unsafe(
"application/vnd.openxmlformats-officedocument.presentationml.viewProps+xml"
),
MimeType.application(
"vnd.openxmlformats-officedocument.presentationml.viewprops+xml"
)
)
assertEquals(
MimeType.unsafe("application/vnd.powerbuilder75-s"),
MimeType.application("vnd.powerbuilder75-s")
)
}
test("parse with charset") {
assertEquals(
MimeType.unsafe("text/plain; charset=UTF-8"),
MimeType.plain.withUtf8Charset
)
assertEquals(
MimeType.unsafe("text/plain; CHARSET=UTF-8"),
MimeType.plain.withUtf8Charset
)
assertEquals(
MimeType.unsafe("text/plain; CharSet=UTF-8"),
MimeType.plain.withUtf8Charset
)
assertEquals(
MimeType.unsafe("text/html; charset=\"ISO-8859-1\""),
MimeType.html.withCharset(StandardCharsets.ISO_8859_1)
)
}
test("parse with charset and more params") {
assertEquals(
MimeType.unsafe("text/plain; charset=UTF-8; action=test"),
MimeType.plain.withUtf8Charset
)
assertEquals(
MimeType.unsafe("text/plain; run=\"2\"; charset=UTF-8; action=test"),
MimeType.plain.withUtf8Charset
)
}
test("parse without charset but params") {
assertEquals(MimeType.unsafe("image/jpeg; action=urn:2"), MimeType.jpeg)
}
test("parse some stranger values") {
assertEquals(
MimeType.unsafe("text/plain; charset=\"\"ISO-8859-1\"\""),
MimeType.plain.withCharset(StandardCharsets.ISO_8859_1)
)
assertEquals(
MimeType.unsafe("text/plain; charset=\"\" ISO-8859-1 \"\""),
MimeType.plain.withCharset(StandardCharsets.ISO_8859_1)
)
}
test("parse invalid mime types") {
assert(MimeType.parse("").isLeft)
assert(MimeType.parse("_ _/plain").isLeft)
assert(MimeType.parse("/").isLeft)
assert(MimeType.parse("()").isLeft)
}
property("read own asString") {
forAll(MimeTypeTest.mimeType) { mt: MimeType =>
assertEquals(MimeType.unsafe(mt.asString), mt)
}
}
}
object MimeTypeTest {
val someTypes = List(
MimeType.plain,
MimeType.html
) ++ MimeType.emls.toList
val mimeType =
for {
base <- Gen.atLeastOne(someTypes)
cs <- Gen.someOf(Charset.availableCharsets().values().asScala)
} yield base.head.copy(charset = cs.headOption)
}

@ -31,10 +31,10 @@ object TikaMimetype {
private def convert(mt: MediaType): MimeType =
Option(mt) match {
case Some(_) =>
val params = mt.getParameters.asScala.toMap
val cs = mt.getParameters.asScala.toMap.get("charset").getOrElse("unknown")
val primary = mt.getType
val sub = mt.getSubtype
normalize(MimeType(primary, sub, params))
normalize(MimeType(primary, sub, None).withCharsetName(cs))
case None =>
MimeType.octetStream
}
@ -48,8 +48,8 @@ object TikaMimetype {
private def normalize(in: MimeType): MimeType =
in match {
case MimeType(_, sub, p) if sub contains "xhtml" =>
MimeType.html.copy(params = p)
case MimeType(_, sub, cs) if sub contains "xhtml" =>
MimeType.html.copy(charset = cs)
case _ => in
}
@ -86,7 +86,7 @@ object TikaMimetype {
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
dt match {
case DataType.Exact(mt) =>
mt.resolveCharset match {
mt.charset match {
case None if mt.primary == "text" =>
detectCharset[F](data, MimeTypeHint.advertised(mt))
.map {

@ -0,0 +1,72 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.files
import docspell.common.{MimeType, MimeTypeHint}
import munit.FunSuite
import scodec.bits.ByteVector
class TikaMimetypeTest extends FunSuite {
private def detect(bv: ByteVector, hint: MimeTypeHint): MimeType =
TikaMimetype.detect(bv, hint)
test("detect text/plain") {
val mt = detect(ByteVector.view("hello world".getBytes), MimeTypeHint.none)
assertEquals(mt.baseType, MimeType.plain)
}
test("detect image/jpeg") {
val mt = detect(
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
MimeTypeHint.none
)
assertEquals(mt, MimeType.jpeg)
}
test("detect image/png") {
val mt = detect(
ByteVector.fromValidBase64("iVBORw0KGgoAAAANSUhEUgAAA2I="),
MimeTypeHint.none
)
assertEquals(mt, MimeType.png)
}
test("detect application/json") {
val mt =
detect(
ByteVector.view("""{"name":"me"}""".getBytes),
MimeTypeHint.filename("me.json")
)
assertEquals(mt, MimeType.json)
}
test("detect application/json") {
val mt = detect(
ByteVector.view("""{"name":"me"}""".getBytes),
MimeTypeHint.advertised("application/json")
)
assertEquals(mt, MimeType.json)
}
test("detect image/jpeg wrong advertised") {
val mt = detect(
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
MimeTypeHint.advertised("image/png")
)
assertEquals(mt, MimeType.jpeg)
}
test("just filename") {
assertEquals(
detect(ByteVector.empty, MimeTypeHint.filename("doc.pdf")),
MimeType.pdf
)
}
}

@ -730,8 +730,8 @@ trait Conversions {
MimeType(
header.mediaType.mainType,
header.mediaType.subType,
header.mediaType.extensions
)
None
).withCharsetName(header.mediaType.extensions.get("charset").getOrElse("unknown"))
}
object Conversions extends Conversions {

@ -12,11 +12,16 @@ object MimeTypes {
implicit final class EmilMimeTypeOps(emt: emil.MimeType) {
def toLocal: MimeType =
MimeType(emt.primary, emt.sub, emt.params)
MimeType(emt.primary, emt.sub, None)
.withCharsetName(emt.params.get("charset").getOrElse("unknown"))
}
implicit final class DocspellMimeTypeOps(mt: MimeType) {
def toEmil: emil.MimeType =
emil.MimeType(mt.primary, mt.sub, mt.params)
emil.MimeType(
mt.primary,
mt.sub,
mt.charset.map(cs => Map("charset" -> cs.name())).getOrElse(Map.empty)
)
}
}