From 1761526e20bc97c5628ec728e575e971cf916ff6 Mon Sep 17 00:00:00 2001 From: eikek Date: Thu, 23 Sep 2021 14:09:39 +0200 Subject: [PATCH] Simplify MimeType class and parse mimetypes in a more lenient way --- .../main/scala/docspell/common/MimeType.scala | 157 ++++++++++++------ .../docspell/common/syntax/EitherSyntax.scala | 5 +- .../scala/docspell/common/MimeTypeTest.scala | 128 ++++++++++++++ .../scala/docspell/files/TikaMimetype.scala | 10 +- .../docspell/files/TikaMimetypeTest.scala | 72 ++++++++ .../restserver/conv/Conversions.scala | 4 +- .../docspell/store/syntax/MimeTypes.scala | 9 +- 7 files changed, 321 insertions(+), 64 deletions(-) create mode 100644 modules/common/src/test/scala/docspell/common/MimeTypeTest.scala create mode 100644 modules/files/src/test/scala/docspell/files/TikaMimetypeTest.scala diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index c8d76611..61fc7494 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -9,6 +9,8 @@ package docspell.common import java.nio.charset.Charset import java.nio.charset.StandardCharsets +import scala.util.Try + import cats.data.NonEmptyList import docspell.common.syntax.all._ @@ -16,33 +18,31 @@ import docspell.common.syntax.all._ import io.circe.{Decoder, Encoder} /** A MIME Type impl with just enough features for the use here. */ -case class MimeType(primary: String, sub: String, params: Map[String, String]) { - def withParam(name: String, value: String): MimeType = - copy(params = params.updated(name, value)) +case class MimeType(primary: String, sub: String, charset: Option[Charset]) { def withCharset(cs: Charset): MimeType = - withParam("charset", cs.name()) + copy(charset = Some(cs)) def withUtf8Charset: MimeType = withCharset(StandardCharsets.UTF_8) - def resolveCharset: Option[Charset] = - params.get("charset").flatMap { cs => - if (Charset.isSupported(cs)) Some(Charset.forName(cs)) - else None - } + def withCharsetName(csName: String): MimeType = + if (Try(Charset.isSupported(csName)).getOrElse(false)) + withCharset(Charset.forName(csName)) + else this def charsetOrUtf8: Charset = - resolveCharset.getOrElse(StandardCharsets.UTF_8) + charset.getOrElse(StandardCharsets.UTF_8) def baseType: MimeType = - if (params.isEmpty) this else copy(params = Map.empty) + if (charset.isEmpty) this else copy(charset = None) def asString: String = - if (params.isEmpty) s"$primary/$sub" - else { - val parameters = params.toList.map(t => s"""${t._1}="${t._2}"""").mkString(";") - s"$primary/$sub; $parameters" + charset match { + case Some(cs) => + s"$primary/$sub; charset=\"${cs.name()}\"" + case None => + s"$primary/$sub" } def matches(other: MimeType): Boolean = @@ -53,46 +53,16 @@ case class MimeType(primary: String, sub: String, params: Map[String, String]) { object MimeType { def application(sub: String): MimeType = - MimeType("application", sub, Map.empty) + MimeType("application", sub, None) def text(sub: String): MimeType = - MimeType("text", sub, Map.empty) + MimeType("text", sub, None) def image(sub: String): MimeType = - MimeType("image", sub, Map.empty) + MimeType("image", sub, None) - def parse(str: String): Either[String, MimeType] = { - def parsePrimary: Either[String, (String, String)] = - str.indexOf('/') match { - case -1 => Left(s"Invalid mediatype: $str") - case n => Right(str.take(n) -> str.drop(n + 1)) - } - - def parseSub(s: String): Either[String, (String, String)] = - s.indexOf(';') match { - case -1 => Right((s, "")) - case n => Right((s.take(n), s.drop(n))) - } - - def parseParams(s: String): Map[String, String] = - s.split(';') - .map(_.trim) - .filter(_.nonEmpty) - .toList - .flatMap(p => - p.split("=", 2).toList match { - case a :: b :: Nil => Some((a, b)) - case _ => None - } - ) - .toMap - - for { - pt <- parsePrimary - st <- parseSub(pt._2) - pa = parseParams(st._2) - } yield MimeType(pt._1, st._1, pa) - } + def parse(str: String): Either[String, MimeType] = + Parser.parse(str) def unsafe(str: String): MimeType = parse(str).throwLeft @@ -105,8 +75,9 @@ object MimeType { val tiff = image("tiff") val html = text("html") val plain = text("plain") + val json = application("json") val emls = NonEmptyList.of( - MimeType("message", "rfc822", Map.empty), + MimeType("message", "rfc822", None), application("mbox") ) @@ -158,4 +129,88 @@ object MimeType { implicit val jsonDecoder: Decoder[MimeType] = Decoder.decodeString.emap(parse) + + private object Parser { + def parse(s: String): Either[String, MimeType] = + mimeType(s).map(_._1) + + type Result[A] = Either[String, (A, String)] + type P[A] = String => Result[A] + + private[this] val tokenExtraChars = "+-$%*._~".toSet + + private def seq[A, B, C](pa: P[A], pb: P[B])(f: (A, B) => C): P[C] = + in => + pa(in) match { + case Right((a, resta)) => + pb(resta) match { + case Right((b, restb)) => + Right((f(a, b), restb)) + case left => + left.asInstanceOf[Result[C]] + } + case left => + left.asInstanceOf[Result[C]] + } + + private def takeWhile(p: Char => Boolean): P[String] = + in => { + val (prefix, suffix) = in.span(p) + Right((prefix.trim, suffix.drop(1).trim)) + } + + private def check[A](p: P[A], test: A => Boolean, err: => String): P[A] = + in => + p(in) match { + case r @ Right((a, _)) => + if (test(a)) r else Left(err) + case left => + left + } + + //https://datatracker.ietf.org/doc/html/rfc7230#section-3.2.6 + private def isToken(s: String): Boolean = + s.nonEmpty && s.forall(c => c.isLetterOrDigit || tokenExtraChars.contains(c)) + + private val baseType: P[MimeType] = { + val primary = check( + takeWhile(_ != '/'), + isToken, + "Primary type must be non-empty and contain valid characters" + ) + val sub = check( + takeWhile(_ != ';'), + isToken, + "Subtype must be non-empty and contain valid characters" + ) + seq(primary, sub)((p, s) => MimeType(p.toLowerCase, s.toLowerCase, None)) + } + + //https://datatracker.ietf.org/doc/html/rfc2046#section-4.1.2 + private val charset: P[Option[Charset]] = in => + in.trim.toLowerCase.indexOf("charset=") match { + case -1 => Right((None, in)) + case n => + val csValueStart = in.substring(n + "charset=".length).trim + val csName = csValueStart.indexOf(';') match { + case -1 => unquote(csValueStart).trim + case n => unquote(csValueStart.substring(0, n)).trim + } + if (Charset.isSupported(csName)) Right((Some(Charset.forName(csName)), "")) + else Right((None, "")) + } + + private val mimeType = + seq(baseType, charset)((bt, cs) => bt.copy(charset = cs)) + + private def unquote(s: String): String = { + val len = s.length + if (len == 0 || len == 1) s + else { + if (s.charAt(0) == '"' && s.charAt(len - 1) == '"') + unquote(s.substring(1, len - 1)) + else s + } + } + } } diff --git a/modules/common/src/main/scala/docspell/common/syntax/EitherSyntax.scala b/modules/common/src/main/scala/docspell/common/syntax/EitherSyntax.scala index 0457e232..5415353b 100644 --- a/modules/common/src/main/scala/docspell/common/syntax/EitherSyntax.scala +++ b/modules/common/src/main/scala/docspell/common/syntax/EitherSyntax.scala @@ -10,10 +10,7 @@ trait EitherSyntax { implicit final class LeftStringEitherOps[A](e: Either[String, A]) { def throwLeft: A = - e match { - case Right(a) => a - case Left(err) => sys.error(err) - } + e.fold(sys.error, identity) } implicit final class ThrowableLeftEitherOps[A](e: Either[Throwable, A]) { diff --git a/modules/common/src/test/scala/docspell/common/MimeTypeTest.scala b/modules/common/src/test/scala/docspell/common/MimeTypeTest.scala new file mode 100644 index 00000000..2f13c1dc --- /dev/null +++ b/modules/common/src/test/scala/docspell/common/MimeTypeTest.scala @@ -0,0 +1,128 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.common + +import java.nio.charset.{Charset, StandardCharsets} + +import scala.jdk.CollectionConverters._ + +import munit.ScalaCheckSuite +import org.scalacheck.Gen +import org.scalacheck.Prop.forAll + +class MimeTypeTest extends ScalaCheckSuite { + + test("asString") { + assertEquals(MimeType.html.asString, "text/html") + assertEquals( + MimeType.html.withCharset(StandardCharsets.ISO_8859_1).asString, + "text/html; charset=\"ISO-8859-1\"" + ) + assertEquals( + MimeType.html.withUtf8Charset.asString, + "text/html; charset=\"UTF-8\"" + ) + } + + test("parse without params") { + assertEquals(MimeType.unsafe("application/pdf"), MimeType.pdf) + assertEquals(MimeType.unsafe("image/jpeg"), MimeType.jpeg) + + assertEquals(MimeType.unsafe("image/jpeg "), MimeType.jpeg) + assertEquals(MimeType.unsafe(" image/jpeg "), MimeType.jpeg) + assertEquals(MimeType.unsafe(" image / jpeg "), MimeType.jpeg) + + assertEquals( + MimeType.unsafe("application/xml+html"), + MimeType.application("xml+html") + ) + assertEquals( + MimeType.unsafe( + "application/vnd.openxmlformats-officedocument.presentationml.viewProps+xml" + ), + MimeType.application( + "vnd.openxmlformats-officedocument.presentationml.viewprops+xml" + ) + ) + assertEquals( + MimeType.unsafe("application/vnd.powerbuilder75-s"), + MimeType.application("vnd.powerbuilder75-s") + ) + } + + test("parse with charset") { + assertEquals( + MimeType.unsafe("text/plain; charset=UTF-8"), + MimeType.plain.withUtf8Charset + ) + assertEquals( + MimeType.unsafe("text/plain; CHARSET=UTF-8"), + MimeType.plain.withUtf8Charset + ) + assertEquals( + MimeType.unsafe("text/plain; CharSet=UTF-8"), + MimeType.plain.withUtf8Charset + ) + assertEquals( + MimeType.unsafe("text/html; charset=\"ISO-8859-1\""), + MimeType.html.withCharset(StandardCharsets.ISO_8859_1) + ) + } + + test("parse with charset and more params") { + assertEquals( + MimeType.unsafe("text/plain; charset=UTF-8; action=test"), + MimeType.plain.withUtf8Charset + ) + assertEquals( + MimeType.unsafe("text/plain; run=\"2\"; charset=UTF-8; action=test"), + MimeType.plain.withUtf8Charset + ) + } + + test("parse without charset but params") { + assertEquals(MimeType.unsafe("image/jpeg; action=urn:2"), MimeType.jpeg) + } + + test("parse some stranger values") { + assertEquals( + MimeType.unsafe("text/plain; charset=\"\"ISO-8859-1\"\""), + MimeType.plain.withCharset(StandardCharsets.ISO_8859_1) + ) + assertEquals( + MimeType.unsafe("text/plain; charset=\"\" ISO-8859-1 \"\""), + MimeType.plain.withCharset(StandardCharsets.ISO_8859_1) + ) + } + + test("parse invalid mime types") { + assert(MimeType.parse("").isLeft) + assert(MimeType.parse("_ _/plain").isLeft) + assert(MimeType.parse("/").isLeft) + assert(MimeType.parse("()").isLeft) + } + + property("read own asString") { + forAll(MimeTypeTest.mimeType) { mt: MimeType => + assertEquals(MimeType.unsafe(mt.asString), mt) + } + } +} + +object MimeTypeTest { + val someTypes = List( + MimeType.plain, + MimeType.html + ) ++ MimeType.emls.toList + + val mimeType = + for { + base <- Gen.atLeastOne(someTypes) + cs <- Gen.someOf(Charset.availableCharsets().values().asScala) + } yield base.head.copy(charset = cs.headOption) + +} diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala index 7c0be3e5..24c5bd54 100644 --- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -31,10 +31,10 @@ object TikaMimetype { private def convert(mt: MediaType): MimeType = Option(mt) match { case Some(_) => - val params = mt.getParameters.asScala.toMap + val cs = mt.getParameters.asScala.toMap.get("charset").getOrElse("unknown") val primary = mt.getType val sub = mt.getSubtype - normalize(MimeType(primary, sub, params)) + normalize(MimeType(primary, sub, None).withCharsetName(cs)) case None => MimeType.octetStream } @@ -48,8 +48,8 @@ object TikaMimetype { private def normalize(in: MimeType): MimeType = in match { - case MimeType(_, sub, p) if sub contains "xhtml" => - MimeType.html.copy(params = p) + case MimeType(_, sub, cs) if sub contains "xhtml" => + MimeType.html.copy(charset = cs) case _ => in } @@ -86,7 +86,7 @@ object TikaMimetype { def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] = dt match { case DataType.Exact(mt) => - mt.resolveCharset match { + mt.charset match { case None if mt.primary == "text" => detectCharset[F](data, MimeTypeHint.advertised(mt)) .map { diff --git a/modules/files/src/test/scala/docspell/files/TikaMimetypeTest.scala b/modules/files/src/test/scala/docspell/files/TikaMimetypeTest.scala new file mode 100644 index 00000000..1cc10359 --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/TikaMimetypeTest.scala @@ -0,0 +1,72 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.files + +import docspell.common.{MimeType, MimeTypeHint} + +import munit.FunSuite +import scodec.bits.ByteVector + +class TikaMimetypeTest extends FunSuite { + + private def detect(bv: ByteVector, hint: MimeTypeHint): MimeType = + TikaMimetype.detect(bv, hint) + + test("detect text/plain") { + val mt = detect(ByteVector.view("hello world".getBytes), MimeTypeHint.none) + assertEquals(mt.baseType, MimeType.plain) + } + + test("detect image/jpeg") { + val mt = detect( + ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="), + MimeTypeHint.none + ) + assertEquals(mt, MimeType.jpeg) + } + + test("detect image/png") { + val mt = detect( + ByteVector.fromValidBase64("iVBORw0KGgoAAAANSUhEUgAAA2I="), + MimeTypeHint.none + ) + assertEquals(mt, MimeType.png) + } + + test("detect application/json") { + val mt = + detect( + ByteVector.view("""{"name":"me"}""".getBytes), + MimeTypeHint.filename("me.json") + ) + assertEquals(mt, MimeType.json) + } + + test("detect application/json") { + val mt = detect( + ByteVector.view("""{"name":"me"}""".getBytes), + MimeTypeHint.advertised("application/json") + ) + assertEquals(mt, MimeType.json) + } + + test("detect image/jpeg wrong advertised") { + val mt = detect( + ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="), + MimeTypeHint.advertised("image/png") + ) + assertEquals(mt, MimeType.jpeg) + } + + test("just filename") { + assertEquals( + detect(ByteVector.empty, MimeTypeHint.filename("doc.pdf")), + MimeType.pdf + ) + } + +} diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index a893dd1d..7cc03c6b 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -730,8 +730,8 @@ trait Conversions { MimeType( header.mediaType.mainType, header.mediaType.subType, - header.mediaType.extensions - ) + None + ).withCharsetName(header.mediaType.extensions.get("charset").getOrElse("unknown")) } object Conversions extends Conversions { diff --git a/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala b/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala index 40efd836..fa55fd99 100644 --- a/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala +++ b/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala @@ -12,11 +12,16 @@ object MimeTypes { implicit final class EmilMimeTypeOps(emt: emil.MimeType) { def toLocal: MimeType = - MimeType(emt.primary, emt.sub, emt.params) + MimeType(emt.primary, emt.sub, None) + .withCharsetName(emt.params.get("charset").getOrElse("unknown")) } implicit final class DocspellMimeTypeOps(mt: MimeType) { def toEmil: emil.MimeType = - emil.MimeType(mt.primary, mt.sub, mt.params) + emil.MimeType( + mt.primary, + mt.sub, + mt.charset.map(cs => Map("charset" -> cs.name())).getOrElse(Map.empty) + ) } }