mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 18:39:33 +00:00
Simplify MimeType class and parse mimetypes in a more lenient way
This commit is contained in:
parent
617f931a14
commit
1761526e20
modules
common/src
main/scala/docspell/common
test/scala/docspell/common
files/src
restserver/src/main/scala/docspell/restserver/conv
store/src/main/scala/docspell/store/syntax
@ -9,6 +9,8 @@ package docspell.common
|
||||
import java.nio.charset.Charset
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import docspell.common.syntax.all._
|
||||
@ -16,33 +18,31 @@ import docspell.common.syntax.all._
|
||||
import io.circe.{Decoder, Encoder}
|
||||
|
||||
/** A MIME Type impl with just enough features for the use here. */
|
||||
case class MimeType(primary: String, sub: String, params: Map[String, String]) {
|
||||
def withParam(name: String, value: String): MimeType =
|
||||
copy(params = params.updated(name, value))
|
||||
case class MimeType(primary: String, sub: String, charset: Option[Charset]) {
|
||||
|
||||
def withCharset(cs: Charset): MimeType =
|
||||
withParam("charset", cs.name())
|
||||
copy(charset = Some(cs))
|
||||
|
||||
def withUtf8Charset: MimeType =
|
||||
withCharset(StandardCharsets.UTF_8)
|
||||
|
||||
def resolveCharset: Option[Charset] =
|
||||
params.get("charset").flatMap { cs =>
|
||||
if (Charset.isSupported(cs)) Some(Charset.forName(cs))
|
||||
else None
|
||||
}
|
||||
def withCharsetName(csName: String): MimeType =
|
||||
if (Try(Charset.isSupported(csName)).getOrElse(false))
|
||||
withCharset(Charset.forName(csName))
|
||||
else this
|
||||
|
||||
def charsetOrUtf8: Charset =
|
||||
resolveCharset.getOrElse(StandardCharsets.UTF_8)
|
||||
charset.getOrElse(StandardCharsets.UTF_8)
|
||||
|
||||
def baseType: MimeType =
|
||||
if (params.isEmpty) this else copy(params = Map.empty)
|
||||
if (charset.isEmpty) this else copy(charset = None)
|
||||
|
||||
def asString: String =
|
||||
if (params.isEmpty) s"$primary/$sub"
|
||||
else {
|
||||
val parameters = params.toList.map(t => s"""${t._1}="${t._2}"""").mkString(";")
|
||||
s"$primary/$sub; $parameters"
|
||||
charset match {
|
||||
case Some(cs) =>
|
||||
s"$primary/$sub; charset=\"${cs.name()}\""
|
||||
case None =>
|
||||
s"$primary/$sub"
|
||||
}
|
||||
|
||||
def matches(other: MimeType): Boolean =
|
||||
@ -53,46 +53,16 @@ case class MimeType(primary: String, sub: String, params: Map[String, String]) {
|
||||
object MimeType {
|
||||
|
||||
def application(sub: String): MimeType =
|
||||
MimeType("application", sub, Map.empty)
|
||||
MimeType("application", sub, None)
|
||||
|
||||
def text(sub: String): MimeType =
|
||||
MimeType("text", sub, Map.empty)
|
||||
MimeType("text", sub, None)
|
||||
|
||||
def image(sub: String): MimeType =
|
||||
MimeType("image", sub, Map.empty)
|
||||
MimeType("image", sub, None)
|
||||
|
||||
def parse(str: String): Either[String, MimeType] = {
|
||||
def parsePrimary: Either[String, (String, String)] =
|
||||
str.indexOf('/') match {
|
||||
case -1 => Left(s"Invalid mediatype: $str")
|
||||
case n => Right(str.take(n) -> str.drop(n + 1))
|
||||
}
|
||||
|
||||
def parseSub(s: String): Either[String, (String, String)] =
|
||||
s.indexOf(';') match {
|
||||
case -1 => Right((s, ""))
|
||||
case n => Right((s.take(n), s.drop(n)))
|
||||
}
|
||||
|
||||
def parseParams(s: String): Map[String, String] =
|
||||
s.split(';')
|
||||
.map(_.trim)
|
||||
.filter(_.nonEmpty)
|
||||
.toList
|
||||
.flatMap(p =>
|
||||
p.split("=", 2).toList match {
|
||||
case a :: b :: Nil => Some((a, b))
|
||||
case _ => None
|
||||
}
|
||||
)
|
||||
.toMap
|
||||
|
||||
for {
|
||||
pt <- parsePrimary
|
||||
st <- parseSub(pt._2)
|
||||
pa = parseParams(st._2)
|
||||
} yield MimeType(pt._1, st._1, pa)
|
||||
}
|
||||
def parse(str: String): Either[String, MimeType] =
|
||||
Parser.parse(str)
|
||||
|
||||
def unsafe(str: String): MimeType =
|
||||
parse(str).throwLeft
|
||||
@ -105,8 +75,9 @@ object MimeType {
|
||||
val tiff = image("tiff")
|
||||
val html = text("html")
|
||||
val plain = text("plain")
|
||||
val json = application("json")
|
||||
val emls = NonEmptyList.of(
|
||||
MimeType("message", "rfc822", Map.empty),
|
||||
MimeType("message", "rfc822", None),
|
||||
application("mbox")
|
||||
)
|
||||
|
||||
@ -158,4 +129,88 @@ object MimeType {
|
||||
|
||||
implicit val jsonDecoder: Decoder[MimeType] =
|
||||
Decoder.decodeString.emap(parse)
|
||||
|
||||
private object Parser {
|
||||
def parse(s: String): Either[String, MimeType] =
|
||||
mimeType(s).map(_._1)
|
||||
|
||||
type Result[A] = Either[String, (A, String)]
|
||||
type P[A] = String => Result[A]
|
||||
|
||||
private[this] val tokenExtraChars = "+-$%*._~".toSet
|
||||
|
||||
private def seq[A, B, C](pa: P[A], pb: P[B])(f: (A, B) => C): P[C] =
|
||||
in =>
|
||||
pa(in) match {
|
||||
case Right((a, resta)) =>
|
||||
pb(resta) match {
|
||||
case Right((b, restb)) =>
|
||||
Right((f(a, b), restb))
|
||||
case left =>
|
||||
left.asInstanceOf[Result[C]]
|
||||
}
|
||||
case left =>
|
||||
left.asInstanceOf[Result[C]]
|
||||
}
|
||||
|
||||
private def takeWhile(p: Char => Boolean): P[String] =
|
||||
in => {
|
||||
val (prefix, suffix) = in.span(p)
|
||||
Right((prefix.trim, suffix.drop(1).trim))
|
||||
}
|
||||
|
||||
private def check[A](p: P[A], test: A => Boolean, err: => String): P[A] =
|
||||
in =>
|
||||
p(in) match {
|
||||
case r @ Right((a, _)) =>
|
||||
if (test(a)) r else Left(err)
|
||||
case left =>
|
||||
left
|
||||
}
|
||||
|
||||
//https://datatracker.ietf.org/doc/html/rfc7230#section-3.2.6
|
||||
private def isToken(s: String): Boolean =
|
||||
s.nonEmpty && s.forall(c => c.isLetterOrDigit || tokenExtraChars.contains(c))
|
||||
|
||||
private val baseType: P[MimeType] = {
|
||||
val primary = check(
|
||||
takeWhile(_ != '/'),
|
||||
isToken,
|
||||
"Primary type must be non-empty and contain valid characters"
|
||||
)
|
||||
val sub = check(
|
||||
takeWhile(_ != ';'),
|
||||
isToken,
|
||||
"Subtype must be non-empty and contain valid characters"
|
||||
)
|
||||
seq(primary, sub)((p, s) => MimeType(p.toLowerCase, s.toLowerCase, None))
|
||||
}
|
||||
|
||||
//https://datatracker.ietf.org/doc/html/rfc2046#section-4.1.2
|
||||
private val charset: P[Option[Charset]] = in =>
|
||||
in.trim.toLowerCase.indexOf("charset=") match {
|
||||
case -1 => Right((None, in))
|
||||
case n =>
|
||||
val csValueStart = in.substring(n + "charset=".length).trim
|
||||
val csName = csValueStart.indexOf(';') match {
|
||||
case -1 => unquote(csValueStart).trim
|
||||
case n => unquote(csValueStart.substring(0, n)).trim
|
||||
}
|
||||
if (Charset.isSupported(csName)) Right((Some(Charset.forName(csName)), ""))
|
||||
else Right((None, ""))
|
||||
}
|
||||
|
||||
private val mimeType =
|
||||
seq(baseType, charset)((bt, cs) => bt.copy(charset = cs))
|
||||
|
||||
private def unquote(s: String): String = {
|
||||
val len = s.length
|
||||
if (len == 0 || len == 1) s
|
||||
else {
|
||||
if (s.charAt(0) == '"' && s.charAt(len - 1) == '"')
|
||||
unquote(s.substring(1, len - 1))
|
||||
else s
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10,10 +10,7 @@ trait EitherSyntax {
|
||||
|
||||
implicit final class LeftStringEitherOps[A](e: Either[String, A]) {
|
||||
def throwLeft: A =
|
||||
e match {
|
||||
case Right(a) => a
|
||||
case Left(err) => sys.error(err)
|
||||
}
|
||||
e.fold(sys.error, identity)
|
||||
}
|
||||
|
||||
implicit final class ThrowableLeftEitherOps[A](e: Either[Throwable, A]) {
|
||||
|
128
modules/common/src/test/scala/docspell/common/MimeTypeTest.scala
Normal file
128
modules/common/src/test/scala/docspell/common/MimeTypeTest.scala
Normal file
@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.common
|
||||
|
||||
import java.nio.charset.{Charset, StandardCharsets}
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
import munit.ScalaCheckSuite
|
||||
import org.scalacheck.Gen
|
||||
import org.scalacheck.Prop.forAll
|
||||
|
||||
class MimeTypeTest extends ScalaCheckSuite {
|
||||
|
||||
test("asString") {
|
||||
assertEquals(MimeType.html.asString, "text/html")
|
||||
assertEquals(
|
||||
MimeType.html.withCharset(StandardCharsets.ISO_8859_1).asString,
|
||||
"text/html; charset=\"ISO-8859-1\""
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.html.withUtf8Charset.asString,
|
||||
"text/html; charset=\"UTF-8\""
|
||||
)
|
||||
}
|
||||
|
||||
test("parse without params") {
|
||||
assertEquals(MimeType.unsafe("application/pdf"), MimeType.pdf)
|
||||
assertEquals(MimeType.unsafe("image/jpeg"), MimeType.jpeg)
|
||||
|
||||
assertEquals(MimeType.unsafe("image/jpeg "), MimeType.jpeg)
|
||||
assertEquals(MimeType.unsafe(" image/jpeg "), MimeType.jpeg)
|
||||
assertEquals(MimeType.unsafe(" image / jpeg "), MimeType.jpeg)
|
||||
|
||||
assertEquals(
|
||||
MimeType.unsafe("application/xml+html"),
|
||||
MimeType.application("xml+html")
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe(
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.viewProps+xml"
|
||||
),
|
||||
MimeType.application(
|
||||
"vnd.openxmlformats-officedocument.presentationml.viewprops+xml"
|
||||
)
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("application/vnd.powerbuilder75-s"),
|
||||
MimeType.application("vnd.powerbuilder75-s")
|
||||
)
|
||||
}
|
||||
|
||||
test("parse with charset") {
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; charset=UTF-8"),
|
||||
MimeType.plain.withUtf8Charset
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; CHARSET=UTF-8"),
|
||||
MimeType.plain.withUtf8Charset
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; CharSet=UTF-8"),
|
||||
MimeType.plain.withUtf8Charset
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/html; charset=\"ISO-8859-1\""),
|
||||
MimeType.html.withCharset(StandardCharsets.ISO_8859_1)
|
||||
)
|
||||
}
|
||||
|
||||
test("parse with charset and more params") {
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; charset=UTF-8; action=test"),
|
||||
MimeType.plain.withUtf8Charset
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; run=\"2\"; charset=UTF-8; action=test"),
|
||||
MimeType.plain.withUtf8Charset
|
||||
)
|
||||
}
|
||||
|
||||
test("parse without charset but params") {
|
||||
assertEquals(MimeType.unsafe("image/jpeg; action=urn:2"), MimeType.jpeg)
|
||||
}
|
||||
|
||||
test("parse some stranger values") {
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; charset=\"\"ISO-8859-1\"\""),
|
||||
MimeType.plain.withCharset(StandardCharsets.ISO_8859_1)
|
||||
)
|
||||
assertEquals(
|
||||
MimeType.unsafe("text/plain; charset=\"\" ISO-8859-1 \"\""),
|
||||
MimeType.plain.withCharset(StandardCharsets.ISO_8859_1)
|
||||
)
|
||||
}
|
||||
|
||||
test("parse invalid mime types") {
|
||||
assert(MimeType.parse("").isLeft)
|
||||
assert(MimeType.parse("_ _/plain").isLeft)
|
||||
assert(MimeType.parse("/").isLeft)
|
||||
assert(MimeType.parse("()").isLeft)
|
||||
}
|
||||
|
||||
property("read own asString") {
|
||||
forAll(MimeTypeTest.mimeType) { mt: MimeType =>
|
||||
assertEquals(MimeType.unsafe(mt.asString), mt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object MimeTypeTest {
|
||||
val someTypes = List(
|
||||
MimeType.plain,
|
||||
MimeType.html
|
||||
) ++ MimeType.emls.toList
|
||||
|
||||
val mimeType =
|
||||
for {
|
||||
base <- Gen.atLeastOne(someTypes)
|
||||
cs <- Gen.someOf(Charset.availableCharsets().values().asScala)
|
||||
} yield base.head.copy(charset = cs.headOption)
|
||||
|
||||
}
|
@ -31,10 +31,10 @@ object TikaMimetype {
|
||||
private def convert(mt: MediaType): MimeType =
|
||||
Option(mt) match {
|
||||
case Some(_) =>
|
||||
val params = mt.getParameters.asScala.toMap
|
||||
val cs = mt.getParameters.asScala.toMap.get("charset").getOrElse("unknown")
|
||||
val primary = mt.getType
|
||||
val sub = mt.getSubtype
|
||||
normalize(MimeType(primary, sub, params))
|
||||
normalize(MimeType(primary, sub, None).withCharsetName(cs))
|
||||
case None =>
|
||||
MimeType.octetStream
|
||||
}
|
||||
@ -48,8 +48,8 @@ object TikaMimetype {
|
||||
|
||||
private def normalize(in: MimeType): MimeType =
|
||||
in match {
|
||||
case MimeType(_, sub, p) if sub contains "xhtml" =>
|
||||
MimeType.html.copy(params = p)
|
||||
case MimeType(_, sub, cs) if sub contains "xhtml" =>
|
||||
MimeType.html.copy(charset = cs)
|
||||
case _ => in
|
||||
}
|
||||
|
||||
@ -86,7 +86,7 @@ object TikaMimetype {
|
||||
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
|
||||
dt match {
|
||||
case DataType.Exact(mt) =>
|
||||
mt.resolveCharset match {
|
||||
mt.charset match {
|
||||
case None if mt.primary == "text" =>
|
||||
detectCharset[F](data, MimeTypeHint.advertised(mt))
|
||||
.map {
|
||||
|
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.files
|
||||
|
||||
import docspell.common.{MimeType, MimeTypeHint}
|
||||
|
||||
import munit.FunSuite
|
||||
import scodec.bits.ByteVector
|
||||
|
||||
class TikaMimetypeTest extends FunSuite {
|
||||
|
||||
private def detect(bv: ByteVector, hint: MimeTypeHint): MimeType =
|
||||
TikaMimetype.detect(bv, hint)
|
||||
|
||||
test("detect text/plain") {
|
||||
val mt = detect(ByteVector.view("hello world".getBytes), MimeTypeHint.none)
|
||||
assertEquals(mt.baseType, MimeType.plain)
|
||||
}
|
||||
|
||||
test("detect image/jpeg") {
|
||||
val mt = detect(
|
||||
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
|
||||
MimeTypeHint.none
|
||||
)
|
||||
assertEquals(mt, MimeType.jpeg)
|
||||
}
|
||||
|
||||
test("detect image/png") {
|
||||
val mt = detect(
|
||||
ByteVector.fromValidBase64("iVBORw0KGgoAAAANSUhEUgAAA2I="),
|
||||
MimeTypeHint.none
|
||||
)
|
||||
assertEquals(mt, MimeType.png)
|
||||
}
|
||||
|
||||
test("detect application/json") {
|
||||
val mt =
|
||||
detect(
|
||||
ByteVector.view("""{"name":"me"}""".getBytes),
|
||||
MimeTypeHint.filename("me.json")
|
||||
)
|
||||
assertEquals(mt, MimeType.json)
|
||||
}
|
||||
|
||||
test("detect application/json") {
|
||||
val mt = detect(
|
||||
ByteVector.view("""{"name":"me"}""".getBytes),
|
||||
MimeTypeHint.advertised("application/json")
|
||||
)
|
||||
assertEquals(mt, MimeType.json)
|
||||
}
|
||||
|
||||
test("detect image/jpeg wrong advertised") {
|
||||
val mt = detect(
|
||||
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
|
||||
MimeTypeHint.advertised("image/png")
|
||||
)
|
||||
assertEquals(mt, MimeType.jpeg)
|
||||
}
|
||||
|
||||
test("just filename") {
|
||||
assertEquals(
|
||||
detect(ByteVector.empty, MimeTypeHint.filename("doc.pdf")),
|
||||
MimeType.pdf
|
||||
)
|
||||
}
|
||||
|
||||
}
|
@ -730,8 +730,8 @@ trait Conversions {
|
||||
MimeType(
|
||||
header.mediaType.mainType,
|
||||
header.mediaType.subType,
|
||||
header.mediaType.extensions
|
||||
)
|
||||
None
|
||||
).withCharsetName(header.mediaType.extensions.get("charset").getOrElse("unknown"))
|
||||
}
|
||||
|
||||
object Conversions extends Conversions {
|
||||
|
@ -12,11 +12,16 @@ object MimeTypes {
|
||||
|
||||
implicit final class EmilMimeTypeOps(emt: emil.MimeType) {
|
||||
def toLocal: MimeType =
|
||||
MimeType(emt.primary, emt.sub, emt.params)
|
||||
MimeType(emt.primary, emt.sub, None)
|
||||
.withCharsetName(emt.params.get("charset").getOrElse("unknown"))
|
||||
}
|
||||
|
||||
implicit final class DocspellMimeTypeOps(mt: MimeType) {
|
||||
def toEmil: emil.MimeType =
|
||||
emil.MimeType(mt.primary, mt.sub, mt.params)
|
||||
emil.MimeType(
|
||||
mt.primary,
|
||||
mt.sub,
|
||||
mt.charset.map(cs => Map("charset" -> cs.name())).getOrElse(Map.empty)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user