Simplify MimeType class and parse mimetypes in a more lenient way

This commit is contained in:
eikek
2021-09-23 14:09:39 +02:00
parent 617f931a14
commit 1761526e20
7 changed files with 321 additions and 64 deletions

View File

@ -31,10 +31,10 @@ object TikaMimetype {
private def convert(mt: MediaType): MimeType =
Option(mt) match {
case Some(_) =>
val params = mt.getParameters.asScala.toMap
val cs = mt.getParameters.asScala.toMap.get("charset").getOrElse("unknown")
val primary = mt.getType
val sub = mt.getSubtype
normalize(MimeType(primary, sub, params))
normalize(MimeType(primary, sub, None).withCharsetName(cs))
case None =>
MimeType.octetStream
}
@ -48,8 +48,8 @@ object TikaMimetype {
private def normalize(in: MimeType): MimeType =
in match {
case MimeType(_, sub, p) if sub contains "xhtml" =>
MimeType.html.copy(params = p)
case MimeType(_, sub, cs) if sub contains "xhtml" =>
MimeType.html.copy(charset = cs)
case _ => in
}
@ -86,7 +86,7 @@ object TikaMimetype {
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
dt match {
case DataType.Exact(mt) =>
mt.resolveCharset match {
mt.charset match {
case None if mt.primary == "text" =>
detectCharset[F](data, MimeTypeHint.advertised(mt))
.map {

View File

@ -0,0 +1,72 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.files
import docspell.common.{MimeType, MimeTypeHint}
import munit.FunSuite
import scodec.bits.ByteVector
class TikaMimetypeTest extends FunSuite {
private def detect(bv: ByteVector, hint: MimeTypeHint): MimeType =
TikaMimetype.detect(bv, hint)
test("detect text/plain") {
val mt = detect(ByteVector.view("hello world".getBytes), MimeTypeHint.none)
assertEquals(mt.baseType, MimeType.plain)
}
test("detect image/jpeg") {
val mt = detect(
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
MimeTypeHint.none
)
assertEquals(mt, MimeType.jpeg)
}
test("detect image/png") {
val mt = detect(
ByteVector.fromValidBase64("iVBORw0KGgoAAAANSUhEUgAAA2I="),
MimeTypeHint.none
)
assertEquals(mt, MimeType.png)
}
test("detect application/json") {
val mt =
detect(
ByteVector.view("""{"name":"me"}""".getBytes),
MimeTypeHint.filename("me.json")
)
assertEquals(mt, MimeType.json)
}
test("detect application/json") {
val mt = detect(
ByteVector.view("""{"name":"me"}""".getBytes),
MimeTypeHint.advertised("application/json")
)
assertEquals(mt, MimeType.json)
}
test("detect image/jpeg wrong advertised") {
val mt = detect(
ByteVector.fromValidBase64("/9j/4AAQSkZJRgABAgAAZABkAAA="),
MimeTypeHint.advertised("image/png")
)
assertEquals(mt, MimeType.jpeg)
}
test("just filename") {
assertEquals(
detect(ByteVector.empty, MimeTypeHint.filename("doc.pdf")),
MimeType.pdf
)
}
}