Fix uploads with utf8 bytes in filenames

This adds a modified parser for `Content-Disposition` header to fix
issue . The parser in http4s for `Content-Disposition` header
removes filenames that are sent as plain utf8 bytes. See also
.
This commit is contained in:
eikek 2021-11-07 21:23:23 +01:00
parent 33c68373fb
commit b041e2616d
3 changed files with 255 additions and 1 deletions
modules/restserver/src
main/scala/docspell/restserver
test/scala/docspell/restserver/http4s

@ -22,6 +22,7 @@ import docspell.common.syntax.all._
import docspell.ftsclient.FtsResult
import docspell.restapi.model._
import docspell.restserver.conv.Conversions._
import docspell.restserver.http4s.ContentDisposition
import docspell.store.queries.{AttachmentLight => QAttachmentLight, IdRefCount}
import docspell.store.records._
import docspell.store.{AddResult, UpdateResult}
@ -377,7 +378,11 @@ trait Conversions {
.filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))
.map(p =>
OUpload
.File(p.filename, p.headers.get[`Content-Type`].map(fromContentType), p.body)
.File(
ContentDisposition.getFileName(p),
p.headers.get[`Content-Type`].map(fromContentType),
p.body
)
)
for {
metaData <- meta

@ -0,0 +1,154 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.restserver.http4s
import java.nio.charset.{Charset, StandardCharsets}
import cats.implicits._
import cats.parse.{Parser, Parser0, Rfc5234}
import org.http4s.headers.`Content-Disposition`
import org.http4s.internal.CharPredicate
import org.http4s.multipart.Part
import org.http4s.{Header, ParseFailure, ParseResult}
import org.typelevel.ci.CIString
import org.typelevel.ci._
/** A replacement for http4s' `Content-Disposition` class with a slightly modified parser
* to allow utf8 filenames.
*
* The usage of this class is already in the `Part` class to retrieve the filename. This
* class can be used as follows:
*
* {{{ ContentDisposition.getFileName(part) }}
*
* where `part` is of type `multipart.Part[F]`.
*/
case class ContentDisposition(dispositionType: String, parameters: Map[CIString, String])
object ContentDisposition {
def getFileName[F[_]](part: Part[F]): Option[String] =
part.headers.get[ContentDisposition].flatMap(_.parameters.get(ci"filename"))
private[http4s] val mimeValue: Parser[String] = {
val value = Parser.anyChar.repUntilAs[String](Parser.char(';').orElse(Parser.end))
val qvalue =
Rfc5234.dquote *> Parser.charsWhile(c => c != '"').string <* Rfc5234.dquote
qvalue.orElse(value)
}
// --- taken from http4s (v0.23.6) with modification; Licensed under Apache License 2.0
def parse(s: String): ParseResult[ContentDisposition] =
fromParser(parser, "Invalid Content-Disposition header")(s)
private def fromParser[A](parser: Parser0[A], errorMessage: => String)(
s: String
): ParseResult[A] =
try parser.parseAll(s).leftMap(e => ParseFailure(errorMessage, e.toString))
catch { case p: ParseFailure => p.asLeft[A] }
/* ALPHA = %x41-5A / %x61-7A ; A-Z / a-z */
private[this] val alpha: Parser[Char] =
Parser
.charIn(0x41.toChar to 0x5a.toChar)
.orElse(Parser.charIn(0x61.toChar to 0x7a.toChar))
/* DIGIT = %x30-39
* ; 0-9 */
private[this] val digit: Parser[Char] =
Parser.charIn(0x30.toChar to 0x39.toChar)
/* The spec references RFC2234, which is 0-9A-F, but it also
* explicitly permits lowercase. */
private[this] val hexdig: Parser[Char] =
digit.orElse(Parser.charIn("ABCDEFabcdef"))
private[http4s] object Rfc7230 {
/* `tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
* "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA`
*/
val tchar: Parser[Char] = Parser.charIn("!#$%&'*+-.^_`|~").orElse(digit).orElse(alpha)
/* `token = 1*tchar` */
val token: Parser[String] = tchar.rep.string
val htab: Parser[Unit] =
Parser.char('\t')
val sp: Parser[Unit] =
Parser.char(' ')
/* `OWS = *( SP / HTAB )` */
val ows: Parser0[Unit] = sp.orElse(htab).rep0.void
}
private[http4s] def makeParser(paramValueParser: Parser[String]) = {
sealed trait ValueChar
case class AsciiChar(c: Char) extends ValueChar
case class EncodedChar(a: Char, b: Char) extends ValueChar
val attrChar = alpha
.orElse(digit)
.orElse(Parser.charIn('!', '#', '$', '&', '+', '-', '.', '^', '_', '`', '|', '~'))
.map { (a: Char) =>
AsciiChar(a)
}
val pctEncoded = (Parser.string("%") *> hexdig ~ hexdig).map {
case (a: Char, b: Char) => EncodedChar(a, b)
}
val valueChars = attrChar.orElse(pctEncoded).rep
val language =
(Parser.string(Rfc5234.alpha.rep) ~ (Parser.char('-') *> alpha.rep(1)).?).string
val charset: Parser[Charset] =
Parser.oneOf(
Parser.ignoreCase("UTF-8").as(StandardCharsets.UTF_8) ::
Parser.ignoreCase("ISO-8859-1").as(StandardCharsets.ISO_8859_1) ::
Parser.ignoreCase("US-ASCII").as(StandardCharsets.US_ASCII) ::
Nil
)
val extValue = (Rfc5234.dquote *> Parser.charsWhile0(
CharPredicate.All -- '"'
) <* Rfc5234.dquote) | (charset.? ~ (Parser.string("'") *> language.? <* Parser
.string(
"'"
)) ~ valueChars).map { case ((charset, _), values) =>
values
.map {
case EncodedChar(a: Char, b: Char) =>
val charByte = (Character.digit(a, 16) << 4) + Character.digit(b, 16)
new String(Array(charByte.toByte), charset.getOrElse(StandardCharsets.UTF_8))
case AsciiChar(a) => a.toString
}
.toList
.mkString
}
val parameter = for {
tok <- Rfc7230.token <* Parser.string("=") <* Rfc7230.ows
v <- if (tok.endsWith("*")) extValue else paramValueParser
} yield (CIString(tok), v)
(Rfc7230.token ~ (Parser.string(";") *> Rfc7230.ows *> parameter).rep0).map {
case (token: String, params: List[(CIString, String)]) =>
ContentDisposition(token, params.toMap)
}
}
private val parser = makeParser(mimeValue)
//private val origParser = makeParser(Rfc7230.token | Rfc7230.quotedString)
implicit val headerInstance: Header[ContentDisposition, Header.Single] = {
val oh = `Content-Disposition`.headerInstance
Header.createRendered(
oh.name,
v => oh.value(`Content-Disposition`(v.dispositionType, v.parameters)),
parse
)
}
}

@ -0,0 +1,95 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.restserver.http4s
import munit._
import org.http4s.headers.`Content-Disposition`
import org.typelevel.ci._
class ContentDispositionTest extends FunSuite {
def parseGood(s: String): ContentDisposition = ContentDisposition
.parse(s)
.fold(throw _, identity)
def parseOrig(s: String): ContentDisposition =
`Content-Disposition`
.parse(s)
.map(v => ContentDisposition(v.dispositionType, v.parameters))
.fold(throw _, identity)
test("allow rfc2231 parameters with charset") {
assertEquals(
parseGood("form-data; name*=us-ascii''This%20is%20%2A%2A%2Afun%2A%2A%2A"),
ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
)
}
test("allow rfc2231 parameters with charset and language") {
assertEquals(
parseGood("form-data; name*=utf-8'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A"),
ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
)
}
test("allow rfc2231 parameters without charset and language") {
assertEquals(
parseGood("form-data; name*=''This%20is%20%2A%2A%2Afun%2A%2A%2A"),
ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
)
}
test("allow rfc2231 parameters with quoted strings") {
assertEquals(
parseGood("form-data; name*=\"This is ***fun***\""),
ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
)
}
test("allow utf8 bytes in filename") {
assertEquals(
parseGood("""form-data; name="file"; filename="Константинополя.txt""""),
ContentDisposition(
"form-data",
Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt")
)
)
assertEquals(
parseGood("""form-data; name=file; filename=Константинополя.txt"""),
ContentDisposition(
"form-data",
Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt")
)
)
}
// interestingly, this works with the original header impl from http4s. but
// i've never seen it being used like that in clients
test("unicode in filename with original header impl and filename*") {
assertEquals(
parseOrig("""form-data; name="file"; filename*="Константинополя.txt""""),
ContentDisposition(
"form-data",
Map(ci"name" -> "file", ci"filename*" -> "Константинополя.txt")
)
)
}
test("allow simple values") {
assertEquals(
parseGood("form-data; name=hello"),
ContentDisposition("form-data", Map(ci"name" -> "hello"))
)
assertEquals(
parseGood("form-data; name=\"hello\""),
ContentDisposition("form-data", Map(ci"name" -> "hello"))
)
assertEquals(
parseGood("form-data; name=\"hello you\""),
ContentDisposition("form-data", Map(ci"name" -> "hello you"))
)
}
}