From b041e2616de9c7938c3e430de9b2ddee5b914026 Mon Sep 17 00:00:00 2001 From: eikek Date: Sun, 7 Nov 2021 21:23:23 +0100 Subject: [PATCH] Fix uploads with utf8 bytes in filenames This adds a modified parser for `Content-Disposition` header to fix issue #991. The parser in http4s for `Content-Disposition` header removes filenames that are sent as plain utf8 bytes. See also http4s/http4s#5053. --- .../restserver/conv/Conversions.scala | 7 +- .../http4s/ContentDisposition.scala | 154 ++++++++++++++++++ .../http4s/ContentDispositionTest.scala | 95 +++++++++++ 3 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala create mode 100644 modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index 03142eaf..057064c3 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -22,6 +22,7 @@ import docspell.common.syntax.all._ import docspell.ftsclient.FtsResult import docspell.restapi.model._ import docspell.restserver.conv.Conversions._ +import docspell.restserver.http4s.ContentDisposition import docspell.store.queries.{AttachmentLight => QAttachmentLight, IdRefCount} import docspell.store.records._ import docspell.store.{AddResult, UpdateResult} @@ -377,7 +378,11 @@ trait Conversions { .filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta"))) .map(p => OUpload - .File(p.filename, p.headers.get[`Content-Type`].map(fromContentType), p.body) + .File( + ContentDisposition.getFileName(p), + p.headers.get[`Content-Type`].map(fromContentType), + p.body + ) ) for { metaData <- meta diff --git a/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala b/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala new file mode 100644 index 00000000..7a5e389b --- /dev/null +++ b/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala @@ -0,0 +1,154 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.restserver.http4s + +import java.nio.charset.{Charset, StandardCharsets} + +import cats.implicits._ +import cats.parse.{Parser, Parser0, Rfc5234} + +import org.http4s.headers.`Content-Disposition` +import org.http4s.internal.CharPredicate +import org.http4s.multipart.Part +import org.http4s.{Header, ParseFailure, ParseResult} +import org.typelevel.ci.CIString +import org.typelevel.ci._ + +/** A replacement for http4s' `Content-Disposition` class with a slightly modified parser + * to allow utf8 filenames. + * + * The usage of this class is already in the `Part` class to retrieve the filename. This + * class can be used as follows: + * + * {{{ ContentDisposition.getFileName(part) }} + * + * where `part` is of type `multipart.Part[F]`. + */ +case class ContentDisposition(dispositionType: String, parameters: Map[CIString, String]) + +object ContentDisposition { + + def getFileName[F[_]](part: Part[F]): Option[String] = + part.headers.get[ContentDisposition].flatMap(_.parameters.get(ci"filename")) + + private[http4s] val mimeValue: Parser[String] = { + val value = Parser.anyChar.repUntilAs[String](Parser.char(';').orElse(Parser.end)) + val qvalue = + Rfc5234.dquote *> Parser.charsWhile(c => c != '"').string <* Rfc5234.dquote + qvalue.orElse(value) + } + + // --- taken from http4s (v0.23.6) with modification; Licensed under Apache License 2.0 + + def parse(s: String): ParseResult[ContentDisposition] = + fromParser(parser, "Invalid Content-Disposition header")(s) + + private def fromParser[A](parser: Parser0[A], errorMessage: => String)( + s: String + ): ParseResult[A] = + try parser.parseAll(s).leftMap(e => ParseFailure(errorMessage, e.toString)) + catch { case p: ParseFailure => p.asLeft[A] } + + /* ALPHA = %x41-5A / %x61-7A ; A-Z / a-z */ + private[this] val alpha: Parser[Char] = + Parser + .charIn(0x41.toChar to 0x5a.toChar) + .orElse(Parser.charIn(0x61.toChar to 0x7a.toChar)) + + /* DIGIT = %x30-39 + * ; 0-9 */ + private[this] val digit: Parser[Char] = + Parser.charIn(0x30.toChar to 0x39.toChar) + + /* The spec references RFC2234, which is 0-9A-F, but it also + * explicitly permits lowercase. */ + private[this] val hexdig: Parser[Char] = + digit.orElse(Parser.charIn("ABCDEFabcdef")) + + private[http4s] object Rfc7230 { + /* `tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / + * "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA` + */ + val tchar: Parser[Char] = Parser.charIn("!#$%&'*+-.^_`|~").orElse(digit).orElse(alpha) + + /* `token = 1*tchar` */ + val token: Parser[String] = tchar.rep.string + + val htab: Parser[Unit] = + Parser.char('\t') + + val sp: Parser[Unit] = + Parser.char(' ') + + /* `OWS = *( SP / HTAB )` */ + val ows: Parser0[Unit] = sp.orElse(htab).rep0.void + + } + private[http4s] def makeParser(paramValueParser: Parser[String]) = { + sealed trait ValueChar + case class AsciiChar(c: Char) extends ValueChar + case class EncodedChar(a: Char, b: Char) extends ValueChar + + val attrChar = alpha + .orElse(digit) + .orElse(Parser.charIn('!', '#', '$', '&', '+', '-', '.', '^', '_', '`', '|', '~')) + .map { (a: Char) => + AsciiChar(a) + } + val pctEncoded = (Parser.string("%") *> hexdig ~ hexdig).map { + case (a: Char, b: Char) => EncodedChar(a, b) + } + val valueChars = attrChar.orElse(pctEncoded).rep + val language = + (Parser.string(Rfc5234.alpha.rep) ~ (Parser.char('-') *> alpha.rep(1)).?).string + val charset: Parser[Charset] = + Parser.oneOf( + Parser.ignoreCase("UTF-8").as(StandardCharsets.UTF_8) :: + Parser.ignoreCase("ISO-8859-1").as(StandardCharsets.ISO_8859_1) :: + Parser.ignoreCase("US-ASCII").as(StandardCharsets.US_ASCII) :: + Nil + ) + val extValue = (Rfc5234.dquote *> Parser.charsWhile0( + CharPredicate.All -- '"' + ) <* Rfc5234.dquote) | (charset.? ~ (Parser.string("'") *> language.? <* Parser + .string( + "'" + )) ~ valueChars).map { case ((charset, _), values) => + values + .map { + case EncodedChar(a: Char, b: Char) => + val charByte = (Character.digit(a, 16) << 4) + Character.digit(b, 16) + new String(Array(charByte.toByte), charset.getOrElse(StandardCharsets.UTF_8)) + case AsciiChar(a) => a.toString + } + .toList + .mkString + } + + val parameter = for { + tok <- Rfc7230.token <* Parser.string("=") <* Rfc7230.ows + v <- if (tok.endsWith("*")) extValue else paramValueParser + } yield (CIString(tok), v) + + (Rfc7230.token ~ (Parser.string(";") *> Rfc7230.ows *> parameter).rep0).map { + case (token: String, params: List[(CIString, String)]) => + ContentDisposition(token, params.toMap) + } + } + + private val parser = makeParser(mimeValue) + //private val origParser = makeParser(Rfc7230.token | Rfc7230.quotedString) + + implicit val headerInstance: Header[ContentDisposition, Header.Single] = { + val oh = `Content-Disposition`.headerInstance + Header.createRendered( + oh.name, + v => oh.value(`Content-Disposition`(v.dispositionType, v.parameters)), + parse + ) + } +} diff --git a/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala b/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala new file mode 100644 index 00000000..e5f6b38f --- /dev/null +++ b/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala @@ -0,0 +1,95 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.restserver.http4s + +import munit._ +import org.http4s.headers.`Content-Disposition` +import org.typelevel.ci._ + +class ContentDispositionTest extends FunSuite { + def parseGood(s: String): ContentDisposition = ContentDisposition + .parse(s) + .fold(throw _, identity) + + def parseOrig(s: String): ContentDisposition = + `Content-Disposition` + .parse(s) + .map(v => ContentDisposition(v.dispositionType, v.parameters)) + .fold(throw _, identity) + + test("allow rfc2231 parameters with charset") { + assertEquals( + parseGood("form-data; name*=us-ascii''This%20is%20%2A%2A%2Afun%2A%2A%2A"), + ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) + ) + } + + test("allow rfc2231 parameters with charset and language") { + assertEquals( + parseGood("form-data; name*=utf-8'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A"), + ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) + ) + } + + test("allow rfc2231 parameters without charset and language") { + assertEquals( + parseGood("form-data; name*=''This%20is%20%2A%2A%2Afun%2A%2A%2A"), + ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) + ) + } + + test("allow rfc2231 parameters with quoted strings") { + assertEquals( + parseGood("form-data; name*=\"This is ***fun***\""), + ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) + ) + } + + test("allow utf8 bytes in filename") { + assertEquals( + parseGood("""form-data; name="file"; filename="Константинополя.txt""""), + ContentDisposition( + "form-data", + Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt") + ) + ) + assertEquals( + parseGood("""form-data; name=file; filename=Константинополя.txt"""), + ContentDisposition( + "form-data", + Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt") + ) + ) + } + + // interestingly, this works with the original header impl from http4s. but + // i've never seen it being used like that in clients + test("unicode in filename with original header impl and filename*") { + assertEquals( + parseOrig("""form-data; name="file"; filename*="Константинополя.txt""""), + ContentDisposition( + "form-data", + Map(ci"name" -> "file", ci"filename*" -> "Константинополя.txt") + ) + ) + } + + test("allow simple values") { + assertEquals( + parseGood("form-data; name=hello"), + ContentDisposition("form-data", Map(ci"name" -> "hello")) + ) + assertEquals( + parseGood("form-data; name=\"hello\""), + ContentDisposition("form-data", Map(ci"name" -> "hello")) + ) + assertEquals( + parseGood("form-data; name=\"hello you\""), + ContentDisposition("form-data", Map(ci"name" -> "hello you")) + ) + } +}