mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-30 21:40:12 +00:00 
			
		
		
		
	Fix uploads with utf8 bytes in filenames
This adds a modified parser for `Content-Disposition` header to fix issue #991. The parser in http4s for `Content-Disposition` header removes filenames that are sent as plain utf8 bytes. See also http4s/http4s#5053.
This commit is contained in:
		| @@ -22,6 +22,7 @@ import docspell.common.syntax.all._ | ||||
| import docspell.ftsclient.FtsResult | ||||
| import docspell.restapi.model._ | ||||
| import docspell.restserver.conv.Conversions._ | ||||
| import docspell.restserver.http4s.ContentDisposition | ||||
| import docspell.store.queries.{AttachmentLight => QAttachmentLight, IdRefCount} | ||||
| import docspell.store.records._ | ||||
| import docspell.store.{AddResult, UpdateResult} | ||||
| @@ -377,7 +378,11 @@ trait Conversions { | ||||
|       .filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta"))) | ||||
|       .map(p => | ||||
|         OUpload | ||||
|           .File(p.filename, p.headers.get[`Content-Type`].map(fromContentType), p.body) | ||||
|           .File( | ||||
|             ContentDisposition.getFileName(p), | ||||
|             p.headers.get[`Content-Type`].map(fromContentType), | ||||
|             p.body | ||||
|           ) | ||||
|       ) | ||||
|     for { | ||||
|       metaData <- meta | ||||
|   | ||||
| @@ -0,0 +1,154 @@ | ||||
| /* | ||||
|  * Copyright 2020 Eike K. & Contributors | ||||
|  * | ||||
|  * SPDX-License-Identifier: AGPL-3.0-or-later | ||||
|  */ | ||||
|  | ||||
| package docspell.restserver.http4s | ||||
|  | ||||
| import java.nio.charset.{Charset, StandardCharsets} | ||||
|  | ||||
| import cats.implicits._ | ||||
| import cats.parse.{Parser, Parser0, Rfc5234} | ||||
|  | ||||
| import org.http4s.headers.`Content-Disposition` | ||||
| import org.http4s.internal.CharPredicate | ||||
| import org.http4s.multipart.Part | ||||
| import org.http4s.{Header, ParseFailure, ParseResult} | ||||
| import org.typelevel.ci.CIString | ||||
| import org.typelevel.ci._ | ||||
|  | ||||
| /** A replacement for http4s' `Content-Disposition` class with a slightly modified parser | ||||
|   * to allow utf8 filenames. | ||||
|   * | ||||
|   * The usage of this class is already in the `Part` class to retrieve the filename. This | ||||
|   * class can be used as follows: | ||||
|   * | ||||
|   * {{{ ContentDisposition.getFileName(part) }} | ||||
|   * | ||||
|   * where `part` is of type `multipart.Part[F]`. | ||||
|   */ | ||||
| case class ContentDisposition(dispositionType: String, parameters: Map[CIString, String]) | ||||
|  | ||||
| object ContentDisposition { | ||||
|  | ||||
|   def getFileName[F[_]](part: Part[F]): Option[String] = | ||||
|     part.headers.get[ContentDisposition].flatMap(_.parameters.get(ci"filename")) | ||||
|  | ||||
|   private[http4s] val mimeValue: Parser[String] = { | ||||
|     val value = Parser.anyChar.repUntilAs[String](Parser.char(';').orElse(Parser.end)) | ||||
|     val qvalue = | ||||
|       Rfc5234.dquote *> Parser.charsWhile(c => c != '"').string <* Rfc5234.dquote | ||||
|     qvalue.orElse(value) | ||||
|   } | ||||
|  | ||||
|   // --- taken from http4s (v0.23.6) with modification; Licensed under Apache License 2.0 | ||||
|  | ||||
|   def parse(s: String): ParseResult[ContentDisposition] = | ||||
|     fromParser(parser, "Invalid Content-Disposition header")(s) | ||||
|  | ||||
|   private def fromParser[A](parser: Parser0[A], errorMessage: => String)( | ||||
|       s: String | ||||
|   ): ParseResult[A] = | ||||
|     try parser.parseAll(s).leftMap(e => ParseFailure(errorMessage, e.toString)) | ||||
|     catch { case p: ParseFailure => p.asLeft[A] } | ||||
|  | ||||
|   /* ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z */ | ||||
|   private[this] val alpha: Parser[Char] = | ||||
|     Parser | ||||
|       .charIn(0x41.toChar to 0x5a.toChar) | ||||
|       .orElse(Parser.charIn(0x61.toChar to 0x7a.toChar)) | ||||
|  | ||||
|   /* DIGIT          =  %x30-39 | ||||
|    *                       ; 0-9 */ | ||||
|   private[this] val digit: Parser[Char] = | ||||
|     Parser.charIn(0x30.toChar to 0x39.toChar) | ||||
|  | ||||
|   /* The spec references RFC2234, which is 0-9A-F, but it also | ||||
|    * explicitly permits lowercase. */ | ||||
|   private[this] val hexdig: Parser[Char] = | ||||
|     digit.orElse(Parser.charIn("ABCDEFabcdef")) | ||||
|  | ||||
|   private[http4s] object Rfc7230 { | ||||
|     /* `tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / | ||||
|      *  "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA` | ||||
|      */ | ||||
|     val tchar: Parser[Char] = Parser.charIn("!#$%&'*+-.^_`|~").orElse(digit).orElse(alpha) | ||||
|  | ||||
|     /* `token = 1*tchar` */ | ||||
|     val token: Parser[String] = tchar.rep.string | ||||
|  | ||||
|     val htab: Parser[Unit] = | ||||
|       Parser.char('\t') | ||||
|  | ||||
|     val sp: Parser[Unit] = | ||||
|       Parser.char(' ') | ||||
|  | ||||
|     /* `OWS = *( SP / HTAB )` */ | ||||
|     val ows: Parser0[Unit] = sp.orElse(htab).rep0.void | ||||
|  | ||||
|   } | ||||
|   private[http4s] def makeParser(paramValueParser: Parser[String]) = { | ||||
|     sealed trait ValueChar | ||||
|     case class AsciiChar(c: Char) extends ValueChar | ||||
|     case class EncodedChar(a: Char, b: Char) extends ValueChar | ||||
|  | ||||
|     val attrChar = alpha | ||||
|       .orElse(digit) | ||||
|       .orElse(Parser.charIn('!', '#', '$', '&', '+', '-', '.', '^', '_', '`', '|', '~')) | ||||
|       .map { (a: Char) => | ||||
|         AsciiChar(a) | ||||
|       } | ||||
|     val pctEncoded = (Parser.string("%") *> hexdig ~ hexdig).map { | ||||
|       case (a: Char, b: Char) => EncodedChar(a, b) | ||||
|     } | ||||
|     val valueChars = attrChar.orElse(pctEncoded).rep | ||||
|     val language = | ||||
|       (Parser.string(Rfc5234.alpha.rep) ~ (Parser.char('-') *> alpha.rep(1)).?).string | ||||
|     val charset: Parser[Charset] = | ||||
|       Parser.oneOf( | ||||
|         Parser.ignoreCase("UTF-8").as(StandardCharsets.UTF_8) :: | ||||
|           Parser.ignoreCase("ISO-8859-1").as(StandardCharsets.ISO_8859_1) :: | ||||
|           Parser.ignoreCase("US-ASCII").as(StandardCharsets.US_ASCII) :: | ||||
|           Nil | ||||
|       ) | ||||
|     val extValue = (Rfc5234.dquote *> Parser.charsWhile0( | ||||
|       CharPredicate.All -- '"' | ||||
|     ) <* Rfc5234.dquote) | (charset.? ~ (Parser.string("'") *> language.? <* Parser | ||||
|       .string( | ||||
|         "'" | ||||
|       )) ~ valueChars).map { case ((charset, _), values) => | ||||
|       values | ||||
|         .map { | ||||
|           case EncodedChar(a: Char, b: Char) => | ||||
|             val charByte = (Character.digit(a, 16) << 4) + Character.digit(b, 16) | ||||
|             new String(Array(charByte.toByte), charset.getOrElse(StandardCharsets.UTF_8)) | ||||
|           case AsciiChar(a) => a.toString | ||||
|         } | ||||
|         .toList | ||||
|         .mkString | ||||
|     } | ||||
|  | ||||
|     val parameter = for { | ||||
|       tok <- Rfc7230.token <* Parser.string("=") <* Rfc7230.ows | ||||
|       v <- if (tok.endsWith("*")) extValue else paramValueParser | ||||
|     } yield (CIString(tok), v) | ||||
|  | ||||
|     (Rfc7230.token ~ (Parser.string(";") *> Rfc7230.ows *> parameter).rep0).map { | ||||
|       case (token: String, params: List[(CIString, String)]) => | ||||
|         ContentDisposition(token, params.toMap) | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   private val parser = makeParser(mimeValue) | ||||
|   //private val origParser = makeParser(Rfc7230.token | Rfc7230.quotedString) | ||||
|  | ||||
|   implicit val headerInstance: Header[ContentDisposition, Header.Single] = { | ||||
|     val oh = `Content-Disposition`.headerInstance | ||||
|     Header.createRendered( | ||||
|       oh.name, | ||||
|       v => oh.value(`Content-Disposition`(v.dispositionType, v.parameters)), | ||||
|       parse | ||||
|     ) | ||||
|   } | ||||
| } | ||||
| @@ -0,0 +1,95 @@ | ||||
| /* | ||||
|  * Copyright 2020 Eike K. & Contributors | ||||
|  * | ||||
|  * SPDX-License-Identifier: AGPL-3.0-or-later | ||||
|  */ | ||||
|  | ||||
| package docspell.restserver.http4s | ||||
|  | ||||
| import munit._ | ||||
| import org.http4s.headers.`Content-Disposition` | ||||
| import org.typelevel.ci._ | ||||
|  | ||||
| class ContentDispositionTest extends FunSuite { | ||||
|   def parseGood(s: String): ContentDisposition = ContentDisposition | ||||
|     .parse(s) | ||||
|     .fold(throw _, identity) | ||||
|  | ||||
|   def parseOrig(s: String): ContentDisposition = | ||||
|     `Content-Disposition` | ||||
|       .parse(s) | ||||
|       .map(v => ContentDisposition(v.dispositionType, v.parameters)) | ||||
|       .fold(throw _, identity) | ||||
|  | ||||
|   test("allow rfc2231 parameters with charset") { | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name*=us-ascii''This%20is%20%2A%2A%2Afun%2A%2A%2A"), | ||||
|       ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   test("allow rfc2231 parameters with charset and language") { | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name*=utf-8'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A"), | ||||
|       ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   test("allow rfc2231 parameters without charset and language") { | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name*=''This%20is%20%2A%2A%2Afun%2A%2A%2A"), | ||||
|       ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   test("allow rfc2231 parameters with quoted strings") { | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name*=\"This is ***fun***\""), | ||||
|       ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***")) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   test("allow utf8 bytes in filename") { | ||||
|     assertEquals( | ||||
|       parseGood("""form-data; name="file"; filename="Константинополя.txt""""), | ||||
|       ContentDisposition( | ||||
|         "form-data", | ||||
|         Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt") | ||||
|       ) | ||||
|     ) | ||||
|     assertEquals( | ||||
|       parseGood("""form-data; name=file; filename=Константинополя.txt"""), | ||||
|       ContentDisposition( | ||||
|         "form-data", | ||||
|         Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt") | ||||
|       ) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   // interestingly, this works with the original header impl from http4s. but | ||||
|   // i've never seen it being used like that in clients | ||||
|   test("unicode in filename with original header impl and filename*") { | ||||
|     assertEquals( | ||||
|       parseOrig("""form-data; name="file"; filename*="Константинополя.txt""""), | ||||
|       ContentDisposition( | ||||
|         "form-data", | ||||
|         Map(ci"name" -> "file", ci"filename*" -> "Константинополя.txt") | ||||
|       ) | ||||
|     ) | ||||
|   } | ||||
|  | ||||
|   test("allow simple values") { | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name=hello"), | ||||
|       ContentDisposition("form-data", Map(ci"name" -> "hello")) | ||||
|     ) | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name=\"hello\""), | ||||
|       ContentDisposition("form-data", Map(ci"name" -> "hello")) | ||||
|     ) | ||||
|     assertEquals( | ||||
|       parseGood("form-data; name=\"hello you\""), | ||||
|       ContentDisposition("form-data", Map(ci"name" -> "hello you")) | ||||
|     ) | ||||
|   } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user