From b041e2616de9c7938c3e430de9b2ddee5b914026 Mon Sep 17 00:00:00 2001
From: eikek <eike.kettner@posteo.de>
Date: Sun, 7 Nov 2021 21:23:23 +0100
Subject: [PATCH] Fix uploads with utf8 bytes in filenames

This adds a modified parser for `Content-Disposition` header to fix
issue #991. The parser in http4s for `Content-Disposition` header
removes filenames that are sent as plain utf8 bytes. See also
http4s/http4s#5053.
---
 .../restserver/conv/Conversions.scala         |   7 +-
 .../http4s/ContentDisposition.scala           | 154 ++++++++++++++++++
 .../http4s/ContentDispositionTest.scala       |  95 +++++++++++
 3 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala
 create mode 100644 modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala

diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
index 03142eaf..057064c3 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@@ -22,6 +22,7 @@ import docspell.common.syntax.all._
 import docspell.ftsclient.FtsResult
 import docspell.restapi.model._
 import docspell.restserver.conv.Conversions._
+import docspell.restserver.http4s.ContentDisposition
 import docspell.store.queries.{AttachmentLight => QAttachmentLight, IdRefCount}
 import docspell.store.records._
 import docspell.store.{AddResult, UpdateResult}
@@ -377,7 +378,11 @@ trait Conversions {
       .filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))
       .map(p =>
         OUpload
-          .File(p.filename, p.headers.get[`Content-Type`].map(fromContentType), p.body)
+          .File(
+            ContentDisposition.getFileName(p),
+            p.headers.get[`Content-Type`].map(fromContentType),
+            p.body
+          )
       )
     for {
       metaData <- meta
diff --git a/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala b/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala
new file mode 100644
index 00000000..7a5e389b
--- /dev/null
+++ b/modules/restserver/src/main/scala/docspell/restserver/http4s/ContentDisposition.scala
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.restserver.http4s
+
+import java.nio.charset.{Charset, StandardCharsets}
+
+import cats.implicits._
+import cats.parse.{Parser, Parser0, Rfc5234}
+
+import org.http4s.headers.`Content-Disposition`
+import org.http4s.internal.CharPredicate
+import org.http4s.multipart.Part
+import org.http4s.{Header, ParseFailure, ParseResult}
+import org.typelevel.ci.CIString
+import org.typelevel.ci._
+
+/** A replacement for http4s' `Content-Disposition` class with a slightly modified parser
+  * to allow utf8 filenames.
+  *
+  * The usage of this class is already in the `Part` class to retrieve the filename. This
+  * class can be used as follows:
+  *
+  * {{{ ContentDisposition.getFileName(part) }}
+  *
+  * where `part` is of type `multipart.Part[F]`.
+  */
+case class ContentDisposition(dispositionType: String, parameters: Map[CIString, String])
+
+object ContentDisposition {
+
+  def getFileName[F[_]](part: Part[F]): Option[String] =
+    part.headers.get[ContentDisposition].flatMap(_.parameters.get(ci"filename"))
+
+  private[http4s] val mimeValue: Parser[String] = {
+    val value = Parser.anyChar.repUntilAs[String](Parser.char(';').orElse(Parser.end))
+    val qvalue =
+      Rfc5234.dquote *> Parser.charsWhile(c => c != '"').string <* Rfc5234.dquote
+    qvalue.orElse(value)
+  }
+
+  // --- taken from http4s (v0.23.6) with modification; Licensed under Apache License 2.0
+
+  def parse(s: String): ParseResult[ContentDisposition] =
+    fromParser(parser, "Invalid Content-Disposition header")(s)
+
+  private def fromParser[A](parser: Parser0[A], errorMessage: => String)(
+      s: String
+  ): ParseResult[A] =
+    try parser.parseAll(s).leftMap(e => ParseFailure(errorMessage, e.toString))
+    catch { case p: ParseFailure => p.asLeft[A] }
+
+  /* ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z */
+  private[this] val alpha: Parser[Char] =
+    Parser
+      .charIn(0x41.toChar to 0x5a.toChar)
+      .orElse(Parser.charIn(0x61.toChar to 0x7a.toChar))
+
+  /* DIGIT          =  %x30-39
+   *                       ; 0-9 */
+  private[this] val digit: Parser[Char] =
+    Parser.charIn(0x30.toChar to 0x39.toChar)
+
+  /* The spec references RFC2234, which is 0-9A-F, but it also
+   * explicitly permits lowercase. */
+  private[this] val hexdig: Parser[Char] =
+    digit.orElse(Parser.charIn("ABCDEFabcdef"))
+
+  private[http4s] object Rfc7230 {
+    /* `tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
+     *  "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA`
+     */
+    val tchar: Parser[Char] = Parser.charIn("!#$%&'*+-.^_`|~").orElse(digit).orElse(alpha)
+
+    /* `token = 1*tchar` */
+    val token: Parser[String] = tchar.rep.string
+
+    val htab: Parser[Unit] =
+      Parser.char('\t')
+
+    val sp: Parser[Unit] =
+      Parser.char(' ')
+
+    /* `OWS = *( SP / HTAB )` */
+    val ows: Parser0[Unit] = sp.orElse(htab).rep0.void
+
+  }
+  private[http4s] def makeParser(paramValueParser: Parser[String]) = {
+    sealed trait ValueChar
+    case class AsciiChar(c: Char) extends ValueChar
+    case class EncodedChar(a: Char, b: Char) extends ValueChar
+
+    val attrChar = alpha
+      .orElse(digit)
+      .orElse(Parser.charIn('!', '#', '$', '&', '+', '-', '.', '^', '_', '`', '|', '~'))
+      .map { (a: Char) =>
+        AsciiChar(a)
+      }
+    val pctEncoded = (Parser.string("%") *> hexdig ~ hexdig).map {
+      case (a: Char, b: Char) => EncodedChar(a, b)
+    }
+    val valueChars = attrChar.orElse(pctEncoded).rep
+    val language =
+      (Parser.string(Rfc5234.alpha.rep) ~ (Parser.char('-') *> alpha.rep(1)).?).string
+    val charset: Parser[Charset] =
+      Parser.oneOf(
+        Parser.ignoreCase("UTF-8").as(StandardCharsets.UTF_8) ::
+          Parser.ignoreCase("ISO-8859-1").as(StandardCharsets.ISO_8859_1) ::
+          Parser.ignoreCase("US-ASCII").as(StandardCharsets.US_ASCII) ::
+          Nil
+      )
+    val extValue = (Rfc5234.dquote *> Parser.charsWhile0(
+      CharPredicate.All -- '"'
+    ) <* Rfc5234.dquote) | (charset.? ~ (Parser.string("'") *> language.? <* Parser
+      .string(
+        "'"
+      )) ~ valueChars).map { case ((charset, _), values) =>
+      values
+        .map {
+          case EncodedChar(a: Char, b: Char) =>
+            val charByte = (Character.digit(a, 16) << 4) + Character.digit(b, 16)
+            new String(Array(charByte.toByte), charset.getOrElse(StandardCharsets.UTF_8))
+          case AsciiChar(a) => a.toString
+        }
+        .toList
+        .mkString
+    }
+
+    val parameter = for {
+      tok <- Rfc7230.token <* Parser.string("=") <* Rfc7230.ows
+      v <- if (tok.endsWith("*")) extValue else paramValueParser
+    } yield (CIString(tok), v)
+
+    (Rfc7230.token ~ (Parser.string(";") *> Rfc7230.ows *> parameter).rep0).map {
+      case (token: String, params: List[(CIString, String)]) =>
+        ContentDisposition(token, params.toMap)
+    }
+  }
+
+  private val parser = makeParser(mimeValue)
+  //private val origParser = makeParser(Rfc7230.token | Rfc7230.quotedString)
+
+  implicit val headerInstance: Header[ContentDisposition, Header.Single] = {
+    val oh = `Content-Disposition`.headerInstance
+    Header.createRendered(
+      oh.name,
+      v => oh.value(`Content-Disposition`(v.dispositionType, v.parameters)),
+      parse
+    )
+  }
+}
diff --git a/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala b/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala
new file mode 100644
index 00000000..e5f6b38f
--- /dev/null
+++ b/modules/restserver/src/test/scala/docspell/restserver/http4s/ContentDispositionTest.scala
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.restserver.http4s
+
+import munit._
+import org.http4s.headers.`Content-Disposition`
+import org.typelevel.ci._
+
+class ContentDispositionTest extends FunSuite {
+  def parseGood(s: String): ContentDisposition = ContentDisposition
+    .parse(s)
+    .fold(throw _, identity)
+
+  def parseOrig(s: String): ContentDisposition =
+    `Content-Disposition`
+      .parse(s)
+      .map(v => ContentDisposition(v.dispositionType, v.parameters))
+      .fold(throw _, identity)
+
+  test("allow rfc2231 parameters with charset") {
+    assertEquals(
+      parseGood("form-data; name*=us-ascii''This%20is%20%2A%2A%2Afun%2A%2A%2A"),
+      ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
+    )
+  }
+
+  test("allow rfc2231 parameters with charset and language") {
+    assertEquals(
+      parseGood("form-data; name*=utf-8'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A"),
+      ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
+    )
+  }
+
+  test("allow rfc2231 parameters without charset and language") {
+    assertEquals(
+      parseGood("form-data; name*=''This%20is%20%2A%2A%2Afun%2A%2A%2A"),
+      ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
+    )
+  }
+
+  test("allow rfc2231 parameters with quoted strings") {
+    assertEquals(
+      parseGood("form-data; name*=\"This is ***fun***\""),
+      ContentDisposition("form-data", Map(ci"name*" -> "This is ***fun***"))
+    )
+  }
+
+  test("allow utf8 bytes in filename") {
+    assertEquals(
+      parseGood("""form-data; name="file"; filename="Константинополя.txt""""),
+      ContentDisposition(
+        "form-data",
+        Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt")
+      )
+    )
+    assertEquals(
+      parseGood("""form-data; name=file; filename=Константинополя.txt"""),
+      ContentDisposition(
+        "form-data",
+        Map(ci"name" -> "file", ci"filename" -> "Константинополя.txt")
+      )
+    )
+  }
+
+  // interestingly, this works with the original header impl from http4s. but
+  // i've never seen it being used like that in clients
+  test("unicode in filename with original header impl and filename*") {
+    assertEquals(
+      parseOrig("""form-data; name="file"; filename*="Константинополя.txt""""),
+      ContentDisposition(
+        "form-data",
+        Map(ci"name" -> "file", ci"filename*" -> "Константинополя.txt")
+      )
+    )
+  }
+
+  test("allow simple values") {
+    assertEquals(
+      parseGood("form-data; name=hello"),
+      ContentDisposition("form-data", Map(ci"name" -> "hello"))
+    )
+    assertEquals(
+      parseGood("form-data; name=\"hello\""),
+      ContentDisposition("form-data", Map(ci"name" -> "hello"))
+    )
+    assertEquals(
+      parseGood("form-data; name=\"hello you\""),
+      ContentDisposition("form-data", Map(ci"name" -> "hello you"))
+    )
+  }
+}