diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
index dc8817ed..ca675f02 100644
--- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
+++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
@@ -13,9 +13,11 @@ import emil.jsoup._
import scodec.bits.ByteVector
object JsoupSanitizer {
+ private val whitelist =
+ EmailWhitelist.default.addAttributes(":all", "class")
private val change =
- BodyClean.whitelistClean(EmailWhitelist.default)
+ BodyClean.whitelistClean(whitelist)
def clean(html: String): String =
BodyClean.modifyContent(change)(BodyContent(html)).asString
diff --git a/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala
new file mode 100644
index 00000000..c893ef8a
--- /dev/null
+++ b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.joex.extract
+
+import java.nio.charset.StandardCharsets
+
+import munit.FunSuite
+import org.jsoup.Jsoup
+
+class JsoupSanitizerTest extends FunSuite {
+
+ test("keep interesting tags and attributes") {
+ val cleaned = JsoupSanitizer.clean(html)
+ val doc = Jsoup.parse(cleaned)
+
+ assertEquals(doc.head().getElementsByTag("link").size(), 1)
+ assertEquals(doc.head().getElementsByTag("style").size(), 1)
+ assertEquals(doc.charset(), StandardCharsets.UTF_8)
+ assertEquals(doc.head().select("meta[charset]").attr("charset").toUpperCase, "UTF-8")
+ assert(doc.select("*[class]").size() > 0)
+ assert(doc.select("*[style]").size() > 0)
+ }
+
+ def html =
+ """
+ |
+ |
+ |
+ |
+ |
+ | A simple, clean, and responsive HTML invoice template
+ |
+ |
+ |
+ |
+ | Some html template for an invoice
+ | It is something simple.
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ | |
+ |
+ | Invoice #: 123
+ | Created: January 1, 2015
+ | Due: February 1, 2015
+ | |
+ |
+ |
+ | |
+ |
+ |
+ |
+ |
+ |
+ |
+ | Company, Inc.
+ | 456 Rosewood Road
+ | Flowerville, MI 12345
+ | |
+ |
+ | Acme Corp.
+ | John Doe
+ | john@example.com
+ | |
+ |
+ |
+ | |
+ |
+ |
+ | Payment Method |
+ | Check # |
+ |
+ |
+ | Check |
+ | 1000 |
+ |
+ |
+ | Item |
+ | Price |
+ |
+ |
+ | Website design |
+ | $300.00 |
+ |
+ |
+ | Domain name (1 year) |
+ | $10.00 |
+ |
+ |
+ | |
+ | Total: $385.00 |
+ |
+ |
+ |
+ |
+ |
+ |""".stripMargin
+}