diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala index dc8817ed..ca675f02 100644 --- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala +++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala @@ -13,9 +13,11 @@ import emil.jsoup._ import scodec.bits.ByteVector object JsoupSanitizer { + private val whitelist = + EmailWhitelist.default.addAttributes(":all", "class") private val change = - BodyClean.whitelistClean(EmailWhitelist.default) + BodyClean.whitelistClean(whitelist) def clean(html: String): String = BodyClean.modifyContent(change)(BodyContent(html)).asString diff --git a/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala new file mode 100644 index 00000000..c893ef8a --- /dev/null +++ b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala @@ -0,0 +1,133 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.joex.extract + +import java.nio.charset.StandardCharsets + +import munit.FunSuite +import org.jsoup.Jsoup + +class JsoupSanitizerTest extends FunSuite { + + test("keep interesting tags and attributes") { + val cleaned = JsoupSanitizer.clean(html) + val doc = Jsoup.parse(cleaned) + + assertEquals(doc.head().getElementsByTag("link").size(), 1) + assertEquals(doc.head().getElementsByTag("style").size(), 1) + assertEquals(doc.charset(), StandardCharsets.UTF_8) + assertEquals(doc.head().select("meta[charset]").attr("charset").toUpperCase, "UTF-8") + assert(doc.select("*[class]").size() > 0) + assert(doc.select("*[style]").size() > 0) + } + + def html = + """ + | + | + | + | + | + | A simple, clean, and responsive HTML invoice template + | + | + | + | + |

Some html template for an invoice

+ |

It is something simple.

+ |
+ | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + |
+ | + | + | + | + | + |
+ | Company logo + | + | Invoice #: 123
+ | Created: January 1, 2015
+ | Due: February 1, 2015 + |
+ |
+ | + | + | + | + | + |
+ | Company, Inc.
+ | 456 Rosewood Road
+ | Flowerville, MI 12345 + |
+ | Acme Corp.
+ | John Doe
+ | john@example.com + |
+ |
Payment MethodCheck #
Check1000
ItemPrice
Website design$300.00
Domain name (1 year)$10.00
Total: $385.00
+ |
+ | + | + |""".stripMargin +}