diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala index dbc1ba66..52589148 100644 --- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala +++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala @@ -1,29 +1,18 @@ package docspell.joex.extract -import org.jsoup.Jsoup -import org.jsoup.nodes._ +import emil.BodyContent import emil.jsoup._ import scodec.bits.ByteVector -import java.io.ByteArrayInputStream -import java.nio.charset.{Charset, StandardCharsets} +import java.nio.charset.Charset object JsoupSanitizer { - //BIG NOTE: this changes the input document - def apply(doc: Document): Document = - BodyClean.whitelistClean(EmailWhitelist.default)(doc) + val change = + BodyClean.whitelistClean(EmailWhitelist.default) - def clean(html: String): String = { - //note: Jsoup.clean throws away the html head, which removes the - //charset if present - val doc = Jsoup.parse(html) - apply(doc).outerHtml - } - - def clean(html: ByteVector, cs: Option[Charset]): ByteVector = { - val in = new ByteArrayInputStream(html.toArray) - val doc = Jsoup.parse(in, cs.map(_.name).orNull, "") - ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8))) - } + def clean(html: String): String = + BodyClean.modifyContent(change)(BodyContent(html)).asString + def clean(html: ByteVector, cs: Option[Charset]): ByteVector = + BodyClean.modifyContent(change)(BodyContent(html, cs)).bytes } diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index c5b119d0..280c23c0 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -35,7 +35,7 @@ object ReadMail { mail.body, Some(mail.header), Some(MarkdownBody.makeHtml(markdownCfg)), - Some(JsoupSanitizer.apply) + Some(JsoupSanitizer.change) ).map(makeHtmlBinary[F] _).map(b => Some(b)) }