Simplify jsoup sanitizer to reuse from emil

This commit is contained in:
Eike Kettner 2020-05-14 23:54:04 +02:00
parent dee697e466
commit 6747a86fea
2 changed files with 9 additions and 20 deletions

View File

@ -1,29 +1,18 @@
package docspell.joex.extract
import org.jsoup.Jsoup
import org.jsoup.nodes._
import emil.BodyContent
import emil.jsoup._
import scodec.bits.ByteVector
import java.io.ByteArrayInputStream
import java.nio.charset.{Charset, StandardCharsets}
import java.nio.charset.Charset
object JsoupSanitizer {
//BIG NOTE: this changes the input document
def apply(doc: Document): Document =
BodyClean.whitelistClean(EmailWhitelist.default)(doc)
val change =
BodyClean.whitelistClean(EmailWhitelist.default)
def clean(html: String): String = {
//note: Jsoup.clean throws away the html head, which removes the
//charset if present
val doc = Jsoup.parse(html)
apply(doc).outerHtml
}
def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
val in = new ByteArrayInputStream(html.toArray)
val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
}
def clean(html: String): String =
BodyClean.modifyContent(change)(BodyContent(html)).asString
def clean(html: ByteVector, cs: Option[Charset]): ByteVector =
BodyClean.modifyContent(change)(BodyContent(html, cs)).bytes
}

View File

@ -35,7 +35,7 @@ object ReadMail {
mail.body,
Some(mail.header),
Some(MarkdownBody.makeHtml(markdownCfg)),
Some(JsoupSanitizer.apply)
Some(JsoupSanitizer.change)
).map(makeHtmlBinary[F] _).map(b => Some(b))
}