Simplify jsoup sanitizer to reuse from emil

This commit is contained in:
Eike Kettner 2020-05-14 23:54:04 +02:00
parent dee697e466
commit 6747a86fea
2 changed files with 9 additions and 20 deletions

View File

@ -1,29 +1,18 @@
package docspell.joex.extract package docspell.joex.extract
import org.jsoup.Jsoup import emil.BodyContent
import org.jsoup.nodes._
import emil.jsoup._ import emil.jsoup._
import scodec.bits.ByteVector import scodec.bits.ByteVector
import java.io.ByteArrayInputStream import java.nio.charset.Charset
import java.nio.charset.{Charset, StandardCharsets}
object JsoupSanitizer { object JsoupSanitizer {
//BIG NOTE: this changes the input document val change =
def apply(doc: Document): Document = BodyClean.whitelistClean(EmailWhitelist.default)
BodyClean.whitelistClean(EmailWhitelist.default)(doc)
def clean(html: String): String = { def clean(html: String): String =
//note: Jsoup.clean throws away the html head, which removes the BodyClean.modifyContent(change)(BodyContent(html)).asString
//charset if present
val doc = Jsoup.parse(html)
apply(doc).outerHtml
}
def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
val in = new ByteArrayInputStream(html.toArray)
val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
}
def clean(html: ByteVector, cs: Option[Charset]): ByteVector =
BodyClean.modifyContent(change)(BodyContent(html, cs)).bytes
} }

View File

@ -35,7 +35,7 @@ object ReadMail {
mail.body, mail.body,
Some(mail.header), Some(mail.header),
Some(MarkdownBody.makeHtml(markdownCfg)), Some(MarkdownBody.makeHtml(markdownCfg)),
Some(JsoupSanitizer.apply) Some(JsoupSanitizer.change)
).map(makeHtmlBinary[F] _).map(b => Some(b)) ).map(makeHtmlBinary[F] _).map(b => Some(b))
} }