mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 13:32:51 +00:00
Simplify jsoup sanitizer to reuse from emil
This commit is contained in:
parent
dee697e466
commit
6747a86fea
@ -1,29 +1,18 @@
|
||||
package docspell.joex.extract
|
||||
|
||||
import org.jsoup.Jsoup
|
||||
import org.jsoup.nodes._
|
||||
import emil.BodyContent
|
||||
import emil.jsoup._
|
||||
import scodec.bits.ByteVector
|
||||
import java.io.ByteArrayInputStream
|
||||
import java.nio.charset.{Charset, StandardCharsets}
|
||||
import java.nio.charset.Charset
|
||||
|
||||
object JsoupSanitizer {
|
||||
|
||||
//BIG NOTE: this changes the input document
|
||||
def apply(doc: Document): Document =
|
||||
BodyClean.whitelistClean(EmailWhitelist.default)(doc)
|
||||
val change =
|
||||
BodyClean.whitelistClean(EmailWhitelist.default)
|
||||
|
||||
def clean(html: String): String = {
|
||||
//note: Jsoup.clean throws away the html head, which removes the
|
||||
//charset if present
|
||||
val doc = Jsoup.parse(html)
|
||||
apply(doc).outerHtml
|
||||
}
|
||||
|
||||
def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
|
||||
val in = new ByteArrayInputStream(html.toArray)
|
||||
val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
|
||||
ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
|
||||
}
|
||||
def clean(html: String): String =
|
||||
BodyClean.modifyContent(change)(BodyContent(html)).asString
|
||||
|
||||
def clean(html: ByteVector, cs: Option[Charset]): ByteVector =
|
||||
BodyClean.modifyContent(change)(BodyContent(html, cs)).bytes
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ object ReadMail {
|
||||
mail.body,
|
||||
Some(mail.header),
|
||||
Some(MarkdownBody.makeHtml(markdownCfg)),
|
||||
Some(JsoupSanitizer.apply)
|
||||
Some(JsoupSanitizer.change)
|
||||
).map(makeHtmlBinary[F] _).map(b => Some(b))
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user