Simplify jsoup sanitizer to reuse from emil

2025-06-25 05:48:26 +00:00 · 2020-05-14 23:54:04 +02:00
parent dee697e466
commit 6747a86fea
2 changed files with 9 additions and 20 deletions
--- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
+++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
@ -1,29 +1,18 @@
 package docspell.joex.extract

-import org.jsoup.Jsoup
-import org.jsoup.nodes._
+import emil.BodyContent
 import emil.jsoup._
 import scodec.bits.ByteVector
-import java.io.ByteArrayInputStream
-import java.nio.charset.{Charset, StandardCharsets}
+import java.nio.charset.Charset

 object JsoupSanitizer {

-  //BIG NOTE: this changes the input document
-  def apply(doc: Document): Document =
-    BodyClean.whitelistClean(EmailWhitelist.default)(doc)
+  val change =
+    BodyClean.whitelistClean(EmailWhitelist.default)

-  def clean(html: String): String = {
-    //note: Jsoup.clean throws away the html head, which removes the
-    //charset if present
-    val doc = Jsoup.parse(html)
-    apply(doc).outerHtml
-  }
-
-  def clean(html: ByteVector, cs: Option[Charset]): ByteVector = {
-    val in  = new ByteArrayInputStream(html.toArray)
-    val doc = Jsoup.parse(in, cs.map(_.name).orNull, "")
-    ByteVector.view(apply(doc).outerHtml.getBytes(cs.getOrElse(StandardCharsets.UTF_8)))
-  }
+  def clean(html: String): String =
+    BodyClean.modifyContent(change)(BodyContent(html)).asString

+  def clean(html: ByteVector, cs: Option[Charset]): ByteVector =
+    BodyClean.modifyContent(change)(BodyContent(html, cs)).bytes
 }
--- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
+++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala
@ -35,7 +35,7 @@ object ReadMail {
          mail.body,
          Some(mail.header),
          Some(MarkdownBody.makeHtml(markdownCfg)),
-          Some(JsoupSanitizer.apply)
+          Some(JsoupSanitizer.change)
        ).map(makeHtmlBinary[F] _).map(b => Some(b))
      }