mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Extract tnef attachments and fix incomplete html
The wkhtmltopdf requires the content encoding set correctly in the document.
This commit is contained in:
		@@ -204,6 +204,7 @@ val store = project.in(file("modules/store")).
 | 
			
		||||
      Dependencies.flyway ++
 | 
			
		||||
      Dependencies.loggingApi ++
 | 
			
		||||
      Dependencies.emil ++
 | 
			
		||||
      Dependencies.emilDoobie ++
 | 
			
		||||
      Dependencies.calev
 | 
			
		||||
  ).dependsOn(common)
 | 
			
		||||
 | 
			
		||||
@@ -293,6 +294,7 @@ val joex = project.in(file("modules/joex")).
 | 
			
		||||
      Dependencies.http4s ++
 | 
			
		||||
      Dependencies.circe ++
 | 
			
		||||
      Dependencies.pureconfig ++
 | 
			
		||||
      Dependencies.emilTnef ++
 | 
			
		||||
      Dependencies.loggingApi ++
 | 
			
		||||
      Dependencies.logging.map(_ % Runtime),
 | 
			
		||||
    addCompilerPlugin(Dependencies.kindProjectorPlugin),
 | 
			
		||||
 
 | 
			
		||||
@@ -5,17 +5,21 @@ import cats.implicits._
 | 
			
		||||
import fs2.{Pipe, Stream}
 | 
			
		||||
import emil.{MimeType => _, _}
 | 
			
		||||
import emil.javamail.syntax._
 | 
			
		||||
import cats.Applicative
 | 
			
		||||
import emil.tnef.TnefExtract
 | 
			
		||||
 | 
			
		||||
import docspell.common._
 | 
			
		||||
import java.nio.charset.StandardCharsets
 | 
			
		||||
import java.nio.charset.Charset
 | 
			
		||||
import scodec.bits.ByteVector
 | 
			
		||||
 | 
			
		||||
object ReadMail {
 | 
			
		||||
 | 
			
		||||
  def read[F[_]: Sync](str: String): F[Mail[F]] =
 | 
			
		||||
    Mail.deserialize(str)
 | 
			
		||||
 | 
			
		||||
  def readBytesP[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Binary[F]] =
 | 
			
		||||
  def readBytesP[F[_]: ConcurrentEffect: ContextShift](
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  ): Pipe[F, Byte, Binary[F]] =
 | 
			
		||||
    s =>
 | 
			
		||||
      Stream.eval(logger.debug(s"Converting e-mail into its parts")) >>
 | 
			
		||||
        bytesToMail(s).flatMap(mailToEntries[F](logger))
 | 
			
		||||
@@ -23,14 +27,14 @@ object ReadMail {
 | 
			
		||||
  def bytesToMail[F[_]: Sync](data: Stream[F, Byte]): Stream[F, Mail[F]] =
 | 
			
		||||
    data.through(Binary.decode(StandardCharsets.US_ASCII)).foldMonoid.evalMap(read[F])
 | 
			
		||||
 | 
			
		||||
  def mailToEntries[F[_]: Applicative](
 | 
			
		||||
  def mailToEntries[F[_]: ConcurrentEffect: ContextShift](
 | 
			
		||||
      logger: Logger[F]
 | 
			
		||||
  )(mail: Mail[F]): Stream[F, Binary[F]] = {
 | 
			
		||||
    val bodyEntry: F[Option[Binary[F]]] = mail.body.fold(
 | 
			
		||||
      _ => (None: Option[Binary[F]]).pure[F],
 | 
			
		||||
      txt => txt.text.map(c => Binary.text[F]("mail.txt", c.bytes, c.charsetOrUtf8).some),
 | 
			
		||||
      html => html.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some),
 | 
			
		||||
      both => both.html.map(c => Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8).some)
 | 
			
		||||
      html => html.html.map(c => makeHtmlBinary(c).some),
 | 
			
		||||
      both => both.html.map(c => makeHtmlBinary(c).some)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    Stream.eval(
 | 
			
		||||
@@ -42,12 +46,33 @@ object ReadMail {
 | 
			
		||||
        .eval(bodyEntry)
 | 
			
		||||
        .flatMap(e => Stream.emits(e.toSeq)) ++
 | 
			
		||||
        Stream
 | 
			
		||||
          .emits(mail.attachments.all)
 | 
			
		||||
          .eval(TnefExtract.replace(mail))
 | 
			
		||||
          .flatMap(m => Stream.emits(m.attachments.all))
 | 
			
		||||
          .map(a =>
 | 
			
		||||
            Binary(a.filename.getOrElse("noname"), a.mimeType.toDocspell, a.content)
 | 
			
		||||
          ))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = {
 | 
			
		||||
    val c = fixHtml(cnt)
 | 
			
		||||
    Binary.html[F]("mail.html", c.bytes, c.charsetOrUtf8)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private def fixHtml(cnt: BodyContent): BodyContent = {
 | 
			
		||||
    val str = cnt.asString.trim.toLowerCase
 | 
			
		||||
    val head = htmlHeader(cnt.charsetOrUtf8)
 | 
			
		||||
    if (str.startsWith("<html")) cnt
 | 
			
		||||
    else cnt match {
 | 
			
		||||
      case BodyContent.StringContent(s) =>
 | 
			
		||||
        BodyContent(head + s + htmlHeaderEnd)
 | 
			
		||||
      case BodyContent.ByteContent(bv, cs) =>
 | 
			
		||||
        val begin = ByteVector.view(head.getBytes(cnt.charsetOrUtf8))
 | 
			
		||||
        val end = ByteVector.view(htmlHeaderEnd.getBytes(cnt.charsetOrUtf8))
 | 
			
		||||
        BodyContent(begin ++ bv ++ end, cs)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  implicit class MimeTypeConv(m: emil.MimeType) {
 | 
			
		||||
    def toDocspell: MimeType =
 | 
			
		||||
      MimeType(m.primary, m.sub, m.params)
 | 
			
		||||
@@ -60,4 +85,16 @@ object ReadMail {
 | 
			
		||||
      _ => "html-body",
 | 
			
		||||
      _ => "text-and-html-body"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
  private def htmlHeader(cs: Charset): String =
 | 
			
		||||
    s"""<!DOCTYPE html>
 | 
			
		||||
       |<html>
 | 
			
		||||
       |<head>
 | 
			
		||||
       |<meta charset="${cs.name}"/>
 | 
			
		||||
       |</head>
 | 
			
		||||
       |<body>
 | 
			
		||||
       """
 | 
			
		||||
 | 
			
		||||
  private def htmlHeaderEnd: String =
 | 
			
		||||
    "</body></html>"
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -126,7 +126,7 @@ object ExtractArchive {
 | 
			
		||||
      .lastOrError
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  def extractMail[F[_]: Sync](
 | 
			
		||||
  def extractMail[F[_]: ConcurrentEffect: ContextShift](
 | 
			
		||||
      ctx: Context[F, _],
 | 
			
		||||
      archive: Option[RAttachmentArchive]
 | 
			
		||||
  )(ra: RAttachment): F[Extracted] = {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user