Try streamline the different impls for MimeType

This commit is contained in:
Eike Kettner 2020-05-25 09:23:44 +02:00
parent 7bbc41467c
commit ee394eae86
11 changed files with 85 additions and 43 deletions

View File

@ -5,11 +5,11 @@ import cats.effect._
import cats.implicits._
import cats.data.OptionT
import emil._
import emil.javamail.syntax._
import bitpeace.{FileMeta, RangeDef}
import docspell.common._
import docspell.store._
import docspell.store.syntax.MimeTypes._
import docspell.store.records._
import docspell.store.queries.QMails
import OMail.{ImapSettings, ItemMail, Sent, SmtpSettings}
@ -224,7 +224,7 @@ object OMail {
Stream.emit(a._2).through(store.bitpeace.fetchData2(RangeDef.all))
).withFilename(a._1.name)
.withLength(a._2.length)
.withMimeType(_root_.emil.MimeType.parse(a._2.mimetype.asString).toOption)
.withMimeType(a._2.mimetype.toLocal.toEmil)
}
val fields: Seq[Trans[F]] = Seq(
From(sett.mailFrom),

View File

@ -4,6 +4,7 @@ import docspell.common.syntax.all._
import io.circe.{Decoder, Encoder}
import java.nio.charset.StandardCharsets
import java.nio.charset.Charset
import cats.data.NonEmptyList
/** A MIME Type impl with just enough features for the use here.
*/
@ -96,18 +97,51 @@ object MimeType {
val tiff = image("tiff")
val html = text("html")
val plain = text("plain")
val eml = MimeType("message", "rfc822", Map.empty)
val emls = NonEmptyList.of(
MimeType("message", "rfc822", Map.empty),
application("mbox")
)
object PdfMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(pdf))
}
object TextAllMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.primary == "text")
}
object HtmlMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(html))
}
object NonHtmlText {
def unapply(mt: MimeType): Option[MimeType] =
if (mt.primary == "text" && !mt.sub.contains("html")) Some(mt)
else None
}
object ZipMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(zip))
}
/** Only jpeg, png and tiff */
object ImageMatch {
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
def unapply(m: MimeType): Option[MimeType] =
Some(m).map(_.baseType).filter(all.contains)
}
object EmailMatch {
def unapply(mt: MimeType): Option[MimeType] =
if (emls.exists(mt.matches(_))) Some(mt)
else None
}
implicit val jsonEncoder: Encoder[MimeType] =
Encoder.encodeString.contramap(_.asString)

View File

@ -16,7 +16,7 @@ case class ProcessItemArgs(meta: ProcessMeta, files: List[File]) {
def makeSubject: String =
files.flatMap(_.name) match {
case Nil => s"${meta.sourceAbbrev}: No files"
case Nil => s"${meta.sourceAbbrev}: No files supplied"
case n :: Nil => n
case n1 :: n2 :: Nil => s"$n1, $n2"
case _ => s"${files.size} files from ${meta.sourceAbbrev}"

View File

@ -33,10 +33,10 @@ object Conversion {
in: Stream[F, Byte]
): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case Pdfs(_) =>
case MimeType.PdfMatch(_) =>
handler.run(ConversionResult.successPdf(in))
case mt @ MimeType(_, "html", _) =>
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
@ -44,7 +44,7 @@ object Conversion {
handler
)
case mt @ Texts(_) =>
case MimeType.TextAllMatch(mt) =>
val cs = mt.charsetOrUtf8
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
val bytes = Stream
@ -60,7 +60,7 @@ object Conversion {
)(bytes, handler)
}
case Images(mt) =>
case MimeType.ImageMatch(mt) =>
ImageSize.get(in).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize)
@ -98,23 +98,6 @@ object Conversion {
}
})
object Images {
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
def unapply(m: MimeType): Option[MimeType] =
Some(m).map(_.baseType).filter(all.contains)
}
object Texts {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.primary == "text")
}
object Pdfs {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.matches(MimeType.pdf))
}
object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text")
@ -158,8 +141,8 @@ object Conversion {
def unapply(mt: MimeType): Option[MimeType] =
mt match {
case Office(_) => Some(mt)
case Texts(_) => Some(mt)
case Images(_) => Some(mt)
case MimeType.TextAllMatch(_) => Some(mt)
case MimeType.ImageMatch(_) => Some(mt)
case _ => None
}
}

View File

@ -87,7 +87,7 @@ object Extraction {
) *>
OdfExtract.get(data).map(ExtractResult.fromEither)
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
case MimeType.NonHtmlText(mt) =>
val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>

View File

@ -10,6 +10,7 @@ import emil.markdown._
import emil.jsoup._
import docspell.common._
import docspell.store.syntax.MimeTypes._
object ReadMail {
@ -51,18 +52,13 @@ object ReadMail {
.eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all))
.map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toDocspell, a.content)
Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
))
}
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
implicit class MimeTypeConv(m: emil.MimeType) {
def toDocspell: MimeType =
MimeType(m.primary, m.sub, m.params)
}
private def bodyType[F[_]](body: MailBody[F]): String =
body.fold(
_ => "empty-body",

View File

@ -10,6 +10,7 @@ import docspell.common._
import docspell.convert._
import docspell.joex.scheduler._
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import docspell.convert.ConversionResult.Handler
import docspell.convert.SanitizeHtml
import docspell.joex.extract.JsoupSanitizer
@ -60,17 +61,16 @@ object ConvertPdf {
item: ItemData
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
mime match {
case mt if mt.baseEqual(Mimetype.`application/pdf`) =>
mime.toLocal match {
case MimeType.PdfMatch(_) =>
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F]
case _ =>
case mt =>
val data = ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val mt = MimeType(mime.primary, mime.sub, mime.params)
val handler = conversionHandler[F](ctx, cfg, ra, item)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(

View File

@ -10,6 +10,7 @@ import docspell.common._
import docspell.joex.mail._
import docspell.joex.scheduler._
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import docspell.files.Zip
import cats.kernel.Monoid
import emil.Mail
@ -88,13 +89,13 @@ object ExtractArchive {
ctx: Context[F, ProcessItemArgs],
archive: Option[RAttachmentArchive]
)(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] =
mime match {
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) =>
mime.toLocal match {
case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) =>
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
extractZip(ctx, archive)(ra, pos)
.flatTap(_ => cleanupParents(ctx, ra, archive))
case Mimetype("message", "rfc822", _) =>
case MimeType.EmailMatch(_) =>
ctx.logger.info(s"Reading e-mail ${ra.name.getOrElse("<noname>")}") *>
extractMail(ctx, archive)(ra, pos)
.flatTap(_ => cleanupParents(ctx, ra, archive))

View File

@ -8,6 +8,7 @@ import docspell.common._
import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
import docspell.store.syntax.MimeTypes._
object TextExtraction {
@ -82,7 +83,7 @@ object TextExtraction {
findMime
.flatMap(mt =>
extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang)
extr.extractText(data, DataType(mt.toLocal), lang)
)
}

View File

@ -241,7 +241,7 @@ object ScanMailboxTask {
def submitMail(upload: OUpload[F])(mail: Mail[F]): F[OUpload.UploadResult] = {
val file = OUpload.File(
Some(mail.header.subject + ".eml"),
Some(MimeType.eml),
Some(MimeType.emls.head),
mail.toByteStream
)
for {

View File

@ -0,0 +1,27 @@
package docspell.store.syntax
import bitpeace.Mimetype
import docspell.common._
object MimeTypes {
implicit final class BitpeaceMimeTypeOps(bmt: Mimetype) {
def toLocal: MimeType =
MimeType(bmt.primary, bmt.sub, bmt.params)
}
implicit final class EmilMimeTypeOps(emt: emil.MimeType) {
def toLocal: MimeType =
MimeType(emt.primary, emt.sub, emt.params)
}
implicit final class DocspellMimeTypeOps(mt: MimeType) {
def toEmil: emil.MimeType =
emil.MimeType(mt.primary, mt.sub, mt.params)
def toBitpeace: Mimetype =
Mimetype(mt.primary, mt.sub, mt.params)
}
}