Try streamline the different impls for MimeType

This commit is contained in:
Eike Kettner 2020-05-25 09:23:44 +02:00
parent 7bbc41467c
commit ee394eae86
11 changed files with 85 additions and 43 deletions

View File

@ -5,11 +5,11 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import cats.data.OptionT import cats.data.OptionT
import emil._ import emil._
import emil.javamail.syntax._
import bitpeace.{FileMeta, RangeDef} import bitpeace.{FileMeta, RangeDef}
import docspell.common._ import docspell.common._
import docspell.store._ import docspell.store._
import docspell.store.syntax.MimeTypes._
import docspell.store.records._ import docspell.store.records._
import docspell.store.queries.QMails import docspell.store.queries.QMails
import OMail.{ImapSettings, ItemMail, Sent, SmtpSettings} import OMail.{ImapSettings, ItemMail, Sent, SmtpSettings}
@ -224,7 +224,7 @@ object OMail {
Stream.emit(a._2).through(store.bitpeace.fetchData2(RangeDef.all)) Stream.emit(a._2).through(store.bitpeace.fetchData2(RangeDef.all))
).withFilename(a._1.name) ).withFilename(a._1.name)
.withLength(a._2.length) .withLength(a._2.length)
.withMimeType(_root_.emil.MimeType.parse(a._2.mimetype.asString).toOption) .withMimeType(a._2.mimetype.toLocal.toEmil)
} }
val fields: Seq[Trans[F]] = Seq( val fields: Seq[Trans[F]] = Seq(
From(sett.mailFrom), From(sett.mailFrom),

View File

@ -4,6 +4,7 @@ import docspell.common.syntax.all._
import io.circe.{Decoder, Encoder} import io.circe.{Decoder, Encoder}
import java.nio.charset.StandardCharsets import java.nio.charset.StandardCharsets
import java.nio.charset.Charset import java.nio.charset.Charset
import cats.data.NonEmptyList
/** A MIME Type impl with just enough features for the use here. /** A MIME Type impl with just enough features for the use here.
*/ */
@ -96,18 +97,51 @@ object MimeType {
val tiff = image("tiff") val tiff = image("tiff")
val html = text("html") val html = text("html")
val plain = text("plain") val plain = text("plain")
val eml = MimeType("message", "rfc822", Map.empty) val emls = NonEmptyList.of(
MimeType("message", "rfc822", Map.empty),
application("mbox")
)
object PdfMatch { object PdfMatch {
def unapply(mt: MimeType): Option[MimeType] = def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(pdf)) Some(mt).filter(_.matches(pdf))
} }
object TextAllMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.primary == "text")
}
object HtmlMatch { object HtmlMatch {
def unapply(mt: MimeType): Option[MimeType] = def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(html)) Some(mt).filter(_.matches(html))
} }
object NonHtmlText {
def unapply(mt: MimeType): Option[MimeType] =
if (mt.primary == "text" && !mt.sub.contains("html")) Some(mt)
else None
}
object ZipMatch {
def unapply(mt: MimeType): Option[MimeType] =
Some(mt).filter(_.matches(zip))
}
/** Only jpeg, png and tiff */
object ImageMatch {
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
def unapply(m: MimeType): Option[MimeType] =
Some(m).map(_.baseType).filter(all.contains)
}
object EmailMatch {
def unapply(mt: MimeType): Option[MimeType] =
if (emls.exists(mt.matches(_))) Some(mt)
else None
}
implicit val jsonEncoder: Encoder[MimeType] = implicit val jsonEncoder: Encoder[MimeType] =
Encoder.encodeString.contramap(_.asString) Encoder.encodeString.contramap(_.asString)

View File

@ -16,7 +16,7 @@ case class ProcessItemArgs(meta: ProcessMeta, files: List[File]) {
def makeSubject: String = def makeSubject: String =
files.flatMap(_.name) match { files.flatMap(_.name) match {
case Nil => s"${meta.sourceAbbrev}: No files" case Nil => s"${meta.sourceAbbrev}: No files supplied"
case n :: Nil => n case n :: Nil => n
case n1 :: n2 :: Nil => s"$n1, $n2" case n1 :: n2 :: Nil => s"$n1, $n2"
case _ => s"${files.size} files from ${meta.sourceAbbrev}" case _ => s"${files.size} files from ${meta.sourceAbbrev}"

View File

@ -33,10 +33,10 @@ object Conversion {
in: Stream[F, Byte] in: Stream[F, Byte]
): F[A] = ): F[A] =
TikaMimetype.resolve(dataType, in).flatMap { TikaMimetype.resolve(dataType, in).flatMap {
case Pdfs(_) => case MimeType.PdfMatch(_) =>
handler.run(ConversionResult.successPdf(in)) handler.run(ConversionResult.successPdf(in))
case mt @ MimeType(_, "html", _) => case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
WkHtmlPdf WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)( .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
@ -44,7 +44,7 @@ object Conversion {
handler handler
) )
case mt @ Texts(_) => case MimeType.TextAllMatch(mt) =>
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html => Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
val bytes = Stream val bytes = Stream
@ -60,7 +60,7 @@ object Conversion {
)(bytes, handler) )(bytes, handler)
} }
case Images(mt) => case MimeType.ImageMatch(mt) =>
ImageSize.get(in).flatMap { ImageSize.get(in).flatMap {
case Some(dim) => case Some(dim) =>
if (dim.product > cfg.maxImageSize) if (dim.product > cfg.maxImageSize)
@ -98,23 +98,6 @@ object Conversion {
} }
}) })
object Images {
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
def unapply(m: MimeType): Option[MimeType] =
Some(m).map(_.baseType).filter(all.contains)
}
object Texts {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.primary == "text")
}
object Pdfs {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.matches(MimeType.pdf))
}
object Office { object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text") val odt = MimeType.application("vnd.oasis.opendocument.text")
@ -158,8 +141,8 @@ object Conversion {
def unapply(mt: MimeType): Option[MimeType] = def unapply(mt: MimeType): Option[MimeType] =
mt match { mt match {
case Office(_) => Some(mt) case Office(_) => Some(mt)
case Texts(_) => Some(mt) case MimeType.TextAllMatch(_) => Some(mt)
case Images(_) => Some(mt) case MimeType.ImageMatch(_) => Some(mt)
case _ => None case _ => None
} }
} }

View File

@ -87,7 +87,7 @@ object Extraction {
) *> ) *>
OdfExtract.get(data).map(ExtractResult.fromEither) OdfExtract.get(data).map(ExtractResult.fromEither)
case mt @ MimeType("text", sub, _) if !sub.contains("html") => case MimeType.NonHtmlText(mt) =>
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>

View File

@ -10,6 +10,7 @@ import emil.markdown._
import emil.jsoup._ import emil.jsoup._
import docspell.common._ import docspell.common._
import docspell.store.syntax.MimeTypes._
object ReadMail { object ReadMail {
@ -51,18 +52,13 @@ object ReadMail {
.eval(TnefExtract.replace(mail)) .eval(TnefExtract.replace(mail))
.flatMap(m => Stream.emits(m.attachments.all)) .flatMap(m => Stream.emits(m.attachments.all))
.map(a => .map(a =>
Binary(a.filename.getOrElse("noname"), a.mimeType.toDocspell, a.content) Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content)
)) ))
} }
private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] =
Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8) Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8)
implicit class MimeTypeConv(m: emil.MimeType) {
def toDocspell: MimeType =
MimeType(m.primary, m.sub, m.params)
}
private def bodyType[F[_]](body: MailBody[F]): String = private def bodyType[F[_]](body: MailBody[F]): String =
body.fold( body.fold(
_ => "empty-body", _ => "empty-body",

View File

@ -10,6 +10,7 @@ import docspell.common._
import docspell.convert._ import docspell.convert._
import docspell.joex.scheduler._ import docspell.joex.scheduler._
import docspell.store.records._ import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import docspell.convert.ConversionResult.Handler import docspell.convert.ConversionResult.Handler
import docspell.convert.SanitizeHtml import docspell.convert.SanitizeHtml
import docspell.joex.extract.JsoupSanitizer import docspell.joex.extract.JsoupSanitizer
@ -60,17 +61,16 @@ object ConvertPdf {
item: ItemData item: ItemData
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv => Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
mime match { mime.toLocal match {
case mt if mt.baseEqual(Mimetype.`application/pdf`) => case MimeType.PdfMatch(_) =>
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *> ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
(ra, None: Option[RAttachmentMeta]).pure[F] (ra, None: Option[RAttachmentMeta]).pure[F]
case _ => case mt =>
val data = ctx.store.bitpeace val data = ctx.store.bitpeace
.get(ra.fileId.id) .get(ra.fileId.id)
.unNoneTerminate .unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all)) .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val mt = MimeType(mime.primary, mime.sub, mime.params)
val handler = conversionHandler[F](ctx, cfg, ra, item) val handler = conversionHandler[F](ctx, cfg, ra, item)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(

View File

@ -10,6 +10,7 @@ import docspell.common._
import docspell.joex.mail._ import docspell.joex.mail._
import docspell.joex.scheduler._ import docspell.joex.scheduler._
import docspell.store.records._ import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import docspell.files.Zip import docspell.files.Zip
import cats.kernel.Monoid import cats.kernel.Monoid
import emil.Mail import emil.Mail
@ -88,13 +89,13 @@ object ExtractArchive {
ctx: Context[F, ProcessItemArgs], ctx: Context[F, ProcessItemArgs],
archive: Option[RAttachmentArchive] archive: Option[RAttachmentArchive]
)(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] = )(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] =
mime match { mime.toLocal match {
case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) => case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) =>
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *> ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
extractZip(ctx, archive)(ra, pos) extractZip(ctx, archive)(ra, pos)
.flatTap(_ => cleanupParents(ctx, ra, archive)) .flatTap(_ => cleanupParents(ctx, ra, archive))
case Mimetype("message", "rfc822", _) => case MimeType.EmailMatch(_) =>
ctx.logger.info(s"Reading e-mail ${ra.name.getOrElse("<noname>")}") *> ctx.logger.info(s"Reading e-mail ${ra.name.getOrElse("<noname>")}") *>
extractMail(ctx, archive)(ra, pos) extractMail(ctx, archive)(ra, pos)
.flatTap(_ => cleanupParents(ctx, ra, archive)) .flatTap(_ => cleanupParents(ctx, ra, archive))

View File

@ -8,6 +8,7 @@ import docspell.common._
import docspell.extract.{ExtractConfig, ExtractResult, Extraction} import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
import docspell.joex.scheduler.{Context, Task} import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta} import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
import docspell.store.syntax.MimeTypes._
object TextExtraction { object TextExtraction {
@ -82,7 +83,7 @@ object TextExtraction {
findMime findMime
.flatMap(mt => .flatMap(mt =>
extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang) extr.extractText(data, DataType(mt.toLocal), lang)
) )
} }

View File

@ -241,7 +241,7 @@ object ScanMailboxTask {
def submitMail(upload: OUpload[F])(mail: Mail[F]): F[OUpload.UploadResult] = { def submitMail(upload: OUpload[F])(mail: Mail[F]): F[OUpload.UploadResult] = {
val file = OUpload.File( val file = OUpload.File(
Some(mail.header.subject + ".eml"), Some(mail.header.subject + ".eml"),
Some(MimeType.eml), Some(MimeType.emls.head),
mail.toByteStream mail.toByteStream
) )
for { for {

View File

@ -0,0 +1,27 @@
package docspell.store.syntax
import bitpeace.Mimetype
import docspell.common._
object MimeTypes {
implicit final class BitpeaceMimeTypeOps(bmt: Mimetype) {
def toLocal: MimeType =
MimeType(bmt.primary, bmt.sub, bmt.params)
}
implicit final class EmilMimeTypeOps(emt: emil.MimeType) {
def toLocal: MimeType =
MimeType(emt.primary, emt.sub, emt.params)
}
implicit final class DocspellMimeTypeOps(mt: MimeType) {
def toEmil: emil.MimeType =
emil.MimeType(mt.primary, mt.sub, mt.params)
def toBitpeace: Mimetype =
Mimetype(mt.primary, mt.sub, mt.params)
}
}