From ee394eae86ba15075f4c2372d8f5bc679ba7e73f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 25 May 2020 09:23:44 +0200 Subject: [PATCH] Try streamline the different impls for `MimeType` --- .../scala/docspell/backend/ops/OMail.scala | 4 +-- .../main/scala/docspell/common/MimeType.scala | 36 ++++++++++++++++++- .../docspell/common/ProcessItemArgs.scala | 2 +- .../scala/docspell/convert/Conversion.scala | 29 ++++----------- .../scala/docspell/extract/Extraction.scala | 2 +- .../scala/docspell/joex/mail/ReadMail.scala | 8 ++--- .../docspell/joex/process/ConvertPdf.scala | 8 ++--- .../joex/process/ExtractArchive.scala | 7 ++-- .../joex/process/TextExtraction.scala | 3 +- .../joex/scanmailbox/ScanMailboxTask.scala | 2 +- .../docspell/store/syntax/MimeTypes.scala | 27 ++++++++++++++ 11 files changed, 85 insertions(+), 43 deletions(-) create mode 100644 modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala b/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala index 4c4ae045..18b59f93 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala @@ -5,11 +5,11 @@ import cats.effect._ import cats.implicits._ import cats.data.OptionT import emil._ -import emil.javamail.syntax._ import bitpeace.{FileMeta, RangeDef} import docspell.common._ import docspell.store._ +import docspell.store.syntax.MimeTypes._ import docspell.store.records._ import docspell.store.queries.QMails import OMail.{ImapSettings, ItemMail, Sent, SmtpSettings} @@ -224,7 +224,7 @@ object OMail { Stream.emit(a._2).through(store.bitpeace.fetchData2(RangeDef.all)) ).withFilename(a._1.name) .withLength(a._2.length) - .withMimeType(_root_.emil.MimeType.parse(a._2.mimetype.asString).toOption) + .withMimeType(a._2.mimetype.toLocal.toEmil) } val fields: Seq[Trans[F]] = Seq( From(sett.mailFrom), diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index f5230196..a73a7a53 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -4,6 +4,7 @@ import docspell.common.syntax.all._ import io.circe.{Decoder, Encoder} import java.nio.charset.StandardCharsets import java.nio.charset.Charset +import cats.data.NonEmptyList /** A MIME Type impl with just enough features for the use here. */ @@ -96,18 +97,51 @@ object MimeType { val tiff = image("tiff") val html = text("html") val plain = text("plain") - val eml = MimeType("message", "rfc822", Map.empty) + val emls = NonEmptyList.of( + MimeType("message", "rfc822", Map.empty), + application("mbox") + ) object PdfMatch { def unapply(mt: MimeType): Option[MimeType] = Some(mt).filter(_.matches(pdf)) } + object TextAllMatch { + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(_.primary == "text") + } + object HtmlMatch { def unapply(mt: MimeType): Option[MimeType] = Some(mt).filter(_.matches(html)) } + object NonHtmlText { + def unapply(mt: MimeType): Option[MimeType] = + if (mt.primary == "text" && !mt.sub.contains("html")) Some(mt) + else None + } + + object ZipMatch { + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(_.matches(zip)) + } + + /** Only jpeg, png and tiff */ + object ImageMatch { + val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff) + + def unapply(m: MimeType): Option[MimeType] = + Some(m).map(_.baseType).filter(all.contains) + } + + object EmailMatch { + def unapply(mt: MimeType): Option[MimeType] = + if (emls.exists(mt.matches(_))) Some(mt) + else None + } + implicit val jsonEncoder: Encoder[MimeType] = Encoder.encodeString.contramap(_.asString) diff --git a/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala b/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala index d170eae0..799831f9 100644 --- a/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala +++ b/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala @@ -16,7 +16,7 @@ case class ProcessItemArgs(meta: ProcessMeta, files: List[File]) { def makeSubject: String = files.flatMap(_.name) match { - case Nil => s"${meta.sourceAbbrev}: No files" + case Nil => s"${meta.sourceAbbrev}: No files supplied" case n :: Nil => n case n1 :: n2 :: Nil => s"$n1, $n2" case _ => s"${files.size} files from ${meta.sourceAbbrev}" diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 819910b1..97ed1b98 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -33,10 +33,10 @@ object Conversion { in: Stream[F, Byte] ): F[A] = TikaMimetype.resolve(dataType, in).flatMap { - case Pdfs(_) => + case MimeType.PdfMatch(_) => handler.run(ConversionResult.successPdf(in)) - case mt @ MimeType(_, "html", _) => + case MimeType.HtmlMatch(mt) => val cs = mt.charsetOrUtf8 WkHtmlPdf .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)( @@ -44,7 +44,7 @@ object Conversion { handler ) - case mt @ Texts(_) => + case MimeType.TextAllMatch(mt) => val cs = mt.charsetOrUtf8 Markdown.toHtml(in, cfg.markdown, cs).flatMap { html => val bytes = Stream @@ -60,7 +60,7 @@ object Conversion { )(bytes, handler) } - case Images(mt) => + case MimeType.ImageMatch(mt) => ImageSize.get(in).flatMap { case Some(dim) => if (dim.product > cfg.maxImageSize) @@ -98,23 +98,6 @@ object Conversion { } }) - object Images { - - val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff) - - def unapply(m: MimeType): Option[MimeType] = - Some(m).map(_.baseType).filter(all.contains) - } - - object Texts { - def unapply(m: MimeType): Option[MimeType] = - Some(m).filter(_.primary == "text") - } - - object Pdfs { - def unapply(m: MimeType): Option[MimeType] = - Some(m).filter(_.matches(MimeType.pdf)) - } object Office { val odt = MimeType.application("vnd.oasis.opendocument.text") @@ -158,8 +141,8 @@ object Conversion { def unapply(mt: MimeType): Option[MimeType] = mt match { case Office(_) => Some(mt) - case Texts(_) => Some(mt) - case Images(_) => Some(mt) + case MimeType.TextAllMatch(_) => Some(mt) + case MimeType.ImageMatch(_) => Some(mt) case _ => None } } diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index ade88cdb..d5604499 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -87,7 +87,7 @@ object Extraction { ) *> OdfExtract.get(data).map(ExtractResult.fromEither) - case mt @ MimeType("text", sub, _) if !sub.contains("html") => + case MimeType.NonHtmlText(mt) => val cs = mt.charsetOrUtf8 logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 0cf60538..59b277e4 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -10,6 +10,7 @@ import emil.markdown._ import emil.jsoup._ import docspell.common._ +import docspell.store.syntax.MimeTypes._ object ReadMail { @@ -51,18 +52,13 @@ object ReadMail { .eval(TnefExtract.replace(mail)) .flatMap(m => Stream.emits(m.attachments.all)) .map(a => - Binary(a.filename.getOrElse("noname"), a.mimeType.toDocspell, a.content) + Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content) )) } private def makeHtmlBinary[F[_]](cnt: BodyContent): Binary[F] = Binary.html[F]("mail.html", cnt.bytes, cnt.charsetOrUtf8) - implicit class MimeTypeConv(m: emil.MimeType) { - def toDocspell: MimeType = - MimeType(m.primary, m.sub, m.params) - } - private def bodyType[F[_]](body: MailBody[F]): String = body.fold( _ => "empty-body", diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 3aba96dd..790b758d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -10,6 +10,7 @@ import docspell.common._ import docspell.convert._ import docspell.joex.scheduler._ import docspell.store.records._ +import docspell.store.syntax.MimeTypes._ import docspell.convert.ConversionResult.Handler import docspell.convert.SanitizeHtml import docspell.joex.extract.JsoupSanitizer @@ -60,17 +61,16 @@ object ConvertPdf { item: ItemData )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv => - mime match { - case mt if mt.baseEqual(Mimetype.`application/pdf`) => + mime.toLocal match { + case MimeType.PdfMatch(_) => ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *> (ra, None: Option[RAttachmentMeta]).pure[F] - case _ => + case mt => val data = ctx.store.bitpeace .get(ra.fileId.id) .unNoneTerminate .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - val mt = MimeType(mime.primary, mime.sub, mime.params) val handler = conversionHandler[F](ctx, cfg, ra, item) ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index 06cbba72..5717ade2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -10,6 +10,7 @@ import docspell.common._ import docspell.joex.mail._ import docspell.joex.scheduler._ import docspell.store.records._ +import docspell.store.syntax.MimeTypes._ import docspell.files.Zip import cats.kernel.Monoid import emil.Mail @@ -88,13 +89,13 @@ object ExtractArchive { ctx: Context[F, ProcessItemArgs], archive: Option[RAttachmentArchive] )(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] = - mime match { - case Mimetype("application", "zip", _) if ra.name.exists(_.endsWith(".zip")) => + mime.toLocal match { + case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) => ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("")}.") *> extractZip(ctx, archive)(ra, pos) .flatTap(_ => cleanupParents(ctx, ra, archive)) - case Mimetype("message", "rfc822", _) => + case MimeType.EmailMatch(_) => ctx.logger.info(s"Reading e-mail ${ra.name.getOrElse("")}") *> extractMail(ctx, archive)(ra, pos) .flatTap(_ => cleanupParents(ctx, ra, archive)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 0df63258..539c816e 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -8,6 +8,7 @@ import docspell.common._ import docspell.extract.{ExtractConfig, ExtractResult, Extraction} import docspell.joex.scheduler.{Context, Task} import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta} +import docspell.store.syntax.MimeTypes._ object TextExtraction { @@ -82,7 +83,7 @@ object TextExtraction { findMime .flatMap(mt => - extr.extractText(data, DataType(MimeType(mt.primary, mt.sub, mt.params)), lang) + extr.extractText(data, DataType(mt.toLocal), lang) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala index 8d59b481..96648995 100644 --- a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala @@ -241,7 +241,7 @@ object ScanMailboxTask { def submitMail(upload: OUpload[F])(mail: Mail[F]): F[OUpload.UploadResult] = { val file = OUpload.File( Some(mail.header.subject + ".eml"), - Some(MimeType.eml), + Some(MimeType.emls.head), mail.toByteStream ) for { diff --git a/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala b/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala new file mode 100644 index 00000000..ff36e600 --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/syntax/MimeTypes.scala @@ -0,0 +1,27 @@ +package docspell.store.syntax + +import bitpeace.Mimetype +import docspell.common._ + +object MimeTypes { + + + implicit final class BitpeaceMimeTypeOps(bmt: Mimetype) { + + def toLocal: MimeType = + MimeType(bmt.primary, bmt.sub, bmt.params) + } + + implicit final class EmilMimeTypeOps(emt: emil.MimeType) { + def toLocal: MimeType = + MimeType(emt.primary, emt.sub, emt.params) + } + + implicit final class DocspellMimeTypeOps(mt: MimeType) { + def toEmil: emil.MimeType = + emil.MimeType(mt.primary, mt.sub, mt.params) + + def toBitpeace: Mimetype = + Mimetype(mt.primary, mt.sub, mt.params) + } +}