From ba3865ef5efd64b451b7d8e83e6d73051076809e Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 9 Feb 2020 19:42:49 +0100 Subject: [PATCH] Starting to support more file types First, files are be converted to PDF for archiving. It is also easier to create a preview. This is done via the `ConvertPdf` processing task (which is not yet implemented). Text extraction then tries first with the original file. If that fails, OCR is done on the (potentially) converted pdf file. To not loose information of the original file, it is saved using the table `attachment_source`. If the original file is already a pdf, or the conversion did not succeed, the `attachment` and `attachment_source` record point to the same file. --- .../main/scala/docspell/common/MimeType.scala | 4 +- .../docspell/joex/process/ConvertPdf.scala | 60 +++++++++++++++++++ .../docspell/joex/process/CreateItem.scala | 37 +++++++++--- .../docspell/joex/process/ItemData.scala | 3 +- .../docspell/joex/process/ProcessItem.scala | 3 +- .../joex/process/TextExtraction.scala | 41 +++++++++++-- .../mariadb/V1.2.0__origin_source.sql | 11 ++++ .../postgresql/V1.2.0__origin_source.sql | 11 ++++ .../docspell/store/records/RAttachment.scala | 14 +++++ .../store/records/RAttachmentSource.scala | 44 ++++++++++++++ .../docspell/store/records/RFileMeta.scala | 11 ++++ 11 files changed, 220 insertions(+), 19 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql create mode 100644 modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index 7e6e6647..e6aa6079 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -27,7 +27,7 @@ object MimeType { MimeType("image", partFromString(sub).throwLeft) private[this] val validChars: Set[Char] = - (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet + (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.").toSet def parse(str: String): Either[String, MimeType] = str.indexOf('/') match { @@ -44,7 +44,7 @@ object MimeType { private def partFromString(s: String): Either[String, String] = if (s.forall(validChars.contains)) Right(s) - else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}") + else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}") val octetStream = application("octet-stream") val pdf = application("pdf") diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala new file mode 100644 index 00000000..03bdba4a --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -0,0 +1,60 @@ +package docspell.joex.process + +import bitpeace.Mimetype +import cats.Functor +import cats.implicits._ +import cats.effect._ +import cats.data.OptionT + +import docspell.common._ +import docspell.joex.scheduler._ +import docspell.store.records._ + +/** Goes through all attachments and creates a PDF version of it where + * supported. + * + * The `attachment` record is updated with the PDF version while the + * original file has been stored in the `attachment_source` record. + * + * If pdf conversion is not possible or if the input is already a + * pdf, both files are identical. That is, the `file_id`s point to + * the same file. Since the name of an attachment may be changed by + * the user, the `attachment_origin` record keeps that, too. + * + * This step assumes an existing premature item, it traverses its + * attachments. + */ +object ConvertPdf { + + def apply[F[_]: Sync: ContextShift]( + item: ItemData + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + + // get mimetype + // try to convert + // save to db + // update file_id of RAttachment + + def convert(ra: RAttachment) = + findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m)) + + for { + ras <- item.attachments.traverse(convert) + } yield item.copy(attachments = ras) + + } + + def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + def convertSafe[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs] + )(ra: RAttachment, mime: Mimetype): F[RAttachment] = { + + ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}"). + map(_ => ra) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 916974d1..1ac90139 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -2,11 +2,12 @@ package docspell.joex.process import cats.implicits._ import cats.effect.Sync +import cats.data.OptionT import fs2.Stream import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.queries.QItem -import docspell.store.records.{RAttachment, RItem} +import docspell.store.records.{RAttachment, RAttachmentSource, RItem} /** * Task that creates the item. @@ -53,13 +54,21 @@ object CreateItem { n <- ctx.store.transact(RItem.insert(it)) _ <- if (n != 1) storeItemError[F](ctx) else ().pure[F] fm <- fileMetas(it.id, it.created) - k <- fm.traverse(a => ctx.store.transact(RAttachment.insert(a))) + k <- fm.traverse(insertAttachment(ctx)) _ <- logDifferences(ctx, fm, k.sum) dur <- time _ <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}") - } yield ItemData(it, fm, Vector.empty, Vector.empty) + } yield ItemData(it, fm, Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap) } + def insertAttachment[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Int] = { + val rs = RAttachmentSource.of(ra) + ctx.store.transact(for { + n <- RAttachment.insert(ra) + _ <- RAttachmentSource.insert(rs) + } yield n) + } + def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] = Task { ctx => for { @@ -69,12 +78,18 @@ object CreateItem { ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid)) _ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments") else ().pure[F] - rms <- cand.headOption.traverse(ri => - ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) - ) - } yield cand.headOption.map(ri => - ItemData(ri, rms.getOrElse(Vector.empty), Vector.empty, Vector.empty) - ) + rms <- OptionT( + cand.headOption.traverse(ri => + ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) + ) + ).getOrElse(Vector.empty) + orig <- rms.traverse(a => + ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s)) + ) + origMap = orig + .map(originFileTuple) + .toMap + } yield cand.headOption.map(ri => ItemData(ri, rms, Vector.empty, Vector.empty, origMap)) } private def logDifferences[F[_]: Sync]( @@ -94,4 +109,8 @@ object CreateItem { val msg = "Inserting item failed. DB returned 0 update count!" ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg)) } + + //TODO if no source is present, it must be saved! + private def originFileTuple(t: (RAttachment, Option[RAttachmentSource])): (Ident, Ident) = + t._2.map(s => s.id -> s.fileId).getOrElse(t._1.id -> t._1.fileId) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index a6f751f7..c5f474a5 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -8,7 +8,8 @@ case class ItemData( item: RItem, attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], - dateLabels: Vector[AttachmentDates] + dateLabels: Vector[AttachmentDates], + originFile: Map[Ident, Ident] ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 88d16892..679625a2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -10,7 +10,8 @@ object ProcessItem { def apply[F[_]: Sync: ContextShift]( cfg: OcrConfig )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextExtraction(cfg, item) + ConvertPdf(item) + .flatMap(TextExtraction(cfg, _)) .flatMap(Task.setProgress(25)) .flatMap(TextAnalysis[F]) .flatMap(Task.setProgress(50)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 157fdfee..478f6a91 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -3,7 +3,7 @@ package docspell.joex.process import bitpeace.RangeDef import cats.implicits._ import cats.effect.{Blocker, ContextShift, Sync} -import docspell.common.{Duration, Language, ProcessItemArgs} +import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.Store import docspell.store.records.{RAttachment, RAttachmentMeta} @@ -19,7 +19,7 @@ object TextExtraction { for { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] - txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language)) + txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item)) _ <- ctx.logger.debug("Storing extracted texts") _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) dur <- start @@ -30,12 +30,13 @@ object TextExtraction { def extractTextToMeta[F[_]: Sync: ContextShift]( ctx: Context[F, _], cfg: OcrConfig, - lang: Language + lang: Language, + item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") dst <- Duration.stopTime[F] - txt <- extractText(cfg, lang, ctx.store, ctx.blocker)(ra) + txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra)) meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) est <- dst _ <- ctx.logger.debug( @@ -48,12 +49,40 @@ object TextExtraction { lang: Language, store: Store[F], blocker: Blocker - )(ra: RAttachment): F[Option[String]] = { + )(fileId: Ident): F[Option[String]] = { val data = store.bitpeace - .get(ra.fileId.id) + .get(fileId.id) .unNoneTerminate .through(store.bitpeace.fetchData2(RangeDef.all)) TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last } + + private def extractTextFallback[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + ocrConfig: OcrConfig, + lang: Language, + )(fileIds: List[Ident]): F[Option[String]] = { + fileIds match { + case Nil => + ctx.logger.error(s"Cannot extract text").map(_ => None) + + case id :: rest => + extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id). + recoverWith({ + case ex => + ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). + flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest)) + }) + } + } + + /** Returns the fileIds to extract text from. First, the source file + * is tried. If that fails, the converted file is tried. + */ + private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] = + item.originFile.get(ra.id) match { + case Some(sid) => List(sid, ra.fileId).distinct + case None => List(ra.fileId) + } } diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql new file mode 100644 index 00000000..6f93ca0c --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE `attachment_source` ( + `id` varchar(254) not null primary key, + `file_id` varchar(254) not null, + `filename` varchar(254), + `created` timestamp not null, + foreign key (`file_id`) references `filemeta`(`id`), + foreign key (`id`) references `attachment`(`attachid`) +); + +INSERT INTO `attachment_source` + SELECT `attachid`,`filemetaid`,`name`,`created` FROM `attachment`; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql new file mode 100644 index 00000000..630ea05d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE "attachment_source" ( + "id" varchar(254) not null primary key, + "file_id" varchar(254) not null, + "filename" varchar(254), + "created" timestamp not null, + foreign key ("file_id") references "filemeta"("id"), + foreign key ("id") references "attachment"("attachid") +); + +INSERT INTO "attachment_source" + SELECT "attachid","filemetaid","name","created" FROM "attachment"; diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index ee193e69..22ab8e89 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -41,6 +41,20 @@ object RAttachment { def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = selectSimple(all, table, id.is(attachId)).query[RAttachment].option + def findMeta(attachId: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + val cols = RFileMeta.Columns.all.map(_.prefix("m")) + val aId = id.prefix("a") + val aFileMeta = fileId.prefix("a") + val mId = RFileMeta.Columns.id.prefix("m") + + val from = table ++ fr"a INNER JOIN" ++ RFileMeta.table ++ fr"m ON" ++ aFileMeta.is(mId) + val cond = aId.is(attachId) + + selectSimple(cols, from, cond).query[FileMeta].option + } + def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] = selectSimple( all.map(_.prefix("a")), diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala new file mode 100644 index 00000000..447af3aa --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala @@ -0,0 +1,44 @@ +package docspell.store.records + +import doobie._ +import doobie.implicits._ +import docspell.common._ +import docspell.store.impl._ +import docspell.store.impl.Implicits._ + +/** The origin file of an attachment. The `id` is shared with the + * attachment, to create a 1-1 (or 0..1-1) relationship. + */ +case class RAttachmentSource( + id: Ident, //same as RAttachment.id + fileId: Ident, + name: Option[String], + created: Timestamp +) + +object RAttachmentSource { + + val table = fr"attachment_source" + + object Columns { + val id = Column("id") + val fileId = Column("file_id") + val name = Column("filename") + val created = Column("created") + + val all = List(id, fileId, name, created) + } + + import Columns._ + + def of(ra: RAttachment): RAttachmentSource = + RAttachmentSource(ra.id, ra.fileId, ra.name, ra.created) + + def insert(v: RAttachmentSource): ConnectionIO[Int] = + insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run + + + def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] = + selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option + +} diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index daa81029..1749a653 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -1,7 +1,12 @@ package docspell.store.records +import bitpeace.FileMeta +import doobie._ import doobie.implicits._ + +import docspell.common._ import docspell.store.impl._ +import docspell.store.impl.Implicits._ object RFileMeta { @@ -19,4 +24,10 @@ object RFileMeta { val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize) } + + def findById(fid: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option + } }