diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index 7e6e6647..e6aa6079 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -27,7 +27,7 @@ object MimeType { MimeType("image", partFromString(sub).throwLeft) private[this] val validChars: Set[Char] = - (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet + (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.").toSet def parse(str: String): Either[String, MimeType] = str.indexOf('/') match { @@ -44,7 +44,7 @@ object MimeType { private def partFromString(s: String): Either[String, String] = if (s.forall(validChars.contains)) Right(s) - else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}") + else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}") val octetStream = application("octet-stream") val pdf = application("pdf") diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala new file mode 100644 index 00000000..03bdba4a --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -0,0 +1,60 @@ +package docspell.joex.process + +import bitpeace.Mimetype +import cats.Functor +import cats.implicits._ +import cats.effect._ +import cats.data.OptionT + +import docspell.common._ +import docspell.joex.scheduler._ +import docspell.store.records._ + +/** Goes through all attachments and creates a PDF version of it where + * supported. + * + * The `attachment` record is updated with the PDF version while the + * original file has been stored in the `attachment_source` record. + * + * If pdf conversion is not possible or if the input is already a + * pdf, both files are identical. That is, the `file_id`s point to + * the same file. Since the name of an attachment may be changed by + * the user, the `attachment_origin` record keeps that, too. + * + * This step assumes an existing premature item, it traverses its + * attachments. + */ +object ConvertPdf { + + def apply[F[_]: Sync: ContextShift]( + item: ItemData + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + + // get mimetype + // try to convert + // save to db + // update file_id of RAttachment + + def convert(ra: RAttachment) = + findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m)) + + for { + ras <- item.attachments.traverse(convert) + } yield item.copy(attachments = ras) + + } + + def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + def convertSafe[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs] + )(ra: RAttachment, mime: Mimetype): F[RAttachment] = { + + ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}"). + map(_ => ra) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 916974d1..1ac90139 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -2,11 +2,12 @@ package docspell.joex.process import cats.implicits._ import cats.effect.Sync +import cats.data.OptionT import fs2.Stream import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.queries.QItem -import docspell.store.records.{RAttachment, RItem} +import docspell.store.records.{RAttachment, RAttachmentSource, RItem} /** * Task that creates the item. @@ -53,13 +54,21 @@ object CreateItem { n <- ctx.store.transact(RItem.insert(it)) _ <- if (n != 1) storeItemError[F](ctx) else ().pure[F] fm <- fileMetas(it.id, it.created) - k <- fm.traverse(a => ctx.store.transact(RAttachment.insert(a))) + k <- fm.traverse(insertAttachment(ctx)) _ <- logDifferences(ctx, fm, k.sum) dur <- time _ <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}") - } yield ItemData(it, fm, Vector.empty, Vector.empty) + } yield ItemData(it, fm, Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap) } + def insertAttachment[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Int] = { + val rs = RAttachmentSource.of(ra) + ctx.store.transact(for { + n <- RAttachment.insert(ra) + _ <- RAttachmentSource.insert(rs) + } yield n) + } + def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] = Task { ctx => for { @@ -69,12 +78,18 @@ object CreateItem { ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid)) _ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments") else ().pure[F] - rms <- cand.headOption.traverse(ri => - ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) - ) - } yield cand.headOption.map(ri => - ItemData(ri, rms.getOrElse(Vector.empty), Vector.empty, Vector.empty) - ) + rms <- OptionT( + cand.headOption.traverse(ri => + ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) + ) + ).getOrElse(Vector.empty) + orig <- rms.traverse(a => + ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s)) + ) + origMap = orig + .map(originFileTuple) + .toMap + } yield cand.headOption.map(ri => ItemData(ri, rms, Vector.empty, Vector.empty, origMap)) } private def logDifferences[F[_]: Sync]( @@ -94,4 +109,8 @@ object CreateItem { val msg = "Inserting item failed. DB returned 0 update count!" ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg)) } + + //TODO if no source is present, it must be saved! + private def originFileTuple(t: (RAttachment, Option[RAttachmentSource])): (Ident, Ident) = + t._2.map(s => s.id -> s.fileId).getOrElse(t._1.id -> t._1.fileId) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index a6f751f7..c5f474a5 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -8,7 +8,8 @@ case class ItemData( item: RItem, attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], - dateLabels: Vector[AttachmentDates] + dateLabels: Vector[AttachmentDates], + originFile: Map[Ident, Ident] ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 88d16892..679625a2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -10,7 +10,8 @@ object ProcessItem { def apply[F[_]: Sync: ContextShift]( cfg: OcrConfig )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextExtraction(cfg, item) + ConvertPdf(item) + .flatMap(TextExtraction(cfg, _)) .flatMap(Task.setProgress(25)) .flatMap(TextAnalysis[F]) .flatMap(Task.setProgress(50)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 157fdfee..478f6a91 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -3,7 +3,7 @@ package docspell.joex.process import bitpeace.RangeDef import cats.implicits._ import cats.effect.{Blocker, ContextShift, Sync} -import docspell.common.{Duration, Language, ProcessItemArgs} +import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.Store import docspell.store.records.{RAttachment, RAttachmentMeta} @@ -19,7 +19,7 @@ object TextExtraction { for { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] - txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language)) + txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item)) _ <- ctx.logger.debug("Storing extracted texts") _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) dur <- start @@ -30,12 +30,13 @@ object TextExtraction { def extractTextToMeta[F[_]: Sync: ContextShift]( ctx: Context[F, _], cfg: OcrConfig, - lang: Language + lang: Language, + item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") dst <- Duration.stopTime[F] - txt <- extractText(cfg, lang, ctx.store, ctx.blocker)(ra) + txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra)) meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) est <- dst _ <- ctx.logger.debug( @@ -48,12 +49,40 @@ object TextExtraction { lang: Language, store: Store[F], blocker: Blocker - )(ra: RAttachment): F[Option[String]] = { + )(fileId: Ident): F[Option[String]] = { val data = store.bitpeace - .get(ra.fileId.id) + .get(fileId.id) .unNoneTerminate .through(store.bitpeace.fetchData2(RangeDef.all)) TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last } + + private def extractTextFallback[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + ocrConfig: OcrConfig, + lang: Language, + )(fileIds: List[Ident]): F[Option[String]] = { + fileIds match { + case Nil => + ctx.logger.error(s"Cannot extract text").map(_ => None) + + case id :: rest => + extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id). + recoverWith({ + case ex => + ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). + flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest)) + }) + } + } + + /** Returns the fileIds to extract text from. First, the source file + * is tried. If that fails, the converted file is tried. + */ + private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] = + item.originFile.get(ra.id) match { + case Some(sid) => List(sid, ra.fileId).distinct + case None => List(ra.fileId) + } } diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql new file mode 100644 index 00000000..6f93ca0c --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE `attachment_source` ( + `id` varchar(254) not null primary key, + `file_id` varchar(254) not null, + `filename` varchar(254), + `created` timestamp not null, + foreign key (`file_id`) references `filemeta`(`id`), + foreign key (`id`) references `attachment`(`attachid`) +); + +INSERT INTO `attachment_source` + SELECT `attachid`,`filemetaid`,`name`,`created` FROM `attachment`; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql new file mode 100644 index 00000000..630ea05d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE "attachment_source" ( + "id" varchar(254) not null primary key, + "file_id" varchar(254) not null, + "filename" varchar(254), + "created" timestamp not null, + foreign key ("file_id") references "filemeta"("id"), + foreign key ("id") references "attachment"("attachid") +); + +INSERT INTO "attachment_source" + SELECT "attachid","filemetaid","name","created" FROM "attachment"; diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index ee193e69..22ab8e89 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -41,6 +41,20 @@ object RAttachment { def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = selectSimple(all, table, id.is(attachId)).query[RAttachment].option + def findMeta(attachId: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + val cols = RFileMeta.Columns.all.map(_.prefix("m")) + val aId = id.prefix("a") + val aFileMeta = fileId.prefix("a") + val mId = RFileMeta.Columns.id.prefix("m") + + val from = table ++ fr"a INNER JOIN" ++ RFileMeta.table ++ fr"m ON" ++ aFileMeta.is(mId) + val cond = aId.is(attachId) + + selectSimple(cols, from, cond).query[FileMeta].option + } + def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] = selectSimple( all.map(_.prefix("a")), diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala new file mode 100644 index 00000000..447af3aa --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala @@ -0,0 +1,44 @@ +package docspell.store.records + +import doobie._ +import doobie.implicits._ +import docspell.common._ +import docspell.store.impl._ +import docspell.store.impl.Implicits._ + +/** The origin file of an attachment. The `id` is shared with the + * attachment, to create a 1-1 (or 0..1-1) relationship. + */ +case class RAttachmentSource( + id: Ident, //same as RAttachment.id + fileId: Ident, + name: Option[String], + created: Timestamp +) + +object RAttachmentSource { + + val table = fr"attachment_source" + + object Columns { + val id = Column("id") + val fileId = Column("file_id") + val name = Column("filename") + val created = Column("created") + + val all = List(id, fileId, name, created) + } + + import Columns._ + + def of(ra: RAttachment): RAttachmentSource = + RAttachmentSource(ra.id, ra.fileId, ra.name, ra.created) + + def insert(v: RAttachmentSource): ConnectionIO[Int] = + insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run + + + def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] = + selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option + +} diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index daa81029..1749a653 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -1,7 +1,12 @@ package docspell.store.records +import bitpeace.FileMeta +import doobie._ import doobie.implicits._ + +import docspell.common._ import docspell.store.impl._ +import docspell.store.impl.Implicits._ object RFileMeta { @@ -19,4 +24,10 @@ object RFileMeta { val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize) } + + def findById(fid: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option + } }