From f6f63000bea7b76b782c92e8ded9dfdc65d02be4 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 23 Sep 2020 23:00:19 +0200 Subject: [PATCH] Prepend a duplicate check when uploading files --- .../scala/docspell/backend/ops/OUpload.scala | 6 +- .../docspell/common/ProcessItemArgs.scala | 3 +- .../joex/process/DuplicateCheck.scala | 62 +++++++++++++++++++ .../docspell/joex/process/ItemHandler.scala | 22 +++++-- .../docspell/joex/process/ReProcessItem.scala | 3 +- .../joex/scanmailbox/ScanMailboxTask.scala | 3 +- .../src/main/resources/docspell-openapi.yml | 6 ++ .../restserver/conv/Conversions.scala | 15 ++++- .../docspell/store/records/RFileMeta.scala | 14 +++++ 9 files changed, 123 insertions(+), 11 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/process/DuplicateCheck.scala diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala index a9145f72..e71a131f 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala @@ -59,7 +59,8 @@ object OUpload { direction: Option[Direction], sourceAbbrev: String, folderId: Option[Ident], - validFileTypes: Seq[MimeType] + validFileTypes: Seq[MimeType], + skipDuplicates: Boolean ) case class UploadData[F[_]]( @@ -125,7 +126,8 @@ object OUpload { data.meta.direction, data.meta.sourceAbbrev, data.meta.folderId, - data.meta.validFileTypes + data.meta.validFileTypes, + data.meta.skipDuplicates ) args = if (data.multiple) files.map(f => ProcessItemArgs(meta, List(f))) diff --git a/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala b/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala index 9e3faf2b..6e5427be 100644 --- a/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala +++ b/modules/common/src/main/scala/docspell/common/ProcessItemArgs.scala @@ -37,7 +37,8 @@ object ProcessItemArgs { direction: Option[Direction], sourceAbbrev: String, folderId: Option[Ident], - validFileTypes: Seq[MimeType] + validFileTypes: Seq[MimeType], + skipDuplicate: Boolean ) object ProcessMeta { diff --git a/modules/joex/src/main/scala/docspell/joex/process/DuplicateCheck.scala b/modules/joex/src/main/scala/docspell/joex/process/DuplicateCheck.scala new file mode 100644 index 00000000..069bf9f5 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/DuplicateCheck.scala @@ -0,0 +1,62 @@ +package docspell.joex.process + +import cats.effect._ +import cats.implicits._ + +import docspell.common._ +import docspell.joex.scheduler.{Context, Task} +import docspell.store.queries.QItem +import docspell.store.records.RFileMeta + +import bitpeace.FileMeta +import doobie._ + +object DuplicateCheck { + type Args = ProcessItemArgs + + def apply[F[_]: Sync]: Task[F, Args, Args] = + Task { ctx => + if (ctx.args.meta.skipDuplicate) + ctx.logger.debug("Checking for duplicate files") *> removeDuplicates(ctx) + else ctx.logger.debug("Not checking for duplicates") *> ctx.args.pure[F] + } + + def removeDuplicates[F[_]: Sync](ctx: Context[F, Args]): F[ProcessItemArgs] = + for { + fileMetas <- findDuplicates(ctx) + _ <- fileMetas.traverse(deleteDuplicate(ctx)) + ids = fileMetas.filter(_.exists).map(_.fm.id).toSet + } yield ctx.args.copy(files = + ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id)) + ) + + private def deleteDuplicate[F[_]: Sync]( + ctx: Context[F, Args] + )(fd: FileMetaDupes): F[Unit] = { + val fname = ctx.args.files.find(_.fileMetaId.id == fd.fm.id).flatMap(_.name) + if (fd.exists) + ctx.logger + .info(s"Deleting duplicate file ${fname}!") *> ctx.store.bitpeace + .delete(fd.fm.id) + .compile + .drain + else ().pure[F] + } + + private def findDuplicates[F[_]: Sync]( + ctx: Context[F, Args] + ): F[Vector[FileMetaDupes]] = + ctx.store.transact(for { + fileMetas <- RFileMeta.findByIds(ctx.args.files.map(_.fileMetaId)) + dupes <- fileMetas.traverse(checkDuplicate(ctx)) + } yield dupes) + + private def checkDuplicate[F[_]]( + ctx: Context[F, Args] + )(fm: FileMeta): ConnectionIO[FileMetaDupes] = + QItem + .findByChecksum(fm.checksum, ctx.args.meta.collective) + .map(v => FileMetaDupes(fm, v.nonEmpty)) + + case class FileMetaDupes(fm: FileMeta, exists: Boolean) +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index acbf810b..a5ef178b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -35,10 +35,18 @@ object ItemHandler { analyser: TextAnalyser[F], regexNer: RegexNerFile[F] ): Task[F, Args, Unit] = - CreateItem[F] - .flatMap(itemStateTask(ItemState.Processing)) - .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer)) - .map(_ => ()) + DuplicateCheck[F] + .flatMap(args => + if (args.files.isEmpty) logNoFiles + else { + val create: Task[F, Args, ItemData] = + CreateItem[F].contramap(_ => args.pure[F]) + create + .flatMap(itemStateTask(ItemState.Processing)) + .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer)) + .map(_ => ()) + } + ) def itemStateTask[F[_]: Sync, A]( state: ItemState @@ -121,4 +129,10 @@ object ItemHandler { private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = Task(_.logger.warn(msg)) + + private def logNoFiles[F[_]]: Task[F, Args, Unit] = + logWarn( + "No files to process! Either no files were given or duplicate check removed all." + ) + } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index bf6d2467..dd7747db 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -90,7 +90,8 @@ object ReProcessItem { None, //direction "", //source-id None, //folder - Seq.empty + Seq.empty, + false ), Nil ).pure[F] diff --git a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala index e98ef3ea..0fee001a 100644 --- a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala @@ -254,7 +254,8 @@ object ScanMailboxTask { Some(dir), s"mailbox-${ctx.args.account.user.id}", args.itemFolder, - Seq.empty + Seq.empty, + true ) data = OUpload.UploadData( multiple = false, diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index f2ed8688..2d239b01 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -3660,6 +3660,7 @@ components: description: DateTime type: integer format: date-time + ItemUploadMeta: description: | Meta information for an item upload. The user can specify some @@ -3674,6 +3675,7 @@ components: A folderId can be given, the item is placed into this folder after creation. + required: - multiple properties: @@ -3686,6 +3688,10 @@ components: folder: type: string format: ident + skipDuplicates: + type: boolean + default: false + Collective: description: | Information about a collective. diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index 85013d42..1def28d4 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -294,10 +294,21 @@ trait Conversions { .map(p => parseMeta(p.body)) .map(fm => fm.map(m => - (m.multiple, UploadMeta(m.direction, "webapp", m.folder, validFileTypes)) + ( + m.multiple, + UploadMeta( + m.direction, + "webapp", + m.folder, + validFileTypes, + m.skipDuplicates.getOrElse(false) + ) + ) ) ) - .getOrElse((true, UploadMeta(None, "webapp", None, validFileTypes)).pure[F]) + .getOrElse( + (true, UploadMeta(None, "webapp", None, validFileTypes, false)).pure[F] + ) val files = mp.parts .filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta"))) diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index 076bfd68..b9e73f77 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -1,5 +1,8 @@ package docspell.store.records +import cats.data.NonEmptyList +import cats.implicits._ + import docspell.common._ import docspell.store.impl.Implicits._ import docspell.store.impl._ @@ -33,6 +36,17 @@ object RFileMeta { selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option } + def findByIds(ids: List[Ident]): ConnectionIO[Vector[FileMeta]] = { + import bitpeace.sql._ + + NonEmptyList.fromList(ids) match { + case Some(nel) => + selectSimple(Columns.all, table, Columns.id.isIn(nel)).query[FileMeta].to[Vector] + case None => + Vector.empty[FileMeta].pure[ConnectionIO] + } + } + def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = { import bitpeace.sql._