Prepend a duplicate check when uploading files

This commit is contained in:
Eike Kettner 2020-09-23 23:00:19 +02:00
parent 10c16bf319
commit f6f63000be
9 changed files with 123 additions and 11 deletions

View File

@ -59,7 +59,8 @@ object OUpload {
direction: Option[Direction],
sourceAbbrev: String,
folderId: Option[Ident],
validFileTypes: Seq[MimeType]
validFileTypes: Seq[MimeType],
skipDuplicates: Boolean
)
case class UploadData[F[_]](
@ -125,7 +126,8 @@ object OUpload {
data.meta.direction,
data.meta.sourceAbbrev,
data.meta.folderId,
data.meta.validFileTypes
data.meta.validFileTypes,
data.meta.skipDuplicates
)
args =
if (data.multiple) files.map(f => ProcessItemArgs(meta, List(f)))

View File

@ -37,7 +37,8 @@ object ProcessItemArgs {
direction: Option[Direction],
sourceAbbrev: String,
folderId: Option[Ident],
validFileTypes: Seq[MimeType]
validFileTypes: Seq[MimeType],
skipDuplicate: Boolean
)
object ProcessMeta {

View File

@ -0,0 +1,62 @@
package docspell.joex.process
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.queries.QItem
import docspell.store.records.RFileMeta
import bitpeace.FileMeta
import doobie._
object DuplicateCheck {
type Args = ProcessItemArgs
def apply[F[_]: Sync]: Task[F, Args, Args] =
Task { ctx =>
if (ctx.args.meta.skipDuplicate)
ctx.logger.debug("Checking for duplicate files") *> removeDuplicates(ctx)
else ctx.logger.debug("Not checking for duplicates") *> ctx.args.pure[F]
}
def removeDuplicates[F[_]: Sync](ctx: Context[F, Args]): F[ProcessItemArgs] =
for {
fileMetas <- findDuplicates(ctx)
_ <- fileMetas.traverse(deleteDuplicate(ctx))
ids = fileMetas.filter(_.exists).map(_.fm.id).toSet
} yield ctx.args.copy(files =
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id))
)
private def deleteDuplicate[F[_]: Sync](
ctx: Context[F, Args]
)(fd: FileMetaDupes): F[Unit] = {
val fname = ctx.args.files.find(_.fileMetaId.id == fd.fm.id).flatMap(_.name)
if (fd.exists)
ctx.logger
.info(s"Deleting duplicate file ${fname}!") *> ctx.store.bitpeace
.delete(fd.fm.id)
.compile
.drain
else ().pure[F]
}
private def findDuplicates[F[_]: Sync](
ctx: Context[F, Args]
): F[Vector[FileMetaDupes]] =
ctx.store.transact(for {
fileMetas <- RFileMeta.findByIds(ctx.args.files.map(_.fileMetaId))
dupes <- fileMetas.traverse(checkDuplicate(ctx))
} yield dupes)
private def checkDuplicate[F[_]](
ctx: Context[F, Args]
)(fm: FileMeta): ConnectionIO[FileMetaDupes] =
QItem
.findByChecksum(fm.checksum, ctx.args.meta.collective)
.map(v => FileMetaDupes(fm, v.nonEmpty))
case class FileMetaDupes(fm: FileMeta, exists: Boolean)
}

View File

@ -35,10 +35,18 @@ object ItemHandler {
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] =
CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
.map(_ => ())
DuplicateCheck[F]
.flatMap(args =>
if (args.files.isEmpty) logNoFiles
else {
val create: Task[F, Args, ItemData] =
CreateItem[F].contramap(_ => args.pure[F])
create
.flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
.map(_ => ())
}
)
def itemStateTask[F[_]: Sync, A](
state: ItemState
@ -121,4 +129,10 @@ object ItemHandler {
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
Task(_.logger.warn(msg))
private def logNoFiles[F[_]]: Task[F, Args, Unit] =
logWarn(
"No files to process! Either no files were given or duplicate check removed all."
)
}

View File

@ -90,7 +90,8 @@ object ReProcessItem {
None, //direction
"", //source-id
None, //folder
Seq.empty
Seq.empty,
false
),
Nil
).pure[F]

View File

@ -254,7 +254,8 @@ object ScanMailboxTask {
Some(dir),
s"mailbox-${ctx.args.account.user.id}",
args.itemFolder,
Seq.empty
Seq.empty,
true
)
data = OUpload.UploadData(
multiple = false,

View File

@ -3660,6 +3660,7 @@ components:
description: DateTime
type: integer
format: date-time
ItemUploadMeta:
description: |
Meta information for an item upload. The user can specify some
@ -3674,6 +3675,7 @@ components:
A folderId can be given, the item is placed into this folder
after creation.
required:
- multiple
properties:
@ -3686,6 +3688,10 @@ components:
folder:
type: string
format: ident
skipDuplicates:
type: boolean
default: false
Collective:
description: |
Information about a collective.

View File

@ -294,10 +294,21 @@ trait Conversions {
.map(p => parseMeta(p.body))
.map(fm =>
fm.map(m =>
(m.multiple, UploadMeta(m.direction, "webapp", m.folder, validFileTypes))
(
m.multiple,
UploadMeta(
m.direction,
"webapp",
m.folder,
validFileTypes,
m.skipDuplicates.getOrElse(false)
)
)
)
)
.getOrElse((true, UploadMeta(None, "webapp", None, validFileTypes)).pure[F])
.getOrElse(
(true, UploadMeta(None, "webapp", None, validFileTypes, false)).pure[F]
)
val files = mp.parts
.filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))

View File

@ -1,5 +1,8 @@
package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._
import docspell.common._
import docspell.store.impl.Implicits._
import docspell.store.impl._
@ -33,6 +36,17 @@ object RFileMeta {
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
}
def findByIds(ids: List[Ident]): ConnectionIO[Vector[FileMeta]] = {
import bitpeace.sql._
NonEmptyList.fromList(ids) match {
case Some(nel) =>
selectSimple(Columns.all, table, Columns.id.isIn(nel)).query[FileMeta].to[Vector]
case None =>
Vector.empty[FileMeta].pure[ConnectionIO]
}
}
def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
import bitpeace.sql._