From 4fd6e02ec0a9ca2f71d568e9166d37174ea63d05 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 11 Nov 2020 08:52:21 +0100 Subject: [PATCH] Improve glob and filter archive entries --- .../scala/docspell/backend/ops/OUpload.scala | 4 +- .../src/main/scala/docspell/common/Glob.scala | 78 ++++++++++++++++--- .../test/scala/docspell/common/GlobTest.scala | 62 +++++++++++---- .../src/main/scala/docspell/files/Zip.scala | 20 +++-- .../test/scala/docspell/files/ZipTest.scala | 3 +- .../scala/docspell/joex/mail/ReadMail.scala | 12 ++- .../joex/process/ExtractArchive.scala | 70 +++++++++-------- .../docspell/joex/process/ProcessItem.scala | 1 + .../joex/process/RemoveEmptyItem.scala | 26 +++++++ .../joex/scanmailbox/ScanMailboxTask.scala | 2 +- .../restserver/conv/Conversions.scala | 5 +- 11 files changed, 211 insertions(+), 72 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/process/RemoveEmptyItem.scala diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala index 8723ed40..efe84d60 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala @@ -61,7 +61,7 @@ object OUpload { folderId: Option[Ident], validFileTypes: Seq[MimeType], skipDuplicates: Boolean, - fileFilter: Option[Glob], + fileFilter: Glob, tags: List[String] ) @@ -130,7 +130,7 @@ object OUpload { data.meta.folderId, data.meta.validFileTypes, data.meta.skipDuplicates, - data.meta.fileFilter, + data.meta.fileFilter.some, data.meta.tags.some ) args = diff --git a/modules/common/src/main/scala/docspell/common/Glob.scala b/modules/common/src/main/scala/docspell/common/Glob.scala index 5eba3f73..afa04a02 100644 --- a/modules/common/src/main/scala/docspell/common/Glob.scala +++ b/modules/common/src/main/scala/docspell/common/Glob.scala @@ -1,27 +1,81 @@ package docspell.common -import cats.implicits._ import cats.data.NonEmptyList +import cats.implicits._ + import io.circe.{Decoder, Encoder} -/** A very simple glob supporting only `*` and `?`. */ -final case class Glob(pattern: Glob.Pattern) { - def matches(in: String): Boolean = - pattern.parts - .zipWith(Glob.split(in, Glob.separator))(_.matches(_)) - .forall(identity) +trait Glob { - def asString: String = - pattern.asString + /** Matches the input string against this glob. */ + def matches(in: String): Boolean + + /** If this glob consists of multiple segments, it is the same as + * `matches`. If it is only a single segment, it is matched against + * the last segment of the input string that is assumed to be a + * pathname separated by slash. + * + * Example: + * test.* <> "/a/b/test.txt" => true + * /test.* <> "/a/b/test.txt" => false + */ + def matchFilenameOrPath(in: String): Boolean + + def asString: String } object Glob { private val separator = '/' + private val anyChar = '|' - def apply(str: String): Glob = - Glob(Pattern(split(str, separator).map(makeSegment))) + val all = new Glob { + def matches(in: String) = true + def matchFilenameOrPath(in: String) = true + val asString = "*" + } - case class Pattern(parts: NonEmptyList[Segment]) { + def pattern(pattern: Pattern): Glob = + PatternGlob(pattern) + + /** A simple glob supporting `*` and `?`. */ + final private case class PatternGlob(pattern: Pattern) extends Glob { + def matches(in: String): Boolean = + pattern.parts + .zipWith(Glob.split(in, Glob.separator))(_.matches(_)) + .forall(identity) + + def matchFilenameOrPath(in: String): Boolean = + if (pattern.parts.tail.isEmpty) matches(split(in, separator).last) + else matches(in) + + def asString: String = + pattern.asString + } + + final private case class AnyGlob(globs: NonEmptyList[Glob]) extends Glob { + def matches(in: String) = + globs.exists(_.matches(in)) + def matchFilenameOrPath(in: String) = + globs.exists(_.matchFilenameOrPath(in)) + def asString = + globs.toList.map(_.asString).mkString(anyChar.toString) + } + + def apply(in: String): Glob = { + def single(str: String) = + PatternGlob(Pattern(split(str, separator).map(makeSegment))) + + if (in == "*") all + else + split(in, anyChar) match { + case NonEmptyList(_, Nil) => + single(in) + case nel => + AnyGlob(nel.map(_.trim).map(single)) + } + } + + case class Pattern(parts: NonEmptyList[Segment]) { def asString = parts.map(_.asString).toList.mkString(separator.toString) } diff --git a/modules/common/src/test/scala/docspell/common/GlobTest.scala b/modules/common/src/test/scala/docspell/common/GlobTest.scala index 05df0699..8f228851 100644 --- a/modules/common/src/test/scala/docspell/common/GlobTest.scala +++ b/modules/common/src/test/scala/docspell/common/GlobTest.scala @@ -6,13 +6,15 @@ import Glob._ object GlobTest extends SimpleTestSuite { test("literals") { - assert(Glob(Pattern(Segment(Token.Literal("hello")))).matches("hello")) - assert(!Glob(Pattern(Segment(Token.Literal("hello")))).matches("hello1")) + assert(Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches("hello")) + assert(!Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches("hello1")) } test("single wildcards 1") { val glob = - Glob(Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t")))) + Glob.pattern( + Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t"))) + ) assert(glob.matches("snapshot")) assert(!glob.matches("snapshots")) @@ -20,7 +22,7 @@ object GlobTest extends SimpleTestSuite { test("single wildcards 2") { val glob = - Glob(Pattern(Segment(Token.Literal("test."), Token.Until("")))) + Glob.pattern(Pattern(Segment(Token.Literal("test."), Token.Until("")))) assert(glob.matches("test.txt")) assert(glob.matches("test.pdf")) @@ -32,28 +34,29 @@ object GlobTest extends SimpleTestSuite { test("single parsing") { assertEquals( Glob("s*p*t"), - Glob(Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t")))) + Glob.pattern( + Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t"))) + ) ) assertEquals( Glob("s***p*t"), - Glob(Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t")))) + Glob.pattern( + Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t"))) + ) ) assertEquals( Glob("test.*"), - Glob(Pattern(Segment(Token.Literal("test."), Token.Until("")))) + Glob.pattern(Pattern(Segment(Token.Literal("test."), Token.Until("")))) ) assertEquals( Glob("stop"), - Glob(Pattern(Segment(Token.Literal("stop")))) + Glob.pattern(Pattern(Segment(Token.Literal("stop")))) ) assertEquals( Glob("*stop"), - Glob(Pattern(Segment(Token.Until("stop")))) - ) - assertEquals( - Glob("*"), - Glob(Pattern(Segment(Token.Until("")))) + Glob.pattern(Pattern(Segment(Token.Until("stop")))) ) + assertEquals(Glob("*"), Glob.all) } test("with splitting") { @@ -71,5 +74,38 @@ object GlobTest extends SimpleTestSuite { assertEquals(Glob("stop").asString, "stop") assertEquals(Glob("*stop").asString, "*stop") assertEquals(Glob("/a/b/*").asString, "/a/b/*") + assertEquals(Glob("*").asString, "*") + assertEquals(Glob.all.asString, "*") + } + + test("simple matches") { + assert(Glob("/test.*").matches("/test.pdf")) + assert(!Glob("/test.*").matches("test.pdf")) + assert(!Glob("test.*").matches("/test.pdf")) + } + + test("matchFilenameOrPath") { + assert(Glob("test.*").matchFilenameOrPath("/a/b/test.pdf")) + assert(!Glob("/test.*").matchFilenameOrPath("/a/b/test.pdf")) + assert(Glob("s*p*t").matchFilenameOrPath("snapshot")) + assert(Glob("s*p*t").matchFilenameOrPath("/tmp/snapshot")) + assert(Glob("/tmp/s*p*t").matchFilenameOrPath("/tmp/snapshot")) + + assert(Glob("a/b/*").matchFilenameOrPath("a/b/hello")) + assert(!Glob("a/b/*").matchFilenameOrPath("/a/b/hello")) + assert(Glob("/a/b/*").matchFilenameOrPath("/a/b/hello")) + assert(!Glob("/a/b/*").matchFilenameOrPath("a/b/hello")) + assert(!Glob("*/a/b/*").matchFilenameOrPath("a/b/hello")) + assert(Glob("*/a/b/*").matchFilenameOrPath("test/a/b/hello")) + } + + test("anyglob") { + assert(Glob("*.pdf|*.txt").matches("test.pdf")) + assert(Glob("*.pdf|*.txt").matches("test.txt")) + assert(!Glob("*.pdf|*.txt").matches("test.xls")) + assert(Glob("*.pdf | *.txt").matches("test.pdf")) + assert(Glob("*.pdf | mail.html").matches("test.pdf")) + assert(Glob("*.pdf | mail.html").matches("mail.html")) + assert(!Glob("*.pdf | mail.html").matches("test.docx")) } } diff --git a/modules/files/src/main/scala/docspell/files/Zip.scala b/modules/files/src/main/scala/docspell/files/Zip.scala index f09bd66d..5450cbf7 100644 --- a/modules/files/src/main/scala/docspell/files/Zip.scala +++ b/modules/files/src/main/scala/docspell/files/Zip.scala @@ -9,24 +9,33 @@ import cats.implicits._ import fs2.{Pipe, Stream} import docspell.common.Binary +import docspell.common.Glob object Zip { def unzipP[F[_]: ConcurrentEffect: ContextShift]( chunkSize: Int, - blocker: Blocker + blocker: Blocker, + glob: Glob ): Pipe[F, Byte, Binary[F]] = - s => unzip[F](chunkSize, blocker)(s) + s => unzip[F](chunkSize, blocker, glob)(s) - def unzip[F[_]: ConcurrentEffect: ContextShift](chunkSize: Int, blocker: Blocker)( + def unzip[F[_]: ConcurrentEffect: ContextShift]( + chunkSize: Int, + blocker: Blocker, + glob: Glob + )( data: Stream[F, Byte] ): Stream[F, Binary[F]] = - data.through(fs2.io.toInputStream[F]).flatMap(in => unzipJava(in, chunkSize, blocker)) + data + .through(fs2.io.toInputStream[F]) + .flatMap(in => unzipJava(in, chunkSize, blocker, glob)) def unzipJava[F[_]: Sync: ContextShift]( in: InputStream, chunkSize: Int, - blocker: Blocker + blocker: Blocker, + glob: Glob ): Stream[F, Binary[F]] = { val zin = new ZipInputStream(in) @@ -39,6 +48,7 @@ object Zip { .resource(nextEntry) .repeat .unNoneTerminate + .filter(ze => glob.matchFilenameOrPath(ze.getName())) .map { ze => val name = Paths.get(ze.getName()).getFileName.toString val data = diff --git a/modules/files/src/test/scala/docspell/files/ZipTest.scala b/modules/files/src/test/scala/docspell/files/ZipTest.scala index 7fce9d50..05cf1866 100644 --- a/modules/files/src/test/scala/docspell/files/ZipTest.scala +++ b/modules/files/src/test/scala/docspell/files/ZipTest.scala @@ -4,6 +4,7 @@ import minitest._ import cats.effect._ import cats.implicits._ import scala.concurrent.ExecutionContext +import docspell.common.Glob object ZipTest extends SimpleTestSuite { @@ -12,7 +13,7 @@ object ZipTest extends SimpleTestSuite { test("unzip") { val zipFile = ExampleFiles.letters_zip.readURL[IO](8192, blocker) - val uncomp = zipFile.through(Zip.unzip(8192, blocker)) + val uncomp = zipFile.through(Zip.unzip(8192, blocker, Glob.all)) uncomp .evalMap { entry => diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 0f144ab7..8242df94 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -16,9 +16,10 @@ import emil.{MimeType => _, _} object ReadMail { def readBytesP[F[_]: ConcurrentEffect: ContextShift]( - logger: Logger[F] + logger: Logger[F], + glob: Glob ): Pipe[F, Byte, Binary[F]] = - _.through(bytesToMail(logger)).flatMap(mailToEntries[F](logger)) + _.through(bytesToMail(logger)).flatMap(mailToEntries[F](logger, glob)) def bytesToMail[F[_]: Sync](logger: Logger[F]): Pipe[F, Byte, Mail[F]] = s => @@ -26,7 +27,8 @@ object ReadMail { s.through(Mail.readBytes[F]) def mailToEntries[F[_]: ConcurrentEffect: ContextShift]( - logger: Logger[F] + logger: Logger[F], + glob: Glob )(mail: Mail[F]): Stream[F, Binary[F]] = { val bodyEntry: F[Option[Binary[F]]] = if (mail.body.isEmpty) (None: Option[Binary[F]]).pure[F] @@ -48,10 +50,12 @@ object ReadMail { ) >> (Stream .eval(bodyEntry) - .flatMap(e => Stream.emits(e.toSeq)) ++ + .flatMap(e => Stream.emits(e.toSeq)) + .filter(a => glob.matches(a.name)) ++ Stream .eval(TnefExtract.replace(mail)) .flatMap(m => Stream.emits(m.attachments.all)) + .filter(a => a.filename.exists(glob.matches)) .map(a => Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content) )) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index f489ae3c..b684ded9 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -95,12 +95,12 @@ object ExtractArchive { case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) => ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("")}.") *> extractZip(ctx, archive)(ra, pos) - .flatTap(_ => cleanupParents(ctx, ra, archive)) + .flatMap(cleanupParents(ctx, ra, archive)) case MimeType.EmailMatch(_) => ctx.logger.info(s"Reading e-mail ${ra.name.getOrElse("")}") *> extractMail(ctx, archive)(ra, pos) - .flatTap(_ => cleanupParents(ctx, ra, archive)) + .flatMap(cleanupParents(ctx, ra, archive)) case _ => ctx.logger.debug(s"Not an archive: ${mime.asString}") *> @@ -111,7 +111,7 @@ object ExtractArchive { ctx: Context[F, _], ra: RAttachment, archive: Option[RAttachmentArchive] - ): F[Unit] = + )(extracted: Extracted): F[Extracted] = archive match { case Some(_) => for { @@ -121,36 +121,37 @@ object ExtractArchive { _ <- ctx.store.transact(RAttachmentArchive.delete(ra.id)) _ <- ctx.store.transact(RAttachment.delete(ra.id)) _ <- ctx.store.bitpeace.delete(ra.fileId.id).compile.drain - } yield () + } yield extracted case None => for { _ <- ctx.logger.debug( s"Extracted attachment ${ra.name}. Remove it from the item." ) _ <- ctx.store.transact(RAttachment.delete(ra.id)) - } yield () + } yield extracted.copy(files = extracted.files.filter(_.id != ra.id)) } def extractZip[F[_]: ConcurrentEffect: ContextShift]( - ctx: Context[F, _], + ctx: Context[F, ProcessItemArgs], archive: Option[RAttachmentArchive] )(ra: RAttachment, pos: Int): F[Extracted] = { val zipData = ctx.store.bitpeace .get(ra.fileId.id) .unNoneTerminate .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - - zipData - .through(Zip.unzipP[F](8192, ctx.blocker)) - .zipWithIndex - .flatMap(handleEntry(ctx, ra, pos, archive, None)) - .foldMonoid - .compile - .lastOrError + val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all) + ctx.logger.debug(s"Filtering zip entries with '${glob.asString}'") *> + zipData + .through(Zip.unzipP[F](8192, ctx.blocker, glob)) + .zipWithIndex + .flatMap(handleEntry(ctx, ra, pos, archive, None)) + .foldMonoid + .compile + .lastOrError } def extractMail[F[_]: ConcurrentEffect: ContextShift]( - ctx: Context[F, _], + ctx: Context[F, ProcessItemArgs], archive: Option[RAttachmentArchive] )(ra: RAttachment, pos: Int): F[Extracted] = { val email: Stream[F, Byte] = ctx.store.bitpeace @@ -158,24 +159,26 @@ object ExtractArchive { .unNoneTerminate .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - email - .through(ReadMail.bytesToMail[F](ctx.logger)) - .flatMap { mail => - val mId = mail.header.messageId - val givenMeta = - for { - _ <- ctx.logger.debug(s"Use mail date for item date: ${mail.header.date}") - s <- Sync[F].delay(extractMailMeta(mail)) - } yield s + val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all) + ctx.logger.debug(s"Filtering email attachments with '${glob.asString}'") *> + email + .through(ReadMail.bytesToMail[F](ctx.logger)) + .flatMap { mail => + val mId = mail.header.messageId + val givenMeta = + for { + _ <- ctx.logger.debug(s"Use mail date for item date: ${mail.header.date}") + s <- Sync[F].delay(extractMailMeta(mail)) + } yield s - ReadMail - .mailToEntries(ctx.logger)(mail) - .zipWithIndex - .flatMap(handleEntry(ctx, ra, pos, archive, mId)) ++ Stream.eval(givenMeta) - } - .foldMonoid - .compile - .lastOrError + ReadMail + .mailToEntries(ctx.logger, glob)(mail) + .zipWithIndex + .flatMap(handleEntry(ctx, ra, pos, archive, mId)) ++ Stream.eval(givenMeta) + } + .foldMonoid + .compile + .lastOrError } def extractMailMeta[F[_]](mail: Mail[F]): Extracted = @@ -239,6 +242,9 @@ object ExtractArchive { positions ++ e.positions ) + def filterNames(filter: Glob): Extracted = + copy(files = files.filter(ra => filter.matches(ra.name.getOrElse("")))) + def setMeta(m: MetaProposal): Extracted = setMeta(MetaProposalList.of(m)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 56f3cd33..b6cc493e 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -25,6 +25,7 @@ object ProcessItem { .flatMap(LinkProposal[F]) .flatMap(SetGivenData[F](itemOps)) .flatMap(Task.setProgress(99)) + .flatMap(RemoveEmptyItem(itemOps)) def processAttachments[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, diff --git a/modules/joex/src/main/scala/docspell/joex/process/RemoveEmptyItem.scala b/modules/joex/src/main/scala/docspell/joex/process/RemoveEmptyItem.scala new file mode 100644 index 00000000..75751672 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/RemoveEmptyItem.scala @@ -0,0 +1,26 @@ +package docspell.joex.process + +import cats.effect._ +import cats.implicits._ + +import docspell.backend.ops.OItem +import docspell.common._ +import docspell.joex.scheduler.Task + +object RemoveEmptyItem { + + def apply[F[_]: Sync]( + ops: OItem[F] + )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = + if (data.item.state.isInvalid && data.attachments.isEmpty) + Task { ctx => + for { + _ <- ctx.logger.warn(s"Removing item as it doesn't have any attachments!") + n <- ops.deleteItem(data.item.id, data.item.cid) + _ <- ctx.logger.warn(s"Removed item ($n). No item has been created!") + } yield data + } + else + Task.pure(data) + +} diff --git a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala index a01b1676..7a746b9f 100644 --- a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala @@ -256,7 +256,7 @@ object ScanMailboxTask { args.itemFolder, Seq.empty, true, - args.fileFilter, + args.fileFilter.getOrElse(Glob.all), args.tags.getOrElse(Nil) ) data = OUpload.UploadData( diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index e5191e07..2638dc28 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -311,14 +311,15 @@ trait Conversions { m.folder, validFileTypes, m.skipDuplicates.getOrElse(false), - m.fileFilter, + m.fileFilter.getOrElse(Glob.all), m.tags.map(_.items).getOrElse(Nil) ) ) ) ) .getOrElse( - (true, UploadMeta(None, sourceName, None, validFileTypes, false, None, Nil)).pure[F] + (true, UploadMeta(None, sourceName, None, validFileTypes, false, Glob.all, Nil)) + .pure[F] ) val files = mp.parts