From d712f8303d76d6ef38eb87b21c15dd607920f224 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 9 Jan 2021 13:23:15 +0100 Subject: [PATCH 1/2] Make glob matching case-insensitive by default --- .../src/main/scala/docspell/common/Glob.scala | 116 ++++++++++-------- .../test/scala/docspell/common/GlobTest.scala | 64 ++++++---- .../scala/docspell/joex/mail/ReadMail.scala | 4 +- .../joex/process/ExtractArchive.scala | 4 +- .../joex/scanmailbox/ScanMailboxTask.scala | 2 +- 5 files changed, 110 insertions(+), 80 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/Glob.scala b/modules/common/src/main/scala/docspell/common/Glob.scala index afa04a02..6a41057e 100644 --- a/modules/common/src/main/scala/docspell/common/Glob.scala +++ b/modules/common/src/main/scala/docspell/common/Glob.scala @@ -8,7 +8,7 @@ import io.circe.{Decoder, Encoder} trait Glob { /** Matches the input string against this glob. */ - def matches(in: String): Boolean + def matches(caseSensitive: Boolean)(in: String): Boolean /** If this glob consists of multiple segments, it is the same as * `matches`. If it is only a single segment, it is matched against @@ -25,42 +25,6 @@ trait Glob { } object Glob { - private val separator = '/' - private val anyChar = '|' - - val all = new Glob { - def matches(in: String) = true - def matchFilenameOrPath(in: String) = true - val asString = "*" - } - - def pattern(pattern: Pattern): Glob = - PatternGlob(pattern) - - /** A simple glob supporting `*` and `?`. */ - final private case class PatternGlob(pattern: Pattern) extends Glob { - def matches(in: String): Boolean = - pattern.parts - .zipWith(Glob.split(in, Glob.separator))(_.matches(_)) - .forall(identity) - - def matchFilenameOrPath(in: String): Boolean = - if (pattern.parts.tail.isEmpty) matches(split(in, separator).last) - else matches(in) - - def asString: String = - pattern.asString - } - - final private case class AnyGlob(globs: NonEmptyList[Glob]) extends Glob { - def matches(in: String) = - globs.exists(_.matches(in)) - def matchFilenameOrPath(in: String) = - globs.exists(_.matchFilenameOrPath(in)) - def asString = - globs.toList.map(_.asString).mkString(anyChar.toString) - } - def apply(in: String): Glob = { def single(str: String) = PatternGlob(Pattern(split(str, separator).map(makeSegment))) @@ -75,6 +39,42 @@ object Glob { } } + private val separator = '/' + private val anyChar = '|' + + val all = new Glob { + def matches(caseSensitive: Boolean)(in: String) = true + def matchFilenameOrPath(in: String) = true + val asString = "*" + } + + def pattern(pattern: Pattern): Glob = + PatternGlob(pattern) + + /** A simple glob supporting `*` and `?`. */ + final private case class PatternGlob(pattern: Pattern) extends Glob { + def matches(caseSensitive: Boolean)(in: String): Boolean = + pattern.parts + .zipWith(Glob.split(in, Glob.separator))(_.matches(caseSensitive)(_)) + .forall(identity) + + def matchFilenameOrPath(in: String): Boolean = + if (pattern.parts.tail.isEmpty) matches(true)(split(in, separator).last) + else matches(true)(in) + + def asString: String = + pattern.asString + } + + final private case class AnyGlob(globs: NonEmptyList[Glob]) extends Glob { + def matches(caseSensitive: Boolean)(in: String) = + globs.exists(_.matches(caseSensitive)(in)) + def matchFilenameOrPath(in: String) = + globs.exists(_.matchFilenameOrPath(in)) + def asString = + globs.toList.map(_.asString).mkString(anyChar.toString) + } + case class Pattern(parts: NonEmptyList[Segment]) { def asString = parts.map(_.asString).toList.mkString(separator.toString) @@ -86,12 +86,12 @@ object Glob { } case class Segment(tokens: NonEmptyList[Token]) { - def matches(in: String): Boolean = - consume(in).exists(_.isEmpty) + def matches(caseSensitive: Boolean)(in: String): Boolean = + consume(in, caseSensitive).exists(_.isEmpty) - def consume(in: String): Option[String] = + def consume(in: String, caseSensitive: Boolean): Option[String] = tokens.foldLeft(in.some) { (rem, token) => - rem.flatMap(token.consume) + rem.flatMap(token.consume(caseSensitive)) } def asString: String = @@ -103,34 +103,47 @@ object Glob { } sealed trait Token { - def consume(str: String): Option[String] + def consume(caseSensitive: Boolean)(str: String): Option[String] def asString: String } object Token { case class Literal(asString: String) extends Token { - def consume(str: String): Option[String] = - if (str.startsWith(asString)) str.drop(asString.length).some + def consume(caseSensitive: Boolean)(str: String): Option[String] = + if (str.startsWith(asString, caseSensitive)) str.drop(asString.length).some else None } case class Until(value: String) extends Token { - def consume(str: String): Option[String] = + def consume(caseSensitive: Boolean)(str: String): Option[String] = if (value.isEmpty) Some("") else - str.indexOf(value) match { - case -1 => None - case n => str.substring(n + value.length).some - } + str + .findFirst(value, caseSensitive) + .map(n => str.substring(n + value.length)) val asString = s"*$value" } case object Single extends Token { - def consume(str: String): Option[String] = - if (str.isEmpty()) None + def consume(caseSensitive: Boolean)(str: String): Option[String] = + if (str.isEmpty) None else Some(str.drop(1)) val asString = "?" } + + implicit final class StringHelper(val str: String) extends AnyVal { + def findFirst(sub: String, caseSensitive: Boolean): Option[Int] = { + val vstr = if (caseSensitive) str else str.toLowerCase + val vsub = if (caseSensitive) sub else sub.toLowerCase + Option(vstr.indexOf(vsub)).filter(_ >= 0) + } + + def startsWith(prefix: String, caseSensitive: Boolean): Boolean = { + val vstr = if (caseSensitive) str else str.toLowerCase + val vprefix = if (caseSensitive) prefix else prefix.toLowerCase + vstr.startsWith(vprefix) + } + } } private def split(str: String, sep: Char): NonEmptyList[String] = @@ -139,6 +152,7 @@ object Glob { .getOrElse(NonEmptyList.of(str)) private def makeSegment(str: String): Segment = { + @annotation.tailrec def loop(rem: String, res: List[Token]): List[Token] = if (rem.isEmpty) res else diff --git a/modules/common/src/test/scala/docspell/common/GlobTest.scala b/modules/common/src/test/scala/docspell/common/GlobTest.scala index 8f228851..2e650174 100644 --- a/modules/common/src/test/scala/docspell/common/GlobTest.scala +++ b/modules/common/src/test/scala/docspell/common/GlobTest.scala @@ -6,8 +6,10 @@ import Glob._ object GlobTest extends SimpleTestSuite { test("literals") { - assert(Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches("hello")) - assert(!Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches("hello1")) + assert(Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches(true)("hello")) + assert( + !Glob.pattern(Pattern(Segment(Token.Literal("hello")))).matches(true)("hello1") + ) } test("single wildcards 1") { @@ -16,19 +18,19 @@ object GlobTest extends SimpleTestSuite { Pattern(Segment(Token.Literal("s"), Token.Until("p"), Token.Until("t"))) ) - assert(glob.matches("snapshot")) - assert(!glob.matches("snapshots")) + assert(glob.matches(true)("snapshot")) + assert(!glob.matches(true)("snapshots")) } test("single wildcards 2") { val glob = Glob.pattern(Pattern(Segment(Token.Literal("test."), Token.Until("")))) - assert(glob.matches("test.txt")) - assert(glob.matches("test.pdf")) - assert(glob.matches("test.converted.pdf")) - assert(!glob.matches("test1.txt")) - assert(!glob.matches("atest.txt")) + assert(glob.matches(true)("test.txt")) + assert(glob.matches(true)("test.pdf")) + assert(glob.matches(true)("test.converted.pdf")) + assert(!glob.matches(true)("test1.txt")) + assert(!glob.matches(true)("atest.txt")) } test("single parsing") { @@ -60,12 +62,12 @@ object GlobTest extends SimpleTestSuite { } test("with splitting") { - assert(Glob("a/b/*").matches("a/b/hello")) - assert(!Glob("a/b/*").matches("/a/b/hello")) - assert(Glob("/a/b/*").matches("/a/b/hello")) - assert(!Glob("/a/b/*").matches("a/b/hello")) - assert(!Glob("*/a/b/*").matches("a/b/hello")) - assert(Glob("*/a/b/*").matches("test/a/b/hello")) + assert(Glob("a/b/*").matches(true)("a/b/hello")) + assert(!Glob("a/b/*").matches(true)("/a/b/hello")) + assert(Glob("/a/b/*").matches(true)("/a/b/hello")) + assert(!Glob("/a/b/*").matches(true)("a/b/hello")) + assert(!Glob("*/a/b/*").matches(true)("a/b/hello")) + assert(Glob("*/a/b/*").matches(true)("test/a/b/hello")) } test("asString") { @@ -79,9 +81,9 @@ object GlobTest extends SimpleTestSuite { } test("simple matches") { - assert(Glob("/test.*").matches("/test.pdf")) - assert(!Glob("/test.*").matches("test.pdf")) - assert(!Glob("test.*").matches("/test.pdf")) + assert(Glob("/test.*").matches(true)("/test.pdf")) + assert(!Glob("/test.*").matches(true)("test.pdf")) + assert(!Glob("test.*").matches(true)("/test.pdf")) } test("matchFilenameOrPath") { @@ -100,12 +102,24 @@ object GlobTest extends SimpleTestSuite { } test("anyglob") { - assert(Glob("*.pdf|*.txt").matches("test.pdf")) - assert(Glob("*.pdf|*.txt").matches("test.txt")) - assert(!Glob("*.pdf|*.txt").matches("test.xls")) - assert(Glob("*.pdf | *.txt").matches("test.pdf")) - assert(Glob("*.pdf | mail.html").matches("test.pdf")) - assert(Glob("*.pdf | mail.html").matches("mail.html")) - assert(!Glob("*.pdf | mail.html").matches("test.docx")) + assert(Glob("*.pdf|*.txt").matches(true)("test.pdf")) + assert(Glob("*.pdf|*.txt").matches(true)("test.txt")) + assert(!Glob("*.pdf|*.txt").matches(true)("test.xls")) + assert(Glob("*.pdf | *.txt").matches(true)("test.pdf")) + assert(Glob("*.pdf | mail.html").matches(true)("test.pdf")) + assert(Glob("*.pdf | mail.html").matches(true)("mail.html")) + assert(!Glob("*.pdf | mail.html").matches(true)("test.docx")) + } + + test("case insensitive") { + assert(Glob("*hello*").matches(false)("hello world")) + assert(Glob("*hello*").matches(false)("world hello")) + assert(Glob("*hello*").matches(false)("Hello world")) + assert(Glob("*hello*").matches(false)("world Hello")) + assert(Glob("*hello*").matches(false)("World Hello")) + assert(Glob("*hello*").matches(false)("Hello World")) + assert(Glob("*Hello*").matches(false)("world hello")) + assert(Glob("*heLLo*").matches(false)("Hello world")) + assert(Glob("*hellO*").matches(false)("world Hello")) } } diff --git a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala index 8242df94..ce5bd3ca 100644 --- a/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala +++ b/modules/joex/src/main/scala/docspell/joex/mail/ReadMail.scala @@ -51,11 +51,11 @@ object ReadMail { (Stream .eval(bodyEntry) .flatMap(e => Stream.emits(e.toSeq)) - .filter(a => glob.matches(a.name)) ++ + .filter(a => glob.matches(caseSensitive = false)(a.name)) ++ Stream .eval(TnefExtract.replace(mail)) .flatMap(m => Stream.emits(m.attachments.all)) - .filter(a => a.filename.exists(glob.matches)) + .filter(a => a.filename.exists(glob.matches(caseSensitive = false))) .map(a => Binary(a.filename.getOrElse("noname"), a.mimeType.toLocal, a.content) )) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index b684ded9..c48952e2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -243,7 +243,9 @@ object ExtractArchive { ) def filterNames(filter: Glob): Extracted = - copy(files = files.filter(ra => filter.matches(ra.name.getOrElse("")))) + copy(files = + files.filter(ra => filter.matches(caseSensitive = false)(ra.name.getOrElse(""))) + ) def setMeta(m: MetaProposal): Extracted = setMeta(MetaProposalList.of(m)) diff --git a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala index 4390da3a..ed038926 100644 --- a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala @@ -182,7 +182,7 @@ object ScanMailboxTask { ctx.args.subjectFilter match { case Some(sf) => def check(mh: MailHeader): F[Option[MailHeader]] = - if (sf.matches(mh.subject)) + if (sf.matches(caseSensitive = false)(mh.subject)) ctx.logger.debug( s"Including mail '${mh.subject}', it matches the filter." ) *> Option(mh).pure[F] From bddafa7d28e59b90637b09f77541c2cb2d5ead7f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 9 Jan 2021 14:13:02 +0100 Subject: [PATCH 2/2] Fix looping over already seen mails when they are skipped When skipping mails due to a filter, it must still enter the post-handling step. Otherwise it will be seen again on next run. Issue: #551 --- .../joex/scanmailbox/ScanMailboxTask.scala | 97 +++++++++++-------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala index ed038926..71bf3902 100644 --- a/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/scanmailbox/ScanMailboxTask.scala @@ -135,15 +135,21 @@ object ScanMailboxTask { final private class Impl[F[_]: Sync](cfg: Config.ScanMailbox, ctx: Context[F, Args]) { + private def logOp[C](f: Logger[F] => F[Unit]): MailOp[F, C, Unit] = + MailOp(_ => f(ctx.logger)) + def handleFolder[C](a: Access[F, C], upload: OUpload[F])( name: String ): MailOp[F, C, ScanResult] = for { - _ <- Kleisli.liftF(ctx.logger.info(s"Processing folder $name")) - folder <- requireFolder(a)(name) - search <- searchMails(a)(folder) - headers <- Kleisli.liftF(filterSubjects(search.mails).flatMap(filterMessageIds)) - _ <- headers.traverse(handleOne(ctx.args, a, upload)) + _ <- Kleisli.liftF(ctx.logger.info(s"Processing folder $name")) + folder <- requireFolder(a)(name) + search <- searchMails(a)(folder) + items = search.mails.map(MailHeaderItem(_)) + headers <- Kleisli.liftF( + filterSubjects(items).flatMap(filterMessageIds) + ) + _ <- headers.traverse(handleOne(ctx.args, a, upload)) } yield ScanResult(name, search.mails.size, search.count - search.mails.size) def requireFolder[C](a: Access[F, C])(name: String): MailOp[F, C, MailFolder] = @@ -163,43 +169,39 @@ object ScanMailboxTask { } for { - _ <- Kleisli.liftF( - ctx.logger.debug( - s"Searching next ${cfg.mailBatchSize} mails in ${folder.name}." - ) + _ <- logOp( + _.debug(s"Searching next ${cfg.mailBatchSize} mails in ${folder.name}.") ) query <- Kleisli.liftF(q) mails <- a.search(folder, cfg.mailBatchSize)(query) - _ <- Kleisli.liftF( - ctx.logger.debug( + _ <- logOp( + _.debug( s"Found ${mails.count} mails in folder. Reading first ${mails.mails.size}" ) ) } yield mails } - def filterSubjects(headers: Vector[MailHeader]): F[Vector[MailHeader]] = + def filterSubjects(headers: Vector[MailHeaderItem]): F[Vector[MailHeaderItem]] = ctx.args.subjectFilter match { case Some(sf) => - def check(mh: MailHeader): F[Option[MailHeader]] = - if (sf.matches(caseSensitive = false)(mh.subject)) - ctx.logger.debug( - s"Including mail '${mh.subject}', it matches the filter." - ) *> Option(mh).pure[F] + def check(mh: MailHeaderItem): F[MailHeaderItem] = + if (mh.notProcess || sf.matches(caseSensitive = false)(mh.mh.subject)) + mh.pure[F] else ctx.logger.debug( - s"Excluding mail '${mh.subject}', it doesn't match the filter." - ) *> (None: Option[MailHeader]).pure[F] + s"Excluding mail '${mh.mh.subject}', it doesn't match the subject filter." + ) *> mh.skip.pure[F] ctx.logger.info( s"Filtering mails on subject using filter: ${sf.asString}" - ) *> headers.traverseFilter(check) + ) *> headers.traverse(check) case None => ctx.logger.debug("Not matching on subjects. No filter given") *> headers.pure[F] } - def filterMessageIds(headers: Vector[MailHeader]): F[Vector[MailHeader]] = - NonEmptyList.fromFoldable(headers.flatMap(_.messageId)) match { + def filterMessageIds[C](headers: Vector[MailHeaderItem]): F[Vector[MailHeaderItem]] = + NonEmptyList.fromFoldable(headers.flatMap(_.mh.messageId)) match { case Some(nl) => for { archives <- ctx.store.transact( @@ -207,12 +209,14 @@ object ScanMailboxTask { .findByMessageIdAndCollective(nl, ctx.args.account.collective) ) existing = archives.flatMap(_.messageId).toSet - mails = headers.filterNot(mh => mh.messageId.forall(existing.contains)) - _ <- headers.size - mails.size match { - case 0 => ().pure[F] - case n => - ctx.logger.info(s"Excluded $n mails since items for them already exist.") - } + mails <- headers + .traverse(mh => + if (mh.process && mh.mh.messageId.forall(existing.contains)) + ctx.logger.debug( + s"Excluding mail '${mh.mh.subject}' it has been imported in the past.'" + ) *> mh.skip.pure[F] + else mh.pure[F] + ) } yield mails case None => @@ -248,15 +252,21 @@ object ScanMailboxTask { def postHandle[C](a: Access[F, C])(mh: MailHeader): MailOp[F, C, Unit] = ctx.args.targetFolder match { case Some(tf) => - a.getOrCreateFolder(None, tf).flatMap(folder => a.moveMail(mh, folder)) + logOp(_.debug(s"Post handling mail: ${mh.subject} - moving to folder: $tf")) + .flatMap(_ => + a.getOrCreateFolder(None, tf).flatMap(folder => a.moveMail(mh, folder)) + ) case None if ctx.args.deleteMail => - a.deleteMail(mh).flatMapF { r => - if (r.count == 0) - ctx.logger.warn(s"Mail '${mh.subject}' could not be deleted") - else ().pure[F] - } + logOp(_.debug(s"Post handling mail: ${mh.subject} - deleting mail.")).flatMap( + _ => + a.deleteMail(mh).flatMapF { r => + if (r.count == 0) + ctx.logger.warn(s"Mail could not be deleted!") + else ().pure[F] + } + ) case None => - MailOp.pure(()) + logOp(_.debug(s"Post handling mail: ${mh.subject} - no handling defined!")) } def submitMail(upload: OUpload[F], args: Args)( @@ -292,13 +302,15 @@ object ScanMailboxTask { } def handleOne[C](args: Args, a: Access[F, C], upload: OUpload[F])( - mh: MailHeader + mh: MailHeaderItem ): MailOp[F, C, Unit] = for { - mail <- a.loadMail(mh) + mail <- a.loadMail(mh.mh) res <- mail match { - case Some(m) => + case Some(m) if mh.process => Kleisli.liftF(submitMail(upload, args)(m).attempt) + case Some(_) => + Kleisli.liftF(Either.right(mh).pure[F]) case None => MailOp.pure[F, C, Either[Throwable, OUpload.UploadResult]]( Either.left(new Exception(s"Mail not found")) @@ -307,10 +319,15 @@ object ScanMailboxTask { _ <- res.fold( ex => Kleisli.liftF( - ctx.logger.warn(s"Error submitting '${mh.subject}': ${ex.getMessage}") + ctx.logger.warn(s"Error submitting '${mh.mh.subject}': ${ex.getMessage}") ), - _ => postHandle(a)(mh) + _ => postHandle(a)(mh.mh) ) } yield () } + + case class MailHeaderItem(mh: MailHeader, process: Boolean = true) { + def skip = copy(process = false) + def notProcess = !process + } }