Improve performance of zip/unzip

Adds tests and includes some cleanup
This commit is contained in:
eikek
2022-06-18 16:37:38 +02:00
parent 483dbf5d2b
commit 6cef9d4f07
24 changed files with 711 additions and 293 deletions

View File

@ -7,19 +7,16 @@
package docspell.files
import cats.data.OptionT
import cats.effect.Sync
import cats.effect.{Async, Sync}
import cats.syntax.all._
import fs2.Stream
import fs2.Pipe
import fs2.io.file.{Files, Path}
import docspell.common.{MimeType, MimeTypeHint}
import io.circe.Encoder
import io.circe.syntax._
import docspell.common.{Binary, MimeType, MimeTypeHint}
trait FileSupport {
implicit final class FileOps[F[_]: Files: Sync](self: Path) {
def detectMime: F[Option[MimeType]] =
implicit final class FileOps(self: Path) {
def detectMime[F[_]: Files: Sync]: F[Option[MimeType]] =
Files[F].isReadable(self).flatMap { flag =>
OptionT
.whenF(flag) {
@ -32,30 +29,18 @@ trait FileSupport {
.value
}
def asTextFile(alt: MimeType => F[Unit]): F[Option[Path]] =
OptionT(detectMime).flatMapF { mime =>
if (mime.matches(MimeType.text("plain"))) self.some.pure[F]
else alt(mime).as(None: Option[Path])
}.value
def readText: F[String] =
Files[F]
.readAll(self)
.through(fs2.text.utf8.decode)
.compile
.string
def readAll: Stream[F, Byte] =
Files[F].readAll(self)
def writeJson[A: Encoder](value: A): F[Unit] =
Stream
.emit(value.asJson.noSpaces)
.through(fs2.text.utf8.encode)
.through(Files[F].writeAll(self))
.compile
.drain
def mimeType[F[_]: Files: Sync]: F[MimeType] =
detectMime.map(_.getOrElse(MimeType.octetStream))
}
def detectMime[F[_]: Sync]: Pipe[F, Binary[F], Binary[F]] =
_.evalMap { bin =>
val hint = MimeTypeHint.filename(bin.name).withAdvertised(bin.mime.asString)
TikaMimetype.detect[F](bin.data, hint).map(mt => bin.copy(mime = mt))
}
def toBinaryWithMime[F[_]: Async]: Pipe[F, Path, Binary[F]] =
_.evalMap(file => file.mimeType.map(mt => Binary(file).copy(mime = mt)))
}
object FileSupport extends FileSupport

View File

@ -1,180 +0,0 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.files
import java.io.InputStream
import java.nio.charset.StandardCharsets
import java.util.zip.{ZipEntry, ZipInputStream, ZipOutputStream}
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import fs2.io.file.{Files, Path}
import fs2.{Pipe, Stream}
import docspell.common.Binary
import docspell.common.Glob
import docspell.logging.Logger
object Zip {
def zip[F[_]: Async](
logger: Logger[F],
chunkSize: Int
): Pipe[F, (String, Stream[F, Byte]), Byte] =
in => zipJava(logger, chunkSize, in.through(deduplicate))
def unzip[F[_]: Async](
chunkSize: Int,
glob: Glob
): Pipe[F, Byte, Binary[F]] =
s => unzipStream[F](chunkSize, glob)(s)
def unzipStream[F[_]: Async](chunkSize: Int, glob: Glob)(
data: Stream[F, Byte]
): Stream[F, Binary[F]] =
data
.through(fs2.io.toInputStream[F])
.flatMap(in => unzipJava(in, chunkSize, glob))
def saveTo[F[_]: Async](
logger: Logger[F],
targetDir: Path,
moveUp: Boolean
): Pipe[F, Binary[F], Path] =
binaries =>
binaries
.filter(e => !e.name.endsWith("/"))
.evalMap { entry =>
val out = targetDir / entry.name
val createParent =
OptionT
.fromOption[F](out.parent)
.flatMapF(parent =>
Files[F]
.exists(parent)
.map(flag => Option.when(!flag)(parent))
)
.semiflatMap(p => Files[F].createDirectories(p))
.getOrElse(())
logger.trace(s"Unzip ${entry.name} -> $out") *>
createParent *>
entry.data.through(Files[F].writeAll(out)).compile.drain
}
.drain ++ Stream
.eval(if (moveUp) moveContentsUp(logger)(targetDir) else ().pure[F])
.as(targetDir)
private def moveContentsUp[F[_]: Sync: Files](logger: Logger[F])(dir: Path): F[Unit] =
Files[F]
.list(dir)
.take(2)
.compile
.toList
.flatMap {
case subdir :: Nil =>
Files[F].isDirectory(subdir).flatMap {
case false => ().pure[F]
case true =>
Files[F]
.list(subdir)
.filter(p => p != dir)
.evalTap(c => logger.trace(s"Move $c -> ${dir / c.fileName}"))
.evalMap(child => Files[F].move(child, dir / child.fileName))
.compile
.drain
}
case _ =>
().pure[F]
}
def unzipJava[F[_]: Async](
in: InputStream,
chunkSize: Int,
glob: Glob
): Stream[F, Binary[F]] = {
val zin = new ZipInputStream(in)
val nextEntry = Resource.make(Sync[F].delay(Option(zin.getNextEntry))) {
case Some(_) => Sync[F].delay(zin.closeEntry())
case None => ().pure[F]
}
Stream
.resource(nextEntry)
.repeat
.unNoneTerminate
.filter(ze => glob.matchFilenameOrPath(ze.getName()))
.map { ze =>
val name = ze.getName()
val data =
fs2.io.readInputStream[F]((zin: InputStream).pure[F], chunkSize, false)
Binary(name, data)
}
}
private def deduplicate[F[_]: Sync, A]: Pipe[F, (String, A), (String, A)] = {
def makeName(name: String, count: Int): String =
if (count <= 0) name
else
name.lastIndexOf('.') match {
case n if n > 0 =>
s"${name.substring(0, n)}_$count${name.substring(n)}"
case _ =>
s"${name}_$count"
}
def unique(
current: Set[String],
name: String,
counter: Int
): (Set[String], String) = {
val nextName = makeName(name, counter)
if (current.contains(nextName))
unique(current, name, counter + 1)
else (current + nextName, nextName)
}
in =>
Stream
.eval(Ref.of[F, Set[String]](Set.empty[String]))
.flatMap { ref =>
in.evalMap { element =>
ref
.modify(names => unique(names, element._1, 0))
.map(n => (n, element._2))
}
}
}
def zipJava[F[_]: Async](
logger: Logger[F],
chunkSize: Int,
entries: Stream[F, (String, Stream[F, Byte])]
): Stream[F, Byte] =
fs2.io.readOutputStream(chunkSize) { out =>
val zip = new ZipOutputStream(out, StandardCharsets.UTF_8)
val writeEntries =
entries.evalMap { case (name, bytes) =>
val javaOut =
bytes.through(
fs2.io.writeOutputStream[F](Sync[F].pure(zip), closeAfterUse = false)
)
val nextEntry =
logger.debug(s"Adding $name to zip file…") *>
Sync[F].delay(zip.putNextEntry(new ZipEntry(name)))
Resource
.make(nextEntry)(_ => Sync[F].delay(zip.closeEntry()))
.use(_ => javaOut.compile.drain)
}
val closeStream = Sync[F].delay(zip.close())
writeEntries.onFinalize(closeStream).compile.drain
}
}

View File

@ -7,10 +7,12 @@
package docspell.files
import cats.effect._
import cats.implicits._
import cats.syntax.option._
import fs2.Stream
import fs2.io.file.{Files, Path}
import docspell.common.Glob
import docspell.common.syntax.file._
import docspell.common.util.Zip
import docspell.logging.TestLoggingConfig
import munit._
@ -21,29 +23,101 @@ class ZipTest extends CatsEffectSuite with TestLoggingConfig {
Files[IO].tempDirectory(Path("target").some, "zip-test-", None)
)
test("unzip") {
tempDir.test("unzip") { dir =>
val zipFile = ExampleFiles.letters_zip.readURL[IO](8192)
val unzip = zipFile.through(Zip.unzip(8192, Glob.all))
val unzip: Stream[IO, Path] = zipFile
.through(Zip[IO](logger.some, dir.some).unzip(8192))
unzip
.evalMap { entry =>
val x = entry.data.map(_ => 1).foldMonoid.compile.lastOrError
x.map { size =>
if (entry.name.endsWith(".pdf")) {
assertEquals(entry.name, "letter-de.pdf")
assertEquals(size, 34815)
} else {
assertEquals(entry.name, "letter-en.txt")
assertEquals(size, 1131)
}
(for {
file <- unzip
length <- Stream.eval(Files[IO].size(file))
sha <- Stream.eval(file.sha256Hex[IO])
_ = {
if (file.name == "letter-de.pdf") {
assertEquals(length, 34815L)
assertEquals(
sha,
"299c15429ce327099c322b36caaec56e7a6034106531c5d1b3fd085467a8d495"
)
} else {
assertEquals(file.name, "letter-en.txt")
assertEquals(length, 1131L)
assertEquals(
sha,
"55eca47c65084126d7c3bbce941cadff0f642a7287ff8e0f3fc9c2c33a4bb7f0"
)
}
}
} yield ()).compile.drain
}
tempDir.test("unzip directories and files") { dir =>
val zipFile = ExampleFiles.zip_dirs_zip.readURL[IO](8192)
val unzip: Stream[IO, Path] = zipFile
.through(Zip[IO](logger.some, dir.some).unzip(8192))
val entries =
for {
file <- unzip
sha <- Stream.eval(file.sha256Hex[IO])
} yield (file.name, file, sha)
val expectedSha =
"5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03"
entries
.map {
case ("file1.txt", file, realSha) =>
assertEquals(realSha, expectedSha)
val relFile = dir.relativize(file).dropLeft(1)
assertEquals(relFile.toString, "file1.txt")
case ("file2.txt", file, realSha) =>
assertEquals(realSha, expectedSha)
val relFile = dir.relativize(file).dropLeft(1)
assertEquals(relFile.toString, "dir1/file2.txt")
case ("file3.txt", file, realSha) =>
assertEquals(realSha, expectedSha)
val relFile = dir.relativize(file).dropLeft(1)
assertEquals(relFile.toString, "dir1/dir11/file3.txt")
case ("file4.txt", file, realSha) =>
assertEquals(realSha, expectedSha)
val relFile = dir.relativize(file).dropLeft(1)
assertEquals(relFile.toString, "dir2/file4.txt")
case (name, _, _) =>
fail(s"Unexpected file: $name")
}
.compile
.drain
}
tempDir.test("unzipTo directory tree") { _ =>
// val zipFile = ExampleFiles.zip_dirs_zip.readURL[IO](8192)
// zipFile.through(Zip.unzip(G))
}
// tempDir.test("test runtime") { _ =>
// val archive = Path("./local/large-archive.zip")
//
// for {
//
// timer1 <- Duration.stopTime[IO]
// es1 <- Files[IO]
// .readAll(archive)
// .through(Zip[IO]().unzip(64 * 1024))
// .compile
// .toVector
// duration1 <- timer1
//
// timer2 <- Duration.stopTime[IO]
// es2 <- fs2.Stream
// .emit(archive)
// .covary[IO]
// .through(Zip[IO]().unzipFiles(64 * 1024))
// .compile
// .toVector
// duration2 <- timer2
//
// _ <- IO.println(s">>>>1. ${duration1.formatExact}, entries: $es1")
// _ <- IO.println(s">>>>2. ${duration2.formatExact}, entries: $es2")
// } yield ()
// }
}