mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Improve performance of zip/unzip
Adds tests and includes some cleanup
This commit is contained in:
@ -7,19 +7,16 @@
|
||||
package docspell.files
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect.Sync
|
||||
import cats.effect.{Async, Sync}
|
||||
import cats.syntax.all._
|
||||
import fs2.Stream
|
||||
import fs2.Pipe
|
||||
import fs2.io.file.{Files, Path}
|
||||
|
||||
import docspell.common.{MimeType, MimeTypeHint}
|
||||
|
||||
import io.circe.Encoder
|
||||
import io.circe.syntax._
|
||||
import docspell.common.{Binary, MimeType, MimeTypeHint}
|
||||
|
||||
trait FileSupport {
|
||||
implicit final class FileOps[F[_]: Files: Sync](self: Path) {
|
||||
def detectMime: F[Option[MimeType]] =
|
||||
implicit final class FileOps(self: Path) {
|
||||
def detectMime[F[_]: Files: Sync]: F[Option[MimeType]] =
|
||||
Files[F].isReadable(self).flatMap { flag =>
|
||||
OptionT
|
||||
.whenF(flag) {
|
||||
@ -32,30 +29,18 @@ trait FileSupport {
|
||||
.value
|
||||
}
|
||||
|
||||
def asTextFile(alt: MimeType => F[Unit]): F[Option[Path]] =
|
||||
OptionT(detectMime).flatMapF { mime =>
|
||||
if (mime.matches(MimeType.text("plain"))) self.some.pure[F]
|
||||
else alt(mime).as(None: Option[Path])
|
||||
}.value
|
||||
|
||||
def readText: F[String] =
|
||||
Files[F]
|
||||
.readAll(self)
|
||||
.through(fs2.text.utf8.decode)
|
||||
.compile
|
||||
.string
|
||||
|
||||
def readAll: Stream[F, Byte] =
|
||||
Files[F].readAll(self)
|
||||
|
||||
def writeJson[A: Encoder](value: A): F[Unit] =
|
||||
Stream
|
||||
.emit(value.asJson.noSpaces)
|
||||
.through(fs2.text.utf8.encode)
|
||||
.through(Files[F].writeAll(self))
|
||||
.compile
|
||||
.drain
|
||||
def mimeType[F[_]: Files: Sync]: F[MimeType] =
|
||||
detectMime.map(_.getOrElse(MimeType.octetStream))
|
||||
}
|
||||
|
||||
def detectMime[F[_]: Sync]: Pipe[F, Binary[F], Binary[F]] =
|
||||
_.evalMap { bin =>
|
||||
val hint = MimeTypeHint.filename(bin.name).withAdvertised(bin.mime.asString)
|
||||
TikaMimetype.detect[F](bin.data, hint).map(mt => bin.copy(mime = mt))
|
||||
}
|
||||
|
||||
def toBinaryWithMime[F[_]: Async]: Pipe[F, Path, Binary[F]] =
|
||||
_.evalMap(file => file.mimeType.map(mt => Binary(file).copy(mime = mt)))
|
||||
}
|
||||
|
||||
object FileSupport extends FileSupport
|
||||
|
@ -1,180 +0,0 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.files
|
||||
|
||||
import java.io.InputStream
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.util.zip.{ZipEntry, ZipInputStream, ZipOutputStream}
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.io.file.{Files, Path}
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.common.Binary
|
||||
import docspell.common.Glob
|
||||
import docspell.logging.Logger
|
||||
|
||||
object Zip {
|
||||
|
||||
def zip[F[_]: Async](
|
||||
logger: Logger[F],
|
||||
chunkSize: Int
|
||||
): Pipe[F, (String, Stream[F, Byte]), Byte] =
|
||||
in => zipJava(logger, chunkSize, in.through(deduplicate))
|
||||
|
||||
def unzip[F[_]: Async](
|
||||
chunkSize: Int,
|
||||
glob: Glob
|
||||
): Pipe[F, Byte, Binary[F]] =
|
||||
s => unzipStream[F](chunkSize, glob)(s)
|
||||
|
||||
def unzipStream[F[_]: Async](chunkSize: Int, glob: Glob)(
|
||||
data: Stream[F, Byte]
|
||||
): Stream[F, Binary[F]] =
|
||||
data
|
||||
.through(fs2.io.toInputStream[F])
|
||||
.flatMap(in => unzipJava(in, chunkSize, glob))
|
||||
|
||||
def saveTo[F[_]: Async](
|
||||
logger: Logger[F],
|
||||
targetDir: Path,
|
||||
moveUp: Boolean
|
||||
): Pipe[F, Binary[F], Path] =
|
||||
binaries =>
|
||||
binaries
|
||||
.filter(e => !e.name.endsWith("/"))
|
||||
.evalMap { entry =>
|
||||
val out = targetDir / entry.name
|
||||
val createParent =
|
||||
OptionT
|
||||
.fromOption[F](out.parent)
|
||||
.flatMapF(parent =>
|
||||
Files[F]
|
||||
.exists(parent)
|
||||
.map(flag => Option.when(!flag)(parent))
|
||||
)
|
||||
.semiflatMap(p => Files[F].createDirectories(p))
|
||||
.getOrElse(())
|
||||
|
||||
logger.trace(s"Unzip ${entry.name} -> $out") *>
|
||||
createParent *>
|
||||
entry.data.through(Files[F].writeAll(out)).compile.drain
|
||||
}
|
||||
.drain ++ Stream
|
||||
.eval(if (moveUp) moveContentsUp(logger)(targetDir) else ().pure[F])
|
||||
.as(targetDir)
|
||||
|
||||
private def moveContentsUp[F[_]: Sync: Files](logger: Logger[F])(dir: Path): F[Unit] =
|
||||
Files[F]
|
||||
.list(dir)
|
||||
.take(2)
|
||||
.compile
|
||||
.toList
|
||||
.flatMap {
|
||||
case subdir :: Nil =>
|
||||
Files[F].isDirectory(subdir).flatMap {
|
||||
case false => ().pure[F]
|
||||
case true =>
|
||||
Files[F]
|
||||
.list(subdir)
|
||||
.filter(p => p != dir)
|
||||
.evalTap(c => logger.trace(s"Move $c -> ${dir / c.fileName}"))
|
||||
.evalMap(child => Files[F].move(child, dir / child.fileName))
|
||||
.compile
|
||||
.drain
|
||||
}
|
||||
|
||||
case _ =>
|
||||
().pure[F]
|
||||
}
|
||||
|
||||
def unzipJava[F[_]: Async](
|
||||
in: InputStream,
|
||||
chunkSize: Int,
|
||||
glob: Glob
|
||||
): Stream[F, Binary[F]] = {
|
||||
val zin = new ZipInputStream(in)
|
||||
|
||||
val nextEntry = Resource.make(Sync[F].delay(Option(zin.getNextEntry))) {
|
||||
case Some(_) => Sync[F].delay(zin.closeEntry())
|
||||
case None => ().pure[F]
|
||||
}
|
||||
|
||||
Stream
|
||||
.resource(nextEntry)
|
||||
.repeat
|
||||
.unNoneTerminate
|
||||
.filter(ze => glob.matchFilenameOrPath(ze.getName()))
|
||||
.map { ze =>
|
||||
val name = ze.getName()
|
||||
val data =
|
||||
fs2.io.readInputStream[F]((zin: InputStream).pure[F], chunkSize, false)
|
||||
Binary(name, data)
|
||||
}
|
||||
}
|
||||
|
||||
private def deduplicate[F[_]: Sync, A]: Pipe[F, (String, A), (String, A)] = {
|
||||
def makeName(name: String, count: Int): String =
|
||||
if (count <= 0) name
|
||||
else
|
||||
name.lastIndexOf('.') match {
|
||||
case n if n > 0 =>
|
||||
s"${name.substring(0, n)}_$count${name.substring(n)}"
|
||||
case _ =>
|
||||
s"${name}_$count"
|
||||
}
|
||||
|
||||
def unique(
|
||||
current: Set[String],
|
||||
name: String,
|
||||
counter: Int
|
||||
): (Set[String], String) = {
|
||||
val nextName = makeName(name, counter)
|
||||
if (current.contains(nextName))
|
||||
unique(current, name, counter + 1)
|
||||
else (current + nextName, nextName)
|
||||
}
|
||||
|
||||
in =>
|
||||
Stream
|
||||
.eval(Ref.of[F, Set[String]](Set.empty[String]))
|
||||
.flatMap { ref =>
|
||||
in.evalMap { element =>
|
||||
ref
|
||||
.modify(names => unique(names, element._1, 0))
|
||||
.map(n => (n, element._2))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def zipJava[F[_]: Async](
|
||||
logger: Logger[F],
|
||||
chunkSize: Int,
|
||||
entries: Stream[F, (String, Stream[F, Byte])]
|
||||
): Stream[F, Byte] =
|
||||
fs2.io.readOutputStream(chunkSize) { out =>
|
||||
val zip = new ZipOutputStream(out, StandardCharsets.UTF_8)
|
||||
val writeEntries =
|
||||
entries.evalMap { case (name, bytes) =>
|
||||
val javaOut =
|
||||
bytes.through(
|
||||
fs2.io.writeOutputStream[F](Sync[F].pure(zip), closeAfterUse = false)
|
||||
)
|
||||
val nextEntry =
|
||||
logger.debug(s"Adding $name to zip file…") *>
|
||||
Sync[F].delay(zip.putNextEntry(new ZipEntry(name)))
|
||||
Resource
|
||||
.make(nextEntry)(_ => Sync[F].delay(zip.closeEntry()))
|
||||
.use(_ => javaOut.compile.drain)
|
||||
}
|
||||
val closeStream = Sync[F].delay(zip.close())
|
||||
|
||||
writeEntries.onFinalize(closeStream).compile.drain
|
||||
}
|
||||
}
|
@ -7,10 +7,12 @@
|
||||
package docspell.files
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import cats.syntax.option._
|
||||
import fs2.Stream
|
||||
import fs2.io.file.{Files, Path}
|
||||
|
||||
import docspell.common.Glob
|
||||
import docspell.common.syntax.file._
|
||||
import docspell.common.util.Zip
|
||||
import docspell.logging.TestLoggingConfig
|
||||
|
||||
import munit._
|
||||
@ -21,29 +23,101 @@ class ZipTest extends CatsEffectSuite with TestLoggingConfig {
|
||||
Files[IO].tempDirectory(Path("target").some, "zip-test-", None)
|
||||
)
|
||||
|
||||
test("unzip") {
|
||||
tempDir.test("unzip") { dir =>
|
||||
val zipFile = ExampleFiles.letters_zip.readURL[IO](8192)
|
||||
val unzip = zipFile.through(Zip.unzip(8192, Glob.all))
|
||||
val unzip: Stream[IO, Path] = zipFile
|
||||
.through(Zip[IO](logger.some, dir.some).unzip(8192))
|
||||
|
||||
unzip
|
||||
.evalMap { entry =>
|
||||
val x = entry.data.map(_ => 1).foldMonoid.compile.lastOrError
|
||||
x.map { size =>
|
||||
if (entry.name.endsWith(".pdf")) {
|
||||
assertEquals(entry.name, "letter-de.pdf")
|
||||
assertEquals(size, 34815)
|
||||
} else {
|
||||
assertEquals(entry.name, "letter-en.txt")
|
||||
assertEquals(size, 1131)
|
||||
}
|
||||
(for {
|
||||
file <- unzip
|
||||
length <- Stream.eval(Files[IO].size(file))
|
||||
sha <- Stream.eval(file.sha256Hex[IO])
|
||||
_ = {
|
||||
if (file.name == "letter-de.pdf") {
|
||||
assertEquals(length, 34815L)
|
||||
assertEquals(
|
||||
sha,
|
||||
"299c15429ce327099c322b36caaec56e7a6034106531c5d1b3fd085467a8d495"
|
||||
)
|
||||
} else {
|
||||
assertEquals(file.name, "letter-en.txt")
|
||||
assertEquals(length, 1131L)
|
||||
assertEquals(
|
||||
sha,
|
||||
"55eca47c65084126d7c3bbce941cadff0f642a7287ff8e0f3fc9c2c33a4bb7f0"
|
||||
)
|
||||
}
|
||||
}
|
||||
} yield ()).compile.drain
|
||||
}
|
||||
|
||||
tempDir.test("unzip directories and files") { dir =>
|
||||
val zipFile = ExampleFiles.zip_dirs_zip.readURL[IO](8192)
|
||||
val unzip: Stream[IO, Path] = zipFile
|
||||
.through(Zip[IO](logger.some, dir.some).unzip(8192))
|
||||
|
||||
val entries =
|
||||
for {
|
||||
file <- unzip
|
||||
sha <- Stream.eval(file.sha256Hex[IO])
|
||||
} yield (file.name, file, sha)
|
||||
|
||||
val expectedSha =
|
||||
"5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03"
|
||||
|
||||
entries
|
||||
.map {
|
||||
case ("file1.txt", file, realSha) =>
|
||||
assertEquals(realSha, expectedSha)
|
||||
val relFile = dir.relativize(file).dropLeft(1)
|
||||
assertEquals(relFile.toString, "file1.txt")
|
||||
|
||||
case ("file2.txt", file, realSha) =>
|
||||
assertEquals(realSha, expectedSha)
|
||||
val relFile = dir.relativize(file).dropLeft(1)
|
||||
assertEquals(relFile.toString, "dir1/file2.txt")
|
||||
|
||||
case ("file3.txt", file, realSha) =>
|
||||
assertEquals(realSha, expectedSha)
|
||||
val relFile = dir.relativize(file).dropLeft(1)
|
||||
assertEquals(relFile.toString, "dir1/dir11/file3.txt")
|
||||
|
||||
case ("file4.txt", file, realSha) =>
|
||||
assertEquals(realSha, expectedSha)
|
||||
val relFile = dir.relativize(file).dropLeft(1)
|
||||
assertEquals(relFile.toString, "dir2/file4.txt")
|
||||
|
||||
case (name, _, _) =>
|
||||
fail(s"Unexpected file: $name")
|
||||
}
|
||||
.compile
|
||||
.drain
|
||||
}
|
||||
|
||||
tempDir.test("unzipTo directory tree") { _ =>
|
||||
// val zipFile = ExampleFiles.zip_dirs_zip.readURL[IO](8192)
|
||||
// zipFile.through(Zip.unzip(G))
|
||||
}
|
||||
// tempDir.test("test runtime") { _ =>
|
||||
// val archive = Path("./local/large-archive.zip")
|
||||
//
|
||||
// for {
|
||||
//
|
||||
// timer1 <- Duration.stopTime[IO]
|
||||
// es1 <- Files[IO]
|
||||
// .readAll(archive)
|
||||
// .through(Zip[IO]().unzip(64 * 1024))
|
||||
// .compile
|
||||
// .toVector
|
||||
// duration1 <- timer1
|
||||
//
|
||||
// timer2 <- Duration.stopTime[IO]
|
||||
// es2 <- fs2.Stream
|
||||
// .emit(archive)
|
||||
// .covary[IO]
|
||||
// .through(Zip[IO]().unzipFiles(64 * 1024))
|
||||
// .compile
|
||||
// .toVector
|
||||
// duration2 <- timer2
|
||||
//
|
||||
// _ <- IO.println(s">>>>1. ${duration1.formatExact}, entries: $es1")
|
||||
// _ <- IO.println(s">>>>2. ${duration2.formatExact}, entries: $es2")
|
||||
// } yield ()
|
||||
// }
|
||||
}
|
||||
|
Reference in New Issue
Block a user