mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-20 17:39:54 +00:00
Add support for archive files
Each attachment is now first extracted into potentially multiple ones, if it is recognized as an archive. This is the first step in processing. The original archive file is also stored and the resulting attachments are associated to their original archive. First support is implemented for zip files.
This commit is contained in:
48
modules/files/src/main/scala/docspell/files/Zip.scala
Normal file
48
modules/files/src/main/scala/docspell/files/Zip.scala
Normal file
@ -0,0 +1,48 @@
|
||||
package docspell.files
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.{Pipe, Stream}
|
||||
import java.io.InputStream
|
||||
import java.util.zip.ZipInputStream
|
||||
import java.nio.file.Paths
|
||||
|
||||
object Zip {
|
||||
|
||||
case class Entry[F[_]](name: String, data: Stream[F, Byte])
|
||||
|
||||
def unzipP[F[_]: ConcurrentEffect: ContextShift](
|
||||
chunkSize: Int,
|
||||
blocker: Blocker
|
||||
): Pipe[F, Byte, Entry[F]] =
|
||||
s => unzip[F](chunkSize, blocker)(s)
|
||||
|
||||
def unzip[F[_]: ConcurrentEffect: ContextShift](chunkSize: Int, blocker: Blocker)(
|
||||
data: Stream[F, Byte]
|
||||
): Stream[F, Entry[F]] =
|
||||
data.through(fs2.io.toInputStream[F]).flatMap(in => unzipJava(in, chunkSize, blocker))
|
||||
|
||||
def unzipJava[F[_]: Sync: ContextShift](
|
||||
in: InputStream,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker
|
||||
): Stream[F, Entry[F]] = {
|
||||
val zin = new ZipInputStream(in)
|
||||
|
||||
val nextEntry = Resource.make(Sync[F].delay(Option(zin.getNextEntry))) {
|
||||
case Some(_) => Sync[F].delay(zin.closeEntry())
|
||||
case None => ().pure[F]
|
||||
}
|
||||
|
||||
Stream
|
||||
.resource(nextEntry)
|
||||
.repeat
|
||||
.unNoneTerminate
|
||||
.map { ze =>
|
||||
val name = Paths.get(ze.getName()).getFileName.toString
|
||||
val data =
|
||||
fs2.io.readInputStream[F]((zin: InputStream).pure[F], chunkSize, blocker, false)
|
||||
Entry(name, data)
|
||||
}
|
||||
}
|
||||
}
|
BIN
modules/files/src/test/resources/letters.zip
Normal file
BIN
modules/files/src/test/resources/letters.zip
Normal file
Binary file not shown.
30
modules/files/src/test/scala/docspell/files/ZipTest.scala
Normal file
30
modules/files/src/test/scala/docspell/files/ZipTest.scala
Normal file
@ -0,0 +1,30 @@
|
||||
package docspell.files
|
||||
|
||||
import minitest._
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object ZipTest extends SimpleTestSuite {
|
||||
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
test("unzip") {
|
||||
val zipFile = ExampleFiles.letters_zip.readURL[IO](8192, blocker)
|
||||
val uncomp = zipFile.through(Zip.unzip(8192, blocker))
|
||||
|
||||
uncomp.evalMap(entry => {
|
||||
val x = entry.data.map(_ => 1).foldMonoid.compile.lastOrError
|
||||
x.map(size => {
|
||||
if (entry.name.endsWith(".pdf")) {
|
||||
assertEquals(entry.name, "letter-de.pdf")
|
||||
assertEquals(size, 34815)
|
||||
} else {
|
||||
assertEquals(entry.name, "letter-en.txt")
|
||||
assertEquals(size, 1131)
|
||||
}
|
||||
})
|
||||
}).compile.drain.unsafeRunSync
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user