Add support for archive files

Each attachment is now first extracted into potentially multiple ones,
if it is recognized as an archive. This is the first step in
processing. The original archive file is also stored and the resulting
attachments are associated to their original archive.

First support is implemented for zip files.
This commit is contained in:
Eike Kettner
2020-03-19 22:42:27 +01:00
parent 2a7066650f
commit 4ed7a137f7
12 changed files with 419 additions and 27 deletions

View File

@ -0,0 +1,48 @@
package docspell.files
import cats.effect._
import cats.implicits._
import fs2.{Pipe, Stream}
import java.io.InputStream
import java.util.zip.ZipInputStream
import java.nio.file.Paths
object Zip {
case class Entry[F[_]](name: String, data: Stream[F, Byte])
def unzipP[F[_]: ConcurrentEffect: ContextShift](
chunkSize: Int,
blocker: Blocker
): Pipe[F, Byte, Entry[F]] =
s => unzip[F](chunkSize, blocker)(s)
def unzip[F[_]: ConcurrentEffect: ContextShift](chunkSize: Int, blocker: Blocker)(
data: Stream[F, Byte]
): Stream[F, Entry[F]] =
data.through(fs2.io.toInputStream[F]).flatMap(in => unzipJava(in, chunkSize, blocker))
def unzipJava[F[_]: Sync: ContextShift](
in: InputStream,
chunkSize: Int,
blocker: Blocker
): Stream[F, Entry[F]] = {
val zin = new ZipInputStream(in)
val nextEntry = Resource.make(Sync[F].delay(Option(zin.getNextEntry))) {
case Some(_) => Sync[F].delay(zin.closeEntry())
case None => ().pure[F]
}
Stream
.resource(nextEntry)
.repeat
.unNoneTerminate
.map { ze =>
val name = Paths.get(ze.getName()).getFileName.toString
val data =
fs2.io.readInputStream[F]((zin: InputStream).pure[F], chunkSize, blocker, false)
Entry(name, data)
}
}
}

Binary file not shown.

View File

@ -0,0 +1,30 @@
package docspell.files
import minitest._
import cats.effect._
import cats.implicits._
import scala.concurrent.ExecutionContext
object ZipTest extends SimpleTestSuite {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
test("unzip") {
val zipFile = ExampleFiles.letters_zip.readURL[IO](8192, blocker)
val uncomp = zipFile.through(Zip.unzip(8192, blocker))
uncomp.evalMap(entry => {
val x = entry.data.map(_ => 1).foldMonoid.compile.lastOrError
x.map(size => {
if (entry.name.endsWith(".pdf")) {
assertEquals(entry.name, "letter-de.pdf")
assertEquals(size, 34815)
} else {
assertEquals(entry.name, "letter-en.txt")
assertEquals(size, 1131)
}
})
}).compile.drain.unsafeRunSync
}
}