Use passwords when reading PDFs

This commit is contained in:
eikek 2021-09-30 11:11:08 +02:00
parent f74624485f
commit aa8f3b82fc
9 changed files with 73 additions and 37 deletions

View File

@ -33,6 +33,7 @@ object Conversion {
def create[F[_]: Async]( def create[F[_]: Async](
cfg: ConvertConfig, cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml, sanitizeHtml: SanitizeHtml,
additionalPasswords: List[Password],
logger: Logger[F] logger: Logger[F]
): Resource[F, Conversion[F]] = ): Resource[F, Conversion[F]] =
Resource.pure[F, Conversion[F]](new Conversion[F] { Resource.pure[F, Conversion[F]](new Conversion[F] {
@ -42,10 +43,14 @@ object Conversion {
): F[A] = ): F[A] =
TikaMimetype.resolve(dataType, in).flatMap { TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.PdfMatch(_) => case MimeType.PdfMatch(_) =>
val allPass = cfg.decryptPdf.passwords ++ additionalPasswords
val pdfStream = val pdfStream =
if (cfg.decryptPdf.enabled) if (cfg.decryptPdf.enabled) {
in.through(RemovePdfEncryption(logger, cfg.decryptPdf.passwords)) logger.s
else in .debug(s"Trying to read the PDF using ${allPass.size} passwords")
.drain ++
in.through(RemovePdfEncryption(logger, allPass))
} else in
OcrMyPdf OcrMyPdf
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler) .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler)

View File

@ -6,6 +6,7 @@
package docspell.convert package docspell.convert
import docspell.common.Password
import docspell.convert.ConvertConfig.DecryptPdf import docspell.convert.ConvertConfig.DecryptPdf
import docspell.convert.extern.OcrMyPdfConfig import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
@ -25,5 +26,5 @@ final case class ConvertConfig(
object ConvertConfig { object ConvertConfig {
final case class DecryptPdf(enabled: Boolean, passwords: List[String]) final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
} }

View File

@ -11,7 +11,7 @@ import java.io.ByteArrayOutputStream
import cats.effect._ import cats.effect._
import fs2.{Chunk, Pipe, Stream} import fs2.{Chunk, Pipe, Stream}
import docspell.common.Logger import docspell.common._
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
@ -21,15 +21,15 @@ object RemovePdfEncryption {
def apply[F[_]: Sync]( def apply[F[_]: Sync](
logger: Logger[F], logger: Logger[F],
passwords: List[String] passwords: List[Password]
): Pipe[F, Byte, Byte] = ): Pipe[F, Byte, Byte] =
apply(logger, Stream.emits(passwords)) apply(logger, Stream.emits(passwords))
def apply[F[_]: Sync]( def apply[F[_]: Sync](
logger: Logger[F], logger: Logger[F],
passwords: Stream[F, String] passwords: Stream[F, Password]
): Pipe[F, Byte, Byte] = { ): Pipe[F, Byte, Byte] = {
val pws = passwords.cons1("") val pws = passwords.cons1(Password.empty)
in => in =>
pws pws
.flatMap(pw => in.through(openPdf[F](logger, pw))) .flatMap(pw => in.through(openPdf[F](logger, pw)))
@ -54,7 +54,7 @@ object RemovePdfEncryption {
private def openPdf[F[_]: Sync]( private def openPdf[F[_]: Sync](
logger: Logger[F], logger: Logger[F],
pw: String pw: Password
): Pipe[F, Byte, PDDocument] = { ): Pipe[F, Byte, PDDocument] = {
def alloc(bytes: Array[Byte]): F[Option[PDDocument]] = def alloc(bytes: Array[Byte]): F[Option[PDDocument]] =
Sync[F].delay(load(bytes, pw)) Sync[F].delay(load(bytes, pw))
@ -64,7 +64,7 @@ object RemovePdfEncryption {
val log = val log =
if (pw.isEmpty) Stream.empty if (pw.isEmpty) Stream.empty
else logger.s.debug(s"Try opening PDF with password: ${pw.take(2)}***").drain else logger.s.debug(s"Try opening PDF with password: ${pw.pass.take(2)}***").drain
in => in =>
Stream Stream
@ -73,8 +73,8 @@ object RemovePdfEncryption {
.flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty)) .flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty))
} }
private def load(bytes: Array[Byte], pw: String): Option[PDDocument] = private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] =
try Option(PDDocument.load(bytes, pw)) try Option(PDDocument.load(bytes, pw.pass))
catch { catch {
case _: InvalidPasswordException => case _: InvalidPasswordException =>
None None

View File

@ -79,7 +79,7 @@ class ConversionTest extends FunSuite with FileChecks {
) )
val conversion = val conversion =
Conversion.create[IO](convertConfig, SanitizeHtml.none, logger) Conversion.create[IO](convertConfig, SanitizeHtml.none, Nil, logger)
val bombs = List( val bombs = List(
ExampleFiles.bombs_20K_gray_jpeg, ExampleFiles.bombs_20K_gray_jpeg,

View File

@ -9,7 +9,7 @@ package docspell.convert
import cats.effect.IO import cats.effect.IO
import fs2.Stream import fs2.Stream
import docspell.common.Logger import docspell.common._
import docspell.files.ExampleFiles import docspell.files.ExampleFiles
import munit.CatsEffectSuite import munit.CatsEffectSuite
@ -17,9 +17,11 @@ import munit.CatsEffectSuite
class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger) val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger)
val protectedPdf = ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024) private val protectedPdf =
val encryptedPdf = ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024) ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024)
val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024) private val encryptedPdf =
ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024)
private val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024)
test("have encrypted pdfs") { test("have encrypted pdfs") {
for { for {
@ -30,14 +32,19 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
test("decrypt pdf") { test("decrypt pdf") {
encryptedPdf encryptedPdf
.through(RemovePdfEncryption(logger, List("test123"))) .through(RemovePdfEncryption(logger, List(Password("test123"))))
.isUnencryptedPDF .isUnencryptedPDF
.map(assert(_)) .map(assert(_))
} }
test("decrypt pdf with multiple passwords") { test("decrypt pdf with multiple passwords") {
encryptedPdf encryptedPdf
.through(RemovePdfEncryption(logger, List("xy123", "123xy", "test123", "abc123"))) .through(
RemovePdfEncryption(
logger,
List("xy123", "123xy", "test123", "abc123").map(Password(_))
)
)
.isUnencryptedPDF .isUnencryptedPDF
.map(assert(_)) .map(assert(_))
} }
@ -59,7 +66,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
test("decrypt with multiple passwords, stop on first") { test("decrypt with multiple passwords, stop on first") {
val passwords: Stream[IO, String] = val passwords: Stream[IO, String] =
Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called")) Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called"))
val decrypt = RemovePdfEncryption(logger, passwords) val decrypt = RemovePdfEncryption(logger, passwords.map(Password(_)))
encryptedPdf encryptedPdf
.through(decrypt) .through(decrypt)
.isUnencryptedPDF .isUnencryptedPDF
@ -68,7 +75,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
test("return input stream if nothing helps") { test("return input stream if nothing helps") {
encryptedPdf encryptedPdf
.through(RemovePdfEncryption(logger, List("a", "b"))) .through(RemovePdfEncryption(logger, List("a", "b").map(Password(_))))
.isEncryptedPDF .isEncryptedPDF
.map(assert(_)) .map(assert(_))
} }

View File

@ -595,8 +595,12 @@ Docpell Update Check
# docspell needs to read it. It also requires to specify a # docspell needs to read it. It also requires to specify a
# password here. All passwords are tried when reading a PDF. # password here. All passwords are tried when reading a PDF.
# #
# This is enabled by default, using an empty password list. This # This is enabled by default with an empty password list. This
# removes protection from PDFs, which is better for processing. # removes protection from PDFs, which is better for processing.
#
# Passwords can be given here and each collective can maintain
# their passwords as well. But if the `enabled` setting below is
# `false`, then no attempt at decrypting is done.
decrypt-pdf = { decrypt-pdf = {
enabled = true enabled = true
passwords = [] passwords = []

View File

@ -77,17 +77,27 @@ object ConvertPdf {
ctx: Context[F, ProcessItemArgs], ctx: Context[F, ProcessItemArgs],
item: ItemData item: ItemData
)(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] = )(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.logger).use { conv => loadCollectivePasswords(ctx).flatMap(collPass =>
mime match { Conversion.create[F](cfg, sanitizeHtml, collPass, ctx.logger).use { conv =>
case mt => mime match {
val data = ctx.store.fileStore.getBytes(ra.fileId) case mt =>
val handler = conversionHandler[F](ctx, cfg, ra, item) val data = ctx.store.fileStore.getBytes(ra.fileId)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> val handler = conversionHandler[F](ctx, cfg, ra, item)
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( ctx.logger
data .info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
) conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
data
)
}
} }
} )
private def loadCollectivePasswords[F[_]: Async](
ctx: Context[F, ProcessItemArgs]
): F[List[Password]] =
ctx.store
.transact(RCollectivePassword.findAll(ctx.args.meta.collective))
.map(_.map(_.password).distinct)
private def conversionHandler[F[_]: Sync]( private def conversionHandler[F[_]: Sync](
ctx: Context[F, ProcessItemArgs], ctx: Context[F, ProcessItemArgs],

View File

@ -1,14 +1,22 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList import cats.data.NonEmptyList
import docspell.common._
import docspell.store.qb._
import docspell.store.qb.DSL._
import doobie._
import doobie.implicits._
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.common._
import docspell.store.qb.DSL._
import docspell.store.qb._
import doobie._
import doobie.implicits._
final case class RCollectivePassword( final case class RCollectivePassword(
id: Ident, id: Ident,
cid: Ident, cid: Ident,

View File

@ -56,6 +56,7 @@ description = "A list of features and limitations."
- Everything stored in a SQL database: PostgreSQL, MariaDB or H2 - Everything stored in a SQL database: PostgreSQL, MariaDB or H2
- H2 is embedded, a "one-file-only" database, avoids installing db - H2 is embedded, a "one-file-only" database, avoids installing db
servers servers
- Support for encrypted PDFs
- Files supported: - Files supported:
- Documents: - Documents:
- PDF - PDF