From aa8f3b82fc9338a4a258a323e3f587b5e557396c Mon Sep 17 00:00:00 2001 From: eikek Date: Thu, 30 Sep 2021 11:11:08 +0200 Subject: [PATCH] Use passwords when reading PDFs --- .../scala/docspell/convert/Conversion.scala | 11 +++++-- .../docspell/convert/ConvertConfig.scala | 3 +- .../convert/RemovePdfEncryption.scala | 16 +++++----- .../docspell/convert/ConversionTest.scala | 2 +- .../convert/RemovePdfEncryptionTest.scala | 23 +++++++++----- .../joex/src/main/resources/reference.conf | 6 +++- .../docspell/joex/process/ConvertPdf.scala | 30 ++++++++++++------- .../store/records/RCollectivePassword.scala | 18 +++++++---- website/site/content/docs/features/_index.md | 1 + 9 files changed, 73 insertions(+), 37 deletions(-) diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index b27be23b..b1a05aa4 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -33,6 +33,7 @@ object Conversion { def create[F[_]: Async]( cfg: ConvertConfig, sanitizeHtml: SanitizeHtml, + additionalPasswords: List[Password], logger: Logger[F] ): Resource[F, Conversion[F]] = Resource.pure[F, Conversion[F]](new Conversion[F] { @@ -42,10 +43,14 @@ object Conversion { ): F[A] = TikaMimetype.resolve(dataType, in).flatMap { case MimeType.PdfMatch(_) => + val allPass = cfg.decryptPdf.passwords ++ additionalPasswords val pdfStream = - if (cfg.decryptPdf.enabled) - in.through(RemovePdfEncryption(logger, cfg.decryptPdf.passwords)) - else in + if (cfg.decryptPdf.enabled) { + logger.s + .debug(s"Trying to read the PDF using ${allPass.size} passwords") + .drain ++ + in.through(RemovePdfEncryption(logger, allPass)) + } else in OcrMyPdf .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler) diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index 74a136b1..a4f3c224 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -6,6 +6,7 @@ package docspell.convert +import docspell.common.Password import docspell.convert.ConvertConfig.DecryptPdf import docspell.convert.extern.OcrMyPdfConfig import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} @@ -25,5 +26,5 @@ final case class ConvertConfig( object ConvertConfig { - final case class DecryptPdf(enabled: Boolean, passwords: List[String]) + final case class DecryptPdf(enabled: Boolean, passwords: List[Password]) } diff --git a/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala index 34f93054..4d7a469f 100644 --- a/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala +++ b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala @@ -11,7 +11,7 @@ import java.io.ByteArrayOutputStream import cats.effect._ import fs2.{Chunk, Pipe, Stream} -import docspell.common.Logger +import docspell.common._ import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException @@ -21,15 +21,15 @@ object RemovePdfEncryption { def apply[F[_]: Sync]( logger: Logger[F], - passwords: List[String] + passwords: List[Password] ): Pipe[F, Byte, Byte] = apply(logger, Stream.emits(passwords)) def apply[F[_]: Sync]( logger: Logger[F], - passwords: Stream[F, String] + passwords: Stream[F, Password] ): Pipe[F, Byte, Byte] = { - val pws = passwords.cons1("") + val pws = passwords.cons1(Password.empty) in => pws .flatMap(pw => in.through(openPdf[F](logger, pw))) @@ -54,7 +54,7 @@ object RemovePdfEncryption { private def openPdf[F[_]: Sync]( logger: Logger[F], - pw: String + pw: Password ): Pipe[F, Byte, PDDocument] = { def alloc(bytes: Array[Byte]): F[Option[PDDocument]] = Sync[F].delay(load(bytes, pw)) @@ -64,7 +64,7 @@ object RemovePdfEncryption { val log = if (pw.isEmpty) Stream.empty - else logger.s.debug(s"Try opening PDF with password: ${pw.take(2)}***").drain + else logger.s.debug(s"Try opening PDF with password: ${pw.pass.take(2)}***").drain in => Stream @@ -73,8 +73,8 @@ object RemovePdfEncryption { .flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty)) } - private def load(bytes: Array[Byte], pw: String): Option[PDDocument] = - try Option(PDDocument.load(bytes, pw)) + private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] = + try Option(PDDocument.load(bytes, pw.pass)) catch { case _: InvalidPasswordException => None diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index 3bd1de4a..8f9f191f 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -79,7 +79,7 @@ class ConversionTest extends FunSuite with FileChecks { ) val conversion = - Conversion.create[IO](convertConfig, SanitizeHtml.none, logger) + Conversion.create[IO](convertConfig, SanitizeHtml.none, Nil, logger) val bombs = List( ExampleFiles.bombs_20K_gray_jpeg, diff --git a/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala b/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala index 805c44ad..803f3174 100644 --- a/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala @@ -9,7 +9,7 @@ package docspell.convert import cats.effect.IO import fs2.Stream -import docspell.common.Logger +import docspell.common._ import docspell.files.ExampleFiles import munit.CatsEffectSuite @@ -17,9 +17,11 @@ import munit.CatsEffectSuite class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger) - val protectedPdf = ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024) - val encryptedPdf = ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024) - val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024) + private val protectedPdf = + ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024) + private val encryptedPdf = + ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024) + private val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024) test("have encrypted pdfs") { for { @@ -30,14 +32,19 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { test("decrypt pdf") { encryptedPdf - .through(RemovePdfEncryption(logger, List("test123"))) + .through(RemovePdfEncryption(logger, List(Password("test123")))) .isUnencryptedPDF .map(assert(_)) } test("decrypt pdf with multiple passwords") { encryptedPdf - .through(RemovePdfEncryption(logger, List("xy123", "123xy", "test123", "abc123"))) + .through( + RemovePdfEncryption( + logger, + List("xy123", "123xy", "test123", "abc123").map(Password(_)) + ) + ) .isUnencryptedPDF .map(assert(_)) } @@ -59,7 +66,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { test("decrypt with multiple passwords, stop on first") { val passwords: Stream[IO, String] = Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called")) - val decrypt = RemovePdfEncryption(logger, passwords) + val decrypt = RemovePdfEncryption(logger, passwords.map(Password(_))) encryptedPdf .through(decrypt) .isUnencryptedPDF @@ -68,7 +75,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { test("return input stream if nothing helps") { encryptedPdf - .through(RemovePdfEncryption(logger, List("a", "b"))) + .through(RemovePdfEncryption(logger, List("a", "b").map(Password(_)))) .isEncryptedPDF .map(assert(_)) } diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 3e2ce177..4313771a 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -595,8 +595,12 @@ Docpell Update Check # docspell needs to read it. It also requires to specify a # password here. All passwords are tried when reading a PDF. # - # This is enabled by default, using an empty password list. This + # This is enabled by default with an empty password list. This # removes protection from PDFs, which is better for processing. + # + # Passwords can be given here and each collective can maintain + # their passwords as well. But if the `enabled` setting below is + # `false`, then no attempt at decrypting is done. decrypt-pdf = { enabled = true passwords = [] diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 44e2613f..0108ef98 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -77,17 +77,27 @@ object ConvertPdf { ctx: Context[F, ProcessItemArgs], item: ItemData )(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] = - Conversion.create[F](cfg, sanitizeHtml, ctx.logger).use { conv => - mime match { - case mt => - val data = ctx.store.fileStore.getBytes(ra.fileId) - val handler = conversionHandler[F](ctx, cfg, ra, item) - ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> - conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( - data - ) + loadCollectivePasswords(ctx).flatMap(collPass => + Conversion.create[F](cfg, sanitizeHtml, collPass, ctx.logger).use { conv => + mime match { + case mt => + val data = ctx.store.fileStore.getBytes(ra.fileId) + val handler = conversionHandler[F](ctx, cfg, ra, item) + ctx.logger + .info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> + conv.toPDF(DataType(mt), ctx.args.meta.language, handler)( + data + ) + } } - } + ) + + private def loadCollectivePasswords[F[_]: Async]( + ctx: Context[F, ProcessItemArgs] + ): F[List[Password]] = + ctx.store + .transact(RCollectivePassword.findAll(ctx.args.meta.collective)) + .map(_.map(_.password).distinct) private def conversionHandler[F[_]: Sync]( ctx: Context[F, ProcessItemArgs], diff --git a/modules/store/src/main/scala/docspell/store/records/RCollectivePassword.scala b/modules/store/src/main/scala/docspell/store/records/RCollectivePassword.scala index 53726572..c7931d20 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollectivePassword.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollectivePassword.scala @@ -1,14 +1,22 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.store.records import cats.data.NonEmptyList -import docspell.common._ -import docspell.store.qb._ -import docspell.store.qb.DSL._ -import doobie._ -import doobie.implicits._ import cats.effect._ import cats.implicits._ +import docspell.common._ +import docspell.store.qb.DSL._ +import docspell.store.qb._ + +import doobie._ +import doobie.implicits._ + final case class RCollectivePassword( id: Ident, cid: Ident, diff --git a/website/site/content/docs/features/_index.md b/website/site/content/docs/features/_index.md index b5855066..03da2d7d 100644 --- a/website/site/content/docs/features/_index.md +++ b/website/site/content/docs/features/_index.md @@ -56,6 +56,7 @@ description = "A list of features and limitations." - Everything stored in a SQL database: PostgreSQL, MariaDB or H2 - H2 is embedded, a "one-file-only" database, avoids installing db servers +- Support for encrypted PDFs - Files supported: - Documents: - PDF