mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-02-15 20:33:26 +00:00
Use passwords when reading PDFs
This commit is contained in:
parent
f74624485f
commit
aa8f3b82fc
@ -33,6 +33,7 @@ object Conversion {
|
|||||||
def create[F[_]: Async](
|
def create[F[_]: Async](
|
||||||
cfg: ConvertConfig,
|
cfg: ConvertConfig,
|
||||||
sanitizeHtml: SanitizeHtml,
|
sanitizeHtml: SanitizeHtml,
|
||||||
|
additionalPasswords: List[Password],
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Resource[F, Conversion[F]] =
|
): Resource[F, Conversion[F]] =
|
||||||
Resource.pure[F, Conversion[F]](new Conversion[F] {
|
Resource.pure[F, Conversion[F]](new Conversion[F] {
|
||||||
@ -42,10 +43,14 @@ object Conversion {
|
|||||||
): F[A] =
|
): F[A] =
|
||||||
TikaMimetype.resolve(dataType, in).flatMap {
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
|
val allPass = cfg.decryptPdf.passwords ++ additionalPasswords
|
||||||
val pdfStream =
|
val pdfStream =
|
||||||
if (cfg.decryptPdf.enabled)
|
if (cfg.decryptPdf.enabled) {
|
||||||
in.through(RemovePdfEncryption(logger, cfg.decryptPdf.passwords))
|
logger.s
|
||||||
else in
|
.debug(s"Trying to read the PDF using ${allPass.size} passwords")
|
||||||
|
.drain ++
|
||||||
|
in.through(RemovePdfEncryption(logger, allPass))
|
||||||
|
} else in
|
||||||
OcrMyPdf
|
OcrMyPdf
|
||||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler)
|
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler)
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import docspell.common.Password
|
||||||
import docspell.convert.ConvertConfig.DecryptPdf
|
import docspell.convert.ConvertConfig.DecryptPdf
|
||||||
import docspell.convert.extern.OcrMyPdfConfig
|
import docspell.convert.extern.OcrMyPdfConfig
|
||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
@ -25,5 +26,5 @@ final case class ConvertConfig(
|
|||||||
|
|
||||||
object ConvertConfig {
|
object ConvertConfig {
|
||||||
|
|
||||||
final case class DecryptPdf(enabled: Boolean, passwords: List[String])
|
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ import java.io.ByteArrayOutputStream
|
|||||||
import cats.effect._
|
import cats.effect._
|
||||||
import fs2.{Chunk, Pipe, Stream}
|
import fs2.{Chunk, Pipe, Stream}
|
||||||
|
|
||||||
import docspell.common.Logger
|
import docspell.common._
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
||||||
@ -21,15 +21,15 @@ object RemovePdfEncryption {
|
|||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync](
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
passwords: List[String]
|
passwords: List[Password]
|
||||||
): Pipe[F, Byte, Byte] =
|
): Pipe[F, Byte, Byte] =
|
||||||
apply(logger, Stream.emits(passwords))
|
apply(logger, Stream.emits(passwords))
|
||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync](
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
passwords: Stream[F, String]
|
passwords: Stream[F, Password]
|
||||||
): Pipe[F, Byte, Byte] = {
|
): Pipe[F, Byte, Byte] = {
|
||||||
val pws = passwords.cons1("")
|
val pws = passwords.cons1(Password.empty)
|
||||||
in =>
|
in =>
|
||||||
pws
|
pws
|
||||||
.flatMap(pw => in.through(openPdf[F](logger, pw)))
|
.flatMap(pw => in.through(openPdf[F](logger, pw)))
|
||||||
@ -54,7 +54,7 @@ object RemovePdfEncryption {
|
|||||||
|
|
||||||
private def openPdf[F[_]: Sync](
|
private def openPdf[F[_]: Sync](
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
pw: String
|
pw: Password
|
||||||
): Pipe[F, Byte, PDDocument] = {
|
): Pipe[F, Byte, PDDocument] = {
|
||||||
def alloc(bytes: Array[Byte]): F[Option[PDDocument]] =
|
def alloc(bytes: Array[Byte]): F[Option[PDDocument]] =
|
||||||
Sync[F].delay(load(bytes, pw))
|
Sync[F].delay(load(bytes, pw))
|
||||||
@ -64,7 +64,7 @@ object RemovePdfEncryption {
|
|||||||
|
|
||||||
val log =
|
val log =
|
||||||
if (pw.isEmpty) Stream.empty
|
if (pw.isEmpty) Stream.empty
|
||||||
else logger.s.debug(s"Try opening PDF with password: ${pw.take(2)}***").drain
|
else logger.s.debug(s"Try opening PDF with password: ${pw.pass.take(2)}***").drain
|
||||||
|
|
||||||
in =>
|
in =>
|
||||||
Stream
|
Stream
|
||||||
@ -73,8 +73,8 @@ object RemovePdfEncryption {
|
|||||||
.flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty))
|
.flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def load(bytes: Array[Byte], pw: String): Option[PDDocument] =
|
private def load(bytes: Array[Byte], pw: Password): Option[PDDocument] =
|
||||||
try Option(PDDocument.load(bytes, pw))
|
try Option(PDDocument.load(bytes, pw.pass))
|
||||||
catch {
|
catch {
|
||||||
case _: InvalidPasswordException =>
|
case _: InvalidPasswordException =>
|
||||||
None
|
None
|
||||||
|
@ -79,7 +79,7 @@ class ConversionTest extends FunSuite with FileChecks {
|
|||||||
)
|
)
|
||||||
|
|
||||||
val conversion =
|
val conversion =
|
||||||
Conversion.create[IO](convertConfig, SanitizeHtml.none, logger)
|
Conversion.create[IO](convertConfig, SanitizeHtml.none, Nil, logger)
|
||||||
|
|
||||||
val bombs = List(
|
val bombs = List(
|
||||||
ExampleFiles.bombs_20K_gray_jpeg,
|
ExampleFiles.bombs_20K_gray_jpeg,
|
||||||
|
@ -9,7 +9,7 @@ package docspell.convert
|
|||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common.Logger
|
import docspell.common._
|
||||||
import docspell.files.ExampleFiles
|
import docspell.files.ExampleFiles
|
||||||
|
|
||||||
import munit.CatsEffectSuite
|
import munit.CatsEffectSuite
|
||||||
@ -17,9 +17,11 @@ import munit.CatsEffectSuite
|
|||||||
class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
||||||
val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger)
|
val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger)
|
||||||
|
|
||||||
val protectedPdf = ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024)
|
private val protectedPdf =
|
||||||
val encryptedPdf = ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024)
|
ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024)
|
||||||
val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024)
|
private val encryptedPdf =
|
||||||
|
ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024)
|
||||||
|
private val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024)
|
||||||
|
|
||||||
test("have encrypted pdfs") {
|
test("have encrypted pdfs") {
|
||||||
for {
|
for {
|
||||||
@ -30,14 +32,19 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
|||||||
|
|
||||||
test("decrypt pdf") {
|
test("decrypt pdf") {
|
||||||
encryptedPdf
|
encryptedPdf
|
||||||
.through(RemovePdfEncryption(logger, List("test123")))
|
.through(RemovePdfEncryption(logger, List(Password("test123"))))
|
||||||
.isUnencryptedPDF
|
.isUnencryptedPDF
|
||||||
.map(assert(_))
|
.map(assert(_))
|
||||||
}
|
}
|
||||||
|
|
||||||
test("decrypt pdf with multiple passwords") {
|
test("decrypt pdf with multiple passwords") {
|
||||||
encryptedPdf
|
encryptedPdf
|
||||||
.through(RemovePdfEncryption(logger, List("xy123", "123xy", "test123", "abc123")))
|
.through(
|
||||||
|
RemovePdfEncryption(
|
||||||
|
logger,
|
||||||
|
List("xy123", "123xy", "test123", "abc123").map(Password(_))
|
||||||
|
)
|
||||||
|
)
|
||||||
.isUnencryptedPDF
|
.isUnencryptedPDF
|
||||||
.map(assert(_))
|
.map(assert(_))
|
||||||
}
|
}
|
||||||
@ -59,7 +66,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
|||||||
test("decrypt with multiple passwords, stop on first") {
|
test("decrypt with multiple passwords, stop on first") {
|
||||||
val passwords: Stream[IO, String] =
|
val passwords: Stream[IO, String] =
|
||||||
Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called"))
|
Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called"))
|
||||||
val decrypt = RemovePdfEncryption(logger, passwords)
|
val decrypt = RemovePdfEncryption(logger, passwords.map(Password(_)))
|
||||||
encryptedPdf
|
encryptedPdf
|
||||||
.through(decrypt)
|
.through(decrypt)
|
||||||
.isUnencryptedPDF
|
.isUnencryptedPDF
|
||||||
@ -68,7 +75,7 @@ class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
|||||||
|
|
||||||
test("return input stream if nothing helps") {
|
test("return input stream if nothing helps") {
|
||||||
encryptedPdf
|
encryptedPdf
|
||||||
.through(RemovePdfEncryption(logger, List("a", "b")))
|
.through(RemovePdfEncryption(logger, List("a", "b").map(Password(_))))
|
||||||
.isEncryptedPDF
|
.isEncryptedPDF
|
||||||
.map(assert(_))
|
.map(assert(_))
|
||||||
}
|
}
|
||||||
|
@ -595,8 +595,12 @@ Docpell Update Check
|
|||||||
# docspell needs to read it. It also requires to specify a
|
# docspell needs to read it. It also requires to specify a
|
||||||
# password here. All passwords are tried when reading a PDF.
|
# password here. All passwords are tried when reading a PDF.
|
||||||
#
|
#
|
||||||
# This is enabled by default, using an empty password list. This
|
# This is enabled by default with an empty password list. This
|
||||||
# removes protection from PDFs, which is better for processing.
|
# removes protection from PDFs, which is better for processing.
|
||||||
|
#
|
||||||
|
# Passwords can be given here and each collective can maintain
|
||||||
|
# their passwords as well. But if the `enabled` setting below is
|
||||||
|
# `false`, then no attempt at decrypting is done.
|
||||||
decrypt-pdf = {
|
decrypt-pdf = {
|
||||||
enabled = true
|
enabled = true
|
||||||
passwords = []
|
passwords = []
|
||||||
|
@ -77,17 +77,27 @@ object ConvertPdf {
|
|||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, ProcessItemArgs],
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] =
|
)(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||||
Conversion.create[F](cfg, sanitizeHtml, ctx.logger).use { conv =>
|
loadCollectivePasswords(ctx).flatMap(collPass =>
|
||||||
mime match {
|
Conversion.create[F](cfg, sanitizeHtml, collPass, ctx.logger).use { conv =>
|
||||||
case mt =>
|
mime match {
|
||||||
val data = ctx.store.fileStore.getBytes(ra.fileId)
|
case mt =>
|
||||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
val data = ctx.store.fileStore.getBytes(ra.fileId)
|
||||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||||
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
|
ctx.logger
|
||||||
data
|
.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||||
)
|
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
|
||||||
|
data
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
)
|
||||||
|
|
||||||
|
private def loadCollectivePasswords[F[_]: Async](
|
||||||
|
ctx: Context[F, ProcessItemArgs]
|
||||||
|
): F[List[Password]] =
|
||||||
|
ctx.store
|
||||||
|
.transact(RCollectivePassword.findAll(ctx.args.meta.collective))
|
||||||
|
.map(_.map(_.password).distinct)
|
||||||
|
|
||||||
private def conversionHandler[F[_]: Sync](
|
private def conversionHandler[F[_]: Sync](
|
||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, ProcessItemArgs],
|
||||||
|
@ -1,14 +1,22 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
import cats.data.NonEmptyList
|
import cats.data.NonEmptyList
|
||||||
import docspell.common._
|
|
||||||
import docspell.store.qb._
|
|
||||||
import docspell.store.qb.DSL._
|
|
||||||
import doobie._
|
|
||||||
import doobie.implicits._
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.qb.DSL._
|
||||||
|
import docspell.store.qb._
|
||||||
|
|
||||||
|
import doobie._
|
||||||
|
import doobie.implicits._
|
||||||
|
|
||||||
final case class RCollectivePassword(
|
final case class RCollectivePassword(
|
||||||
id: Ident,
|
id: Ident,
|
||||||
cid: Ident,
|
cid: Ident,
|
||||||
|
@ -56,6 +56,7 @@ description = "A list of features and limitations."
|
|||||||
- Everything stored in a SQL database: PostgreSQL, MariaDB or H2
|
- Everything stored in a SQL database: PostgreSQL, MariaDB or H2
|
||||||
- H2 is embedded, a "one-file-only" database, avoids installing db
|
- H2 is embedded, a "one-file-only" database, avoids installing db
|
||||||
servers
|
servers
|
||||||
|
- Support for encrypted PDFs
|
||||||
- Files supported:
|
- Files supported:
|
||||||
- Documents:
|
- Documents:
|
||||||
- PDF
|
- PDF
|
||||||
|
Loading…
Reference in New Issue
Block a user