diff --git a/build.sbt b/build.sbt index bd15be3f..c4837e71 100644 --- a/build.sbt +++ b/build.sbt @@ -275,6 +275,7 @@ val common = project .settings(testSettingsMUnit) .settings( name := "docspell-common", + addCompilerPlugin(Dependencies.kindProjectorPlugin), libraryDependencies ++= Dependencies.fs2 ++ Dependencies.circe ++ @@ -409,7 +410,8 @@ val convert = project name := "docspell-convert", libraryDependencies ++= Dependencies.flexmark ++ - Dependencies.twelvemonkeys + Dependencies.twelvemonkeys ++ + Dependencies.pdfbox ) .dependsOn(common, files % "compile->compile;test->test") diff --git a/modules/common/src/main/scala/docspell/common/Logger.scala b/modules/common/src/main/scala/docspell/common/Logger.scala index df1dba26..936c9d34 100644 --- a/modules/common/src/main/scala/docspell/common/Logger.scala +++ b/modules/common/src/main/scala/docspell/common/Logger.scala @@ -7,12 +7,13 @@ package docspell.common import cats.effect.Sync +import fs2.Stream import docspell.common.syntax.all._ import org.log4s.{Logger => Log4sLogger} -trait Logger[F[_]] { +trait Logger[F[_]] { self => def trace(msg: => String): F[Unit] def debug(msg: => String): F[Unit] @@ -21,6 +22,25 @@ trait Logger[F[_]] { def error(ex: Throwable)(msg: => String): F[Unit] def error(msg: => String): F[Unit] + final def s: Logger[Stream[F, *]] = new Logger[Stream[F, *]] { + def trace(msg: => String): Stream[F, Unit] = + Stream.eval(self.trace(msg)) + + def debug(msg: => String): Stream[F, Unit] = + Stream.eval(self.debug(msg)) + + def info(msg: => String): Stream[F, Unit] = + Stream.eval(self.info(msg)) + + def warn(msg: => String): Stream[F, Unit] = + Stream.eval(self.warn(msg)) + + def error(msg: => String): Stream[F, Unit] = + Stream.eval(self.error(msg)) + + def error(ex: Throwable)(msg: => String): Stream[F, Unit] = + Stream.eval(self.error(ex)(msg)) + } } object Logger { diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 54ee526a..b27be23b 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -42,8 +42,12 @@ object Conversion { ): F[A] = TikaMimetype.resolve(dataType, in).flatMap { case MimeType.PdfMatch(_) => + val pdfStream = + if (cfg.decryptPdf.enabled) + in.through(RemovePdfEncryption(logger, cfg.decryptPdf.passwords)) + else in OcrMyPdf - .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(in, handler) + .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler) case MimeType.HtmlMatch(mt) => val cs = mt.charsetOrUtf8 diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index fc4e4cd6..74a136b1 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -6,11 +6,12 @@ package docspell.convert +import docspell.convert.ConvertConfig.DecryptPdf import docspell.convert.extern.OcrMyPdfConfig import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} import docspell.convert.flexmark.MarkdownConfig -case class ConvertConfig( +final case class ConvertConfig( chunkSize: Int, convertedFilenamePart: String, maxImageSize: Int, @@ -18,5 +19,11 @@ case class ConvertConfig( wkhtmlpdf: WkHtmlPdfConfig, tesseract: TesseractConfig, unoconv: UnoconvConfig, - ocrmypdf: OcrMyPdfConfig + ocrmypdf: OcrMyPdfConfig, + decryptPdf: DecryptPdf ) + +object ConvertConfig { + + final case class DecryptPdf(enabled: Boolean, passwords: List[String]) +} diff --git a/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala new file mode 100644 index 00000000..34f93054 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/RemovePdfEncryption.scala @@ -0,0 +1,88 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.convert + +import java.io.ByteArrayOutputStream + +import cats.effect._ +import fs2.{Chunk, Pipe, Stream} + +import docspell.common.Logger + +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException + +/** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */ +object RemovePdfEncryption { + + def apply[F[_]: Sync]( + logger: Logger[F], + passwords: List[String] + ): Pipe[F, Byte, Byte] = + apply(logger, Stream.emits(passwords)) + + def apply[F[_]: Sync]( + logger: Logger[F], + passwords: Stream[F, String] + ): Pipe[F, Byte, Byte] = { + val pws = passwords.cons1("") + in => + pws + .flatMap(pw => in.through(openPdf[F](logger, pw))) + .head + .flatMap { doc => + if (doc.isEncrypted) { + logger.s.debug("Removing protection/encryption from PDF").drain ++ + Stream.eval(Sync[F].delay(doc.setAllSecurityToBeRemoved(true))).drain ++ + toStream[F](doc) + } else { + in + } + } + .ifEmpty( + logger.s + .info( + s"None of the passwords helped to read the given PDF!" + ) + .drain ++ in + ) + } + + private def openPdf[F[_]: Sync]( + logger: Logger[F], + pw: String + ): Pipe[F, Byte, PDDocument] = { + def alloc(bytes: Array[Byte]): F[Option[PDDocument]] = + Sync[F].delay(load(bytes, pw)) + + def free(doc: Option[PDDocument]): F[Unit] = + Sync[F].delay(doc.foreach(_.close())) + + val log = + if (pw.isEmpty) Stream.empty + else logger.s.debug(s"Try opening PDF with password: ${pw.take(2)}***").drain + + in => + Stream + .eval(in.compile.to(Array)) + .flatMap(bytes => log ++ Stream.bracket(alloc(bytes))(free)) + .flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty)) + } + + private def load(bytes: Array[Byte], pw: String): Option[PDDocument] = + try Option(PDDocument.load(bytes, pw)) + catch { + case _: InvalidPasswordException => + None + } + + private def toStream[F[_]](doc: PDDocument): Stream[F, Byte] = { + val baos = new ByteArrayOutputStream() + doc.save(baos) + Stream.chunk(Chunk.array(baos.toByteArray)) + } +} diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index fa4360e8..3bd1de4a 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -74,7 +74,8 @@ class ConversionTest extends FunSuite with FileChecks { Duration.seconds(20) ), target - ) + ), + ConvertConfig.DecryptPdf(true, Nil) ) val conversion = diff --git a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala index a6a62462..96f251ff 100644 --- a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala +++ b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala @@ -9,6 +9,8 @@ package docspell.convert import java.nio.charset.StandardCharsets import java.nio.file.Files +import scala.util.Try + import cats.data.Kleisli import cats.effect.IO import cats.effect.unsafe.implicits.global @@ -19,6 +21,9 @@ import docspell.common._ import docspell.convert.ConversionResult.Handler import docspell.files.TikaMimetype +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException + trait FileChecks { implicit class FileCheckOps(p: Path) { @@ -34,15 +39,46 @@ trait FileChecks { def isPlainText: Boolean = isType(MimeType.text("plain")) + + def isUnencryptedPDF: Boolean = + Try(PDDocument.load(p.toNioPath.toFile)).map(_.close()).isSuccess + } + + implicit class ByteStreamOps(delegate: Stream[IO, Byte]) { + def isNonEmpty: IO[Boolean] = + delegate.head.compile.last.map(_.isDefined) + + def isType(mime: MimeType): IO[Boolean] = + TikaMimetype.detect(delegate, MimeTypeHint.none).map(_ == mime) + + def isPDF: IO[Boolean] = + isType(MimeType.pdf) + + def isUnencryptedPDF: IO[Boolean] = + delegate.compile + .to(Array) + .map(PDDocument.load(_)) + .map(_.close()) + .map(_ => true) + + def isEncryptedPDF: IO[Boolean] = + delegate.compile + .to(Array) + .map(PDDocument.load(_)) + .attempt + .map(e => + e.fold( + _.isInstanceOf[InvalidPasswordException], + doc => { + doc.close(); + false + } + ) + ) } def storeFile(file: Path): Pipe[IO, Byte, Path] = - in => - Stream - .eval( - in.compile.to(Array).flatMap(bytes => IO(Files.write(file.toNioPath, bytes))) - ) - .map(p => File.path(p)) + fs2.io.file.Files[IO].writeAll(file).andThen(s => s ++ Stream.emit(file)) def storePdfHandler(file: Path): Handler[IO, Path] = storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1) diff --git a/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala b/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala new file mode 100644 index 00000000..805c44ad --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/RemovePdfEncryptionTest.scala @@ -0,0 +1,75 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.convert + +import cats.effect.IO +import fs2.Stream + +import docspell.common.Logger +import docspell.files.ExampleFiles + +import munit.CatsEffectSuite + +class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks { + val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger) + + val protectedPdf = ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024) + val encryptedPdf = ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024) + val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024) + + test("have encrypted pdfs") { + for { + _ <- assertIO(encryptedPdf.isEncryptedPDF, true) + _ <- assertIO(encryptedPdf.isEncryptedPDF, true) + } yield () + } + + test("decrypt pdf") { + encryptedPdf + .through(RemovePdfEncryption(logger, List("test123"))) + .isUnencryptedPDF + .map(assert(_)) + } + + test("decrypt pdf with multiple passwords") { + encryptedPdf + .through(RemovePdfEncryption(logger, List("xy123", "123xy", "test123", "abc123"))) + .isUnencryptedPDF + .map(assert(_)) + } + + test("remove protection") { + protectedPdf + .through(RemovePdfEncryption(logger, Nil)) + .isUnencryptedPDF + .map(assert(_)) + } + + test("read unprotected pdf") { + plainPdf + .through(RemovePdfEncryption(logger, Nil)) + .isUnencryptedPDF + .map(assert(_)) + } + + test("decrypt with multiple passwords, stop on first") { + val passwords: Stream[IO, String] = + Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called")) + val decrypt = RemovePdfEncryption(logger, passwords) + encryptedPdf + .through(decrypt) + .isUnencryptedPDF + .map(assert(_)) + } + + test("return input stream if nothing helps") { + encryptedPdf + .through(RemovePdfEncryption(logger, List("a", "b"))) + .isEncryptedPDF + .map(assert(_)) + } +} diff --git a/modules/files/src/test/resources/secured/encrypted-test123.pdf b/modules/files/src/test/resources/secured/encrypted-test123.pdf new file mode 100644 index 00000000..2750d634 Binary files /dev/null and b/modules/files/src/test/resources/secured/encrypted-test123.pdf differ diff --git a/modules/files/src/test/resources/secured/protected-test123.pdf b/modules/files/src/test/resources/secured/protected-test123.pdf new file mode 100644 index 00000000..6261e2e8 Binary files /dev/null and b/modules/files/src/test/resources/secured/protected-test123.pdf differ diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index c6ad2cdd..3e2ce177 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -586,6 +586,21 @@ Docpell Update Check } working-dir = ${java.io.tmpdir}"/docspell-convert" } + + # Allows to try to decrypt a PDF with encryption or protection. If + # enabled, a PDFs encryption or protection will be removed during + # conversion. + # + # For encrypted PDFs, this is necessary to be processed, because + # docspell needs to read it. It also requires to specify a + # password here. All passwords are tried when reading a PDF. + # + # This is enabled by default, using an empty password list. This + # removes protection from PDFs, which is better for processing. + decrypt-pdf = { + enabled = true + passwords = [] + } } # The same section is also present in the rest-server config. It is