mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 15:15:58 +00:00
parent
87c756df0a
commit
3c93b63c8a
@ -275,6 +275,7 @@ val common = project
|
|||||||
.settings(testSettingsMUnit)
|
.settings(testSettingsMUnit)
|
||||||
.settings(
|
.settings(
|
||||||
name := "docspell-common",
|
name := "docspell-common",
|
||||||
|
addCompilerPlugin(Dependencies.kindProjectorPlugin),
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.fs2 ++
|
Dependencies.fs2 ++
|
||||||
Dependencies.circe ++
|
Dependencies.circe ++
|
||||||
@ -409,7 +410,8 @@ val convert = project
|
|||||||
name := "docspell-convert",
|
name := "docspell-convert",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.flexmark ++
|
Dependencies.flexmark ++
|
||||||
Dependencies.twelvemonkeys
|
Dependencies.twelvemonkeys ++
|
||||||
|
Dependencies.pdfbox
|
||||||
)
|
)
|
||||||
.dependsOn(common, files % "compile->compile;test->test")
|
.dependsOn(common, files % "compile->compile;test->test")
|
||||||
|
|
||||||
|
@ -7,12 +7,13 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.common.syntax.all._
|
import docspell.common.syntax.all._
|
||||||
|
|
||||||
import org.log4s.{Logger => Log4sLogger}
|
import org.log4s.{Logger => Log4sLogger}
|
||||||
|
|
||||||
trait Logger[F[_]] {
|
trait Logger[F[_]] { self =>
|
||||||
|
|
||||||
def trace(msg: => String): F[Unit]
|
def trace(msg: => String): F[Unit]
|
||||||
def debug(msg: => String): F[Unit]
|
def debug(msg: => String): F[Unit]
|
||||||
@ -21,6 +22,25 @@ trait Logger[F[_]] {
|
|||||||
def error(ex: Throwable)(msg: => String): F[Unit]
|
def error(ex: Throwable)(msg: => String): F[Unit]
|
||||||
def error(msg: => String): F[Unit]
|
def error(msg: => String): F[Unit]
|
||||||
|
|
||||||
|
final def s: Logger[Stream[F, *]] = new Logger[Stream[F, *]] {
|
||||||
|
def trace(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.trace(msg))
|
||||||
|
|
||||||
|
def debug(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.debug(msg))
|
||||||
|
|
||||||
|
def info(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.info(msg))
|
||||||
|
|
||||||
|
def warn(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.warn(msg))
|
||||||
|
|
||||||
|
def error(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.error(msg))
|
||||||
|
|
||||||
|
def error(ex: Throwable)(msg: => String): Stream[F, Unit] =
|
||||||
|
Stream.eval(self.error(ex)(msg))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object Logger {
|
object Logger {
|
||||||
|
@ -42,8 +42,12 @@ object Conversion {
|
|||||||
): F[A] =
|
): F[A] =
|
||||||
TikaMimetype.resolve(dataType, in).flatMap {
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
|
val pdfStream =
|
||||||
|
if (cfg.decryptPdf.enabled)
|
||||||
|
in.through(RemovePdfEncryption(logger, cfg.decryptPdf.passwords))
|
||||||
|
else in
|
||||||
OcrMyPdf
|
OcrMyPdf
|
||||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(in, handler)
|
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(pdfStream, handler)
|
||||||
|
|
||||||
case MimeType.HtmlMatch(mt) =>
|
case MimeType.HtmlMatch(mt) =>
|
||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
|
@ -6,11 +6,12 @@
|
|||||||
|
|
||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import docspell.convert.ConvertConfig.DecryptPdf
|
||||||
import docspell.convert.extern.OcrMyPdfConfig
|
import docspell.convert.extern.OcrMyPdfConfig
|
||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
case class ConvertConfig(
|
final case class ConvertConfig(
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
convertedFilenamePart: String,
|
convertedFilenamePart: String,
|
||||||
maxImageSize: Int,
|
maxImageSize: Int,
|
||||||
@ -18,5 +19,11 @@ case class ConvertConfig(
|
|||||||
wkhtmlpdf: WkHtmlPdfConfig,
|
wkhtmlpdf: WkHtmlPdfConfig,
|
||||||
tesseract: TesseractConfig,
|
tesseract: TesseractConfig,
|
||||||
unoconv: UnoconvConfig,
|
unoconv: UnoconvConfig,
|
||||||
ocrmypdf: OcrMyPdfConfig
|
ocrmypdf: OcrMyPdfConfig,
|
||||||
|
decryptPdf: DecryptPdf
|
||||||
)
|
)
|
||||||
|
|
||||||
|
object ConvertConfig {
|
||||||
|
|
||||||
|
final case class DecryptPdf(enabled: Boolean, passwords: List[String])
|
||||||
|
}
|
||||||
|
@ -0,0 +1,88 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import fs2.{Chunk, Pipe, Stream}
|
||||||
|
|
||||||
|
import docspell.common.Logger
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
|
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
||||||
|
|
||||||
|
/** Using PDFBox, the incoming pdf is loaded while trying the given passwords. */
|
||||||
|
object RemovePdfEncryption {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync](
|
||||||
|
logger: Logger[F],
|
||||||
|
passwords: List[String]
|
||||||
|
): Pipe[F, Byte, Byte] =
|
||||||
|
apply(logger, Stream.emits(passwords))
|
||||||
|
|
||||||
|
def apply[F[_]: Sync](
|
||||||
|
logger: Logger[F],
|
||||||
|
passwords: Stream[F, String]
|
||||||
|
): Pipe[F, Byte, Byte] = {
|
||||||
|
val pws = passwords.cons1("")
|
||||||
|
in =>
|
||||||
|
pws
|
||||||
|
.flatMap(pw => in.through(openPdf[F](logger, pw)))
|
||||||
|
.head
|
||||||
|
.flatMap { doc =>
|
||||||
|
if (doc.isEncrypted) {
|
||||||
|
logger.s.debug("Removing protection/encryption from PDF").drain ++
|
||||||
|
Stream.eval(Sync[F].delay(doc.setAllSecurityToBeRemoved(true))).drain ++
|
||||||
|
toStream[F](doc)
|
||||||
|
} else {
|
||||||
|
in
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.ifEmpty(
|
||||||
|
logger.s
|
||||||
|
.info(
|
||||||
|
s"None of the passwords helped to read the given PDF!"
|
||||||
|
)
|
||||||
|
.drain ++ in
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def openPdf[F[_]: Sync](
|
||||||
|
logger: Logger[F],
|
||||||
|
pw: String
|
||||||
|
): Pipe[F, Byte, PDDocument] = {
|
||||||
|
def alloc(bytes: Array[Byte]): F[Option[PDDocument]] =
|
||||||
|
Sync[F].delay(load(bytes, pw))
|
||||||
|
|
||||||
|
def free(doc: Option[PDDocument]): F[Unit] =
|
||||||
|
Sync[F].delay(doc.foreach(_.close()))
|
||||||
|
|
||||||
|
val log =
|
||||||
|
if (pw.isEmpty) Stream.empty
|
||||||
|
else logger.s.debug(s"Try opening PDF with password: ${pw.take(2)}***").drain
|
||||||
|
|
||||||
|
in =>
|
||||||
|
Stream
|
||||||
|
.eval(in.compile.to(Array))
|
||||||
|
.flatMap(bytes => log ++ Stream.bracket(alloc(bytes))(free))
|
||||||
|
.flatMap(opt => opt.map(Stream.emit).getOrElse(Stream.empty))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def load(bytes: Array[Byte], pw: String): Option[PDDocument] =
|
||||||
|
try Option(PDDocument.load(bytes, pw))
|
||||||
|
catch {
|
||||||
|
case _: InvalidPasswordException =>
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
private def toStream[F[_]](doc: PDDocument): Stream[F, Byte] = {
|
||||||
|
val baos = new ByteArrayOutputStream()
|
||||||
|
doc.save(baos)
|
||||||
|
Stream.chunk(Chunk.array(baos.toByteArray))
|
||||||
|
}
|
||||||
|
}
|
@ -74,7 +74,8 @@ class ConversionTest extends FunSuite with FileChecks {
|
|||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
),
|
),
|
||||||
target
|
target
|
||||||
)
|
),
|
||||||
|
ConvertConfig.DecryptPdf(true, Nil)
|
||||||
)
|
)
|
||||||
|
|
||||||
val conversion =
|
val conversion =
|
||||||
|
@ -9,6 +9,8 @@ package docspell.convert
|
|||||||
import java.nio.charset.StandardCharsets
|
import java.nio.charset.StandardCharsets
|
||||||
import java.nio.file.Files
|
import java.nio.file.Files
|
||||||
|
|
||||||
|
import scala.util.Try
|
||||||
|
|
||||||
import cats.data.Kleisli
|
import cats.data.Kleisli
|
||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
import cats.effect.unsafe.implicits.global
|
import cats.effect.unsafe.implicits.global
|
||||||
@ -19,6 +21,9 @@ import docspell.common._
|
|||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.files.TikaMimetype
|
import docspell.files.TikaMimetype
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
|
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
|
||||||
|
|
||||||
trait FileChecks {
|
trait FileChecks {
|
||||||
|
|
||||||
implicit class FileCheckOps(p: Path) {
|
implicit class FileCheckOps(p: Path) {
|
||||||
@ -34,15 +39,46 @@ trait FileChecks {
|
|||||||
|
|
||||||
def isPlainText: Boolean =
|
def isPlainText: Boolean =
|
||||||
isType(MimeType.text("plain"))
|
isType(MimeType.text("plain"))
|
||||||
|
|
||||||
|
def isUnencryptedPDF: Boolean =
|
||||||
|
Try(PDDocument.load(p.toNioPath.toFile)).map(_.close()).isSuccess
|
||||||
|
}
|
||||||
|
|
||||||
|
implicit class ByteStreamOps(delegate: Stream[IO, Byte]) {
|
||||||
|
def isNonEmpty: IO[Boolean] =
|
||||||
|
delegate.head.compile.last.map(_.isDefined)
|
||||||
|
|
||||||
|
def isType(mime: MimeType): IO[Boolean] =
|
||||||
|
TikaMimetype.detect(delegate, MimeTypeHint.none).map(_ == mime)
|
||||||
|
|
||||||
|
def isPDF: IO[Boolean] =
|
||||||
|
isType(MimeType.pdf)
|
||||||
|
|
||||||
|
def isUnencryptedPDF: IO[Boolean] =
|
||||||
|
delegate.compile
|
||||||
|
.to(Array)
|
||||||
|
.map(PDDocument.load(_))
|
||||||
|
.map(_.close())
|
||||||
|
.map(_ => true)
|
||||||
|
|
||||||
|
def isEncryptedPDF: IO[Boolean] =
|
||||||
|
delegate.compile
|
||||||
|
.to(Array)
|
||||||
|
.map(PDDocument.load(_))
|
||||||
|
.attempt
|
||||||
|
.map(e =>
|
||||||
|
e.fold(
|
||||||
|
_.isInstanceOf[InvalidPasswordException],
|
||||||
|
doc => {
|
||||||
|
doc.close();
|
||||||
|
false
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
||||||
in =>
|
fs2.io.file.Files[IO].writeAll(file).andThen(s => s ++ Stream.emit(file))
|
||||||
Stream
|
|
||||||
.eval(
|
|
||||||
in.compile.to(Array).flatMap(bytes => IO(Files.write(file.toNioPath, bytes)))
|
|
||||||
)
|
|
||||||
.map(p => File.path(p))
|
|
||||||
|
|
||||||
def storePdfHandler(file: Path): Handler[IO, Path] =
|
def storePdfHandler(file: Path): Handler[IO, Path] =
|
||||||
storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1)
|
storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1)
|
||||||
|
@ -0,0 +1,75 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import cats.effect.IO
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.Logger
|
||||||
|
import docspell.files.ExampleFiles
|
||||||
|
|
||||||
|
import munit.CatsEffectSuite
|
||||||
|
|
||||||
|
class RemovePdfEncryptionTest extends CatsEffectSuite with FileChecks {
|
||||||
|
val logger: Logger[IO] = Logger.log4s(org.log4s.getLogger)
|
||||||
|
|
||||||
|
val protectedPdf = ExampleFiles.secured_protected_test123_pdf.readURL[IO](16 * 1024)
|
||||||
|
val encryptedPdf = ExampleFiles.secured_encrypted_test123_pdf.readURL[IO](16 * 1024)
|
||||||
|
val plainPdf = ExampleFiles.letter_en_pdf.readURL[IO](16 * 1024)
|
||||||
|
|
||||||
|
test("have encrypted pdfs") {
|
||||||
|
for {
|
||||||
|
_ <- assertIO(encryptedPdf.isEncryptedPDF, true)
|
||||||
|
_ <- assertIO(encryptedPdf.isEncryptedPDF, true)
|
||||||
|
} yield ()
|
||||||
|
}
|
||||||
|
|
||||||
|
test("decrypt pdf") {
|
||||||
|
encryptedPdf
|
||||||
|
.through(RemovePdfEncryption(logger, List("test123")))
|
||||||
|
.isUnencryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("decrypt pdf with multiple passwords") {
|
||||||
|
encryptedPdf
|
||||||
|
.through(RemovePdfEncryption(logger, List("xy123", "123xy", "test123", "abc123")))
|
||||||
|
.isUnencryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("remove protection") {
|
||||||
|
protectedPdf
|
||||||
|
.through(RemovePdfEncryption(logger, Nil))
|
||||||
|
.isUnencryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("read unprotected pdf") {
|
||||||
|
plainPdf
|
||||||
|
.through(RemovePdfEncryption(logger, Nil))
|
||||||
|
.isUnencryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("decrypt with multiple passwords, stop on first") {
|
||||||
|
val passwords: Stream[IO, String] =
|
||||||
|
Stream("test123") ++ Stream.raiseError[IO](new Exception("is not called"))
|
||||||
|
val decrypt = RemovePdfEncryption(logger, passwords)
|
||||||
|
encryptedPdf
|
||||||
|
.through(decrypt)
|
||||||
|
.isUnencryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("return input stream if nothing helps") {
|
||||||
|
encryptedPdf
|
||||||
|
.through(RemovePdfEncryption(logger, List("a", "b")))
|
||||||
|
.isEncryptedPDF
|
||||||
|
.map(assert(_))
|
||||||
|
}
|
||||||
|
}
|
BIN
modules/files/src/test/resources/secured/encrypted-test123.pdf
Normal file
BIN
modules/files/src/test/resources/secured/encrypted-test123.pdf
Normal file
Binary file not shown.
BIN
modules/files/src/test/resources/secured/protected-test123.pdf
Normal file
BIN
modules/files/src/test/resources/secured/protected-test123.pdf
Normal file
Binary file not shown.
@ -586,6 +586,21 @@ Docpell Update Check
|
|||||||
}
|
}
|
||||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allows to try to decrypt a PDF with encryption or protection. If
|
||||||
|
# enabled, a PDFs encryption or protection will be removed during
|
||||||
|
# conversion.
|
||||||
|
#
|
||||||
|
# For encrypted PDFs, this is necessary to be processed, because
|
||||||
|
# docspell needs to read it. It also requires to specify a
|
||||||
|
# password here. All passwords are tried when reading a PDF.
|
||||||
|
#
|
||||||
|
# This is enabled by default, using an empty password list. This
|
||||||
|
# removes protection from PDFs, which is better for processing.
|
||||||
|
decrypt-pdf = {
|
||||||
|
enabled = true
|
||||||
|
passwords = []
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# The same section is also present in the rest-server config. It is
|
# The same section is also present in the rest-server config. It is
|
||||||
|
Loading…
x
Reference in New Issue
Block a user