diff --git a/build.sbt b/build.sbt index 9ddd50a1..3de1afe3 100644 --- a/build.sbt +++ b/build.sbt @@ -152,7 +152,7 @@ val files = project.in(file("modules/files")). settings( name := "docspell-files", libraryDependencies ++= - Dependencies.tika , + Dependencies.tika, Test / sourceGenerators += Def.task { val base = (Test/resourceDirectory).value val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base) @@ -204,6 +204,7 @@ val extract = project.in(file("modules/extract")). name := "docspell-extract", libraryDependencies ++= Dependencies.fs2 ++ + Dependencies.twelvemonkeys ++ Dependencies.pdfbox ++ Dependencies.poi ++ Dependencies.commonsIO ++ @@ -217,7 +218,8 @@ val convert = project.in(file("modules/convert")). settings( name := "docspell-convert", libraryDependencies ++= - Dependencies.flexmark + Dependencies.flexmark ++ + Dependencies.twelvemonkeys ).dependsOn(common, files % "compile->compile;test->test") val analysis = project.in(file("modules/analysis")). diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 006f9098..f85845c7 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -6,8 +6,9 @@ import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor} import java.util.concurrent.atomic.AtomicInteger import scala.jdk.CollectionConverters._ +import fs2.Stream import cats.implicits._ -import cats.effect.{Blocker, ContextShift, Resource, Sync} +import cats.effect._ object File { @@ -42,6 +43,9 @@ object File { count.get } + def exists[F[_]: Sync](file: Path): F[Boolean] = + Sync[F].delay(Files.exists(file)) + def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) @@ -61,6 +65,11 @@ object File { javaList.asScala.toList.sortBy(_.getFileName.toString) } - def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) = + def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] = fs2.io.file.readAll(file, blocker, chunkSize) + + def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = + readAll[F](file, blocker, 8192). + through(fs2.text.utf8Decode). + compile.foldMonoid } diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index a4605280..e106b844 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -1,24 +1,112 @@ package docspell.convert +import java.nio.charset.StandardCharsets + import fs2._ import cats.effect._ +import cats.implicits._ import docspell.common._ +import docspell.convert.ConversionResult.Handler +import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf} +import docspell.convert.flexmark.Markdown +import docspell.files.{ImageSize, TikaMimetype} trait Conversion[F[_]] { - def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] + def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] } object Conversion { - def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] = + def create[F[_]: Sync: ContextShift]( + cfg: ConvertConfig, + blocker: Blocker, + logger: Logger[F] + ): Resource[F, Conversion[F]] = Resource.pure(new Conversion[F] { - def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = { - println(s"$cfg $blocker $logger") - ??? - } + def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = + TikaMimetype.resolve(dataType, in).flatMap { + case MimeType.pdf => + handler.run(ConversionResult.successPdf(in)) + case MimeType.html => + WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler) + + case Texts(_) => + Markdown.toHtml(in, cfg.markdown).flatMap { html => + val bytes = Stream + .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8))) + .covary[F] + WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler) + } + + case Images(mt) => + ImageSize.get(in).flatMap { + case Some(dim) => + if (dim.product > cfg.maxImageSize) { + logger + .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + handler.run( + ConversionResult.inputMalformed( + mt, + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})." + ) + ) + } else { + Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + } + + case None => + logger.info( + s"Cannot read image when determining size for ${mt.asString}. Converting anyways." + ) *> + Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + } + + case Office(_) => + Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler) + + case mt => + handler.run(ConversionResult.unsupportedFormat(mt)) + } }) + + object Images { + + val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff) + + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(all.contains) + } + + object Texts { + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(_.primary == "text") + } + + object Office { + val odt = MimeType.application("vnd.oasis.opendocument.text") + val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") + val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text") + val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") + val msoffice = MimeType.application("x-tika-msoffice") + val ooxml = MimeType.application("x-tika-ooxml") + val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") + val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") + val xls = MimeType.application("vnd.ms-excel") + val doc = MimeType.application("msword") + val rtf = MimeType.application("rtf") + + // without a filename, tika returns application/zip for odt/ods files, since + // they are just zip files + val odfContainer = MimeType.zip + + val all = + Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer) + + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(all.contains) + } } diff --git a/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala b/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala new file mode 100644 index 00000000..dee9e9e0 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala @@ -0,0 +1,53 @@ +package docspell.convert + +import cats.data.Kleisli +import fs2.Stream +import docspell.common.MimeType + +sealed trait ConversionResult[F[_]] { + + def pdfData: Stream[F, Byte] + +} + +object ConversionResult { + + /** The conversion is done by external tools that write files to the + * file system. These are temporary files and they will be deleted + * once the process finishes. This handler is used to do something + * relevant with the resulting files. + */ + type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A] + + def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] = + UnsupportedFormat[F](mime) + + def failure[F[_]](ex: Throwable): ConversionResult[F] = + Failure[F](ex) + + def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] = + SuccessPdf[F](pdf) + + def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] = + SuccessPdfTxt[F](pdf, txt) + + def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] = + InputMalformed(mimeType, reason) + + case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] { + val pdfData = Stream.empty + } + case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] { + val pdfData = Stream.empty + } + case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] { + val pdfData = pdf + } + case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] { + val pdfData = pdf + } + + case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] { + val pdfData = Stream.empty + } +} diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index ddebaa23..887fe218 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -1,5 +1,11 @@ package docspell.convert +import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} import docspell.convert.flexmark.MarkdownConfig -case class ConvertConfig(markdown: MarkdownConfig) +case class ConvertConfig(chunkSize: Int, + maxImageSize: Int, + markdown: MarkdownConfig, + wkhtmlpdf: WkHtmlPdfConfig, + tesseract: TesseractConfig, + unoconv: UnoconvConfig) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala index ebc96be1..bf682287 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -2,30 +2,34 @@ package docspell.convert.extern import java.nio.file.Path +import cats.implicits._ import cats.effect._ import fs2.{Pipe, Stream} import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt} -object ExternConv { +private[extern] object ExternConv { - def toPDF[F[_]: Sync: ContextShift]( + def toPDF[F[_]: Sync: ContextShift, A]( name: String, cmdCfg: SystemCommand.Config, wd: Path, - chunkSize: Int, useStdin: Boolean, blocker: Blocker, - logger: Logger[F] - ): Pipe[F, Byte, Byte] = - in => - Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => + logger: Logger[F], + reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = + Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => val inFile = dir.resolve("infile").toAbsolutePath.normalize - val out = dir.resolve("out.pdf").toAbsolutePath.normalize + val out = dir.resolve("out.pdf").toAbsolutePath.normalize val sysCfg = cmdCfg.replace( - Map("{{outfile}}" -> out.toString) ++ + Map( + "{{outfile}}" -> out.toString + ) ++ (if (!useStdin) Map("{{infile}}" -> inFile.toString) - else Map.empty) + else Map.empty) ) val createInput: Pipe[F, Byte, Unit] = @@ -35,41 +39,66 @@ object ExternConv { in.through(createInput).flatMap { _ => SystemCommand .execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty) - .flatMap(result => - logResult(name, result, logger) ++ readResult[F]( - out, - result, - blocker, - chunkSize, - logger - ) + .evalMap(result => + logResult(name, result, logger). + flatMap(_ => reader(out, result)). + flatMap(handler.run) ) } - } + }.compile.lastOrError def readResult[F[_]: Sync: ContextShift]( - out: Path, - result: SystemCommand.Result, blocker: Blocker, chunkSize: Int, logger: Logger[F] - ): Stream[F, Byte] = - Stream.eval(File.existsNonEmpty[F](out)).flatMap { + )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = + File.existsNonEmpty[F](out).flatMap { case true => - if (result.rc == 0) File.readAll(out, blocker, chunkSize) + if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F] else - Stream - .eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists.")) - .drain ++ - File.readAll(out, blocker, chunkSize) + logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + successPdf(File.readAll(out, blocker, chunkSize)).pure[F] case false => - Stream.raiseError[F]( + ConversionResult.failure[F]( new Exception(s"Command result=${result.rc}. No output file found.") - ) + ).pure[F] } - private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] = + def readResultTesseract[F[_]: Sync: ContextShift]( + outPrefix: String, + blocker: Blocker, + chunkSize: Int, + logger: Logger[F] + )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = { + val outPdf = out.resolveSibling(s"$outPrefix.pdf") + File.existsNonEmpty[F](outPdf).flatMap { + case true => + val outTxt = out.resolveSibling(s"$outPrefix.txt") + File.exists(outTxt).flatMap(txtExists => { + val pdfData = File.readAll(out, blocker, chunkSize) + if (result.rc == 0) { + if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F] + else successPdf(pdfData).pure[F] + } else { + logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + successPdf(pdfData).pure[F] + } + }) + + case false => + ConversionResult.failure[F]( + new Exception(s"Command result=${result.rc}. No output file found.") + ).pure[F] + } + } + + private def storeDataToFile[F[_]: Sync: ContextShift]( + name: String, + blocker: Blocker, + logger: Logger[F], + inFile: Path + ): Pipe[F, Byte, Unit] = in => Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ Stream.eval(storeFile(in, inFile, blocker)) @@ -78,12 +107,12 @@ object ExternConv { name: String, result: SystemCommand.Result, logger: Logger[F] - ): Stream[F, Nothing] = - Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++ - Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain + ): F[Unit] = + logger.debug(s"$name stdout: ${result.stdout}") *> + logger.debug(s"$name stderr: ${result.stderr}") private def storeFile[F[_]: Sync: ContextShift]( - in: Stream[F, Byte], + in: Stream[F, Byte], target: Path, blocker: Blocker ): F[Unit] = diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala index f7cd017d..7c04608e 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -1,5 +1,26 @@ package docspell.convert.extern +import java.nio.file.Path + +import cats.effect._ +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler + object Tesseract { + def toPDF[F[_]: Sync: ContextShift, A]( + cfg: TesseractConfig, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val outBase = cfg.cmd.args.tail.headOption.getOrElse("out") + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger) + + ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler) + } + } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala new file mode 100644 index 00000000..f5bef831 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala @@ -0,0 +1,7 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala index 7ce10109..a6bb5b04 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -1,18 +1,25 @@ package docspell.convert.extern +import java.nio.file.Path + import cats.effect._ -import fs2.Pipe +import fs2.Stream import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler object Unoconv { - def toPDF[F[_]: Sync: ContextShift]( + def toPDF[F[_]: Sync: ContextShift, A]( cfg: UnoconvConfig, chunkSize: Int, blocker: Blocker, - logger: Logger[F], - ): Pipe[F, Byte, Byte] = - ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger) + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](blocker, chunkSize, logger) + ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler) + } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 11a7ccda..0c5657c1 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -1,18 +1,25 @@ package docspell.convert.extern +import java.nio.file.Path + import cats.effect._ -import fs2.Pipe +import fs2.Stream import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler object WkHtmlPdf { - def toPDF[F[_]: Sync: ContextShift]( + def toPDF[F[_]: Sync: ContextShift, A]( cfg: WkHtmlPdfConfig, chunkSize: Int, blocker: Blocker, logger: Logger[F], - ): Pipe[F, Byte, Byte] = - ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger) + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](blocker, chunkSize, logger) + ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler) + } } diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala new file mode 100644 index 00000000..dc158a31 --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -0,0 +1,160 @@ +package docspell.convert + +import java.nio.file.Paths + +import cats.data.Kleisli +import cats.implicits._ +import cats.effect.IO +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult.Handler +import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} +import docspell.convert.flexmark.MarkdownConfig +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object ConversionTest extends SimpleTestSuite with FileChecks { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val logger = Logger.log4s[IO](org.log4s.getLogger) + val target = Paths.get("target") + + val convertConfig = ConvertConfig( + 8192, + 3000 * 3000, + MarkdownConfig("body { padding: 2em 5em; }"), + WkHtmlPdfConfig( + SystemCommand.Config( + "wkhtmltopdf", + Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), + Duration.seconds(20) + ), + target + ), + TesseractConfig( + SystemCommand.Config( + "tesseract", + Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), + Duration.seconds(20) + ), + target + ), + UnoconvConfig( + SystemCommand.Config( + "unoconv", + Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), + Duration.seconds(20) + ), + target + ) + ) + + val conversion = Conversion.create[IO](convertConfig, blocker, logger) + + val bombs = List( + ExampleFiles.bombs_20K_gray_jpeg, + ExampleFiles.bombs_20K_gray_png, + ExampleFiles.bombs_20K_rgb_jpeg, + ExampleFiles.bombs_20K_rgb_png + ) + val pdfOnly = List( + ExampleFiles.examples_sample_ods, + ExampleFiles.examples_sample_doc, + ExampleFiles.examples_sample_docx, + ExampleFiles.examples_sample_ods, + ExampleFiles.examples_sample_odt, + ExampleFiles.examples_sample_rtf, + ExampleFiles.examples_sample_xls, + ExampleFiles.examples_sample_xlsx, + ExampleFiles.letter_de_md, + ExampleFiles.letter_de_txt, + ExampleFiles.letter_en_txt, + ExampleFiles.letter_de_html + ) + val pdfAndTxt = List( + ExampleFiles.camera_letter_en_jpg, + ExampleFiles.camera_letter_en_png, + ExampleFiles.camera_letter_en_tiff, + ExampleFiles.scanner_jfif_jpg + ) + + test("convert to pdf") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + File + .withTempDir[IO](target, "convpdf") + .use { dir => + conversion.use { conv => + def check(n: Long): Handler[IO, Unit] = + storePdfHandler(dir.resolve(s"test-$n.pdf")).map { p => + assert(p.isNonEmpty && p.isPDF) + } + + runConversion(pdfOnly, check, conv).compile.drain + } + } + .unsafeRunSync() + } + + test("convert image to pdf and txt") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + File + .withTempDir[IO](target, "convimgpdf") + .use { dir => + conversion.use { conv => + def check(n: Long): Handler[IO, Unit] = + storePdfTxtHandler(dir.resolve(s"test-$n.pdf"), dir.resolve(s"test-$n.txt")) + .map { + case (p, t) => + assert(p.isNonEmpty && p.isPDF) + assert(t.isNonEmpty && t.isPlainText) + } + + runConversion(pdfAndTxt, check, conv).compile.drain + } + } + .unsafeRunSync() + } + + test("do not convert image bombs") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + conversion + .use { conv => + def check: Handler[IO, Unit] = + Kleisli({ + case ConversionResult.InputMalformed(_, _) => + ().pure[IO] + case cr => + IO.raiseError(new Exception(s"Unexpected result: $cr")) + }) + + runConversion(bombs, _ => check, conv).compile.drain + } + .unsafeRunSync() + } + + def runConversion[A]( + uris: List[LenientUri], + handler: Long => Handler[IO, A], + conv: Conversion[IO] + ) = + Stream + .emits(uris) + .covary[IO] + .zipWithIndex + .evalMap({ + case (uri, index) => + val load = uri.readURL[IO](8192, blocker) + val dataType = DataType.filename(uri.path.segments.last) + logger.info(s"Processing file ${uri.path.asString}") *> + conv.toPDF(dataType, handler(index))(load) + }) + + def commandsExist: Boolean = + commandExists(convertConfig.unoconv.cmd.program) && + commandExists(convertConfig.wkhtmlpdf.cmd.program) && + commandExists(convertConfig.tesseract.cmd.program) +} diff --git a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala new file mode 100644 index 00000000..52254fbb --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala @@ -0,0 +1,59 @@ +package docspell.convert + +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path} + +import cats.data.Kleisli +import cats.effect.IO +import fs2.{Pipe, Stream} +import docspell.common.MimeType +import docspell.convert.ConversionResult.Handler +import docspell.files.TikaMimetype + +trait FileChecks { + + implicit class FileCheckOps(p: Path) { + + def isNonEmpty: Boolean = + Files.exists(p) && Files.size(p) > 0 + + def isType(mime: MimeType): Boolean = + TikaMimetype.detect[IO](p).map(_ == mime).unsafeRunSync + + def isPDF: Boolean = + isType(MimeType.pdf) + + def isPlainText: Boolean = + isType(MimeType.text("plain")) + } + + def storeFile(file: Path): Pipe[IO, Byte, Path] = + in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes)))) + + def storePdfHandler(file: Path): Handler[IO, Path] = + storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1) + + def storePdfTxtHandler(filePdf: Path, fileTxt: Path): Handler[IO, (Path, Path)] = + Kleisli({ + case ConversionResult.SuccessPdfTxt(pdf, txt) => + for { + pout <- pdf.through(storeFile(filePdf)).compile.lastOrError + str <- txt + tout <- IO(Files.write(fileTxt, str.getBytes(StandardCharsets.UTF_8))) + } yield (pout, tout) + + case ConversionResult.SuccessPdf(pdf) => + pdf.through(storeFile(filePdf)).compile.lastOrError.map(p => (p, fileTxt)) + + case ConversionResult.Failure(ex) => + throw new Exception(s"Unexpected result (failure: ${ex.getMessage})", ex) + + case cr => + throw new Exception(s"Unexpected result: $cr") + }) + + def commandExists(cmd: String): Boolean = + Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0 + + +} diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala index 7f81b694..9db0588c 100644 --- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -1,22 +1,20 @@ package docspell.convert.extern -import java.nio.file.{Files, Path, Paths} +import java.nio.file.{Path, Paths} -import fs2.Stream import cats.effect._ import docspell.common._ +import docspell.convert.FileChecks import docspell.files.{ExampleFiles, TestFiles} -import fs2.Pipe import minitest.SimpleTestSuite -object ExternConvTest extends SimpleTestSuite { +object ExternConvTest extends SimpleTestSuite with FileChecks { val blocker = TestFiles.blocker implicit val CS = TestFiles.CS val logger = Logger.log4s[IO](org.log4s.getLogger) val target = Paths.get("target") - test("convert html to pdf") { val cfg = SystemCommand.Config( "wkhtmltopdf", @@ -28,18 +26,20 @@ object ExternConvTest extends SimpleTestSuite { else { File .withTempDir[IO](target, "wkhtmltopdf") - .use(dir => IO { - val wkCfg = WkHtmlPdfConfig(cfg, target) - val p = ExampleFiles.letter_de_html - .readURL[IO](8192, blocker) - .through(WkHtmlPdf.toPDF[IO](wkCfg, 8192, blocker, logger)) - .through(storeFile(dir.resolve("test.pdf"))) - .compile - .lastOrError - .unsafeRunSync() + .use(dir => + IO { + val wkCfg = WkHtmlPdfConfig(cfg, target) + val p = + WkHtmlPdf + .toPDF[IO, Path](wkCfg, 8192, blocker, logger)( + ExampleFiles.letter_de_html.readURL[IO](8192, blocker), + storePdfHandler(dir.resolve("test.pdf")) + ) + .unsafeRunSync() - assert(Files.exists(p) && Files.size(p) > 0) - }) + assert(p.isNonEmpty && p.isPDF) + } + ) .unsafeRunSync } } @@ -55,26 +55,53 @@ object ExternConvTest extends SimpleTestSuite { else { File .withTempDir[IO](target, "unoconv") - .use(dir => IO { - val ucCfg = UnoconvConfig(cfg, target) - val p = ExampleFiles.examples_sample_docx - .readURL[IO](8192, blocker) - .through(Unoconv.toPDF[IO](ucCfg, 8192, blocker, logger)) - .through(storeFile(dir.resolve("test.pdf"))) - .compile - .lastOrError - .unsafeRunSync() + .use(dir => + IO { + val ucCfg = UnoconvConfig(cfg, target) + val p = + Unoconv + .toPDF[IO, Path](ucCfg, 8192, blocker, logger)( + ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker), + storePdfHandler(dir.resolve("test.pdf")) + ) + .unsafeRunSync() - assert(Files.exists(p) && Files.size(p) > 0) - }) + assert(p.isNonEmpty && p.isPDF) + } + ) + .unsafeRunSync + } + } + + test("convert image to pdf") { + val cfg = SystemCommand.Config( + "tesseract", + Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "tesseract") + .use(dir => + IO { + val tessCfg = TesseractConfig(cfg, target) + val (pdf, txt) = + Tesseract + .toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)( + ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker), + storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt")) + ) + .unsafeRunSync() + + assert(pdf.isNonEmpty && pdf.isPDF) + assert(txt.isNonEmpty && txt.isPlainText) + } + ) .unsafeRunSync } } - def storeFile(file: Path): Pipe[IO, Byte, Path] = - in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes)))) - - def commandExists(cmd: String): Boolean = - Runtime.getRuntime().exec(Array("which", cmd)).waitFor() == 0 } diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala index b4951686..ae35cb5f 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -2,4 +2,4 @@ package docspell.extract import docspell.extract.ocr.OcrConfig -case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) +case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index 892ef54d..81a61e7a 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -9,6 +9,7 @@ import docspell.extract.poi.{PoiExtract, PoiType} import docspell.extract.rtf.RtfExtract import fs2.Stream import docspell.files.TikaMimetype +import docspell.files.ImageSize trait Extraction[F[_]] { @@ -44,14 +45,29 @@ object Extraction { case OdfType(_) => OdfExtract.get(data).map(ExtractResult.fromEither) - case OcrType(_) => - TextExtract + case OcrType(mt) => + val doExtract = TextExtract .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) .compile .lastOrError .attempt .map(ExtractResult.fromEither) + ImageSize.get(data).flatMap { + case Some(dim) => + if (dim.product > cfg.maxImageSize) { + logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + ExtractResult.failure(new Exception( + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).") + ).pure[F] + } else { + doExtract + } + case None => + logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *> + doExtract + } + case OdfType.container => logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> OdfExtract.get(data).map(ExtractResult.fromEither) diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala index 5519ddeb..3e935ef4 100644 --- a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala @@ -4,10 +4,10 @@ import docspell.common.MimeType object OdfType { - val odt = MimeType.application("application/vnd.oasis.opendocument.text") - val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet") - val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text") - val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet") + val odt = MimeType.application("vnd.oasis.opendocument.text") + val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") + val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text") + val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") val container = MimeType.zip diff --git a/modules/files/src/main/scala/docspell/files/Dimension.scala b/modules/files/src/main/scala/docspell/files/Dimension.scala index 2d1a1f4b..44025311 100644 --- a/modules/files/src/main/scala/docspell/files/Dimension.scala +++ b/modules/files/src/main/scala/docspell/files/Dimension.scala @@ -2,6 +2,8 @@ package docspell.files case class Dimension(width: Int, height: Int) { + def product = width * height + def toAwtDimension: java.awt.Dimension = new java.awt.Dimension(width, height) } diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala index b828b8fe..88b95874 100644 --- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -1,5 +1,8 @@ package docspell.files +import java.io.BufferedInputStream +import java.nio.file.{Files, Path} + import cats.implicits._ import cats.effect.Sync import docspell.common._ @@ -8,6 +11,8 @@ import org.apache.tika.config.TikaConfig import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} import org.apache.tika.mime.MediaType +import scala.util.Using + object TikaMimetype { private val tika = new TikaConfig().getDetector @@ -43,4 +48,12 @@ object TikaMimetype { case DataType.Exact(mt) => mt.pure[F] case DataType.Hint(hint) => TikaMimetype.detect(data, hint) } + + def detect[F[_]: Sync](file: Path): F[MimeType] = + Sync[F].delay { + val hint = MimeTypeHint.filename(file.getFileName.toString) + Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in => + convert(tika.detect(in, makeMetadata(hint))) + }).toEither + }.rethrow } diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 12c7d6c5..9e782c70 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -32,6 +32,7 @@ object Dependencies { val YamuscaVersion = "0.6.1" val SwaggerUIVersion = "3.25.0" val SemanticUIVersion = "2.4.1" + val TwelveMonkeysVersion = "3.5" val JQueryVersion = "3.4.1" val ViewerJSVersion = "0.5.8" @@ -62,10 +63,10 @@ object Dependencies { ExclusionRule("hamcrest-core") )) - // val twelvemonkeys = Seq( - // "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5", - // "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5" - // ) + val twelvemonkeys = Seq( + "com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion, + "com.twelvemonkeys.imageio" % "imageio-tiff" % TwelveMonkeysVersion + ) val pdfbox = Seq( "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (