Convert some files to pdf

This commit is contained in:
Eike Kettner 2020-02-18 21:32:21 +01:00
parent 5869e2ee6e
commit 9b1349734e
19 changed files with 605 additions and 98 deletions

View File

@ -152,7 +152,7 @@ val files = project.in(file("modules/files")).
settings(
name := "docspell-files",
libraryDependencies ++=
Dependencies.tika ,
Dependencies.tika,
Test / sourceGenerators += Def.task {
val base = (Test/resourceDirectory).value
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
@ -204,6 +204,7 @@ val extract = project.in(file("modules/extract")).
name := "docspell-extract",
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.twelvemonkeys ++
Dependencies.pdfbox ++
Dependencies.poi ++
Dependencies.commonsIO ++
@ -217,7 +218,8 @@ val convert = project.in(file("modules/convert")).
settings(
name := "docspell-convert",
libraryDependencies ++=
Dependencies.flexmark
Dependencies.flexmark ++
Dependencies.twelvemonkeys
).dependsOn(common, files % "compile->compile;test->test")
val analysis = project.in(file("modules/analysis")).

View File

@ -6,8 +6,9 @@ import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
import java.util.concurrent.atomic.AtomicInteger
import scala.jdk.CollectionConverters._
import fs2.Stream
import cats.implicits._
import cats.effect.{Blocker, ContextShift, Resource, Sync}
import cats.effect._
object File {
@ -42,6 +43,9 @@ object File {
count.get
}
def exists[F[_]: Sync](file: Path): F[Boolean] =
Sync[F].delay(Files.exists(file))
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
@ -61,6 +65,11 @@ object File {
javaList.asScala.toList.sortBy(_.getFileName.toString)
}
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) =
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
fs2.io.file.readAll(file, blocker, chunkSize)
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).
through(fs2.text.utf8Decode).
compile.foldMonoid
}

View File

@ -1,24 +1,112 @@
package docspell.convert
import java.nio.charset.StandardCharsets
import fs2._
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
trait Conversion[F[_]] {
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]]
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
}
object Conversion {
def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] =
def create[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
blocker: Blocker,
logger: Logger[F]
): Resource[F, Conversion[F]] =
Resource.pure(new Conversion[F] {
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = {
println(s"$cfg $blocker $logger")
???
}
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.pdf =>
handler.run(ConversionResult.successPdf(in))
case MimeType.html =>
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
case Texts(_) =>
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
val bytes = Stream
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
.covary[F]
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
}
case Images(mt) =>
ImageSize.get(in).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
handler.run(
ConversionResult.inputMalformed(
mt,
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})."
)
)
} else {
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
}
case None =>
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
}
case Office(_) =>
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)
case mt =>
handler.run(ConversionResult.unsupportedFormat(mt))
}
})
object Images {
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)
}
object Texts {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(_.primary == "text")
}
object Office {
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val rtf = MimeType.application("rtf")
// without a filename, tika returns application/zip for odt/ods files, since
// they are just zip files
val odfContainer = MimeType.zip
val all =
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)
}
}

View File

@ -0,0 +1,53 @@
package docspell.convert
import cats.data.Kleisli
import fs2.Stream
import docspell.common.MimeType
sealed trait ConversionResult[F[_]] {
def pdfData: Stream[F, Byte]
}
object ConversionResult {
/** The conversion is done by external tools that write files to the
* file system. These are temporary files and they will be deleted
* once the process finishes. This handler is used to do something
* relevant with the resulting files.
*/
type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A]
def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] =
UnsupportedFormat[F](mime)
def failure[F[_]](ex: Throwable): ConversionResult[F] =
Failure[F](ex)
def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] =
SuccessPdf[F](pdf)
def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] =
SuccessPdfTxt[F](pdf, txt)
def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] =
InputMalformed(mimeType, reason)
case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] {
val pdfData = Stream.empty
}
case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] {
val pdfData = Stream.empty
}
case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] {
val pdfData = pdf
}
case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] {
val pdfData = pdf
}
case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] {
val pdfData = Stream.empty
}
}

View File

@ -1,5 +1,11 @@
package docspell.convert
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
case class ConvertConfig(markdown: MarkdownConfig)
case class ConvertConfig(chunkSize: Int,
maxImageSize: Int,
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
tesseract: TesseractConfig,
unoconv: UnoconvConfig)

View File

@ -2,30 +2,34 @@ package docspell.convert.extern
import java.nio.file.Path
import cats.implicits._
import cats.effect._
import fs2.{Pipe, Stream}
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
object ExternConv {
private[extern] object ExternConv {
def toPDF[F[_]: Sync: ContextShift](
def toPDF[F[_]: Sync: ContextShift, A](
name: String,
cmdCfg: SystemCommand.Config,
wd: Path,
chunkSize: Int,
useStdin: Boolean,
blocker: Blocker,
logger: Logger[F]
): Pipe[F, Byte, Byte] =
in =>
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
logger: Logger[F],
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
val inFile = dir.resolve("infile").toAbsolutePath.normalize
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
val sysCfg =
cmdCfg.replace(
Map("{{outfile}}" -> out.toString) ++
Map(
"{{outfile}}" -> out.toString
) ++
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
else Map.empty)
else Map.empty)
)
val createInput: Pipe[F, Byte, Unit] =
@ -35,41 +39,66 @@ object ExternConv {
in.through(createInput).flatMap { _ =>
SystemCommand
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
.flatMap(result =>
logResult(name, result, logger) ++ readResult[F](
out,
result,
blocker,
chunkSize,
logger
)
.evalMap(result =>
logResult(name, result, logger).
flatMap(_ => reader(out, result)).
flatMap(handler.run)
)
}
}
}.compile.lastOrError
def readResult[F[_]: Sync: ContextShift](
out: Path,
result: SystemCommand.Result,
blocker: Blocker,
chunkSize: Int,
logger: Logger[F]
): Stream[F, Byte] =
Stream.eval(File.existsNonEmpty[F](out)).flatMap {
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
File.existsNonEmpty[F](out).flatMap {
case true =>
if (result.rc == 0) File.readAll(out, blocker, chunkSize)
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
else
Stream
.eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists."))
.drain ++
File.readAll(out, blocker, chunkSize)
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
case false =>
Stream.raiseError[F](
ConversionResult.failure[F](
new Exception(s"Command result=${result.rc}. No output file found.")
)
).pure[F]
}
private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
def readResultTesseract[F[_]: Sync: ContextShift](
outPrefix: String,
blocker: Blocker,
chunkSize: Int,
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
File.existsNonEmpty[F](outPdf).flatMap {
case true =>
val outTxt = out.resolveSibling(s"$outPrefix.txt")
File.exists(outTxt).flatMap(txtExists => {
val pdfData = File.readAll(out, blocker, chunkSize)
if (result.rc == 0) {
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
else successPdf(pdfData).pure[F]
} else {
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(pdfData).pure[F]
}
})
case false =>
ConversionResult.failure[F](
new Exception(s"Command result=${result.rc}. No output file found.")
).pure[F]
}
}
private def storeDataToFile[F[_]: Sync: ContextShift](
name: String,
blocker: Blocker,
logger: Logger[F],
inFile: Path
): Pipe[F, Byte, Unit] =
in =>
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
Stream.eval(storeFile(in, inFile, blocker))
@ -78,12 +107,12 @@ object ExternConv {
name: String,
result: SystemCommand.Result,
logger: Logger[F]
): Stream[F, Nothing] =
Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++
Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain
): F[Unit] =
logger.debug(s"$name stdout: ${result.stdout}") *>
logger.debug(s"$name stderr: ${result.stderr}")
private def storeFile[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
in: Stream[F, Byte],
target: Path,
blocker: Blocker
): F[Unit] =

View File

@ -1,5 +1,26 @@
package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
object Tesseract {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: TesseractConfig,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val outBase = cfg.cmd.args.tail.headOption.getOrElse("out")
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
}
}

View File

@ -0,0 +1,7 @@
package docspell.convert.extern
import java.nio.file.Path
import docspell.common.SystemCommand
case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path)

View File

@ -1,18 +1,25 @@
package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Pipe
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
object Unoconv {
def toPDF[F[_]: Sync: ContextShift](
def toPDF[F[_]: Sync: ContextShift, A](
cfg: UnoconvConfig,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F],
): Pipe[F, Byte, Byte] =
ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger)
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
}
}

View File

@ -1,18 +1,25 @@
package docspell.convert.extern
import java.nio.file.Path
import cats.effect._
import fs2.Pipe
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult
import docspell.convert.ConversionResult.Handler
object WkHtmlPdf {
def toPDF[F[_]: Sync: ContextShift](
def toPDF[F[_]: Sync: ContextShift, A](
cfg: WkHtmlPdfConfig,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F],
): Pipe[F, Byte, Byte] =
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger)
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler)
}
}

View File

@ -0,0 +1,160 @@
package docspell.convert
import java.nio.file.Paths
import cats.data.Kleisli
import cats.implicits._
import cats.effect.IO
import fs2.Stream
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object ConversionTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val logger = Logger.log4s[IO](org.log4s.getLogger)
val target = Paths.get("target")
val convertConfig = ConvertConfig(
8192,
3000 * 3000,
MarkdownConfig("body { padding: 2em 5em; }"),
WkHtmlPdfConfig(
SystemCommand.Config(
"wkhtmltopdf",
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
Duration.seconds(20)
),
target
),
TesseractConfig(
SystemCommand.Config(
"tesseract",
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
Duration.seconds(20)
),
target
),
UnoconvConfig(
SystemCommand.Config(
"unoconv",
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
Duration.seconds(20)
),
target
)
)
val conversion = Conversion.create[IO](convertConfig, blocker, logger)
val bombs = List(
ExampleFiles.bombs_20K_gray_jpeg,
ExampleFiles.bombs_20K_gray_png,
ExampleFiles.bombs_20K_rgb_jpeg,
ExampleFiles.bombs_20K_rgb_png
)
val pdfOnly = List(
ExampleFiles.examples_sample_ods,
ExampleFiles.examples_sample_doc,
ExampleFiles.examples_sample_docx,
ExampleFiles.examples_sample_ods,
ExampleFiles.examples_sample_odt,
ExampleFiles.examples_sample_rtf,
ExampleFiles.examples_sample_xls,
ExampleFiles.examples_sample_xlsx,
ExampleFiles.letter_de_md,
ExampleFiles.letter_de_txt,
ExampleFiles.letter_en_txt,
ExampleFiles.letter_de_html
)
val pdfAndTxt = List(
ExampleFiles.camera_letter_en_jpg,
ExampleFiles.camera_letter_en_png,
ExampleFiles.camera_letter_en_tiff,
ExampleFiles.scanner_jfif_jpg
)
test("convert to pdf") {
if (!commandsExist) ignore("At least one of the conversion programs not found")
else
File
.withTempDir[IO](target, "convpdf")
.use { dir =>
conversion.use { conv =>
def check(n: Long): Handler[IO, Unit] =
storePdfHandler(dir.resolve(s"test-$n.pdf")).map { p =>
assert(p.isNonEmpty && p.isPDF)
}
runConversion(pdfOnly, check, conv).compile.drain
}
}
.unsafeRunSync()
}
test("convert image to pdf and txt") {
if (!commandsExist) ignore("At least one of the conversion programs not found")
else
File
.withTempDir[IO](target, "convimgpdf")
.use { dir =>
conversion.use { conv =>
def check(n: Long): Handler[IO, Unit] =
storePdfTxtHandler(dir.resolve(s"test-$n.pdf"), dir.resolve(s"test-$n.txt"))
.map {
case (p, t) =>
assert(p.isNonEmpty && p.isPDF)
assert(t.isNonEmpty && t.isPlainText)
}
runConversion(pdfAndTxt, check, conv).compile.drain
}
}
.unsafeRunSync()
}
test("do not convert image bombs") {
if (!commandsExist) ignore("At least one of the conversion programs not found")
else
conversion
.use { conv =>
def check: Handler[IO, Unit] =
Kleisli({
case ConversionResult.InputMalformed(_, _) =>
().pure[IO]
case cr =>
IO.raiseError(new Exception(s"Unexpected result: $cr"))
})
runConversion(bombs, _ => check, conv).compile.drain
}
.unsafeRunSync()
}
def runConversion[A](
uris: List[LenientUri],
handler: Long => Handler[IO, A],
conv: Conversion[IO]
) =
Stream
.emits(uris)
.covary[IO]
.zipWithIndex
.evalMap({
case (uri, index) =>
val load = uri.readURL[IO](8192, blocker)
val dataType = DataType.filename(uri.path.segments.last)
logger.info(s"Processing file ${uri.path.asString}") *>
conv.toPDF(dataType, handler(index))(load)
})
def commandsExist: Boolean =
commandExists(convertConfig.unoconv.cmd.program) &&
commandExists(convertConfig.wkhtmlpdf.cmd.program) &&
commandExists(convertConfig.tesseract.cmd.program)
}

View File

@ -0,0 +1,59 @@
package docspell.convert
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Path}
import cats.data.Kleisli
import cats.effect.IO
import fs2.{Pipe, Stream}
import docspell.common.MimeType
import docspell.convert.ConversionResult.Handler
import docspell.files.TikaMimetype
trait FileChecks {
implicit class FileCheckOps(p: Path) {
def isNonEmpty: Boolean =
Files.exists(p) && Files.size(p) > 0
def isType(mime: MimeType): Boolean =
TikaMimetype.detect[IO](p).map(_ == mime).unsafeRunSync
def isPDF: Boolean =
isType(MimeType.pdf)
def isPlainText: Boolean =
isType(MimeType.text("plain"))
}
def storeFile(file: Path): Pipe[IO, Byte, Path] =
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
def storePdfHandler(file: Path): Handler[IO, Path] =
storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1)
def storePdfTxtHandler(filePdf: Path, fileTxt: Path): Handler[IO, (Path, Path)] =
Kleisli({
case ConversionResult.SuccessPdfTxt(pdf, txt) =>
for {
pout <- pdf.through(storeFile(filePdf)).compile.lastOrError
str <- txt
tout <- IO(Files.write(fileTxt, str.getBytes(StandardCharsets.UTF_8)))
} yield (pout, tout)
case ConversionResult.SuccessPdf(pdf) =>
pdf.through(storeFile(filePdf)).compile.lastOrError.map(p => (p, fileTxt))
case ConversionResult.Failure(ex) =>
throw new Exception(s"Unexpected result (failure: ${ex.getMessage})", ex)
case cr =>
throw new Exception(s"Unexpected result: $cr")
})
def commandExists(cmd: String): Boolean =
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
}

View File

@ -1,22 +1,20 @@
package docspell.convert.extern
import java.nio.file.{Files, Path, Paths}
import java.nio.file.{Path, Paths}
import fs2.Stream
import cats.effect._
import docspell.common._
import docspell.convert.FileChecks
import docspell.files.{ExampleFiles, TestFiles}
import fs2.Pipe
import minitest.SimpleTestSuite
object ExternConvTest extends SimpleTestSuite {
object ExternConvTest extends SimpleTestSuite with FileChecks {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val logger = Logger.log4s[IO](org.log4s.getLogger)
val target = Paths.get("target")
test("convert html to pdf") {
val cfg = SystemCommand.Config(
"wkhtmltopdf",
@ -28,18 +26,20 @@ object ExternConvTest extends SimpleTestSuite {
else {
File
.withTempDir[IO](target, "wkhtmltopdf")
.use(dir => IO {
val wkCfg = WkHtmlPdfConfig(cfg, target)
val p = ExampleFiles.letter_de_html
.readURL[IO](8192, blocker)
.through(WkHtmlPdf.toPDF[IO](wkCfg, 8192, blocker, logger))
.through(storeFile(dir.resolve("test.pdf")))
.compile
.lastOrError
.unsafeRunSync()
.use(dir =>
IO {
val wkCfg = WkHtmlPdfConfig(cfg, target)
val p =
WkHtmlPdf
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
storePdfHandler(dir.resolve("test.pdf"))
)
.unsafeRunSync()
assert(Files.exists(p) && Files.size(p) > 0)
})
assert(p.isNonEmpty && p.isPDF)
}
)
.unsafeRunSync
}
}
@ -55,26 +55,53 @@ object ExternConvTest extends SimpleTestSuite {
else {
File
.withTempDir[IO](target, "unoconv")
.use(dir => IO {
val ucCfg = UnoconvConfig(cfg, target)
val p = ExampleFiles.examples_sample_docx
.readURL[IO](8192, blocker)
.through(Unoconv.toPDF[IO](ucCfg, 8192, blocker, logger))
.through(storeFile(dir.resolve("test.pdf")))
.compile
.lastOrError
.unsafeRunSync()
.use(dir =>
IO {
val ucCfg = UnoconvConfig(cfg, target)
val p =
Unoconv
.toPDF[IO, Path](ucCfg, 8192, blocker, logger)(
ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker),
storePdfHandler(dir.resolve("test.pdf"))
)
.unsafeRunSync()
assert(Files.exists(p) && Files.size(p) > 0)
})
assert(p.isNonEmpty && p.isPDF)
}
)
.unsafeRunSync
}
}
test("convert image to pdf") {
val cfg = SystemCommand.Config(
"tesseract",
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
Duration.seconds(20)
)
if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found")
else {
File
.withTempDir[IO](target, "tesseract")
.use(dir =>
IO {
val tessCfg = TesseractConfig(cfg, target)
val (pdf, txt) =
Tesseract
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
)
.unsafeRunSync()
assert(pdf.isNonEmpty && pdf.isPDF)
assert(txt.isNonEmpty && txt.isPlainText)
}
)
.unsafeRunSync
}
}
def storeFile(file: Path): Pipe[IO, Byte, Path] =
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
def commandExists(cmd: String): Boolean =
Runtime.getRuntime().exec(Array("which", cmd)).waitFor() == 0
}

View File

@ -2,4 +2,4 @@ package docspell.extract
import docspell.extract.ocr.OcrConfig
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)

View File

@ -9,6 +9,7 @@ import docspell.extract.poi.{PoiExtract, PoiType}
import docspell.extract.rtf.RtfExtract
import fs2.Stream
import docspell.files.TikaMimetype
import docspell.files.ImageSize
trait Extraction[F[_]] {
@ -44,14 +45,29 @@ object Extraction {
case OdfType(_) =>
OdfExtract.get(data).map(ExtractResult.fromEither)
case OcrType(_) =>
TextExtract
case OcrType(mt) =>
val doExtract = TextExtract
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.compile
.lastOrError
.attempt
.map(ExtractResult.fromEither)
ImageSize.get(data).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
ExtractResult.failure(new Exception(
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
).pure[F]
} else {
doExtract
}
case None =>
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
doExtract
}
case OdfType.container =>
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
OdfExtract.get(data).map(ExtractResult.fromEither)

View File

@ -4,10 +4,10 @@ import docspell.common.MimeType
object OdfType {
val odt = MimeType.application("application/vnd.oasis.opendocument.text")
val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet")
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val container = MimeType.zip

View File

@ -2,6 +2,8 @@ package docspell.files
case class Dimension(width: Int, height: Int) {
def product = width * height
def toAwtDimension: java.awt.Dimension =
new java.awt.Dimension(width, height)
}

View File

@ -1,5 +1,8 @@
package docspell.files
import java.io.BufferedInputStream
import java.nio.file.{Files, Path}
import cats.implicits._
import cats.effect.Sync
import docspell.common._
@ -8,6 +11,8 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.mime.MediaType
import scala.util.Using
object TikaMimetype {
private val tika = new TikaConfig().getDetector
@ -43,4 +48,12 @@ object TikaMimetype {
case DataType.Exact(mt) => mt.pure[F]
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
}
def detect[F[_]: Sync](file: Path): F[MimeType] =
Sync[F].delay {
val hint = MimeTypeHint.filename(file.getFileName.toString)
Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in =>
convert(tika.detect(in, makeMetadata(hint)))
}).toEither
}.rethrow
}

View File

@ -32,6 +32,7 @@ object Dependencies {
val YamuscaVersion = "0.6.1"
val SwaggerUIVersion = "3.25.0"
val SemanticUIVersion = "2.4.1"
val TwelveMonkeysVersion = "3.5"
val JQueryVersion = "3.4.1"
val ViewerJSVersion = "0.5.8"
@ -62,10 +63,10 @@ object Dependencies {
ExclusionRule("hamcrest-core")
))
// val twelvemonkeys = Seq(
// "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
// "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
// )
val twelvemonkeys = Seq(
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
"com.twelvemonkeys.imageio" % "imageio-tiff" % TwelveMonkeysVersion
)
val pdfbox = Seq(
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (