mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Convert some files to pdf
This commit is contained in:
parent
5869e2ee6e
commit
9b1349734e
@ -152,7 +152,7 @@ val files = project.in(file("modules/files")).
|
||||
settings(
|
||||
name := "docspell-files",
|
||||
libraryDependencies ++=
|
||||
Dependencies.tika ,
|
||||
Dependencies.tika,
|
||||
Test / sourceGenerators += Def.task {
|
||||
val base = (Test/resourceDirectory).value
|
||||
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
|
||||
@ -204,6 +204,7 @@ val extract = project.in(file("modules/extract")).
|
||||
name := "docspell-extract",
|
||||
libraryDependencies ++=
|
||||
Dependencies.fs2 ++
|
||||
Dependencies.twelvemonkeys ++
|
||||
Dependencies.pdfbox ++
|
||||
Dependencies.poi ++
|
||||
Dependencies.commonsIO ++
|
||||
@ -217,7 +218,8 @@ val convert = project.in(file("modules/convert")).
|
||||
settings(
|
||||
name := "docspell-convert",
|
||||
libraryDependencies ++=
|
||||
Dependencies.flexmark
|
||||
Dependencies.flexmark ++
|
||||
Dependencies.twelvemonkeys
|
||||
).dependsOn(common, files % "compile->compile;test->test")
|
||||
|
||||
val analysis = project.in(file("modules/analysis")).
|
||||
|
@ -6,8 +6,9 @@ import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import fs2.Stream
|
||||
import cats.implicits._
|
||||
import cats.effect.{Blocker, ContextShift, Resource, Sync}
|
||||
import cats.effect._
|
||||
|
||||
object File {
|
||||
|
||||
@ -42,6 +43,9 @@ object File {
|
||||
count.get
|
||||
}
|
||||
|
||||
def exists[F[_]: Sync](file: Path): F[Boolean] =
|
||||
Sync[F].delay(Files.exists(file))
|
||||
|
||||
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
||||
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
||||
|
||||
@ -61,6 +65,11 @@ object File {
|
||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||
}
|
||||
|
||||
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) =
|
||||
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
|
||||
fs2.io.file.readAll(file, blocker, chunkSize)
|
||||
|
||||
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||
readAll[F](file, blocker, 8192).
|
||||
through(fs2.text.utf8Decode).
|
||||
compile.foldMonoid
|
||||
}
|
||||
|
@ -1,24 +1,112 @@
|
||||
package docspell.convert
|
||||
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
import fs2._
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
|
||||
import docspell.convert.flexmark.Markdown
|
||||
import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]]
|
||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||
|
||||
}
|
||||
|
||||
object Conversion {
|
||||
|
||||
def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] =
|
||||
def create[F[_]: Sync: ContextShift](
|
||||
cfg: ConvertConfig,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Resource[F, Conversion[F]] =
|
||||
Resource.pure(new Conversion[F] {
|
||||
|
||||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = {
|
||||
println(s"$cfg $blocker $logger")
|
||||
???
|
||||
}
|
||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.pdf =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
|
||||
case MimeType.html =>
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
|
||||
case Texts(_) =>
|
||||
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
|
||||
val bytes = Stream
|
||||
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
|
||||
.covary[F]
|
||||
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
|
||||
}
|
||||
|
||||
case Images(mt) =>
|
||||
ImageSize.get(in).flatMap {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.maxImageSize) {
|
||||
logger
|
||||
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||
handler.run(
|
||||
ConversionResult.inputMalformed(
|
||||
mt,
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})."
|
||||
)
|
||||
)
|
||||
} else {
|
||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
}
|
||||
|
||||
case None =>
|
||||
logger.info(
|
||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||
) *>
|
||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
}
|
||||
|
||||
case Office(_) =>
|
||||
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
|
||||
case mt =>
|
||||
handler.run(ConversionResult.unsupportedFormat(mt))
|
||||
}
|
||||
})
|
||||
|
||||
object Images {
|
||||
|
||||
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
|
||||
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(all.contains)
|
||||
}
|
||||
|
||||
object Texts {
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(_.primary == "text")
|
||||
}
|
||||
|
||||
object Office {
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
|
||||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val rtf = MimeType.application("rtf")
|
||||
|
||||
// without a filename, tika returns application/zip for odt/ods files, since
|
||||
// they are just zip files
|
||||
val odfContainer = MimeType.zip
|
||||
|
||||
val all =
|
||||
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
|
||||
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(all.contains)
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,53 @@
|
||||
package docspell.convert
|
||||
|
||||
import cats.data.Kleisli
|
||||
import fs2.Stream
|
||||
import docspell.common.MimeType
|
||||
|
||||
sealed trait ConversionResult[F[_]] {
|
||||
|
||||
def pdfData: Stream[F, Byte]
|
||||
|
||||
}
|
||||
|
||||
object ConversionResult {
|
||||
|
||||
/** The conversion is done by external tools that write files to the
|
||||
* file system. These are temporary files and they will be deleted
|
||||
* once the process finishes. This handler is used to do something
|
||||
* relevant with the resulting files.
|
||||
*/
|
||||
type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A]
|
||||
|
||||
def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] =
|
||||
UnsupportedFormat[F](mime)
|
||||
|
||||
def failure[F[_]](ex: Throwable): ConversionResult[F] =
|
||||
Failure[F](ex)
|
||||
|
||||
def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] =
|
||||
SuccessPdf[F](pdf)
|
||||
|
||||
def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] =
|
||||
SuccessPdfTxt[F](pdf, txt)
|
||||
|
||||
def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] =
|
||||
InputMalformed(mimeType, reason)
|
||||
|
||||
case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] {
|
||||
val pdfData = Stream.empty
|
||||
}
|
||||
case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] {
|
||||
val pdfData = Stream.empty
|
||||
}
|
||||
case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] {
|
||||
val pdfData = pdf
|
||||
}
|
||||
case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] {
|
||||
val pdfData = pdf
|
||||
}
|
||||
|
||||
case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] {
|
||||
val pdfData = Stream.empty
|
||||
}
|
||||
}
|
@ -1,5 +1,11 @@
|
||||
package docspell.convert
|
||||
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
|
||||
case class ConvertConfig(markdown: MarkdownConfig)
|
||||
case class ConvertConfig(chunkSize: Int,
|
||||
maxImageSize: Int,
|
||||
markdown: MarkdownConfig,
|
||||
wkhtmlpdf: WkHtmlPdfConfig,
|
||||
tesseract: TesseractConfig,
|
||||
unoconv: UnoconvConfig)
|
||||
|
@ -2,30 +2,34 @@ package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect._
|
||||
import fs2.{Pipe, Stream}
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
||||
|
||||
object ExternConv {
|
||||
private[extern] object ExternConv {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift](
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
name: String,
|
||||
cmdCfg: SystemCommand.Config,
|
||||
wd: Path,
|
||||
chunkSize: Int,
|
||||
useStdin: Boolean,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Pipe[F, Byte, Byte] =
|
||||
in =>
|
||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
||||
logger: Logger[F],
|
||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
||||
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
||||
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||
val sysCfg =
|
||||
cmdCfg.replace(
|
||||
Map("{{outfile}}" -> out.toString) ++
|
||||
Map(
|
||||
"{{outfile}}" -> out.toString
|
||||
) ++
|
||||
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
||||
else Map.empty)
|
||||
else Map.empty)
|
||||
)
|
||||
|
||||
val createInput: Pipe[F, Byte, Unit] =
|
||||
@ -35,41 +39,66 @@ object ExternConv {
|
||||
in.through(createInput).flatMap { _ =>
|
||||
SystemCommand
|
||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
||||
.flatMap(result =>
|
||||
logResult(name, result, logger) ++ readResult[F](
|
||||
out,
|
||||
result,
|
||||
blocker,
|
||||
chunkSize,
|
||||
logger
|
||||
)
|
||||
.evalMap(result =>
|
||||
logResult(name, result, logger).
|
||||
flatMap(_ => reader(out, result)).
|
||||
flatMap(handler.run)
|
||||
)
|
||||
}
|
||||
}
|
||||
}.compile.lastOrError
|
||||
|
||||
def readResult[F[_]: Sync: ContextShift](
|
||||
out: Path,
|
||||
result: SystemCommand.Result,
|
||||
blocker: Blocker,
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Byte] =
|
||||
Stream.eval(File.existsNonEmpty[F](out)).flatMap {
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||
File.existsNonEmpty[F](out).flatMap {
|
||||
case true =>
|
||||
if (result.rc == 0) File.readAll(out, blocker, chunkSize)
|
||||
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
else
|
||||
Stream
|
||||
.eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists."))
|
||||
.drain ++
|
||||
File.readAll(out, blocker, chunkSize)
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
Stream.raiseError[F](
|
||||
ConversionResult.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
)
|
||||
).pure[F]
|
||||
}
|
||||
|
||||
private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
|
||||
def readResultTesseract[F[_]: Sync: ContextShift](
|
||||
outPrefix: String,
|
||||
blocker: Blocker,
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
|
||||
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
|
||||
File.existsNonEmpty[F](outPdf).flatMap {
|
||||
case true =>
|
||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||
File.exists(outTxt).flatMap(txtExists => {
|
||||
val pdfData = File.readAll(out, blocker, chunkSize)
|
||||
if (result.rc == 0) {
|
||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
||||
else successPdf(pdfData).pure[F]
|
||||
} else {
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(pdfData).pure[F]
|
||||
}
|
||||
})
|
||||
|
||||
case false =>
|
||||
ConversionResult.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
).pure[F]
|
||||
}
|
||||
}
|
||||
|
||||
private def storeDataToFile[F[_]: Sync: ContextShift](
|
||||
name: String,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
inFile: Path
|
||||
): Pipe[F, Byte, Unit] =
|
||||
in =>
|
||||
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
||||
Stream.eval(storeFile(in, inFile, blocker))
|
||||
@ -78,12 +107,12 @@ object ExternConv {
|
||||
name: String,
|
||||
result: SystemCommand.Result,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Nothing] =
|
||||
Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++
|
||||
Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain
|
||||
): F[Unit] =
|
||||
logger.debug(s"$name stdout: ${result.stdout}") *>
|
||||
logger.debug(s"$name stderr: ${result.stderr}")
|
||||
|
||||
private def storeFile[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
in: Stream[F, Byte],
|
||||
target: Path,
|
||||
blocker: Blocker
|
||||
): F[Unit] =
|
||||
|
@ -1,5 +1,26 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object Tesseract {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: TesseractConfig,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val outBase = cfg.cmd.args.tail.headOption.getOrElse("out")
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
7
modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
vendored
Normal file
7
modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
|
||||
case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path)
|
@ -1,18 +1,25 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Pipe
|
||||
import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object Unoconv {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift](
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: UnoconvConfig,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
): Pipe[F, Byte, Byte] =
|
||||
ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger)
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,18 +1,25 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import fs2.Pipe
|
||||
import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object WkHtmlPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift](
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
): Pipe[F, Byte, Byte] =
|
||||
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger)
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,160 @@
|
||||
package docspell.convert
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.implicits._
|
||||
import cats.effect.IO
|
||||
import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
val target = Paths.get("target")
|
||||
|
||||
val convertConfig = ConvertConfig(
|
||||
8192,
|
||||
3000 * 3000,
|
||||
MarkdownConfig("body { padding: 2em 5em; }"),
|
||||
WkHtmlPdfConfig(
|
||||
SystemCommand.Config(
|
||||
"wkhtmltopdf",
|
||||
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
),
|
||||
TesseractConfig(
|
||||
SystemCommand.Config(
|
||||
"tesseract",
|
||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
),
|
||||
UnoconvConfig(
|
||||
SystemCommand.Config(
|
||||
"unoconv",
|
||||
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
)
|
||||
)
|
||||
|
||||
val conversion = Conversion.create[IO](convertConfig, blocker, logger)
|
||||
|
||||
val bombs = List(
|
||||
ExampleFiles.bombs_20K_gray_jpeg,
|
||||
ExampleFiles.bombs_20K_gray_png,
|
||||
ExampleFiles.bombs_20K_rgb_jpeg,
|
||||
ExampleFiles.bombs_20K_rgb_png
|
||||
)
|
||||
val pdfOnly = List(
|
||||
ExampleFiles.examples_sample_ods,
|
||||
ExampleFiles.examples_sample_doc,
|
||||
ExampleFiles.examples_sample_docx,
|
||||
ExampleFiles.examples_sample_ods,
|
||||
ExampleFiles.examples_sample_odt,
|
||||
ExampleFiles.examples_sample_rtf,
|
||||
ExampleFiles.examples_sample_xls,
|
||||
ExampleFiles.examples_sample_xlsx,
|
||||
ExampleFiles.letter_de_md,
|
||||
ExampleFiles.letter_de_txt,
|
||||
ExampleFiles.letter_en_txt,
|
||||
ExampleFiles.letter_de_html
|
||||
)
|
||||
val pdfAndTxt = List(
|
||||
ExampleFiles.camera_letter_en_jpg,
|
||||
ExampleFiles.camera_letter_en_png,
|
||||
ExampleFiles.camera_letter_en_tiff,
|
||||
ExampleFiles.scanner_jfif_jpg
|
||||
)
|
||||
|
||||
test("convert to pdf") {
|
||||
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||
else
|
||||
File
|
||||
.withTempDir[IO](target, "convpdf")
|
||||
.use { dir =>
|
||||
conversion.use { conv =>
|
||||
def check(n: Long): Handler[IO, Unit] =
|
||||
storePdfHandler(dir.resolve(s"test-$n.pdf")).map { p =>
|
||||
assert(p.isNonEmpty && p.isPDF)
|
||||
}
|
||||
|
||||
runConversion(pdfOnly, check, conv).compile.drain
|
||||
}
|
||||
}
|
||||
.unsafeRunSync()
|
||||
}
|
||||
|
||||
test("convert image to pdf and txt") {
|
||||
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||
else
|
||||
File
|
||||
.withTempDir[IO](target, "convimgpdf")
|
||||
.use { dir =>
|
||||
conversion.use { conv =>
|
||||
def check(n: Long): Handler[IO, Unit] =
|
||||
storePdfTxtHandler(dir.resolve(s"test-$n.pdf"), dir.resolve(s"test-$n.txt"))
|
||||
.map {
|
||||
case (p, t) =>
|
||||
assert(p.isNonEmpty && p.isPDF)
|
||||
assert(t.isNonEmpty && t.isPlainText)
|
||||
}
|
||||
|
||||
runConversion(pdfAndTxt, check, conv).compile.drain
|
||||
}
|
||||
}
|
||||
.unsafeRunSync()
|
||||
}
|
||||
|
||||
test("do not convert image bombs") {
|
||||
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||
else
|
||||
conversion
|
||||
.use { conv =>
|
||||
def check: Handler[IO, Unit] =
|
||||
Kleisli({
|
||||
case ConversionResult.InputMalformed(_, _) =>
|
||||
().pure[IO]
|
||||
case cr =>
|
||||
IO.raiseError(new Exception(s"Unexpected result: $cr"))
|
||||
})
|
||||
|
||||
runConversion(bombs, _ => check, conv).compile.drain
|
||||
}
|
||||
.unsafeRunSync()
|
||||
}
|
||||
|
||||
def runConversion[A](
|
||||
uris: List[LenientUri],
|
||||
handler: Long => Handler[IO, A],
|
||||
conv: Conversion[IO]
|
||||
) =
|
||||
Stream
|
||||
.emits(uris)
|
||||
.covary[IO]
|
||||
.zipWithIndex
|
||||
.evalMap({
|
||||
case (uri, index) =>
|
||||
val load = uri.readURL[IO](8192, blocker)
|
||||
val dataType = DataType.filename(uri.path.segments.last)
|
||||
logger.info(s"Processing file ${uri.path.asString}") *>
|
||||
conv.toPDF(dataType, handler(index))(load)
|
||||
})
|
||||
|
||||
def commandsExist: Boolean =
|
||||
commandExists(convertConfig.unoconv.cmd.program) &&
|
||||
commandExists(convertConfig.wkhtmlpdf.cmd.program) &&
|
||||
commandExists(convertConfig.tesseract.cmd.program)
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
package docspell.convert
|
||||
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.file.{Files, Path}
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.effect.IO
|
||||
import fs2.{Pipe, Stream}
|
||||
import docspell.common.MimeType
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.files.TikaMimetype
|
||||
|
||||
trait FileChecks {
|
||||
|
||||
implicit class FileCheckOps(p: Path) {
|
||||
|
||||
def isNonEmpty: Boolean =
|
||||
Files.exists(p) && Files.size(p) > 0
|
||||
|
||||
def isType(mime: MimeType): Boolean =
|
||||
TikaMimetype.detect[IO](p).map(_ == mime).unsafeRunSync
|
||||
|
||||
def isPDF: Boolean =
|
||||
isType(MimeType.pdf)
|
||||
|
||||
def isPlainText: Boolean =
|
||||
isType(MimeType.text("plain"))
|
||||
}
|
||||
|
||||
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
||||
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
|
||||
|
||||
def storePdfHandler(file: Path): Handler[IO, Path] =
|
||||
storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1)
|
||||
|
||||
def storePdfTxtHandler(filePdf: Path, fileTxt: Path): Handler[IO, (Path, Path)] =
|
||||
Kleisli({
|
||||
case ConversionResult.SuccessPdfTxt(pdf, txt) =>
|
||||
for {
|
||||
pout <- pdf.through(storeFile(filePdf)).compile.lastOrError
|
||||
str <- txt
|
||||
tout <- IO(Files.write(fileTxt, str.getBytes(StandardCharsets.UTF_8)))
|
||||
} yield (pout, tout)
|
||||
|
||||
case ConversionResult.SuccessPdf(pdf) =>
|
||||
pdf.through(storeFile(filePdf)).compile.lastOrError.map(p => (p, fileTxt))
|
||||
|
||||
case ConversionResult.Failure(ex) =>
|
||||
throw new Exception(s"Unexpected result (failure: ${ex.getMessage})", ex)
|
||||
|
||||
case cr =>
|
||||
throw new Exception(s"Unexpected result: $cr")
|
||||
})
|
||||
|
||||
def commandExists(cmd: String): Boolean =
|
||||
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
|
||||
|
||||
|
||||
}
|
@ -1,22 +1,20 @@
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.file.{Files, Path, Paths}
|
||||
import java.nio.file.{Path, Paths}
|
||||
|
||||
import fs2.Stream
|
||||
import cats.effect._
|
||||
import docspell.common._
|
||||
import docspell.convert.FileChecks
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import fs2.Pipe
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object ExternConvTest extends SimpleTestSuite {
|
||||
object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
val target = Paths.get("target")
|
||||
|
||||
|
||||
test("convert html to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
"wkhtmltopdf",
|
||||
@ -28,18 +26,20 @@ object ExternConvTest extends SimpleTestSuite {
|
||||
else {
|
||||
File
|
||||
.withTempDir[IO](target, "wkhtmltopdf")
|
||||
.use(dir => IO {
|
||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||
val p = ExampleFiles.letter_de_html
|
||||
.readURL[IO](8192, blocker)
|
||||
.through(WkHtmlPdf.toPDF[IO](wkCfg, 8192, blocker, logger))
|
||||
.through(storeFile(dir.resolve("test.pdf")))
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
.use(dir =>
|
||||
IO {
|
||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||
val p =
|
||||
WkHtmlPdf
|
||||
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
|
||||
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
|
||||
assert(Files.exists(p) && Files.size(p) > 0)
|
||||
})
|
||||
assert(p.isNonEmpty && p.isPDF)
|
||||
}
|
||||
)
|
||||
.unsafeRunSync
|
||||
}
|
||||
}
|
||||
@ -55,26 +55,53 @@ object ExternConvTest extends SimpleTestSuite {
|
||||
else {
|
||||
File
|
||||
.withTempDir[IO](target, "unoconv")
|
||||
.use(dir => IO {
|
||||
val ucCfg = UnoconvConfig(cfg, target)
|
||||
val p = ExampleFiles.examples_sample_docx
|
||||
.readURL[IO](8192, blocker)
|
||||
.through(Unoconv.toPDF[IO](ucCfg, 8192, blocker, logger))
|
||||
.through(storeFile(dir.resolve("test.pdf")))
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
.use(dir =>
|
||||
IO {
|
||||
val ucCfg = UnoconvConfig(cfg, target)
|
||||
val p =
|
||||
Unoconv
|
||||
.toPDF[IO, Path](ucCfg, 8192, blocker, logger)(
|
||||
ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
|
||||
assert(Files.exists(p) && Files.size(p) > 0)
|
||||
})
|
||||
assert(p.isNonEmpty && p.isPDF)
|
||||
}
|
||||
)
|
||||
.unsafeRunSync
|
||||
}
|
||||
}
|
||||
|
||||
test("convert image to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
"tesseract",
|
||||
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||
Duration.seconds(20)
|
||||
)
|
||||
|
||||
if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found")
|
||||
else {
|
||||
File
|
||||
.withTempDir[IO](target, "tesseract")
|
||||
.use(dir =>
|
||||
IO {
|
||||
val tessCfg = TesseractConfig(cfg, target)
|
||||
val (pdf, txt) =
|
||||
Tesseract
|
||||
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
|
||||
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
||||
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
|
||||
assert(pdf.isNonEmpty && pdf.isPDF)
|
||||
assert(txt.isNonEmpty && txt.isPlainText)
|
||||
}
|
||||
)
|
||||
.unsafeRunSync
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
||||
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
|
||||
|
||||
def commandExists(cmd: String): Boolean =
|
||||
Runtime.getRuntime().exec(Array("which", cmd)).waitFor() == 0
|
||||
}
|
||||
|
@ -2,4 +2,4 @@ package docspell.extract
|
||||
|
||||
import docspell.extract.ocr.OcrConfig
|
||||
|
||||
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
||||
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
|
||||
|
@ -9,6 +9,7 @@ import docspell.extract.poi.{PoiExtract, PoiType}
|
||||
import docspell.extract.rtf.RtfExtract
|
||||
import fs2.Stream
|
||||
import docspell.files.TikaMimetype
|
||||
import docspell.files.ImageSize
|
||||
|
||||
trait Extraction[F[_]] {
|
||||
|
||||
@ -44,14 +45,29 @@ object Extraction {
|
||||
case OdfType(_) =>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case OcrType(_) =>
|
||||
TextExtract
|
||||
case OcrType(mt) =>
|
||||
val doExtract = TextExtract
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.attempt
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
ImageSize.get(data).flatMap {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.maxImageSize) {
|
||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||
ExtractResult.failure(new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
|
||||
).pure[F]
|
||||
} else {
|
||||
doExtract
|
||||
}
|
||||
case None =>
|
||||
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
|
||||
doExtract
|
||||
}
|
||||
|
||||
case OdfType.container =>
|
||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
@ -4,10 +4,10 @@ import docspell.common.MimeType
|
||||
|
||||
object OdfType {
|
||||
|
||||
val odt = MimeType.application("application/vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet")
|
||||
val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text")
|
||||
val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet")
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
|
||||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||
|
||||
val container = MimeType.zip
|
||||
|
||||
|
@ -2,6 +2,8 @@ package docspell.files
|
||||
|
||||
case class Dimension(width: Int, height: Int) {
|
||||
|
||||
def product = width * height
|
||||
|
||||
def toAwtDimension: java.awt.Dimension =
|
||||
new java.awt.Dimension(width, height)
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
package docspell.files
|
||||
|
||||
import java.io.BufferedInputStream
|
||||
import java.nio.file.{Files, Path}
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
@ -8,6 +11,8 @@ import org.apache.tika.config.TikaConfig
|
||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
||||
import org.apache.tika.mime.MediaType
|
||||
|
||||
import scala.util.Using
|
||||
|
||||
object TikaMimetype {
|
||||
private val tika = new TikaConfig().getDetector
|
||||
|
||||
@ -43,4 +48,12 @@ object TikaMimetype {
|
||||
case DataType.Exact(mt) => mt.pure[F]
|
||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||
}
|
||||
|
||||
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||
Sync[F].delay {
|
||||
val hint = MimeTypeHint.filename(file.getFileName.toString)
|
||||
Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in =>
|
||||
convert(tika.detect(in, makeMetadata(hint)))
|
||||
}).toEither
|
||||
}.rethrow
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ object Dependencies {
|
||||
val YamuscaVersion = "0.6.1"
|
||||
val SwaggerUIVersion = "3.25.0"
|
||||
val SemanticUIVersion = "2.4.1"
|
||||
val TwelveMonkeysVersion = "3.5"
|
||||
val JQueryVersion = "3.4.1"
|
||||
val ViewerJSVersion = "0.5.8"
|
||||
|
||||
@ -62,10 +63,10 @@ object Dependencies {
|
||||
ExclusionRule("hamcrest-core")
|
||||
))
|
||||
|
||||
// val twelvemonkeys = Seq(
|
||||
// "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
|
||||
// "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
|
||||
// )
|
||||
val twelvemonkeys = Seq(
|
||||
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
|
||||
"com.twelvemonkeys.imageio" % "imageio-tiff" % TwelveMonkeysVersion
|
||||
)
|
||||
|
||||
val pdfbox = Seq(
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
||||
|
Loading…
x
Reference in New Issue
Block a user