Upgrade code base to CE3

This commit is contained in:
eikek
2021-06-21 21:33:54 +02:00
parent 903ec26e54
commit bd791b4593
146 changed files with 638 additions and 758 deletions

View File

@ -12,6 +12,8 @@ import docspell.convert.extern._
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
import scodec.bits.ByteVector
trait Conversion[F[_]] {
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
@ -22,10 +24,9 @@ trait Conversion[F[_]] {
object Conversion {
def create[F[_]: Sync: ContextShift](
def create[F[_]: Async](
cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
): Resource[F, Conversion[F]] =
Resource.pure[F, Conversion[F]](new Conversion[F] {
@ -36,12 +37,12 @@ object Conversion {
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.PdfMatch(_) =>
OcrMyPdf
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(in, handler)
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
in,
handler
)
@ -50,14 +51,15 @@ object Conversion {
val cs = mt.charsetOrUtf8
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
val bytes = Stream
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
.chunk(
Chunk.byteVector(ByteVector.view(html.getBytes(StandardCharsets.UTF_8)))
)
.covary[F]
WkHtmlPdf.toPDF(
cfg.wkhtmlpdf,
cfg.chunkSize,
StandardCharsets.UTF_8,
sanitizeHtml,
blocker,
logger
)(bytes, handler)
}
@ -77,7 +79,7 @@ object Conversion {
)
)
else
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, logger)(
in,
handler
)
@ -86,14 +88,14 @@ object Conversion {
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, logger)(
in,
handler
)
}
case Office(_) =>
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, logger)(in, handler)
case mt =>
handler.run(ConversionResult.unsupportedFormat(mt))

View File

@ -4,6 +4,7 @@ import java.nio.file.Path
import cats.effect._
import cats.implicits._
import fs2.io.file.Files
import fs2.{Pipe, Stream}
import docspell.common._
@ -12,12 +13,11 @@ import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
private[extern] object ExternConv {
def toPDF[F[_]: Sync: ContextShift, A](
def toPDF[F[_]: Async, A](
name: String,
cmdCfg: SystemCommand.Config,
wd: Path,
useStdin: Boolean,
blocker: Blocker,
logger: Logger[F],
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
@ -37,13 +37,12 @@ private[extern] object ExternConv {
val createInput: Pipe[F, Byte, Unit] =
if (useStdin) _ => Stream.emit(())
else storeDataToFile(name, blocker, logger, inFile)
else storeDataToFile(name, logger, inFile)
in.through(createInput).flatMap { _ =>
SystemCommand
.exec[F](
sysCfg,
blocker,
logger,
Some(dir),
if (useStdin) in
@ -66,8 +65,7 @@ private[extern] object ExternConv {
handler.run(ConversionResult.failure(ex))
}
def readResult[F[_]: Sync: ContextShift](
blocker: Blocker,
def readResult[F[_]: Async](
chunkSize: Int,
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
@ -77,15 +75,15 @@ private[extern] object ExternConv {
File.existsNonEmpty[F](outTxt).flatMap {
case true =>
successPdfTxt(
File.readAll(out, blocker, chunkSize),
File.readText(outTxt, blocker)
File.readAll(out, chunkSize),
File.readText(outTxt)
).pure[F]
case false =>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
successPdf(File.readAll(out, chunkSize)).pure[F]
}
case true =>
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
successPdf(File.readAll(out, chunkSize)).pure[F]
case false =>
ConversionResult
@ -95,9 +93,8 @@ private[extern] object ExternConv {
.pure[F]
}
def readResultTesseract[F[_]: Sync: ContextShift](
def readResultTesseract[F[_]: Async](
outPrefix: String,
blocker: Blocker,
chunkSize: Int,
logger: Logger[F]
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
@ -106,9 +103,9 @@ private[extern] object ExternConv {
case true =>
val outTxt = out.resolveSibling(s"$outPrefix.txt")
File.exists(outTxt).flatMap { txtExists =>
val pdfData = File.readAll(out, blocker, chunkSize)
val pdfData = File.readAll(out, chunkSize)
if (result.rc == 0)
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F]
else successPdf(pdfData).pure[F]
else
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
@ -124,9 +121,8 @@ private[extern] object ExternConv {
}
}
private def storeDataToFile[F[_]: Sync: ContextShift](
private def storeDataToFile[F[_]: Async](
name: String,
blocker: Blocker,
logger: Logger[F],
inFile: Path
): Pipe[F, Byte, Unit] =
@ -134,7 +130,7 @@ private[extern] object ExternConv {
Stream
.eval(logger.debug(s"Storing input to file ${inFile} for running $name"))
.drain ++
Stream.eval(storeFile(in, inFile, blocker))
Stream.eval(storeFile(in, inFile))
private def logResult[F[_]: Sync](
name: String,
@ -144,10 +140,9 @@ private[extern] object ExternConv {
logger.debug(s"$name stdout: ${result.stdout}") *>
logger.debug(s"$name stderr: ${result.stderr}")
private def storeFile[F[_]: Sync: ContextShift](
private def storeFile[F[_]: Async](
in: Stream[F, Byte],
target: Path,
blocker: Blocker
target: Path
): F[Unit] =
in.through(fs2.io.file.writeAll(target, blocker)).compile.drain
in.through(Files[F].writeAll(target)).compile.drain
}

View File

@ -11,23 +11,21 @@ import docspell.convert.ConversionResult.Handler
object OcrMyPdf {
def toPDF[F[_]: Sync: ContextShift, A](
def toPDF[F[_]: Async, A](
cfg: OcrMyPdfConfig,
lang: Language,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
if (cfg.enabled) {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.readResult[F](chunkSize, logger)
ExternConv.toPDF[F, A](
"ocrmypdf",
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
cfg.workingDir,
false,
blocker,
logger,
reader
)(in, handler)

View File

@ -11,23 +11,21 @@ import docspell.convert.ConversionResult.Handler
object Tesseract {
def toPDF[F[_]: Sync: ContextShift, A](
def toPDF[F[_]: Async, A](
cfg: TesseractConfig,
lang: Language,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val outBase = cfg.command.args.tail.headOption.getOrElse("out")
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
ExternConv.readResultTesseract[F](outBase, chunkSize, logger)
ExternConv.toPDF[F, A](
"tesseract",
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
cfg.workingDir,
false,
blocker,
logger,
reader
)(in, handler)

View File

@ -11,21 +11,19 @@ import docspell.convert.ConversionResult.Handler
object Unoconv {
def toPDF[F[_]: Sync: ContextShift, A](
def toPDF[F[_]: Async, A](
cfg: UnoconvConfig,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.readResult[F](chunkSize, logger)
ExternConv.toPDF[F, A](
"unoconv",
cfg.command,
cfg.workingDir,
false,
blocker,
logger,
reader
)(

View File

@ -13,16 +13,15 @@ import docspell.convert.{ConversionResult, SanitizeHtml}
object WkHtmlPdf {
def toPDF[F[_]: Sync: ContextShift, A](
def toPDF[F[_]: Async, A](
cfg: WkHtmlPdfConfig,
chunkSize: Int,
charset: Charset,
sanitizeHtml: SanitizeHtml,
blocker: Blocker,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](blocker, chunkSize, logger)
ExternConv.readResult[F](chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
@ -40,7 +39,7 @@ object WkHtmlPdf {
)
ExternConv
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, logger, reader)(
inSane,
handler
)

View File

@ -4,6 +4,7 @@ import java.nio.file.Paths
import cats.data.Kleisli
import cats.effect.IO
import cats.effect.unsafe.implicits.global
import cats.implicits._
import fs2.Stream
@ -12,13 +13,11 @@ import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.{ExampleFiles, TestFiles}
import docspell.files.ExampleFiles
import munit._
class ConversionTest extends FunSuite with FileChecks {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val logger = Logger.log4s[IO](org.log4s.getLogger)
val target = Paths.get("target")
@ -73,7 +72,7 @@ class ConversionTest extends FunSuite with FileChecks {
)
val conversion =
Conversion.create[IO](convertConfig, SanitizeHtml.none, blocker, logger)
Conversion.create[IO](convertConfig, SanitizeHtml.none, logger)
val bombs = List(
ExampleFiles.bombs_20K_gray_jpeg,
@ -167,7 +166,7 @@ class ConversionTest extends FunSuite with FileChecks {
.covary[IO]
.zipWithIndex
.evalMap({ case (uri, index) =>
val load = uri.readURL[IO](8192, blocker)
val load = uri.readURL[IO](8192)
val dataType = DataType.filename(uri.path.segments.last)
logger.info(s"Processing file ${uri.path.asString}") *>
conv.toPDF(dataType, Language.German, handler(index))(load)

View File

@ -5,6 +5,7 @@ import java.nio.file.{Files, Path}
import cats.data.Kleisli
import cats.effect.IO
import cats.effect.unsafe.implicits.global
import fs2.{Pipe, Stream}
import docspell.common.MimeType

View File

@ -4,19 +4,18 @@ import java.nio.charset.StandardCharsets
import java.nio.file.{Path, Paths}
import cats.effect._
import cats.effect.unsafe.implicits.global
import docspell.common._
import docspell.convert._
import docspell.files.{ExampleFiles, TestFiles}
import docspell.files.ExampleFiles
import munit._
class ExternConvTest extends FunSuite with FileChecks {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val utf8 = StandardCharsets.UTF_8
val logger = Logger.log4s[IO](org.log4s.getLogger)
val target = Paths.get("target")
val utf8 = StandardCharsets.UTF_8
val logger = Logger.log4s[IO](org.log4s.getLogger)
val target = Paths.get("target")
test("convert html to pdf") {
val cfg = SystemCommand.Config(
@ -32,8 +31,8 @@ class ExternConvTest extends FunSuite with FileChecks {
val wkCfg = WkHtmlPdfConfig(cfg, target)
val p =
WkHtmlPdf
.toPDF[IO, Path](wkCfg, 8192, utf8, SanitizeHtml.none, blocker, logger)(
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
.toPDF[IO, Path](wkCfg, 8192, utf8, SanitizeHtml.none, logger)(
ExampleFiles.letter_de_html.readURL[IO](8192),
storePdfHandler(dir.resolve("test.pdf"))
)
.unsafeRunSync()
@ -59,8 +58,8 @@ class ExternConvTest extends FunSuite with FileChecks {
val ucCfg = UnoconvConfig(cfg, target)
val p =
Unoconv
.toPDF[IO, Path](ucCfg, 8192, blocker, logger)(
ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker),
.toPDF[IO, Path](ucCfg, 8192, logger)(
ExampleFiles.examples_sample_docx.readURL[IO](8192),
storePdfHandler(dir.resolve("test.pdf"))
)
.unsafeRunSync()
@ -85,8 +84,8 @@ class ExternConvTest extends FunSuite with FileChecks {
val tessCfg = TesseractConfig(cfg, target)
val (pdf, txt) =
Tesseract
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)(
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, logger)(
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192),
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
)
.unsafeRunSync()