mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 10:28:27 +00:00
Upgrade code base to CE3
This commit is contained in:
@ -12,6 +12,8 @@ import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.Markdown
|
||||
import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
import scodec.bits.ByteVector
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
|
||||
@ -22,10 +24,9 @@ trait Conversion[F[_]] {
|
||||
|
||||
object Conversion {
|
||||
|
||||
def create[F[_]: Sync: ContextShift](
|
||||
def create[F[_]: Async](
|
||||
cfg: ConvertConfig,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Resource[F, Conversion[F]] =
|
||||
Resource.pure[F, Conversion[F]](new Conversion[F] {
|
||||
@ -36,12 +37,12 @@ object Conversion {
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
OcrMyPdf
|
||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, logger)(in, handler)
|
||||
|
||||
case MimeType.HtmlMatch(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, blocker, logger)(
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
@ -50,14 +51,15 @@ object Conversion {
|
||||
val cs = mt.charsetOrUtf8
|
||||
Markdown.toHtml(in, cfg.markdown, cs).flatMap { html =>
|
||||
val bytes = Stream
|
||||
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
|
||||
.chunk(
|
||||
Chunk.byteVector(ByteVector.view(html.getBytes(StandardCharsets.UTF_8)))
|
||||
)
|
||||
.covary[F]
|
||||
WkHtmlPdf.toPDF(
|
||||
cfg.wkhtmlpdf,
|
||||
cfg.chunkSize,
|
||||
StandardCharsets.UTF_8,
|
||||
sanitizeHtml,
|
||||
blocker,
|
||||
logger
|
||||
)(bytes, handler)
|
||||
}
|
||||
@ -77,7 +79,7 @@ object Conversion {
|
||||
)
|
||||
)
|
||||
else
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
@ -86,14 +88,14 @@ object Conversion {
|
||||
logger.info(
|
||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||
) *>
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case Office(_) =>
|
||||
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, logger)(in, handler)
|
||||
|
||||
case mt =>
|
||||
handler.run(ConversionResult.unsupportedFormat(mt))
|
||||
|
@ -4,6 +4,7 @@ import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.io.file.Files
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.common._
|
||||
@ -12,12 +13,11 @@ import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
||||
|
||||
private[extern] object ExternConv {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
def toPDF[F[_]: Async, A](
|
||||
name: String,
|
||||
cmdCfg: SystemCommand.Config,
|
||||
wd: Path,
|
||||
useStdin: Boolean,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
@ -37,13 +37,12 @@ private[extern] object ExternConv {
|
||||
|
||||
val createInput: Pipe[F, Byte, Unit] =
|
||||
if (useStdin) _ => Stream.emit(())
|
||||
else storeDataToFile(name, blocker, logger, inFile)
|
||||
else storeDataToFile(name, logger, inFile)
|
||||
|
||||
in.through(createInput).flatMap { _ =>
|
||||
SystemCommand
|
||||
.exec[F](
|
||||
sysCfg,
|
||||
blocker,
|
||||
logger,
|
||||
Some(dir),
|
||||
if (useStdin) in
|
||||
@ -66,8 +65,7 @@ private[extern] object ExternConv {
|
||||
handler.run(ConversionResult.failure(ex))
|
||||
}
|
||||
|
||||
def readResult[F[_]: Sync: ContextShift](
|
||||
blocker: Blocker,
|
||||
def readResult[F[_]: Async](
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||
@ -77,15 +75,15 @@ private[extern] object ExternConv {
|
||||
File.existsNonEmpty[F](outTxt).flatMap {
|
||||
case true =>
|
||||
successPdfTxt(
|
||||
File.readAll(out, blocker, chunkSize),
|
||||
File.readText(outTxt, blocker)
|
||||
File.readAll(out, chunkSize),
|
||||
File.readText(outTxt)
|
||||
).pure[F]
|
||||
case false =>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||
}
|
||||
case true =>
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
successPdf(File.readAll(out, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
ConversionResult
|
||||
@ -95,9 +93,8 @@ private[extern] object ExternConv {
|
||||
.pure[F]
|
||||
}
|
||||
|
||||
def readResultTesseract[F[_]: Sync: ContextShift](
|
||||
def readResultTesseract[F[_]: Async](
|
||||
outPrefix: String,
|
||||
blocker: Blocker,
|
||||
chunkSize: Int,
|
||||
logger: Logger[F]
|
||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
|
||||
@ -106,9 +103,9 @@ private[extern] object ExternConv {
|
||||
case true =>
|
||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||
File.exists(outTxt).flatMap { txtExists =>
|
||||
val pdfData = File.readAll(out, blocker, chunkSize)
|
||||
val pdfData = File.readAll(out, chunkSize)
|
||||
if (result.rc == 0)
|
||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt)).pure[F]
|
||||
else successPdf(pdfData).pure[F]
|
||||
else
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
@ -124,9 +121,8 @@ private[extern] object ExternConv {
|
||||
}
|
||||
}
|
||||
|
||||
private def storeDataToFile[F[_]: Sync: ContextShift](
|
||||
private def storeDataToFile[F[_]: Async](
|
||||
name: String,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
inFile: Path
|
||||
): Pipe[F, Byte, Unit] =
|
||||
@ -134,7 +130,7 @@ private[extern] object ExternConv {
|
||||
Stream
|
||||
.eval(logger.debug(s"Storing input to file ${inFile} for running $name"))
|
||||
.drain ++
|
||||
Stream.eval(storeFile(in, inFile, blocker))
|
||||
Stream.eval(storeFile(in, inFile))
|
||||
|
||||
private def logResult[F[_]: Sync](
|
||||
name: String,
|
||||
@ -144,10 +140,9 @@ private[extern] object ExternConv {
|
||||
logger.debug(s"$name stdout: ${result.stdout}") *>
|
||||
logger.debug(s"$name stderr: ${result.stderr}")
|
||||
|
||||
private def storeFile[F[_]: Sync: ContextShift](
|
||||
private def storeFile[F[_]: Async](
|
||||
in: Stream[F, Byte],
|
||||
target: Path,
|
||||
blocker: Blocker
|
||||
target: Path
|
||||
): F[Unit] =
|
||||
in.through(fs2.io.file.writeAll(target, blocker)).compile.drain
|
||||
in.through(Files[F].writeAll(target)).compile.drain
|
||||
}
|
||||
|
@ -11,23 +11,21 @@ import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object OcrMyPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
def toPDF[F[_]: Async, A](
|
||||
cfg: OcrMyPdfConfig,
|
||||
lang: Language,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
if (cfg.enabled) {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"ocrmypdf",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
|
@ -11,23 +11,21 @@ import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object Tesseract {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
def toPDF[F[_]: Async, A](
|
||||
cfg: TesseractConfig,
|
||||
lang: Language,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val outBase = cfg.command.args.tail.headOption.getOrElse("out")
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||
ExternConv.readResultTesseract[F](outBase, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"tesseract",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
|
@ -11,21 +11,19 @@ import docspell.convert.ConversionResult.Handler
|
||||
|
||||
object Unoconv {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
def toPDF[F[_]: Async, A](
|
||||
cfg: UnoconvConfig,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A](
|
||||
"unoconv",
|
||||
cfg.command,
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(
|
||||
|
@ -13,16 +13,15 @@ import docspell.convert.{ConversionResult, SanitizeHtml}
|
||||
|
||||
object WkHtmlPdf {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
def toPDF[F[_]: Async, A](
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
charset: Charset,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
|
||||
@ -40,7 +39,7 @@ object WkHtmlPdf {
|
||||
)
|
||||
|
||||
ExternConv
|
||||
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, blocker, logger, reader)(
|
||||
.toPDF[F, A]("wkhtmltopdf", cmdCfg, cfg.workingDir, true, logger, reader)(
|
||||
inSane,
|
||||
handler
|
||||
)
|
||||
|
@ -4,6 +4,7 @@ import java.nio.file.Paths
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.effect.IO
|
||||
import cats.effect.unsafe.implicits.global
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
@ -12,13 +13,11 @@ import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import docspell.files.ExampleFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class ConversionTest extends FunSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
val target = Paths.get("target")
|
||||
@ -73,7 +72,7 @@ class ConversionTest extends FunSuite with FileChecks {
|
||||
)
|
||||
|
||||
val conversion =
|
||||
Conversion.create[IO](convertConfig, SanitizeHtml.none, blocker, logger)
|
||||
Conversion.create[IO](convertConfig, SanitizeHtml.none, logger)
|
||||
|
||||
val bombs = List(
|
||||
ExampleFiles.bombs_20K_gray_jpeg,
|
||||
@ -167,7 +166,7 @@ class ConversionTest extends FunSuite with FileChecks {
|
||||
.covary[IO]
|
||||
.zipWithIndex
|
||||
.evalMap({ case (uri, index) =>
|
||||
val load = uri.readURL[IO](8192, blocker)
|
||||
val load = uri.readURL[IO](8192)
|
||||
val dataType = DataType.filename(uri.path.segments.last)
|
||||
logger.info(s"Processing file ${uri.path.asString}") *>
|
||||
conv.toPDF(dataType, Language.German, handler(index))(load)
|
||||
|
@ -5,6 +5,7 @@ import java.nio.file.{Files, Path}
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.effect.IO
|
||||
import cats.effect.unsafe.implicits.global
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.common.MimeType
|
||||
|
@ -4,19 +4,18 @@ import java.nio.charset.StandardCharsets
|
||||
import java.nio.file.{Path, Paths}
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert._
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import docspell.files.ExampleFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class ExternConvTest extends FunSuite with FileChecks {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
val utf8 = StandardCharsets.UTF_8
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
val target = Paths.get("target")
|
||||
val utf8 = StandardCharsets.UTF_8
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
val target = Paths.get("target")
|
||||
|
||||
test("convert html to pdf") {
|
||||
val cfg = SystemCommand.Config(
|
||||
@ -32,8 +31,8 @@ class ExternConvTest extends FunSuite with FileChecks {
|
||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||
val p =
|
||||
WkHtmlPdf
|
||||
.toPDF[IO, Path](wkCfg, 8192, utf8, SanitizeHtml.none, blocker, logger)(
|
||||
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
|
||||
.toPDF[IO, Path](wkCfg, 8192, utf8, SanitizeHtml.none, logger)(
|
||||
ExampleFiles.letter_de_html.readURL[IO](8192),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
@ -59,8 +58,8 @@ class ExternConvTest extends FunSuite with FileChecks {
|
||||
val ucCfg = UnoconvConfig(cfg, target)
|
||||
val p =
|
||||
Unoconv
|
||||
.toPDF[IO, Path](ucCfg, 8192, blocker, logger)(
|
||||
ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker),
|
||||
.toPDF[IO, Path](ucCfg, 8192, logger)(
|
||||
ExampleFiles.examples_sample_docx.readURL[IO](8192),
|
||||
storePdfHandler(dir.resolve("test.pdf"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
@ -85,8 +84,8 @@ class ExternConvTest extends FunSuite with FileChecks {
|
||||
val tessCfg = TesseractConfig(cfg, target)
|
||||
val (pdf, txt) =
|
||||
Tesseract
|
||||
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)(
|
||||
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
||||
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, logger)(
|
||||
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192),
|
||||
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
||||
)
|
||||
.unsafeRunSync()
|
||||
|
Reference in New Issue
Block a user