Allow to convert html->pdf via weasyprint

This commit is contained in:
eikek
2022-11-05 00:48:32 +01:00
parent a5315f44ee
commit df75fbddcd
8 changed files with 146 additions and 10 deletions

View File

@ -66,6 +66,7 @@ services:
- DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname
- DOCSPELL_JOEX_JDBC_USER=dbuser
- DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial
- DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint
ports:
- "7878:7878"
depends_on:

View File

@ -14,6 +14,7 @@ import fs2._
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.ConvertConfig.HtmlConverter
import docspell.convert.extern._
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
@ -57,11 +58,21 @@ object Conversion {
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
in,
handler
)
cfg.htmlConverter match {
case HtmlConverter.Wkhtmltopdf =>
WkHtmlPdf
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
in,
handler
)
case HtmlConverter.Weasyprint =>
Weasyprint
.toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)(
in,
handler
)
}
case MimeType.TextAllMatch(mt) =>
val cs = mt.charsetOrUtf8

View File

@ -6,10 +6,11 @@
package docspell.convert
import cats.data.NonEmptyList
import docspell.common.Password
import docspell.convert.ConvertConfig.DecryptPdf
import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.extern._
import docspell.convert.flexmark.MarkdownConfig
final case class ConvertConfig(
@ -18,6 +19,8 @@ final case class ConvertConfig(
maxImageSize: Int,
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
weasyprint: WeasyprintConfig,
htmlConverter: ConvertConfig.HtmlConverter,
tesseract: TesseractConfig,
unoconv: UnoconvConfig,
ocrmypdf: OcrMyPdfConfig,
@ -27,4 +30,25 @@ final case class ConvertConfig(
object ConvertConfig {
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
sealed trait HtmlConverter {
def name: String
}
object HtmlConverter {
case object Wkhtmltopdf extends HtmlConverter {
val name = "wkhtmlpdf"
}
case object Weasyprint extends HtmlConverter {
val name = "weasyprint"
}
val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint)
def fromString(str: String): Either[String, HtmlConverter] =
all
.find(_.name.equalsIgnoreCase(str))
.toRight(
s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}"
)
}
}

View File

@ -0,0 +1,55 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.convert.extern
import java.nio.charset.Charset
import cats.effect._
import cats.implicits._
import fs2.io.file.Path
import fs2.{Chunk, Stream}
import docspell.common._
import docspell.convert.ConversionResult.Handler
import docspell.convert.{ConversionResult, SanitizeHtml}
import docspell.logging.Logger
object Weasyprint {
def toPDF[F[_]: Async, A](
cfg: WeasyprintConfig,
chunkSize: Int,
charset: Charset,
sanitizeHtml: SanitizeHtml,
logger: Logger[F]
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResult[F](chunkSize, logger)
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
// html sanitize should (among other) remove links to invalid
// protocols like cid: which is not supported by further
// processing.
//
// Since jsoup will load everything anyways, a stream-based
// conversion to java's inputstream doesn't make much sense.
val inSane = Stream.evalUnChunk(
Binary
.loadAllBytes(in)
.map(bv => sanitizeHtml(bv, charset.some))
.map(bv => Chunk.byteVector(bv))
)
ExternConv
.toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)(
inSane,
handler
)
}
}

View File

@ -0,0 +1,13 @@
/*
* Copyright 2020 Eike K. & Contributors
*
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package docspell.convert.extern
import fs2.io.file.Path
import docspell.common.SystemCommand
case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)

View File

@ -17,8 +17,8 @@ import fs2.Stream
import docspell.common._
import docspell.common.util.File
import docspell.convert.ConversionResult.Handler
import docspell.convert.extern.OcrMyPdfConfig
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
import docspell.convert.ConvertConfig.HtmlConverter
import docspell.convert.extern._
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.ExampleFiles
import docspell.logging.TestLoggingConfig
@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
),
target
),
WeasyprintConfig(
SystemCommand.Config(
"weasyprint",
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
Duration.seconds(20)
),
target
),
HtmlConverter.Wkhtmltopdf,
TesseractConfig(
SystemCommand.Config(
"tesseract",

View File

@ -551,6 +551,10 @@ Docpell Update Check
"""
}
# Which HTML->PDF converter command to use. One of: wkhtmlpdf,
# weasyprint.
html-converter = "wkhtmlpdf"
# To convert HTML files into PDF files, the external tool
# wkhtmltopdf is used.
wkhtmlpdf {
@ -568,7 +572,22 @@ Docpell Update Check
]
timeout = "2 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
}
# An alternative to wkhtmltopdf is weasyprint.
weasyprint {
command = {
program = "weasyprint"
args = [
"--optimize-size", "all",
"--encoding", "{{encoding}}",
"-",
"{{outfile}}"
]
timeout = "2 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
}
# To convert image files to PDF files, tesseract is used. This

View File

@ -10,6 +10,7 @@ import cats.effect.Async
import docspell.config.Implicits._
import docspell.config.{ConfigFactory, FtsType, Validation}
import docspell.convert.ConvertConfig.HtmlConverter
import docspell.scheduler.CountingScheme
import docspell.store.Db
@ -38,6 +39,9 @@ object ConfigFile {
implicit val mailAddressReader: ConfigReader[MailAddress] =
ConfigReader[String].emap(reason(MailAddress.parse))
implicit val htmlConverterReader: ConfigReader[HtmlConverter] =
ConfigReader[String].emap(reason(HtmlConverter.fromString))
}
def validate: Validation[Config] =