mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Allow to convert html->pdf via weasyprint
This commit is contained in:
@ -66,6 +66,7 @@ services:
|
|||||||
- DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname
|
- DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname
|
||||||
- DOCSPELL_JOEX_JDBC_USER=dbuser
|
- DOCSPELL_JOEX_JDBC_USER=dbuser
|
||||||
- DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial
|
- DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial
|
||||||
|
- DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint
|
||||||
ports:
|
ports:
|
||||||
- "7878:7878"
|
- "7878:7878"
|
||||||
depends_on:
|
depends_on:
|
||||||
|
@ -14,6 +14,7 @@ import fs2._
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
import docspell.convert.ConvertConfig.HtmlConverter
|
||||||
import docspell.convert.extern._
|
import docspell.convert.extern._
|
||||||
import docspell.convert.flexmark.Markdown
|
import docspell.convert.flexmark.Markdown
|
||||||
import docspell.files.{ImageSize, TikaMimetype}
|
import docspell.files.{ImageSize, TikaMimetype}
|
||||||
@ -57,11 +58,21 @@ object Conversion {
|
|||||||
|
|
||||||
case MimeType.HtmlMatch(mt) =>
|
case MimeType.HtmlMatch(mt) =>
|
||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
WkHtmlPdf
|
cfg.htmlConverter match {
|
||||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
case HtmlConverter.Wkhtmltopdf =>
|
||||||
in,
|
WkHtmlPdf
|
||||||
handler
|
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||||
)
|
in,
|
||||||
|
handler
|
||||||
|
)
|
||||||
|
|
||||||
|
case HtmlConverter.Weasyprint =>
|
||||||
|
Weasyprint
|
||||||
|
.toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||||
|
in,
|
||||||
|
handler
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
case MimeType.TextAllMatch(mt) =>
|
case MimeType.TextAllMatch(mt) =>
|
||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
|
@ -6,10 +6,11 @@
|
|||||||
|
|
||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
import docspell.common.Password
|
import docspell.common.Password
|
||||||
import docspell.convert.ConvertConfig.DecryptPdf
|
import docspell.convert.ConvertConfig.DecryptPdf
|
||||||
import docspell.convert.extern.OcrMyPdfConfig
|
import docspell.convert.extern._
|
||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
final case class ConvertConfig(
|
final case class ConvertConfig(
|
||||||
@ -18,6 +19,8 @@ final case class ConvertConfig(
|
|||||||
maxImageSize: Int,
|
maxImageSize: Int,
|
||||||
markdown: MarkdownConfig,
|
markdown: MarkdownConfig,
|
||||||
wkhtmlpdf: WkHtmlPdfConfig,
|
wkhtmlpdf: WkHtmlPdfConfig,
|
||||||
|
weasyprint: WeasyprintConfig,
|
||||||
|
htmlConverter: ConvertConfig.HtmlConverter,
|
||||||
tesseract: TesseractConfig,
|
tesseract: TesseractConfig,
|
||||||
unoconv: UnoconvConfig,
|
unoconv: UnoconvConfig,
|
||||||
ocrmypdf: OcrMyPdfConfig,
|
ocrmypdf: OcrMyPdfConfig,
|
||||||
@ -27,4 +30,25 @@ final case class ConvertConfig(
|
|||||||
object ConvertConfig {
|
object ConvertConfig {
|
||||||
|
|
||||||
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
|
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
|
||||||
|
|
||||||
|
sealed trait HtmlConverter {
|
||||||
|
def name: String
|
||||||
|
}
|
||||||
|
|
||||||
|
object HtmlConverter {
|
||||||
|
case object Wkhtmltopdf extends HtmlConverter {
|
||||||
|
val name = "wkhtmlpdf"
|
||||||
|
}
|
||||||
|
case object Weasyprint extends HtmlConverter {
|
||||||
|
val name = "weasyprint"
|
||||||
|
}
|
||||||
|
val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint)
|
||||||
|
|
||||||
|
def fromString(str: String): Either[String, HtmlConverter] =
|
||||||
|
all
|
||||||
|
.find(_.name.equalsIgnoreCase(str))
|
||||||
|
.toRight(
|
||||||
|
s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}"
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
55
modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
vendored
Normal file
55
modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
vendored
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.charset.Charset
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import fs2.io.file.Path
|
||||||
|
import fs2.{Chunk, Stream}
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
import docspell.convert.{ConversionResult, SanitizeHtml}
|
||||||
|
import docspell.logging.Logger
|
||||||
|
|
||||||
|
object Weasyprint {
|
||||||
|
|
||||||
|
def toPDF[F[_]: Async, A](
|
||||||
|
cfg: WeasyprintConfig,
|
||||||
|
chunkSize: Int,
|
||||||
|
charset: Charset,
|
||||||
|
sanitizeHtml: SanitizeHtml,
|
||||||
|
logger: Logger[F]
|
||||||
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
|
ExternConv.readResult[F](chunkSize, logger)
|
||||||
|
|
||||||
|
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||||
|
|
||||||
|
// html sanitize should (among other) remove links to invalid
|
||||||
|
// protocols like cid: which is not supported by further
|
||||||
|
// processing.
|
||||||
|
//
|
||||||
|
// Since jsoup will load everything anyways, a stream-based
|
||||||
|
// conversion to java's inputstream doesn't make much sense.
|
||||||
|
val inSane = Stream.evalUnChunk(
|
||||||
|
Binary
|
||||||
|
.loadAllBytes(in)
|
||||||
|
.map(bv => sanitizeHtml(bv, charset.some))
|
||||||
|
.map(bv => Chunk.byteVector(bv))
|
||||||
|
)
|
||||||
|
|
||||||
|
ExternConv
|
||||||
|
.toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)(
|
||||||
|
inSane,
|
||||||
|
handler
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
13
modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
vendored
Normal file
13
modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2020 Eike K. & Contributors
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
*/
|
||||||
|
|
||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import fs2.io.file.Path
|
||||||
|
|
||||||
|
import docspell.common.SystemCommand
|
||||||
|
|
||||||
|
case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)
|
@ -17,8 +17,8 @@ import fs2.Stream
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.common.util.File
|
import docspell.common.util.File
|
||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.convert.extern.OcrMyPdfConfig
|
import docspell.convert.ConvertConfig.HtmlConverter
|
||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
import docspell.convert.extern._
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
import docspell.files.ExampleFiles
|
import docspell.files.ExampleFiles
|
||||||
import docspell.logging.TestLoggingConfig
|
import docspell.logging.TestLoggingConfig
|
||||||
@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
|||||||
),
|
),
|
||||||
target
|
target
|
||||||
),
|
),
|
||||||
|
WeasyprintConfig(
|
||||||
|
SystemCommand.Config(
|
||||||
|
"weasyprint",
|
||||||
|
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||||
|
Duration.seconds(20)
|
||||||
|
),
|
||||||
|
target
|
||||||
|
),
|
||||||
|
HtmlConverter.Wkhtmltopdf,
|
||||||
TesseractConfig(
|
TesseractConfig(
|
||||||
SystemCommand.Config(
|
SystemCommand.Config(
|
||||||
"tesseract",
|
"tesseract",
|
||||||
|
@ -551,6 +551,10 @@ Docpell Update Check
|
|||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Which HTML->PDF converter command to use. One of: wkhtmlpdf,
|
||||||
|
# weasyprint.
|
||||||
|
html-converter = "wkhtmlpdf"
|
||||||
|
|
||||||
# To convert HTML files into PDF files, the external tool
|
# To convert HTML files into PDF files, the external tool
|
||||||
# wkhtmltopdf is used.
|
# wkhtmltopdf is used.
|
||||||
wkhtmlpdf {
|
wkhtmlpdf {
|
||||||
@ -568,7 +572,22 @@ Docpell Update Check
|
|||||||
]
|
]
|
||||||
timeout = "2 minutes"
|
timeout = "2 minutes"
|
||||||
}
|
}
|
||||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
|
||||||
|
}
|
||||||
|
|
||||||
|
# An alternative to wkhtmltopdf is weasyprint.
|
||||||
|
weasyprint {
|
||||||
|
command = {
|
||||||
|
program = "weasyprint"
|
||||||
|
args = [
|
||||||
|
"--optimize-size", "all",
|
||||||
|
"--encoding", "{{encoding}}",
|
||||||
|
"-",
|
||||||
|
"{{outfile}}"
|
||||||
|
]
|
||||||
|
timeout = "2 minutes"
|
||||||
|
}
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
|
||||||
}
|
}
|
||||||
|
|
||||||
# To convert image files to PDF files, tesseract is used. This
|
# To convert image files to PDF files, tesseract is used. This
|
||||||
|
@ -10,6 +10,7 @@ import cats.effect.Async
|
|||||||
|
|
||||||
import docspell.config.Implicits._
|
import docspell.config.Implicits._
|
||||||
import docspell.config.{ConfigFactory, FtsType, Validation}
|
import docspell.config.{ConfigFactory, FtsType, Validation}
|
||||||
|
import docspell.convert.ConvertConfig.HtmlConverter
|
||||||
import docspell.scheduler.CountingScheme
|
import docspell.scheduler.CountingScheme
|
||||||
import docspell.store.Db
|
import docspell.store.Db
|
||||||
|
|
||||||
@ -38,6 +39,9 @@ object ConfigFile {
|
|||||||
|
|
||||||
implicit val mailAddressReader: ConfigReader[MailAddress] =
|
implicit val mailAddressReader: ConfigReader[MailAddress] =
|
||||||
ConfigReader[String].emap(reason(MailAddress.parse))
|
ConfigReader[String].emap(reason(MailAddress.parse))
|
||||||
|
|
||||||
|
implicit val htmlConverterReader: ConfigReader[HtmlConverter] =
|
||||||
|
ConfigReader[String].emap(reason(HtmlConverter.fromString))
|
||||||
}
|
}
|
||||||
|
|
||||||
def validate: Validation[Config] =
|
def validate: Validation[Config] =
|
||||||
|
Reference in New Issue
Block a user