diff --git a/docker/docker-compose/docker-compose.yml b/docker/docker-compose/docker-compose.yml
index 5241be63..081244a1 100644
--- a/docker/docker-compose/docker-compose.yml
+++ b/docker/docker-compose/docker-compose.yml
@@ -66,6 +66,7 @@ services:
- DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname
- DOCSPELL_JOEX_JDBC_USER=dbuser
- DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial
+ - DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint
ports:
- "7878:7878"
depends_on:
diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile
index ccb34cbd..5dc385b3 100644
--- a/docker/dockerfiles/joex.dockerfile
+++ b/docker/dockerfiles/joex.dockerfile
@@ -1,14 +1,12 @@
-FROM alpine:3.14
+FROM alpine:3
ARG version=
ARG joex_url=
ARG UNO_URL=https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
ARG TARGETPLATFORM
-RUN JDKPKG="openjdk11-jre"; \
- if [[ $TARGETPLATFORM = linux/arm* ]]; then JDKPKG="openjdk8-jre"; fi; \
- apk update && \
- apk add --no-cache $JDKPKG \
+RUN apk update && \
+ apk add --no-cache openjdk17-jre \
tzdata \
bash \
curl \
@@ -35,7 +33,7 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-pol \
tesseract-ocr-data-est \
unpaper \
- wkhtmltopdf \
+ weasyprint \
libreoffice \
ttf-droid-nonlatin \
ttf-droid \
@@ -60,7 +58,7 @@ RUN JDKPKG="openjdk11-jre"; \
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
&& chmod +x /usr/local/bin/unoconv \
&& apk del libxml2-dev libxslt-dev zlib-dev g++ python3-dev py3-pip libffi-dev qpdf-dev openssl-dev \
- && ln -s /usr/bin/python3 /usr/bin/python
+ && ln -nfs /usr/bin/python3 /usr/bin/python
WORKDIR /opt
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
@@ -77,7 +75,7 @@ RUN \
COPY joex-entrypoint.sh /opt/joex-entrypoint.sh
-ENTRYPOINT ["/opt/joex-entrypoint.sh", "-J-XX:+UseG1GC"]
+ENTRYPOINT ["/opt/joex-entrypoint.sh"]
EXPOSE 7878
HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \
diff --git a/docker/dockerfiles/restserver.dockerfile b/docker/dockerfiles/restserver.dockerfile
index 01326e7a..038ed494 100644
--- a/docker/dockerfiles/restserver.dockerfile
+++ b/docker/dockerfiles/restserver.dockerfile
@@ -1,4 +1,4 @@
-FROM alpine:3.16
+FROM alpine:3
ARG version=
ARG restserver_url=
@@ -15,7 +15,7 @@ RUN curl -L -O ${restserver_url:-https://github.com/eikek/docspell/releases/down
ln -snf docspell-restserver-* docspell-restserver && \
rm docspell-restserver/conf/docspell-server.conf
-ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver", "-J-XX:+UseG1GC"]
+ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver"]
EXPOSE 7880
HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \
diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
index 19b1279c..bcacf3de 100644
--- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala
+++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala
@@ -14,6 +14,7 @@ import fs2._
import docspell.common._
import docspell.convert.ConversionResult.Handler
+import docspell.convert.ConvertConfig.HtmlConverter
import docspell.convert.extern._
import docspell.convert.flexmark.Markdown
import docspell.files.{ImageSize, TikaMimetype}
@@ -57,11 +58,21 @@ object Conversion {
case MimeType.HtmlMatch(mt) =>
val cs = mt.charsetOrUtf8
- WkHtmlPdf
- .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
- in,
- handler
- )
+ cfg.htmlConverter match {
+ case HtmlConverter.Wkhtmltopdf =>
+ WkHtmlPdf
+ .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
+ in,
+ handler
+ )
+
+ case HtmlConverter.Weasyprint =>
+ Weasyprint
+ .toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)(
+ in,
+ handler
+ )
+ }
case MimeType.TextAllMatch(mt) =>
val cs = mt.charsetOrUtf8
diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
index a4f3c224..fe30c11c 100644
--- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala
@@ -6,10 +6,11 @@
package docspell.convert
+import cats.data.NonEmptyList
+
import docspell.common.Password
import docspell.convert.ConvertConfig.DecryptPdf
-import docspell.convert.extern.OcrMyPdfConfig
-import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
+import docspell.convert.extern._
import docspell.convert.flexmark.MarkdownConfig
final case class ConvertConfig(
@@ -18,6 +19,8 @@ final case class ConvertConfig(
maxImageSize: Int,
markdown: MarkdownConfig,
wkhtmlpdf: WkHtmlPdfConfig,
+ weasyprint: WeasyprintConfig,
+ htmlConverter: ConvertConfig.HtmlConverter,
tesseract: TesseractConfig,
unoconv: UnoconvConfig,
ocrmypdf: OcrMyPdfConfig,
@@ -27,4 +30,25 @@ final case class ConvertConfig(
object ConvertConfig {
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
+
+ sealed trait HtmlConverter {
+ def name: String
+ }
+
+ object HtmlConverter {
+ case object Wkhtmltopdf extends HtmlConverter {
+ val name = "wkhtmlpdf"
+ }
+ case object Weasyprint extends HtmlConverter {
+ val name = "weasyprint"
+ }
+ val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint)
+
+ def fromString(str: String): Either[String, HtmlConverter] =
+ all
+ .find(_.name.equalsIgnoreCase(str))
+ .toRight(
+ s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}"
+ )
+ }
}
diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
new file mode 100644
index 00000000..ba1d6ce7
--- /dev/null
+++ b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.convert.extern
+
+import java.nio.charset.Charset
+
+import cats.effect._
+import cats.implicits._
+import fs2.io.file.Path
+import fs2.{Chunk, Stream}
+
+import docspell.common._
+import docspell.convert.ConversionResult.Handler
+import docspell.convert.{ConversionResult, SanitizeHtml}
+import docspell.logging.Logger
+
+object Weasyprint {
+
+ def toPDF[F[_]: Async, A](
+ cfg: WeasyprintConfig,
+ chunkSize: Int,
+ charset: Charset,
+ sanitizeHtml: SanitizeHtml,
+ logger: Logger[F]
+ )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
+ val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
+ ExternConv.readResult[F](chunkSize, logger)
+
+ val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
+
+ // html sanitize should (among other) remove links to invalid
+ // protocols like cid: which is not supported by further
+ // processing.
+ //
+ // Since jsoup will load everything anyways, a stream-based
+ // conversion to java's inputstream doesn't make much sense.
+ val inSane = Stream.evalUnChunk(
+ Binary
+ .loadAllBytes(in)
+ .map(bv => sanitizeHtml(bv, charset.some))
+ .map(bv => Chunk.byteVector(bv))
+ )
+
+ ExternConv
+ .toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)(
+ inSane,
+ handler
+ )
+ }
+
+}
diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
new file mode 100644
index 00000000..2ce485cc
--- /dev/null
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
@@ -0,0 +1,13 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.convert.extern
+
+import fs2.io.file.Path
+
+import docspell.common.SystemCommand
+
+case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)
diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
index 06ee81c1..35dfcfa6 100644
--- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
@@ -17,8 +17,8 @@ import fs2.Stream
import docspell.common._
import docspell.common.util.File
import docspell.convert.ConversionResult.Handler
-import docspell.convert.extern.OcrMyPdfConfig
-import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
+import docspell.convert.ConvertConfig.HtmlConverter
+import docspell.convert.extern._
import docspell.convert.flexmark.MarkdownConfig
import docspell.files.ExampleFiles
import docspell.logging.TestLoggingConfig
@@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
),
target
),
+ WeasyprintConfig(
+ SystemCommand.Config(
+ "weasyprint",
+ Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
+ Duration.seconds(20)
+ ),
+ target
+ ),
+ HtmlConverter.Wkhtmltopdf,
TesseractConfig(
SystemCommand.Config(
"tesseract",
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 3421ffef..8efc2a9a 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -551,6 +551,10 @@ Docpell Update Check
"""
}
+ # Which HTML->PDF converter command to use. One of: wkhtmlpdf,
+ # weasyprint.
+ html-converter = "wkhtmlpdf"
+
# To convert HTML files into PDF files, the external tool
# wkhtmltopdf is used.
wkhtmlpdf {
@@ -568,7 +572,22 @@ Docpell Update Check
]
timeout = "2 minutes"
}
- working-dir = ${java.io.tmpdir}"/docspell-convert"
+ working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
+ }
+
+ # An alternative to wkhtmltopdf is weasyprint.
+ weasyprint {
+ command = {
+ program = "weasyprint"
+ args = [
+ "--optimize-size", "all",
+ "--encoding", "{{encoding}}",
+ "-",
+ "{{outfile}}"
+ ]
+ timeout = "2 minutes"
+ }
+ working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
}
# To convert image files to PDF files, tesseract is used. This
diff --git a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala
index adcc928c..ac4c1b8a 100644
--- a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala
@@ -10,6 +10,7 @@ import cats.effect.Async
import docspell.config.Implicits._
import docspell.config.{ConfigFactory, FtsType, Validation}
+import docspell.convert.ConvertConfig.HtmlConverter
import docspell.scheduler.CountingScheme
import docspell.store.Db
@@ -38,6 +39,9 @@ object ConfigFile {
implicit val mailAddressReader: ConfigReader[MailAddress] =
ConfigReader[String].emap(reason(MailAddress.parse))
+
+ implicit val htmlConverterReader: ConfigReader[HtmlConverter] =
+ ConfigReader[String].emap(reason(HtmlConverter.fromString))
}
def validate: Validation[Config] =
diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
index dc8817ed..ca675f02 100644
--- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
+++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala
@@ -13,9 +13,11 @@ import emil.jsoup._
import scodec.bits.ByteVector
object JsoupSanitizer {
+ private val whitelist =
+ EmailWhitelist.default.addAttributes(":all", "class")
private val change =
- BodyClean.whitelistClean(EmailWhitelist.default)
+ BodyClean.whitelistClean(whitelist)
def clean(html: String): String =
BodyClean.modifyContent(change)(BodyContent(html)).asString
diff --git a/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala
new file mode 100644
index 00000000..c893ef8a
--- /dev/null
+++ b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2020 Eike K. & Contributors
+ *
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ */
+
+package docspell.joex.extract
+
+import java.nio.charset.StandardCharsets
+
+import munit.FunSuite
+import org.jsoup.Jsoup
+
+class JsoupSanitizerTest extends FunSuite {
+
+ test("keep interesting tags and attributes") {
+ val cleaned = JsoupSanitizer.clean(html)
+ val doc = Jsoup.parse(cleaned)
+
+ assertEquals(doc.head().getElementsByTag("link").size(), 1)
+ assertEquals(doc.head().getElementsByTag("style").size(), 1)
+ assertEquals(doc.charset(), StandardCharsets.UTF_8)
+ assertEquals(doc.head().select("meta[charset]").attr("charset").toUpperCase, "UTF-8")
+ assert(doc.select("*[class]").size() > 0)
+ assert(doc.select("*[style]").size() > 0)
+ }
+
+ def html =
+ """
+ |
+ |
+ |
+ |
+ |
+ | A simple, clean, and responsive HTML invoice template
+ |
+ |
+ |
+ |
+ | Some html template for an invoice
+ | It is something simple.
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ |
+ | |
+ |
+ | Invoice #: 123
+ | Created: January 1, 2015
+ | Due: February 1, 2015
+ | |
+ |
+ |
+ | |
+ |
+ |
+ |
+ |
+ |
+ |
+ | Company, Inc.
+ | 456 Rosewood Road
+ | Flowerville, MI 12345
+ | |
+ |
+ | Acme Corp.
+ | John Doe
+ | john@example.com
+ | |
+ |
+ |
+ | |
+ |
+ |
+ | Payment Method |
+ | Check # |
+ |
+ |
+ | Check |
+ | 1000 |
+ |
+ |
+ | Item |
+ | Price |
+ |
+ |
+ | Website design |
+ | $300.00 |
+ |
+ |
+ | Domain name (1 year) |
+ | $10.00 |
+ |
+ |
+ | |
+ | Total: $385.00 |
+ |
+ |
+ |
+ |
+ |
+ |""".stripMargin
+}