diff --git a/docker/docker-compose/docker-compose.yml b/docker/docker-compose/docker-compose.yml index 5241be63..081244a1 100644 --- a/docker/docker-compose/docker-compose.yml +++ b/docker/docker-compose/docker-compose.yml @@ -66,6 +66,7 @@ services: - DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname - DOCSPELL_JOEX_JDBC_USER=dbuser - DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial + - DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint ports: - "7878:7878" depends_on: diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index ccb34cbd..5dc385b3 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -1,14 +1,12 @@ -FROM alpine:3.14 +FROM alpine:3 ARG version= ARG joex_url= ARG UNO_URL=https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv ARG TARGETPLATFORM -RUN JDKPKG="openjdk11-jre"; \ - if [[ $TARGETPLATFORM = linux/arm* ]]; then JDKPKG="openjdk8-jre"; fi; \ - apk update && \ - apk add --no-cache $JDKPKG \ +RUN apk update && \ + apk add --no-cache openjdk17-jre \ tzdata \ bash \ curl \ @@ -35,7 +33,7 @@ RUN JDKPKG="openjdk11-jre"; \ tesseract-ocr-data-pol \ tesseract-ocr-data-est \ unpaper \ - wkhtmltopdf \ + weasyprint \ libreoffice \ ttf-droid-nonlatin \ ttf-droid \ @@ -60,7 +58,7 @@ RUN JDKPKG="openjdk11-jre"; \ && curl -Ls $UNO_URL -o /usr/local/bin/unoconv \ && chmod +x /usr/local/bin/unoconv \ && apk del libxml2-dev libxslt-dev zlib-dev g++ python3-dev py3-pip libffi-dev qpdf-dev openssl-dev \ - && ln -s /usr/bin/python3 /usr/bin/python + && ln -nfs /usr/bin/python3 /usr/bin/python WORKDIR /opt RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \ @@ -77,7 +75,7 @@ RUN \ COPY joex-entrypoint.sh /opt/joex-entrypoint.sh -ENTRYPOINT ["/opt/joex-entrypoint.sh", "-J-XX:+UseG1GC"] +ENTRYPOINT ["/opt/joex-entrypoint.sh"] EXPOSE 7878 HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \ diff --git a/docker/dockerfiles/restserver.dockerfile b/docker/dockerfiles/restserver.dockerfile index 01326e7a..038ed494 100644 --- a/docker/dockerfiles/restserver.dockerfile +++ b/docker/dockerfiles/restserver.dockerfile @@ -1,4 +1,4 @@ -FROM alpine:3.16 +FROM alpine:3 ARG version= ARG restserver_url= @@ -15,7 +15,7 @@ RUN curl -L -O ${restserver_url:-https://github.com/eikek/docspell/releases/down ln -snf docspell-restserver-* docspell-restserver && \ rm docspell-restserver/conf/docspell-server.conf -ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver", "-J-XX:+UseG1GC"] +ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver"] EXPOSE 7880 HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \ diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 19b1279c..bcacf3de 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -14,6 +14,7 @@ import fs2._ import docspell.common._ import docspell.convert.ConversionResult.Handler +import docspell.convert.ConvertConfig.HtmlConverter import docspell.convert.extern._ import docspell.convert.flexmark.Markdown import docspell.files.{ImageSize, TikaMimetype} @@ -57,11 +58,21 @@ object Conversion { case MimeType.HtmlMatch(mt) => val cs = mt.charsetOrUtf8 - WkHtmlPdf - .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)( - in, - handler - ) + cfg.htmlConverter match { + case HtmlConverter.Wkhtmltopdf => + WkHtmlPdf + .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)( + in, + handler + ) + + case HtmlConverter.Weasyprint => + Weasyprint + .toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)( + in, + handler + ) + } case MimeType.TextAllMatch(mt) => val cs = mt.charsetOrUtf8 diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index a4f3c224..fe30c11c 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -6,10 +6,11 @@ package docspell.convert +import cats.data.NonEmptyList + import docspell.common.Password import docspell.convert.ConvertConfig.DecryptPdf -import docspell.convert.extern.OcrMyPdfConfig -import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} +import docspell.convert.extern._ import docspell.convert.flexmark.MarkdownConfig final case class ConvertConfig( @@ -18,6 +19,8 @@ final case class ConvertConfig( maxImageSize: Int, markdown: MarkdownConfig, wkhtmlpdf: WkHtmlPdfConfig, + weasyprint: WeasyprintConfig, + htmlConverter: ConvertConfig.HtmlConverter, tesseract: TesseractConfig, unoconv: UnoconvConfig, ocrmypdf: OcrMyPdfConfig, @@ -27,4 +30,25 @@ final case class ConvertConfig( object ConvertConfig { final case class DecryptPdf(enabled: Boolean, passwords: List[Password]) + + sealed trait HtmlConverter { + def name: String + } + + object HtmlConverter { + case object Wkhtmltopdf extends HtmlConverter { + val name = "wkhtmlpdf" + } + case object Weasyprint extends HtmlConverter { + val name = "weasyprint" + } + val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint) + + def fromString(str: String): Either[String, HtmlConverter] = + all + .find(_.name.equalsIgnoreCase(str)) + .toRight( + s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}" + ) + } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala new file mode 100644 index 00000000..ba1d6ce7 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.convert.extern + +import java.nio.charset.Charset + +import cats.effect._ +import cats.implicits._ +import fs2.io.file.Path +import fs2.{Chunk, Stream} + +import docspell.common._ +import docspell.convert.ConversionResult.Handler +import docspell.convert.{ConversionResult, SanitizeHtml} +import docspell.logging.Logger + +object Weasyprint { + + def toPDF[F[_]: Async, A]( + cfg: WeasyprintConfig, + chunkSize: Int, + charset: Charset, + sanitizeHtml: SanitizeHtml, + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](chunkSize, logger) + + val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) + + // html sanitize should (among other) remove links to invalid + // protocols like cid: which is not supported by further + // processing. + // + // Since jsoup will load everything anyways, a stream-based + // conversion to java's inputstream doesn't make much sense. + val inSane = Stream.evalUnChunk( + Binary + .loadAllBytes(in) + .map(bv => sanitizeHtml(bv, charset.some)) + .map(bv => Chunk.byteVector(bv)) + ) + + ExternConv + .toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)( + inSane, + handler + ) + } + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala new file mode 100644 index 00000000..2ce485cc --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala @@ -0,0 +1,13 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.convert.extern + +import fs2.io.file.Path + +import docspell.common.SystemCommand + +case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index 06ee81c1..35dfcfa6 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -17,8 +17,8 @@ import fs2.Stream import docspell.common._ import docspell.common.util.File import docspell.convert.ConversionResult.Handler -import docspell.convert.extern.OcrMyPdfConfig -import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} +import docspell.convert.ConvertConfig.HtmlConverter +import docspell.convert.extern._ import docspell.convert.flexmark.MarkdownConfig import docspell.files.ExampleFiles import docspell.logging.TestLoggingConfig @@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { ), target ), + WeasyprintConfig( + SystemCommand.Config( + "weasyprint", + Seq("--encoding", "UTF-8", "-", "{{outfile}}"), + Duration.seconds(20) + ), + target + ), + HtmlConverter.Wkhtmltopdf, TesseractConfig( SystemCommand.Config( "tesseract", diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 3421ffef..8efc2a9a 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -551,6 +551,10 @@ Docpell Update Check """ } + # Which HTML->PDF converter command to use. One of: wkhtmlpdf, + # weasyprint. + html-converter = "wkhtmlpdf" + # To convert HTML files into PDF files, the external tool # wkhtmltopdf is used. wkhtmlpdf { @@ -568,7 +572,22 @@ Docpell Update Check ] timeout = "2 minutes" } - working-dir = ${java.io.tmpdir}"/docspell-convert" + working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf" + } + + # An alternative to wkhtmltopdf is weasyprint. + weasyprint { + command = { + program = "weasyprint" + args = [ + "--optimize-size", "all", + "--encoding", "{{encoding}}", + "-", + "{{outfile}}" + ] + timeout = "2 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-weasyprint" } # To convert image files to PDF files, tesseract is used. This diff --git a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala index adcc928c..ac4c1b8a 100644 --- a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala +++ b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala @@ -10,6 +10,7 @@ import cats.effect.Async import docspell.config.Implicits._ import docspell.config.{ConfigFactory, FtsType, Validation} +import docspell.convert.ConvertConfig.HtmlConverter import docspell.scheduler.CountingScheme import docspell.store.Db @@ -38,6 +39,9 @@ object ConfigFile { implicit val mailAddressReader: ConfigReader[MailAddress] = ConfigReader[String].emap(reason(MailAddress.parse)) + + implicit val htmlConverterReader: ConfigReader[HtmlConverter] = + ConfigReader[String].emap(reason(HtmlConverter.fromString)) } def validate: Validation[Config] = diff --git a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala index dc8817ed..ca675f02 100644 --- a/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala +++ b/modules/joex/src/main/scala/docspell/joex/extract/JsoupSanitizer.scala @@ -13,9 +13,11 @@ import emil.jsoup._ import scodec.bits.ByteVector object JsoupSanitizer { + private val whitelist = + EmailWhitelist.default.addAttributes(":all", "class") private val change = - BodyClean.whitelistClean(EmailWhitelist.default) + BodyClean.whitelistClean(whitelist) def clean(html: String): String = BodyClean.modifyContent(change)(BodyContent(html)).asString diff --git a/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala new file mode 100644 index 00000000..c893ef8a --- /dev/null +++ b/modules/joex/src/test/scala/docspell/joex/extract/JsoupSanitizerTest.scala @@ -0,0 +1,133 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.joex.extract + +import java.nio.charset.StandardCharsets + +import munit.FunSuite +import org.jsoup.Jsoup + +class JsoupSanitizerTest extends FunSuite { + + test("keep interesting tags and attributes") { + val cleaned = JsoupSanitizer.clean(html) + val doc = Jsoup.parse(cleaned) + + assertEquals(doc.head().getElementsByTag("link").size(), 1) + assertEquals(doc.head().getElementsByTag("style").size(), 1) + assertEquals(doc.charset(), StandardCharsets.UTF_8) + assertEquals(doc.head().select("meta[charset]").attr("charset").toUpperCase, "UTF-8") + assert(doc.select("*[class]").size() > 0) + assert(doc.select("*[style]").size() > 0) + } + + def html = + """ + | + | + | + | + | + | A simple, clean, and responsive HTML invoice template + | + | + | + | + |

Some html template for an invoice

+ |

It is something simple.

+ |
+ | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + | + |
+ | + | + | + | + | + |
+ | Company logo + | + | Invoice #: 123
+ | Created: January 1, 2015
+ | Due: February 1, 2015 + |
+ |
+ | + | + | + | + | + |
+ | Company, Inc.
+ | 456 Rosewood Road
+ | Flowerville, MI 12345 + |
+ | Acme Corp.
+ | John Doe
+ | john@example.com + |
+ |
Payment MethodCheck #
Check1000
ItemPrice
Website design$300.00
Domain name (1 year)$10.00
Total: $385.00
+ |
+ | + | + |""".stripMargin +}