mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 02:49:32 +00:00
commit
dc061b10a6
@ -66,6 +66,7 @@ services:
|
||||
- DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname
|
||||
- DOCSPELL_JOEX_JDBC_USER=dbuser
|
||||
- DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial
|
||||
- DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint
|
||||
ports:
|
||||
- "7878:7878"
|
||||
depends_on:
|
||||
|
@ -1,14 +1,12 @@
|
||||
FROM alpine:3.14
|
||||
FROM alpine:3
|
||||
|
||||
ARG version=
|
||||
ARG joex_url=
|
||||
ARG UNO_URL=https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
RUN JDKPKG="openjdk11-jre"; \
|
||||
if [[ $TARGETPLATFORM = linux/arm* ]]; then JDKPKG="openjdk8-jre"; fi; \
|
||||
apk update && \
|
||||
apk add --no-cache $JDKPKG \
|
||||
RUN apk update && \
|
||||
apk add --no-cache openjdk17-jre \
|
||||
tzdata \
|
||||
bash \
|
||||
curl \
|
||||
@ -35,7 +33,7 @@ RUN JDKPKG="openjdk11-jre"; \
|
||||
tesseract-ocr-data-pol \
|
||||
tesseract-ocr-data-est \
|
||||
unpaper \
|
||||
wkhtmltopdf \
|
||||
weasyprint \
|
||||
libreoffice \
|
||||
ttf-droid-nonlatin \
|
||||
ttf-droid \
|
||||
@ -60,7 +58,7 @@ RUN JDKPKG="openjdk11-jre"; \
|
||||
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
||||
&& chmod +x /usr/local/bin/unoconv \
|
||||
&& apk del libxml2-dev libxslt-dev zlib-dev g++ python3-dev py3-pip libffi-dev qpdf-dev openssl-dev \
|
||||
&& ln -s /usr/bin/python3 /usr/bin/python
|
||||
&& ln -nfs /usr/bin/python3 /usr/bin/python
|
||||
|
||||
WORKDIR /opt
|
||||
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
|
||||
@ -77,7 +75,7 @@ RUN \
|
||||
|
||||
COPY joex-entrypoint.sh /opt/joex-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/opt/joex-entrypoint.sh", "-J-XX:+UseG1GC"]
|
||||
ENTRYPOINT ["/opt/joex-entrypoint.sh"]
|
||||
EXPOSE 7878
|
||||
|
||||
HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM alpine:3.16
|
||||
FROM alpine:3
|
||||
|
||||
ARG version=
|
||||
ARG restserver_url=
|
||||
@ -15,7 +15,7 @@ RUN curl -L -O ${restserver_url:-https://github.com/eikek/docspell/releases/down
|
||||
ln -snf docspell-restserver-* docspell-restserver && \
|
||||
rm docspell-restserver/conf/docspell-server.conf
|
||||
|
||||
ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver", "-J-XX:+UseG1GC"]
|
||||
ENTRYPOINT ["/opt/docspell-restserver/bin/docspell-restserver"]
|
||||
EXPOSE 7880
|
||||
|
||||
HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \
|
||||
|
@ -14,6 +14,7 @@ import fs2._
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.ConvertConfig.HtmlConverter
|
||||
import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.Markdown
|
||||
import docspell.files.{ImageSize, TikaMimetype}
|
||||
@ -57,11 +58,21 @@ object Conversion {
|
||||
|
||||
case MimeType.HtmlMatch(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
cfg.htmlConverter match {
|
||||
case HtmlConverter.Wkhtmltopdf =>
|
||||
WkHtmlPdf
|
||||
.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
|
||||
case HtmlConverter.Weasyprint =>
|
||||
Weasyprint
|
||||
.toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
case MimeType.TextAllMatch(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
|
@ -6,10 +6,11 @@
|
||||
|
||||
package docspell.convert
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import docspell.common.Password
|
||||
import docspell.convert.ConvertConfig.DecryptPdf
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
|
||||
final case class ConvertConfig(
|
||||
@ -18,6 +19,8 @@ final case class ConvertConfig(
|
||||
maxImageSize: Int,
|
||||
markdown: MarkdownConfig,
|
||||
wkhtmlpdf: WkHtmlPdfConfig,
|
||||
weasyprint: WeasyprintConfig,
|
||||
htmlConverter: ConvertConfig.HtmlConverter,
|
||||
tesseract: TesseractConfig,
|
||||
unoconv: UnoconvConfig,
|
||||
ocrmypdf: OcrMyPdfConfig,
|
||||
@ -27,4 +30,25 @@ final case class ConvertConfig(
|
||||
object ConvertConfig {
|
||||
|
||||
final case class DecryptPdf(enabled: Boolean, passwords: List[Password])
|
||||
|
||||
sealed trait HtmlConverter {
|
||||
def name: String
|
||||
}
|
||||
|
||||
object HtmlConverter {
|
||||
case object Wkhtmltopdf extends HtmlConverter {
|
||||
val name = "wkhtmlpdf"
|
||||
}
|
||||
case object Weasyprint extends HtmlConverter {
|
||||
val name = "weasyprint"
|
||||
}
|
||||
val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint)
|
||||
|
||||
def fromString(str: String): Either[String, HtmlConverter] =
|
||||
all
|
||||
.find(_.name.equalsIgnoreCase(str))
|
||||
.toRight(
|
||||
s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
55
modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
vendored
Normal file
55
modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.convert.extern
|
||||
|
||||
import java.nio.charset.Charset
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.io.file.Path
|
||||
import fs2.{Chunk, Stream}
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.{ConversionResult, SanitizeHtml}
|
||||
import docspell.logging.Logger
|
||||
|
||||
object Weasyprint {
|
||||
|
||||
def toPDF[F[_]: Async, A](
|
||||
cfg: WeasyprintConfig,
|
||||
chunkSize: Int,
|
||||
charset: Charset,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](chunkSize, logger)
|
||||
|
||||
val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name()))
|
||||
|
||||
// html sanitize should (among other) remove links to invalid
|
||||
// protocols like cid: which is not supported by further
|
||||
// processing.
|
||||
//
|
||||
// Since jsoup will load everything anyways, a stream-based
|
||||
// conversion to java's inputstream doesn't make much sense.
|
||||
val inSane = Stream.evalUnChunk(
|
||||
Binary
|
||||
.loadAllBytes(in)
|
||||
.map(bv => sanitizeHtml(bv, charset.some))
|
||||
.map(bv => Chunk.byteVector(bv))
|
||||
)
|
||||
|
||||
ExternConv
|
||||
.toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)(
|
||||
inSane,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
13
modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
vendored
Normal file
13
modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
vendored
Normal file
@ -0,0 +1,13 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.convert.extern
|
||||
|
||||
import fs2.io.file.Path
|
||||
|
||||
import docspell.common.SystemCommand
|
||||
|
||||
case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path)
|
@ -17,8 +17,8 @@ import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.common.util.File
|
||||
import docspell.convert.ConversionResult.Handler
|
||||
import docspell.convert.extern.OcrMyPdfConfig
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.ConvertConfig.HtmlConverter
|
||||
import docspell.convert.extern._
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
import docspell.files.ExampleFiles
|
||||
import docspell.logging.TestLoggingConfig
|
||||
@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig {
|
||||
),
|
||||
target
|
||||
),
|
||||
WeasyprintConfig(
|
||||
SystemCommand.Config(
|
||||
"weasyprint",
|
||||
Seq("--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||
Duration.seconds(20)
|
||||
),
|
||||
target
|
||||
),
|
||||
HtmlConverter.Wkhtmltopdf,
|
||||
TesseractConfig(
|
||||
SystemCommand.Config(
|
||||
"tesseract",
|
||||
|
@ -551,6 +551,10 @@ Docpell Update Check
|
||||
"""
|
||||
}
|
||||
|
||||
# Which HTML->PDF converter command to use. One of: wkhtmlpdf,
|
||||
# weasyprint.
|
||||
html-converter = "wkhtmlpdf"
|
||||
|
||||
# To convert HTML files into PDF files, the external tool
|
||||
# wkhtmltopdf is used.
|
||||
wkhtmlpdf {
|
||||
@ -568,7 +572,22 @@ Docpell Update Check
|
||||
]
|
||||
timeout = "2 minutes"
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||
working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf"
|
||||
}
|
||||
|
||||
# An alternative to wkhtmltopdf is weasyprint.
|
||||
weasyprint {
|
||||
command = {
|
||||
program = "weasyprint"
|
||||
args = [
|
||||
"--optimize-size", "all",
|
||||
"--encoding", "{{encoding}}",
|
||||
"-",
|
||||
"{{outfile}}"
|
||||
]
|
||||
timeout = "2 minutes"
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-weasyprint"
|
||||
}
|
||||
|
||||
# To convert image files to PDF files, tesseract is used. This
|
||||
|
@ -10,6 +10,7 @@ import cats.effect.Async
|
||||
|
||||
import docspell.config.Implicits._
|
||||
import docspell.config.{ConfigFactory, FtsType, Validation}
|
||||
import docspell.convert.ConvertConfig.HtmlConverter
|
||||
import docspell.scheduler.CountingScheme
|
||||
import docspell.store.Db
|
||||
|
||||
@ -38,6 +39,9 @@ object ConfigFile {
|
||||
|
||||
implicit val mailAddressReader: ConfigReader[MailAddress] =
|
||||
ConfigReader[String].emap(reason(MailAddress.parse))
|
||||
|
||||
implicit val htmlConverterReader: ConfigReader[HtmlConverter] =
|
||||
ConfigReader[String].emap(reason(HtmlConverter.fromString))
|
||||
}
|
||||
|
||||
def validate: Validation[Config] =
|
||||
|
@ -13,9 +13,11 @@ import emil.jsoup._
|
||||
import scodec.bits.ByteVector
|
||||
|
||||
object JsoupSanitizer {
|
||||
private val whitelist =
|
||||
EmailWhitelist.default.addAttributes(":all", "class")
|
||||
|
||||
private val change =
|
||||
BodyClean.whitelistClean(EmailWhitelist.default)
|
||||
BodyClean.whitelistClean(whitelist)
|
||||
|
||||
def clean(html: String): String =
|
||||
BodyClean.modifyContent(change)(BodyContent(html)).asString
|
||||
|
@ -0,0 +1,133 @@
|
||||
/*
|
||||
* Copyright 2020 Eike K. & Contributors
|
||||
*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package docspell.joex.extract
|
||||
|
||||
import java.nio.charset.StandardCharsets
|
||||
|
||||
import munit.FunSuite
|
||||
import org.jsoup.Jsoup
|
||||
|
||||
class JsoupSanitizerTest extends FunSuite {
|
||||
|
||||
test("keep interesting tags and attributes") {
|
||||
val cleaned = JsoupSanitizer.clean(html)
|
||||
val doc = Jsoup.parse(cleaned)
|
||||
|
||||
assertEquals(doc.head().getElementsByTag("link").size(), 1)
|
||||
assertEquals(doc.head().getElementsByTag("style").size(), 1)
|
||||
assertEquals(doc.charset(), StandardCharsets.UTF_8)
|
||||
assertEquals(doc.head().select("meta[charset]").attr("charset").toUpperCase, "UTF-8")
|
||||
assert(doc.select("*[class]").size() > 0)
|
||||
assert(doc.select("*[style]").size() > 0)
|
||||
}
|
||||
|
||||
def html =
|
||||
"""
|
||||
|<!DOCTYPE html>
|
||||
|<html lang="en">
|
||||
| <head>
|
||||
| <meta charset="utf-8" />
|
||||
| <meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
| <title>A simple, clean, and responsive HTML invoice template</title>
|
||||
| <link rel="icon" href="./images/favicon.png" type="image/x-icon" />
|
||||
| <style>
|
||||
| body {
|
||||
| font-family: 'Helvetica Neue', 'Helvetica', Helvetica, Arial, sans-serif;
|
||||
| }
|
||||
| body h1 {
|
||||
| font-weight: 300;
|
||||
| }
|
||||
| body h3 {
|
||||
| font-weight: 300;
|
||||
| margin-top: 10px;
|
||||
| color: #555;
|
||||
| }
|
||||
| body a {
|
||||
| color: #06f;
|
||||
| }
|
||||
| .invoice-box {
|
||||
| max-width: 800px;
|
||||
| margin: auto;
|
||||
| padding: 30px;
|
||||
| border: 1px solid #eee;
|
||||
| box-shadow: 0 0 10px rgba(0, 0, 0, 0.15);
|
||||
| font-size: 16px;
|
||||
| line-height: 24px;
|
||||
| font-family: 'Helvetica Neue', 'Helvetica', Helvetica, Arial, sans-serif;
|
||||
| color: #555;
|
||||
| }
|
||||
| </style>
|
||||
| </head>
|
||||
| <body>
|
||||
| <h1>Some html template for an invoice</h1>
|
||||
| <h3>It is something simple.</h3>
|
||||
| <div class="invoice-box">
|
||||
| <table>
|
||||
| <tr class="top">
|
||||
| <td colspan="2">
|
||||
| <table>
|
||||
| <tr>
|
||||
| <td class="title">
|
||||
| <img src="./images/logo.png" alt="Company logo" style="width: 100%; max-width: 300px" />
|
||||
| </td>
|
||||
| <td>
|
||||
| Invoice #: 123<br />
|
||||
| Created: January 1, 2015<br />
|
||||
| Due: February 1, 2015
|
||||
| </td>
|
||||
| </tr>
|
||||
| </table>
|
||||
| </td>
|
||||
| </tr>
|
||||
| <tr class="information">
|
||||
| <td colspan="2">
|
||||
| <table style="color: black;">
|
||||
| <tr>
|
||||
| <td>
|
||||
| Company, Inc.<br />
|
||||
| 456 Rosewood Road<br />
|
||||
| Flowerville, MI 12345
|
||||
| </td>
|
||||
| <td>
|
||||
| Acme Corp.<br />
|
||||
| John Doe<br />
|
||||
| john@example.com
|
||||
| </td>
|
||||
| </tr>
|
||||
| </table>
|
||||
| </td>
|
||||
| </tr>
|
||||
| <tr class="heading">
|
||||
| <td>Payment Method</td>
|
||||
| <td>Check #</td>
|
||||
| </tr>
|
||||
| <tr class="details">
|
||||
| <td>Check</td>
|
||||
| <td>1000</td>
|
||||
| </tr>
|
||||
| <tr class="heading">
|
||||
| <td>Item</td>
|
||||
| <td>Price</td>
|
||||
| </tr>
|
||||
| <tr class="item">
|
||||
| <td>Website design</td>
|
||||
| <td>$300.00</td>
|
||||
| </tr>
|
||||
| <tr class="item last">
|
||||
| <td>Domain name (1 year)</td>
|
||||
| <td>$10.00</td>
|
||||
| </tr>
|
||||
| <tr class="total">
|
||||
| <td></td>
|
||||
| <td>Total: $385.00</td>
|
||||
| </tr>
|
||||
| </table>
|
||||
| </div>
|
||||
| </body>
|
||||
|</html>
|
||||
|""".stripMargin
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user