mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 09:30:12 +00:00 
			
		
		
		
	Allow to convert html->pdf via weasyprint
This commit is contained in:
		| @@ -66,6 +66,7 @@ services: | ||||
|       - DOCSPELL_JOEX_JDBC_URL=jdbc:postgresql://db:5432/dbname | ||||
|       - DOCSPELL_JOEX_JDBC_USER=dbuser | ||||
|       - DOCSPELL_JOEX_ADDONS_EXECUTOR__CONFIG_RUNNER=docker,trivial | ||||
|       - DOCSPELL_JOEX_CONVERT_HTML__CONVERTER=weasyprint | ||||
|     ports: | ||||
|       - "7878:7878" | ||||
|     depends_on: | ||||
|   | ||||
| @@ -14,6 +14,7 @@ import fs2._ | ||||
|  | ||||
| import docspell.common._ | ||||
| import docspell.convert.ConversionResult.Handler | ||||
| import docspell.convert.ConvertConfig.HtmlConverter | ||||
| import docspell.convert.extern._ | ||||
| import docspell.convert.flexmark.Markdown | ||||
| import docspell.files.{ImageSize, TikaMimetype} | ||||
| @@ -57,11 +58,21 @@ object Conversion { | ||||
|  | ||||
|           case MimeType.HtmlMatch(mt) => | ||||
|             val cs = mt.charsetOrUtf8 | ||||
|             WkHtmlPdf | ||||
|               .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)( | ||||
|                 in, | ||||
|                 handler | ||||
|               ) | ||||
|             cfg.htmlConverter match { | ||||
|               case HtmlConverter.Wkhtmltopdf => | ||||
|                 WkHtmlPdf | ||||
|                   .toPDF(cfg.wkhtmlpdf, cfg.chunkSize, cs, sanitizeHtml, logger)( | ||||
|                     in, | ||||
|                     handler | ||||
|                   ) | ||||
|  | ||||
|               case HtmlConverter.Weasyprint => | ||||
|                 Weasyprint | ||||
|                   .toPDF(cfg.weasyprint, cfg.chunkSize, cs, sanitizeHtml, logger)( | ||||
|                     in, | ||||
|                     handler | ||||
|                   ) | ||||
|             } | ||||
|  | ||||
|           case MimeType.TextAllMatch(mt) => | ||||
|             val cs = mt.charsetOrUtf8 | ||||
|   | ||||
| @@ -6,10 +6,11 @@ | ||||
|  | ||||
| package docspell.convert | ||||
|  | ||||
| import cats.data.NonEmptyList | ||||
|  | ||||
| import docspell.common.Password | ||||
| import docspell.convert.ConvertConfig.DecryptPdf | ||||
| import docspell.convert.extern.OcrMyPdfConfig | ||||
| import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} | ||||
| import docspell.convert.extern._ | ||||
| import docspell.convert.flexmark.MarkdownConfig | ||||
|  | ||||
| final case class ConvertConfig( | ||||
| @@ -18,6 +19,8 @@ final case class ConvertConfig( | ||||
|     maxImageSize: Int, | ||||
|     markdown: MarkdownConfig, | ||||
|     wkhtmlpdf: WkHtmlPdfConfig, | ||||
|     weasyprint: WeasyprintConfig, | ||||
|     htmlConverter: ConvertConfig.HtmlConverter, | ||||
|     tesseract: TesseractConfig, | ||||
|     unoconv: UnoconvConfig, | ||||
|     ocrmypdf: OcrMyPdfConfig, | ||||
| @@ -27,4 +30,25 @@ final case class ConvertConfig( | ||||
| object ConvertConfig { | ||||
|  | ||||
|   final case class DecryptPdf(enabled: Boolean, passwords: List[Password]) | ||||
|  | ||||
|   sealed trait HtmlConverter { | ||||
|     def name: String | ||||
|   } | ||||
|  | ||||
|   object HtmlConverter { | ||||
|     case object Wkhtmltopdf extends HtmlConverter { | ||||
|       val name = "wkhtmlpdf" | ||||
|     } | ||||
|     case object Weasyprint extends HtmlConverter { | ||||
|       val name = "weasyprint" | ||||
|     } | ||||
|     val all: NonEmptyList[HtmlConverter] = NonEmptyList.of(Wkhtmltopdf, Weasyprint) | ||||
|  | ||||
|     def fromString(str: String): Either[String, HtmlConverter] = | ||||
|       all | ||||
|         .find(_.name.equalsIgnoreCase(str)) | ||||
|         .toRight( | ||||
|           s"Invalid html-converter value: $str. Use one of: ${all.toList.mkString(", ")}" | ||||
|         ) | ||||
|   } | ||||
| } | ||||
|   | ||||
							
								
								
									
										55
									
								
								modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								modules/convert/src/main/scala/docspell/convert/extern/Weasyprint.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,55 @@ | ||||
| /* | ||||
|  * Copyright 2020 Eike K. & Contributors | ||||
|  * | ||||
|  * SPDX-License-Identifier: AGPL-3.0-or-later | ||||
|  */ | ||||
|  | ||||
| package docspell.convert.extern | ||||
|  | ||||
| import java.nio.charset.Charset | ||||
|  | ||||
| import cats.effect._ | ||||
| import cats.implicits._ | ||||
| import fs2.io.file.Path | ||||
| import fs2.{Chunk, Stream} | ||||
|  | ||||
| import docspell.common._ | ||||
| import docspell.convert.ConversionResult.Handler | ||||
| import docspell.convert.{ConversionResult, SanitizeHtml} | ||||
| import docspell.logging.Logger | ||||
|  | ||||
| object Weasyprint { | ||||
|  | ||||
|   def toPDF[F[_]: Async, A]( | ||||
|       cfg: WeasyprintConfig, | ||||
|       chunkSize: Int, | ||||
|       charset: Charset, | ||||
|       sanitizeHtml: SanitizeHtml, | ||||
|       logger: Logger[F] | ||||
|   )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { | ||||
|     val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = | ||||
|       ExternConv.readResult[F](chunkSize, logger) | ||||
|  | ||||
|     val cmdCfg = cfg.command.replace(Map("{{encoding}}" -> charset.name())) | ||||
|  | ||||
|     // html sanitize should (among other) remove links to invalid | ||||
|     // protocols like cid: which is not supported by further | ||||
|     // processing. | ||||
|     // | ||||
|     // Since jsoup will load everything anyways, a stream-based | ||||
|     // conversion to java's inputstream doesn't make much sense. | ||||
|     val inSane = Stream.evalUnChunk( | ||||
|       Binary | ||||
|         .loadAllBytes(in) | ||||
|         .map(bv => sanitizeHtml(bv, charset.some)) | ||||
|         .map(bv => Chunk.byteVector(bv)) | ||||
|     ) | ||||
|  | ||||
|     ExternConv | ||||
|       .toPDF[F, A]("weasyprint", cmdCfg, cfg.workingDir, true, logger, reader)( | ||||
|         inSane, | ||||
|         handler | ||||
|       ) | ||||
|   } | ||||
|  | ||||
| } | ||||
							
								
								
									
										13
									
								
								modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								modules/convert/src/main/scala/docspell/convert/extern/WeasyprintConfig.scala
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| /* | ||||
|  * Copyright 2020 Eike K. & Contributors | ||||
|  * | ||||
|  * SPDX-License-Identifier: AGPL-3.0-or-later | ||||
|  */ | ||||
|  | ||||
| package docspell.convert.extern | ||||
|  | ||||
| import fs2.io.file.Path | ||||
|  | ||||
| import docspell.common.SystemCommand | ||||
|  | ||||
| case class WeasyprintConfig(command: SystemCommand.Config, workingDir: Path) | ||||
| @@ -17,8 +17,8 @@ import fs2.Stream | ||||
| import docspell.common._ | ||||
| import docspell.common.util.File | ||||
| import docspell.convert.ConversionResult.Handler | ||||
| import docspell.convert.extern.OcrMyPdfConfig | ||||
| import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} | ||||
| import docspell.convert.ConvertConfig.HtmlConverter | ||||
| import docspell.convert.extern._ | ||||
| import docspell.convert.flexmark.MarkdownConfig | ||||
| import docspell.files.ExampleFiles | ||||
| import docspell.logging.TestLoggingConfig | ||||
| @@ -43,6 +43,15 @@ class ConversionTest extends FunSuite with FileChecks with TestLoggingConfig { | ||||
|       ), | ||||
|       target | ||||
|     ), | ||||
|     WeasyprintConfig( | ||||
|       SystemCommand.Config( | ||||
|         "weasyprint", | ||||
|         Seq("--encoding", "UTF-8", "-", "{{outfile}}"), | ||||
|         Duration.seconds(20) | ||||
|       ), | ||||
|       target | ||||
|     ), | ||||
|     HtmlConverter.Wkhtmltopdf, | ||||
|     TesseractConfig( | ||||
|       SystemCommand.Config( | ||||
|         "tesseract", | ||||
|   | ||||
| @@ -551,6 +551,10 @@ Docpell Update Check | ||||
|       """ | ||||
|     } | ||||
|  | ||||
|     # Which HTML->PDF converter command to use. One of: wkhtmlpdf, | ||||
|     # weasyprint. | ||||
|     html-converter = "wkhtmlpdf" | ||||
|  | ||||
|     # To convert HTML files into PDF files, the external tool | ||||
|     # wkhtmltopdf is used. | ||||
|     wkhtmlpdf { | ||||
| @@ -568,7 +572,22 @@ Docpell Update Check | ||||
|         ] | ||||
|         timeout = "2 minutes" | ||||
|       } | ||||
|       working-dir = ${java.io.tmpdir}"/docspell-convert" | ||||
|       working-dir = ${java.io.tmpdir}"/docspell-wkhtmltopdf" | ||||
|     } | ||||
|  | ||||
|     # An alternative to wkhtmltopdf is weasyprint. | ||||
|     weasyprint { | ||||
|       command = { | ||||
|         program = "weasyprint" | ||||
|         args = [ | ||||
|           "--optimize-size", "all", | ||||
|           "--encoding", "{{encoding}}", | ||||
|           "-", | ||||
|           "{{outfile}}" | ||||
|         ] | ||||
|         timeout = "2 minutes" | ||||
|       } | ||||
|       working-dir = ${java.io.tmpdir}"/docspell-weasyprint" | ||||
|     } | ||||
|  | ||||
|     # To convert image files to PDF files, tesseract is used. This | ||||
|   | ||||
| @@ -10,6 +10,7 @@ import cats.effect.Async | ||||
|  | ||||
| import docspell.config.Implicits._ | ||||
| import docspell.config.{ConfigFactory, FtsType, Validation} | ||||
| import docspell.convert.ConvertConfig.HtmlConverter | ||||
| import docspell.scheduler.CountingScheme | ||||
| import docspell.store.Db | ||||
|  | ||||
| @@ -38,6 +39,9 @@ object ConfigFile { | ||||
|  | ||||
|     implicit val mailAddressReader: ConfigReader[MailAddress] = | ||||
|       ConfigReader[String].emap(reason(MailAddress.parse)) | ||||
|  | ||||
|     implicit val htmlConverterReader: ConfigReader[HtmlConverter] = | ||||
|       ConfigReader[String].emap(reason(HtmlConverter.fromString)) | ||||
|   } | ||||
|  | ||||
|   def validate: Validation[Config] = | ||||
|   | ||||
		Reference in New Issue
	
	Block a user