diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 35e48dc5..12f55c9f 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -147,16 +147,31 @@ docspell.joex { # below. They must be in the PATH environment or specify the full # path below via the `program` key. convert { + + # The chunk size used when storing files. This should be the same + # as used with the rest server. chunk-size = 524288 - max-image-size = 12000000 + # When reading images, this is the maximum size. Images that are + # larger are not processed. + max-image-size = ${docspell.joex.extraction.ocr.max-image-size} + # Settings when processing markdown files (and other text files) + # to HTML. + # + # In order to support text formats, text files are first converted + # to HTML using a markdown processor. The resulting HTML is then + # converted to a PDF file. markdown { + + # The CSS that is used to style the resulting HTML. internal-css = """ body { padding: 2em 5em; } """ } + # To convert HTML files into PDF files, the external tool + # wkhtmltopdf is used. wkhtmlpdf { cmd = { program = "wkhtmltopdf" @@ -168,11 +183,13 @@ docspell.joex { "-", "{{outfile}}" ] - timeout = "20 seconds" + timeout = "2 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" } + # To convert image files to PDF files, tesseract is used. This + # also extracts the text in one go. tesseract = { cmd = { program = "tesseract" @@ -184,11 +201,19 @@ docspell.joex { "pdf", "txt" ] - timeout = "120 seconds" + timeout = "5 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" } + # To convert "office" files to PDF files, the external tool + # unoconv is used. Unoconv uses libreoffice/openoffice for + # converting. So it supports all formats that are possible to read + # with libreoffice/openoffic. + # + # Note: to greatly improve performance, it is recommended to start + # a libreoffice listener by running `unoconv -l` in a separate + # process. unoconv = { cmd = { program = "unoconv" @@ -199,7 +224,7 @@ docspell.joex { "{{outfile}}", "{{infile}}" ] - timeout = "20 seconds" + timeout = "2 minutes" } working-dir = ${java.io.tmpdir}"/docspell-convert" }