Update config file doc

This commit is contained in:
Eike Kettner 2020-02-20 21:10:00 +01:00
parent fbe0c1aec5
commit 3f316ab4d0

View File

@ -147,16 +147,31 @@ docspell.joex {
# below. They must be in the PATH environment or specify the full # below. They must be in the PATH environment or specify the full
# path below via the `program` key. # path below via the `program` key.
convert { convert {
# The chunk size used when storing files. This should be the same
# as used with the rest server.
chunk-size = 524288 chunk-size = 524288
max-image-size = 12000000 # When reading images, this is the maximum size. Images that are
# larger are not processed.
max-image-size = ${docspell.joex.extraction.ocr.max-image-size}
# Settings when processing markdown files (and other text files)
# to HTML.
#
# In order to support text formats, text files are first converted
# to HTML using a markdown processor. The resulting HTML is then
# converted to a PDF file.
markdown { markdown {
# The CSS that is used to style the resulting HTML.
internal-css = """ internal-css = """
body { padding: 2em 5em; } body { padding: 2em 5em; }
""" """
} }
# To convert HTML files into PDF files, the external tool
# wkhtmltopdf is used.
wkhtmlpdf { wkhtmlpdf {
cmd = { cmd = {
program = "wkhtmltopdf" program = "wkhtmltopdf"
@ -168,11 +183,13 @@ docspell.joex {
"-", "-",
"{{outfile}}" "{{outfile}}"
] ]
timeout = "20 seconds" timeout = "2 minutes"
} }
working-dir = ${java.io.tmpdir}"/docspell-convert" working-dir = ${java.io.tmpdir}"/docspell-convert"
} }
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = { tesseract = {
cmd = { cmd = {
program = "tesseract" program = "tesseract"
@ -184,11 +201,19 @@ docspell.joex {
"pdf", "pdf",
"txt" "txt"
] ]
timeout = "120 seconds" timeout = "5 minutes"
} }
working-dir = ${java.io.tmpdir}"/docspell-convert" working-dir = ${java.io.tmpdir}"/docspell-convert"
} }
# To convert "office" files to PDF files, the external tool
# unoconv is used. Unoconv uses libreoffice/openoffice for
# converting. So it supports all formats that are possible to read
# with libreoffice/openoffic.
#
# Note: to greatly improve performance, it is recommended to start
# a libreoffice listener by running `unoconv -l` in a separate
# process.
unoconv = { unoconv = {
cmd = { cmd = {
program = "unoconv" program = "unoconv"
@ -199,7 +224,7 @@ docspell.joex {
"{{outfile}}", "{{outfile}}",
"{{infile}}" "{{infile}}"
] ]
timeout = "20 seconds" timeout = "2 minutes"
} }
working-dir = ${java.io.tmpdir}"/docspell-convert" working-dir = ${java.io.tmpdir}"/docspell-convert"
} }