Update config file doc

2025-08-09 03:04:52 +00:00 · 2020-02-20 21:10:00 +01:00
parent fbe0c1aec5
commit 3f316ab4d0
1 changed files with 29 additions and 4 deletions
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@ -147,16 +147,31 @@ docspell.joex {
  # below. They must be in the PATH environment or specify the full
  # path below via the `program` key.
  convert {
+
+    # The chunk size used when storing files. This should be the same
+    # as used with the rest server.
    chunk-size = 524288

-    max-image-size = 12000000
+    # When reading images, this is the maximum size. Images that are
+    # larger are not processed.
+    max-image-size = ${docspell.joex.extraction.ocr.max-image-size}

+    # Settings when processing markdown files (and other text files)
+    # to HTML.
+    #
+    # In order to support text formats, text files are first converted
+    # to HTML using a markdown processor. The resulting HTML is then
+    # converted to a PDF file.
    markdown {
+
+      # The CSS that is used to style the resulting HTML.
      internal-css = """
        body { padding: 2em 5em; }
      """
    }

+    # To convert HTML files into PDF files, the external tool
+    # wkhtmltopdf is used.
    wkhtmlpdf {
      cmd = {
        program = "wkhtmltopdf"
@ -168,11 +183,13 @@ docspell.joex {
          "-",
          "{{outfile}}"
        ]
-        timeout = "20 seconds"
+        timeout = "2 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

+    # To convert image files to PDF files, tesseract is used. This
+    # also extracts the text in one go.
    tesseract = {
      cmd = {
        program = "tesseract"
@ -184,11 +201,19 @@ docspell.joex {
          "pdf",
          "txt"
        ]
-        timeout = "120 seconds"
+        timeout = "5 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

+    # To convert "office" files to PDF files, the external tool
+    # unoconv is used. Unoconv uses libreoffice/openoffice for
+    # converting. So it supports all formats that are possible to read
+    # with libreoffice/openoffic.
+    #
+    # Note: to greatly improve performance, it is recommended to start
+    # a libreoffice listener by running `unoconv -l` in a separate
+    # process.
    unoconv = {
      cmd = {
        program = "unoconv"
@ -199,7 +224,7 @@ docspell.joex {
          "{{outfile}}",
          "{{infile}}"
        ]
-        timeout = "20 seconds"
+        timeout = "2 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }