docspell/modules/joex/src/main/resources/reference.conf

docspell.joex {

  # This is the id of this node. If you run more than one server, you
  # have to make sure to provide unique ids per node.
  app-id = "joex1"


  # This is the base URL this application is deployed to. This is used
  # to register this joex instance such that docspell rest servers can
  # reach them
  base-url = "http://localhost:7878"

  # Where the REST server binds to.
  #
  # JOEX provides a very simple REST interface to inspect its state.
  bind {
    address = "localhost"
    port = 7878
  }

  # The database connection.
  #
  # By default a H2 file-based database is configured. You can provide
  # a postgresql or mariadb connection here. When using H2 use the
  # PostgreSQL compatibility mode and AUTO_SERVER feature.
  #
  # It must be the same connection as the rest server is using.
  jdbc {
    url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
    user = "sa"
    password = ""
  }

  # Configuration for the job scheduler.
  scheduler {

    # Each scheduler needs a unique name. This defaults to the node
    # name, which must be unique, too.
    name = ${docspell.joex.app-id}

    # Number of processing allowed in parallel.
    pool-size = 2

    # A counting scheme determines the ratio of how high- and low-prio
    # jobs are run. For example: 4,1 means run 4 high prio jobs, then
    # 1 low prio and then start over.
    counting-scheme = "4,1"

    # How often a failed job should be retried until it enters failed
    # state. If a job fails, it becomes "stuck" and will be retried
    # after a delay.
    retries = 5

    # The delay until the next try is performed for a failed job. This
    # delay is increased exponentially with the number of retries.
    retry-delay = "1 minute"

    # The queue size of log statements from a job.
    log-buffer-size = 500

    # If no job is left in the queue, the scheduler will wait until a
    # notify is requested (using the REST interface). To also retry
    # stuck jobs, it will notify itself periodically.
    wakeup-period = "30 minutes"
  }

  periodic-scheduler {

    # Each scheduler needs a unique name. This defaults to the node
    # name, which must be unique, too.
    name = ${docspell.joex.app-id}

    # A fallback to start looking for due periodic tasks regularily.
    # Usually joex instances should be notified via REST calls if
    # external processes change tasks. But these requests may get
    # lost.
    wakeup-period = "10 minutes"
  }

  # Configuration of text extraction
  extraction {
    # For PDF files it is first tried to read the text parts of the
    # PDF. But PDFs can be complex documents and they may contain text
    # and images. If the returned text is shorter than the value
    # below, OCR is run afterwards. Then both extracted texts are
    # compared and the longer will be used.
    pdf {
      min-text-len = 10
    }

    # Extracting text using OCR works for image and pdf files. It will
    # first run ghostscript to create a gray image from a pdf. Then
    # unpaper is run to optimize the image for the upcoming ocr, which
    # will be done by tesseract. All these programs must be available
    # in your PATH or the absolute path can be specified below.
    ocr {

      # Images greater than this size are skipped. Note that every
      # image is loaded completely into memory for doing OCR.
      max-image-size = 14000000

      # Defines what pages to process. If a PDF with 600 pages is
      # submitted, it is probably not necessary to scan through all of
      # them. This would take a long time and occupy resources for no
      # value. The first few pages should suffice. The default is first
      # 10 pages.
      #
      # If you want all pages being processed, set this number to -1.
      #
      # Note: if you change the ghostscript command below, be aware that
      # this setting (if not -1) will add another parameter to the
      # beginning of the command.
      page-range {
        begin = 10
      }

      # The ghostscript command.
      ghostscript {
        command {
          program = "gs"
          args = [ "-dNOPAUSE"
                 , "-dBATCH"
                 , "-dSAFER"
                 , "-sDEVICE=tiffscaled8"
                 , "-sOutputFile={{outfile}}"
                 , "{{infile}}"
                 ]
          timeout = "5 minutes"
        }
        working-dir = ${java.io.tmpdir}"/docspell-extraction"
      }

      # The unpaper command.
      unpaper {
        command {
          program = "unpaper"
          args = [ "{{infile}}", "{{outfile}}" ]
          timeout = "5 minutes"
        }
      }

      # The tesseract command.
      tesseract {
        command {
          program = "tesseract"
          args = ["{{file}}"
                 , "stdout"
                 , "-l"
                 , "{{lang}}"
                 ]
          timeout = "5 minutes"
        }
      }
    }
  }

  # Configuration for converting files into PDFs.
  #
  # Most of it is delegated to external tools, which can be configured
  # below. They must be in the PATH environment or specify the full
  # path below via the `program` key.
  convert {

    # The chunk size used when storing files. This should be the same
    # as used with the rest server.
    chunk-size = 524288

    # When reading images, this is the maximum size. Images that are
    # larger are not processed.
    max-image-size = ${docspell.joex.extraction.ocr.max-image-size}

    # Settings when processing markdown files (and other text files)
    # to HTML.
    #
    # In order to support text formats, text files are first converted
    # to HTML using a markdown processor. The resulting HTML is then
    # converted to a PDF file.
    markdown {

      # The CSS that is used to style the resulting HTML.
      internal-css = """
        body { padding: 2em 5em; }
      """
    }

    # To convert HTML files into PDF files, the external tool
    # wkhtmltopdf is used.
    wkhtmlpdf {
      command = {
        program = "wkhtmltopdf"
        args = [
          "-s",
          "A4",
          "--encoding",
          "UTF-8",
          "-",
          "{{outfile}}"
        ]
        timeout = "2 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

    # To convert image files to PDF files, tesseract is used. This
    # also extracts the text in one go.
    tesseract = {
      command = {
        program = "tesseract"
        args = [
          "{{infile}}",
          "out",
          "-l",
          "{{lang}}",
          "pdf",
          "txt"
        ]
        timeout = "5 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }

    # To convert "office" files to PDF files, the external tool
    # unoconv is used. Unoconv uses libreoffice/openoffice for
    # converting. So it supports all formats that are possible to read
    # with libreoffice/openoffic.
    #
    # Note: to greatly improve performance, it is recommended to start
    # a libreoffice listener by running `unoconv -l` in a separate
    # process.
    unoconv = {
      command = {
        program = "unoconv"
        args = [
          "-f",
          "pdf",
          "-o",
          "{{outfile}}",
          "{{infile}}"
        ]
        timeout = "2 minutes"
      }
      working-dir = ${java.io.tmpdir}"/docspell-convert"
    }
  }
}