Eike Kettner 1e598bd902 Sketch a scheduler for running periodic tasks
Periodic tasks are special in that they are usually kept around and
started based on a schedule. A new component checks periodic tasks and
submits them in the queue once they are due.

In order to avoid duplicate periodic jobs, the tracker of a job is
used to store the periodic job id. Each time a periodic task is due,
it is first checked if there is a job running (or queued) for this
task.
2020-03-08 12:55:03 +01:00

245 lines
7.3 KiB
Plaintext

docspell.joex {
# This is the id of this node. If you run more than one server, you
# have to make sure to provide unique ids per node.
app-id = "joex1"
# This is the base URL this application is deployed to. This is used
# to register this joex instance such that docspell rest servers can
# reach them
base-url = "http://localhost:7878"
# Where the REST server binds to.
#
# JOEX provides a very simple REST interface to inspect its state.
bind {
address = "localhost"
port = 7878
}
# The database connection.
#
# By default a H2 file-based database is configured. You can provide
# a postgresql or mariadb connection here. When using H2 use the
# PostgreSQL compatibility mode and AUTO_SERVER feature.
#
# It must be the same connection as the rest server is using.
jdbc {
url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
user = "sa"
password = ""
}
# Configuration for the job scheduler.
scheduler {
# Each scheduler needs a unique name. This defaults to the node
# name, which must be unique, too.
name = ${docspell.joex.app-id}
# Number of processing allowed in parallel.
pool-size = 2
# A counting scheme determines the ratio of how high- and low-prio
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
# 1 low prio and then start over.
counting-scheme = "4,1"
# How often a failed job should be retried until it enters failed
# state. If a job fails, it becomes "stuck" and will be retried
# after a delay.
retries = 5
# The delay until the next try is performed for a failed job. This
# delay is increased exponentially with the number of retries.
retry-delay = "1 minute"
# The queue size of log statements from a job.
log-buffer-size = 500
# If no job is left in the queue, the scheduler will wait until a
# notify is requested (using the REST interface). To also retry
# stuck jobs, it will notify itself periodically.
wakeup-period = "30 minutes"
}
periodic-scheduler {
# Each scheduler needs a unique name. This defaults to the node
# name, which must be unique, too.
name = ${docspell.joex.app-id}
# A fallback to start looking for due periodic tasks regularily.
# Usually joex instances should be notified via REST calls if
# external processes change tasks. But these requests may get
# lost.
wakeup-period = "10 minutes"
}
# Configuration of text extraction
extraction {
# For PDF files it is first tried to read the text parts of the
# PDF. But PDFs can be complex documents and they may contain text
# and images. If the returned text is shorter than the value
# below, OCR is run afterwards. Then both extracted texts are
# compared and the longer will be used.
pdf {
min-text-len = 10
}
# Extracting text using OCR works for image and pdf files. It will
# first run ghostscript to create a gray image from a pdf. Then
# unpaper is run to optimize the image for the upcoming ocr, which
# will be done by tesseract. All these programs must be available
# in your PATH or the absolute path can be specified below.
ocr {
# Images greater than this size are skipped. Note that every
# image is loaded completely into memory for doing OCR.
max-image-size = 14000000
# Defines what pages to process. If a PDF with 600 pages is
# submitted, it is probably not necessary to scan through all of
# them. This would take a long time and occupy resources for no
# value. The first few pages should suffice. The default is first
# 10 pages.
#
# If you want all pages being processed, set this number to -1.
#
# Note: if you change the ghostscript command below, be aware that
# this setting (if not -1) will add another parameter to the
# beginning of the command.
page-range {
begin = 10
}
# The ghostscript command.
ghostscript {
command {
program = "gs"
args = [ "-dNOPAUSE"
, "-dBATCH"
, "-dSAFER"
, "-sDEVICE=tiffscaled8"
, "-sOutputFile={{outfile}}"
, "{{infile}}"
]
timeout = "5 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-extraction"
}
# The unpaper command.
unpaper {
command {
program = "unpaper"
args = [ "{{infile}}", "{{outfile}}" ]
timeout = "5 minutes"
}
}
# The tesseract command.
tesseract {
command {
program = "tesseract"
args = ["{{file}}"
, "stdout"
, "-l"
, "{{lang}}"
]
timeout = "5 minutes"
}
}
}
}
# Configuration for converting files into PDFs.
#
# Most of it is delegated to external tools, which can be configured
# below. They must be in the PATH environment or specify the full
# path below via the `program` key.
convert {
# The chunk size used when storing files. This should be the same
# as used with the rest server.
chunk-size = 524288
# When reading images, this is the maximum size. Images that are
# larger are not processed.
max-image-size = ${docspell.joex.extraction.ocr.max-image-size}
# Settings when processing markdown files (and other text files)
# to HTML.
#
# In order to support text formats, text files are first converted
# to HTML using a markdown processor. The resulting HTML is then
# converted to a PDF file.
markdown {
# The CSS that is used to style the resulting HTML.
internal-css = """
body { padding: 2em 5em; }
"""
}
# To convert HTML files into PDF files, the external tool
# wkhtmltopdf is used.
wkhtmlpdf {
command = {
program = "wkhtmltopdf"
args = [
"-s",
"A4",
"--encoding",
"UTF-8",
"-",
"{{outfile}}"
]
timeout = "2 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = {
command = {
program = "tesseract"
args = [
"{{infile}}",
"out",
"-l",
"{{lang}}",
"pdf",
"txt"
]
timeout = "5 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
# To convert "office" files to PDF files, the external tool
# unoconv is used. Unoconv uses libreoffice/openoffice for
# converting. So it supports all formats that are possible to read
# with libreoffice/openoffic.
#
# Note: to greatly improve performance, it is recommended to start
# a libreoffice listener by running `unoconv -l` in a separate
# process.
unoconv = {
command = {
program = "unoconv"
args = [
"-f",
"pdf",
"-o",
"{{outfile}}",
"{{infile}}"
]
timeout = "2 minutes"
}
working-dir = ${java.io.tmpdir}"/docspell-convert"
}
}
}