mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Periodic tasks are special in that they are usually kept around and started based on a schedule. A new component checks periodic tasks and submits them in the queue once they are due. In order to avoid duplicate periodic jobs, the tracker of a job is used to store the periodic job id. Each time a periodic task is due, it is first checked if there is a job running (or queued) for this task.
245 lines
7.3 KiB
Plaintext
245 lines
7.3 KiB
Plaintext
docspell.joex {
|
|
|
|
# This is the id of this node. If you run more than one server, you
|
|
# have to make sure to provide unique ids per node.
|
|
app-id = "joex1"
|
|
|
|
|
|
# This is the base URL this application is deployed to. This is used
|
|
# to register this joex instance such that docspell rest servers can
|
|
# reach them
|
|
base-url = "http://localhost:7878"
|
|
|
|
# Where the REST server binds to.
|
|
#
|
|
# JOEX provides a very simple REST interface to inspect its state.
|
|
bind {
|
|
address = "localhost"
|
|
port = 7878
|
|
}
|
|
|
|
# The database connection.
|
|
#
|
|
# By default a H2 file-based database is configured. You can provide
|
|
# a postgresql or mariadb connection here. When using H2 use the
|
|
# PostgreSQL compatibility mode and AUTO_SERVER feature.
|
|
#
|
|
# It must be the same connection as the rest server is using.
|
|
jdbc {
|
|
url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
|
|
user = "sa"
|
|
password = ""
|
|
}
|
|
|
|
# Configuration for the job scheduler.
|
|
scheduler {
|
|
|
|
# Each scheduler needs a unique name. This defaults to the node
|
|
# name, which must be unique, too.
|
|
name = ${docspell.joex.app-id}
|
|
|
|
# Number of processing allowed in parallel.
|
|
pool-size = 2
|
|
|
|
# A counting scheme determines the ratio of how high- and low-prio
|
|
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
|
|
# 1 low prio and then start over.
|
|
counting-scheme = "4,1"
|
|
|
|
# How often a failed job should be retried until it enters failed
|
|
# state. If a job fails, it becomes "stuck" and will be retried
|
|
# after a delay.
|
|
retries = 5
|
|
|
|
# The delay until the next try is performed for a failed job. This
|
|
# delay is increased exponentially with the number of retries.
|
|
retry-delay = "1 minute"
|
|
|
|
# The queue size of log statements from a job.
|
|
log-buffer-size = 500
|
|
|
|
# If no job is left in the queue, the scheduler will wait until a
|
|
# notify is requested (using the REST interface). To also retry
|
|
# stuck jobs, it will notify itself periodically.
|
|
wakeup-period = "30 minutes"
|
|
}
|
|
|
|
periodic-scheduler {
|
|
|
|
# Each scheduler needs a unique name. This defaults to the node
|
|
# name, which must be unique, too.
|
|
name = ${docspell.joex.app-id}
|
|
|
|
# A fallback to start looking for due periodic tasks regularily.
|
|
# Usually joex instances should be notified via REST calls if
|
|
# external processes change tasks. But these requests may get
|
|
# lost.
|
|
wakeup-period = "10 minutes"
|
|
}
|
|
|
|
# Configuration of text extraction
|
|
extraction {
|
|
# For PDF files it is first tried to read the text parts of the
|
|
# PDF. But PDFs can be complex documents and they may contain text
|
|
# and images. If the returned text is shorter than the value
|
|
# below, OCR is run afterwards. Then both extracted texts are
|
|
# compared and the longer will be used.
|
|
pdf {
|
|
min-text-len = 10
|
|
}
|
|
|
|
# Extracting text using OCR works for image and pdf files. It will
|
|
# first run ghostscript to create a gray image from a pdf. Then
|
|
# unpaper is run to optimize the image for the upcoming ocr, which
|
|
# will be done by tesseract. All these programs must be available
|
|
# in your PATH or the absolute path can be specified below.
|
|
ocr {
|
|
|
|
# Images greater than this size are skipped. Note that every
|
|
# image is loaded completely into memory for doing OCR.
|
|
max-image-size = 14000000
|
|
|
|
# Defines what pages to process. If a PDF with 600 pages is
|
|
# submitted, it is probably not necessary to scan through all of
|
|
# them. This would take a long time and occupy resources for no
|
|
# value. The first few pages should suffice. The default is first
|
|
# 10 pages.
|
|
#
|
|
# If you want all pages being processed, set this number to -1.
|
|
#
|
|
# Note: if you change the ghostscript command below, be aware that
|
|
# this setting (if not -1) will add another parameter to the
|
|
# beginning of the command.
|
|
page-range {
|
|
begin = 10
|
|
}
|
|
|
|
# The ghostscript command.
|
|
ghostscript {
|
|
command {
|
|
program = "gs"
|
|
args = [ "-dNOPAUSE"
|
|
, "-dBATCH"
|
|
, "-dSAFER"
|
|
, "-sDEVICE=tiffscaled8"
|
|
, "-sOutputFile={{outfile}}"
|
|
, "{{infile}}"
|
|
]
|
|
timeout = "5 minutes"
|
|
}
|
|
working-dir = ${java.io.tmpdir}"/docspell-extraction"
|
|
}
|
|
|
|
# The unpaper command.
|
|
unpaper {
|
|
command {
|
|
program = "unpaper"
|
|
args = [ "{{infile}}", "{{outfile}}" ]
|
|
timeout = "5 minutes"
|
|
}
|
|
}
|
|
|
|
# The tesseract command.
|
|
tesseract {
|
|
command {
|
|
program = "tesseract"
|
|
args = ["{{file}}"
|
|
, "stdout"
|
|
, "-l"
|
|
, "{{lang}}"
|
|
]
|
|
timeout = "5 minutes"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Configuration for converting files into PDFs.
|
|
#
|
|
# Most of it is delegated to external tools, which can be configured
|
|
# below. They must be in the PATH environment or specify the full
|
|
# path below via the `program` key.
|
|
convert {
|
|
|
|
# The chunk size used when storing files. This should be the same
|
|
# as used with the rest server.
|
|
chunk-size = 524288
|
|
|
|
# When reading images, this is the maximum size. Images that are
|
|
# larger are not processed.
|
|
max-image-size = ${docspell.joex.extraction.ocr.max-image-size}
|
|
|
|
# Settings when processing markdown files (and other text files)
|
|
# to HTML.
|
|
#
|
|
# In order to support text formats, text files are first converted
|
|
# to HTML using a markdown processor. The resulting HTML is then
|
|
# converted to a PDF file.
|
|
markdown {
|
|
|
|
# The CSS that is used to style the resulting HTML.
|
|
internal-css = """
|
|
body { padding: 2em 5em; }
|
|
"""
|
|
}
|
|
|
|
# To convert HTML files into PDF files, the external tool
|
|
# wkhtmltopdf is used.
|
|
wkhtmlpdf {
|
|
command = {
|
|
program = "wkhtmltopdf"
|
|
args = [
|
|
"-s",
|
|
"A4",
|
|
"--encoding",
|
|
"UTF-8",
|
|
"-",
|
|
"{{outfile}}"
|
|
]
|
|
timeout = "2 minutes"
|
|
}
|
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
|
}
|
|
|
|
# To convert image files to PDF files, tesseract is used. This
|
|
# also extracts the text in one go.
|
|
tesseract = {
|
|
command = {
|
|
program = "tesseract"
|
|
args = [
|
|
"{{infile}}",
|
|
"out",
|
|
"-l",
|
|
"{{lang}}",
|
|
"pdf",
|
|
"txt"
|
|
]
|
|
timeout = "5 minutes"
|
|
}
|
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
|
}
|
|
|
|
# To convert "office" files to PDF files, the external tool
|
|
# unoconv is used. Unoconv uses libreoffice/openoffice for
|
|
# converting. So it supports all formats that are possible to read
|
|
# with libreoffice/openoffic.
|
|
#
|
|
# Note: to greatly improve performance, it is recommended to start
|
|
# a libreoffice listener by running `unoconv -l` in a separate
|
|
# process.
|
|
unoconv = {
|
|
command = {
|
|
program = "unoconv"
|
|
args = [
|
|
"-f",
|
|
"pdf",
|
|
"-o",
|
|
"{{outfile}}",
|
|
"{{infile}}"
|
|
]
|
|
timeout = "2 minutes"
|
|
}
|
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
|
}
|
|
}
|
|
} |