mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-08-24 23:24:52 +00:00
Initial version.
Features: - Upload PDF files let them analyze - Manage meta data and items - See processing in webapp
This commit is contained in:
14
modules/joex/src/main/resources/logback.xml
Normal file
14
modules/joex/src/main/resources/logback.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<configuration>
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<withJansi>true</withJansi>
|
||||
|
||||
<encoder>
|
||||
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<logger name="docspell" level="debug" />
|
||||
<root level="INFO">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
@ -1,3 +1,131 @@
|
||||
docspell.joex {
|
||||
|
||||
# This is the id of this node. If you run more than one server, you
|
||||
# have to make sure to provide unique ids per node.
|
||||
app-id = "joex1"
|
||||
|
||||
|
||||
# This is the base URL this application is deployed to. This is used
|
||||
# to register this joex instance such that docspell rest servers can
|
||||
# reach them
|
||||
base-url = "http://localhost:7878"
|
||||
|
||||
# Where the REST server binds to.
|
||||
#
|
||||
# JOEX provides a very simple REST interface to inspect its state.
|
||||
bind {
|
||||
address = "localhost"
|
||||
port = 7878
|
||||
}
|
||||
|
||||
# The database connection.
|
||||
#
|
||||
# By default a H2 file-based database is configured. You can provide
|
||||
# a postgresql or mariadb connection here. When using H2 use the
|
||||
# PostgreSQL compatibility mode and AUTO_SERVER feature.
|
||||
#
|
||||
# It must be the same connection as the rest server is using.
|
||||
jdbc {
|
||||
url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
|
||||
user = "sa"
|
||||
password = ""
|
||||
}
|
||||
|
||||
# Configuration for the job scheduler.
|
||||
scheduler {
|
||||
|
||||
# Each scheduler needs a unique name. This defaults to the node
|
||||
# name, which must be unique, too.
|
||||
name = ${docspell.joex.app-id}
|
||||
|
||||
# Number of processing allowed in parallel.
|
||||
pool-size = 2
|
||||
|
||||
# A counting scheme determines the ratio of how high- and low-prio
|
||||
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
|
||||
# 1 low prio and then start over.
|
||||
counting-scheme = "4,1"
|
||||
|
||||
# How often a failed job should be retried until it enters failed
|
||||
# state. If a job fails, it becomes "stuck" and will be retried
|
||||
# after a delay.
|
||||
retries = 5
|
||||
|
||||
# The delay until the next try is performed for a failed job. This
|
||||
# delay is increased exponentially with the number of retries.
|
||||
retry-delay = "1 minute"
|
||||
|
||||
# The queue size of log statements from a job.
|
||||
log-buffer-size = 500
|
||||
|
||||
# If no job is left in the queue, the scheduler will wait until a
|
||||
# notify is requested (using the REST interface). To also retry
|
||||
# stuck jobs, it will notify itself periodically.
|
||||
wakeup-period = "30 minutes"
|
||||
}
|
||||
|
||||
# Configuration of text extraction
|
||||
#
|
||||
# Extracting text currently only work for image and pdf files. It
|
||||
# will first runs ghostscript to create a gray image from a
|
||||
# pdf. Then unpaper is run to optimize the image for the upcoming
|
||||
# ocr, which will be done by tesseract. All these programs must be
|
||||
# available in your PATH or the absolute path can be specified
|
||||
# below.
|
||||
extraction {
|
||||
allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
|
||||
|
||||
# Defines what pages to process. If a PDF with 600 pages is
|
||||
# submitted, it is probably not necessary to scan through all of
|
||||
# them. This would take a long time and occupy resources for no
|
||||
# value. The first few pages should suffice. The default is first
|
||||
# 10 pages.
|
||||
#
|
||||
# If you want all pages being processed, set this number to -1.
|
||||
#
|
||||
# Note: if you change the ghostscript command below, be aware that
|
||||
# this setting (if not -1) will add another parameter to the
|
||||
# beginning of the command.
|
||||
page-range {
|
||||
begin = 10
|
||||
}
|
||||
|
||||
# The ghostscript command.
|
||||
ghostscript {
|
||||
command {
|
||||
program = "gs"
|
||||
args = [ "-dNOPAUSE"
|
||||
, "-dBATCH"
|
||||
, "-dSAFER"
|
||||
, "-sDEVICE=tiffscaled8"
|
||||
, "-sOutputFile={{outfile}}"
|
||||
, "{{infile}}"
|
||||
]
|
||||
timeout = "5 minutes"
|
||||
}
|
||||
working-dir = ${java.io.tmpdir}"/docspell-extraction"
|
||||
}
|
||||
|
||||
# The unpaper command.
|
||||
unpaper {
|
||||
command {
|
||||
program = "unpaper"
|
||||
args = [ "{{infile}}", "{{outfile}}" ]
|
||||
timeout = "5 minutes"
|
||||
}
|
||||
}
|
||||
|
||||
# The tesseract command.
|
||||
tesseract {
|
||||
command {
|
||||
program = "tesseract"
|
||||
args = ["{{file}}"
|
||||
, "stdout"
|
||||
, "-l"
|
||||
, "{{lang}}"
|
||||
]
|
||||
timeout = "5 minutes"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user