Initial version.

Features: - Upload PDF files let them analyze - Manage meta data and items - See processing in webapp
2025-10-20 12:20:12 +00:00 · 2019-07-23 00:53:30 +02:00
parent 6154e6a387
commit 831cd8b655
341 changed files with 23634 additions and 484 deletions
--- a/modules/joex/src/main/resources/logback.xml
+++ b/modules/joex/src/main/resources/logback.xml
@@ -0,0 +1,14 @@
+<configuration>
+  <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+    <withJansi>true</withJansi>
+
+    <encoder>
+      <pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
+    </encoder>
+  </appender>
+
+  <logger name="docspell" level="debug" />
+  <root level="INFO">
+    <appender-ref ref="STDOUT" />
+  </root>
+</configuration>
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -1,3 +1,131 @@
 docspell.joex {

+  # This is the id of this node. If you run more than one server, you
+  # have to make sure to provide unique ids per node.
+  app-id = "joex1"
+
+
+  # This is the base URL this application is deployed to. This is used
+  # to register this joex instance such that docspell rest servers can
+  # reach them
+  base-url = "http://localhost:7878"
+
+  # Where the REST server binds to.
+  #
+  # JOEX provides a very simple REST interface to inspect its state.
+  bind {
+    address = "localhost"
+    port = 7878
+  }
+
+  # The database connection.
+  #
+  # By default a H2 file-based database is configured. You can provide
+  # a postgresql or mariadb connection here. When using H2 use the
+  # PostgreSQL compatibility mode and AUTO_SERVER feature.
+  #
+  # It must be the same connection as the rest server is using.
+  jdbc {
+    url = "jdbc:h2://"${java.io.tmpdir}"/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
+    user = "sa"
+    password = ""
+  }
+
+  # Configuration for the job scheduler.
+  scheduler {
+
+    # Each scheduler needs a unique name. This defaults to the node
+    # name, which must be unique, too.
+    name = ${docspell.joex.app-id}
+
+    # Number of processing allowed in parallel.
+    pool-size = 2
+
+    # A counting scheme determines the ratio of how high- and low-prio
+    # jobs are run. For example: 4,1 means run 4 high prio jobs, then
+    # 1 low prio and then start over.
+    counting-scheme = "4,1"
+
+    # How often a failed job should be retried until it enters failed
+    # state. If a job fails, it becomes "stuck" and will be retried
+    # after a delay.
+    retries = 5
+
+    # The delay until the next try is performed for a failed job. This
+    # delay is increased exponentially with the number of retries.
+    retry-delay = "1 minute"
+
+    # The queue size of log statements from a job.
+    log-buffer-size = 500
+
+    # If no job is left in the queue, the scheduler will wait until a
+    # notify is requested (using the REST interface). To also retry
+    # stuck jobs, it will notify itself periodically.
+    wakeup-period = "30 minutes"
+  }
+
+  # Configuration of text extraction
+  #
+  # Extracting text currently only work for image and pdf files. It
+  # will first runs ghostscript to create a gray image from a
+  # pdf. Then unpaper is run to optimize the image for the upcoming
+  # ocr, which will be done by tesseract. All these programs must be
+  # available in your PATH or the absolute path can be specified
+  # below.
+  extraction {
+    allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
+
+    # Defines what pages to process. If a PDF with 600 pages is
+    # submitted, it is probably not necessary to scan through all of
+    # them. This would take a long time and occupy resources for no
+    # value. The first few pages should suffice. The default is first
+    # 10 pages.
+    #
+    # If you want all pages being processed, set this number to -1.
+    #
+    # Note: if you change the ghostscript command below, be aware that
+    # this setting (if not -1) will add another parameter to the
+    # beginning of the command.
+    page-range {
+      begin = 10
+    }
+
+    # The ghostscript command.
+    ghostscript {
+      command {
+        program = "gs"
+        args = [ "-dNOPAUSE"
+               , "-dBATCH"
+               , "-dSAFER"
+               , "-sDEVICE=tiffscaled8"
+               , "-sOutputFile={{outfile}}"
+               , "{{infile}}"
+               ]
+        timeout = "5 minutes"
+      }
+      working-dir = ${java.io.tmpdir}"/docspell-extraction"
+    }
+
+    # The unpaper command.
+    unpaper {
+      command {
+        program = "unpaper"
+        args = [ "{{infile}}", "{{outfile}}" ]
+        timeout = "5 minutes"
+      }
+    }
+
+    # The tesseract command.
+    tesseract {
+      command {
+        program = "tesseract"
+        args = ["{{file}}"
+               , "stdout"
+               , "-l"
+               , "{{lang}}"
+               ]
+        timeout = "5 minutes"
+      }
+    }
+  }
 }