diff --git a/.gitignore b/.gitignore index 64e525cd..6123871b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,11 @@ _site/ /website/site/static/syntax-*.css /website/site/static/webfonts/ /website/site/static/files/*.woff* +/website/site/static/examples/ +/website/site/templates/shortcodes/addon-output +/website/site/templates/shortcodes/item-data +/website/site/templates/shortcodes/item-args +/website/site/templates/shortcodes/file-meta /website/site/templates/shortcodes/server.conf /website/site/templates/shortcodes/sample-exim.conf /website/site/templates/shortcodes/joex.conf diff --git a/build.sbt b/build.sbt index 8156fc49..64234fb8 100644 --- a/build.sbt +++ b/build.sbt @@ -966,8 +966,28 @@ val website = project ) IO.append(target, IO.readBytes(changelog)) Seq(target) - }.taskValue + }.taskValue, + zolaPrepare := { + val log = streams.value.log + log.info("Generating examples…") + val templateOut = baseDirectory.value / "site" / "templates" / "shortcodes" + IO.createDirectory(templateOut) + + // sbt crashes when interpolating values into the string in `toTask` + // this is the reason for the followingy construct… + (Compile / run).toTask(s" addon-output /tmp/addon-output.json").value + (Compile / run).toTask(s" item-data /tmp/item-data.json").value + (Compile / run).toTask(s" item-args /tmp/item-args.json").value + (Compile / run).toTask(s" file-meta /tmp/file-meta.json").value + + val inputs = List("addon-output", "item-data", "item-args", "file-meta") + + inputs.foreach { name => + IO.move(file(s"/tmp/$name.json"), templateOut / name) + } + } ) + .dependsOn(addonlib, joex) val root = project .in(file(".")) diff --git a/project/ZolaPlugin.scala b/project/ZolaPlugin.scala index afa3f8bb..1311fadc 100644 --- a/project/ZolaPlugin.scala +++ b/project/ZolaPlugin.scala @@ -23,6 +23,7 @@ object ZolaPlugin extends AutoPlugin { "'python -m SimpleHTTPServer 1234' for example." ) val zolaCheck = taskKey[Unit]("Runs zola check to check links") + val zolaPrepare = taskKey[Unit]("Some task to run before generating docs") } import autoImport._ @@ -33,10 +34,12 @@ object ZolaPlugin extends AutoPlugin { zolaOutputDir := target.value / "zola-site", zolaCommand := "zola", zolaTestBaseUrl := "http://localhost:1234", + zolaPrepare := {}, zolaBuild := { val logger = streams.value.log logger.info("Building web site using zola ...") (Compile / resources).value + zolaPrepare.value buildSite(zolaCommand.value, zolaRootDir.value, zolaOutputDir.value, None, logger) logger.info("Website built") }, @@ -45,6 +48,7 @@ object ZolaPlugin extends AutoPlugin { val baseurl = zolaTestBaseUrl.value logger.info("Building web site (test) using zola ...") (Compile / resources).value + zolaPrepare.value buildSite( zolaCommand.value, zolaRootDir.value, diff --git a/website/scripts/screenshot2.sh b/website/scripts/screenshot2.sh index 00e368b7..f8f45374 100755 --- a/website/scripts/screenshot2.sh +++ b/website/scripts/screenshot2.sh @@ -19,7 +19,7 @@ out_base="$1" work_dir=$(mktemp -dt screenshot2-script.XXXXXX) export HOME=$work_dir export RATIO="16:9" -export WAIT_SEC=4 +export WAIT_SEC=${WAIT_SEC:-4} #export TOP_CUT=400 dsc write-default-config diff --git a/website/site/content/blog/2022-05-16_audio_file_addon.md b/website/site/content/blog/2022-05-16_audio_file_addon.md new file mode 100644 index 00000000..7055ddd5 --- /dev/null +++ b/website/site/content/blog/2022-05-16_audio_file_addon.md @@ -0,0 +1,581 @@ ++++ +title = "Addon for audio file support" +[extra] +author = "eikek" ++++ + +# 1st Addon: Audio file support + +Since version 0.36.0 Docspell can be extended by +[addons](@/docs/addons/basics.md) - external programs that are +executed at some defined point in Docspell. This is a walk through the +first addon that was created, mainly as an example: providing support +for audio files. + + + +I think it is interesting to provide support for audio files for a +DMS, although admittedly I don't have much of a use :). But this is +the kind of use-case that addons are for. + +# The idea + +The idea is very simple: the real work is done by external programs, +most notably [coqui's stt](https://github.com/coqui-ai/STT) a deep +learning toolkit originally created at Mozilla. It provides a command +line tool that accepts a WAV file and spits out text. Perfect! + +With this text, a PDF file can be created and a preview image which is +already enough for basic support. You can see the pdf in the web-ui +and search for the text via SOLR or PostgreSQL. + +Because a WAV file is not the most popular format today, `ffmpeg` can +be used to transform any other audio to WAV. + +The only thing now is to create a program that checks the uploaded +files, filters out all audio files and runs them through the mentioned +programs. So let's do this. + +# Preparation + +Addons are external programs and can be written in whatever language…. +For me this is a good opportunity to refresh my rusty scheme know-how +a bit. So this addon is written in Scheme, in particular +[guile](https://www.gnu.org/software/guile/). Programming in scheme is +fun and guile provides good integration into the (posix) OS and also +has a nice JSON module. I had the [reference +docs](https://www.gnu.org/software/guile/docs/docs-2.2/guile-ref/index.html) +open all the time - look at them for further details on the used +functions. + +It's usually good to play around with the tools at first. For stt, we +first need to download a *model*. This will be used to "detect" the +text in the audio data. They have a [page](https://coqui.ai/models) +where we can download model files for any supported language. For the +addon, we will implement English and German. + +When creating a PDF with wkhtmltopdf, we prettify it a little by +embedding the plain text into some html template. This will also take +care to specifiy UTF-8 as default encoding directly in the HTML +template. + +FFMpeg just works as usual. It figures out the input format +automatically and knows from the extension of the output file what to +do. + +You can find the full code +[here](https://github.com/docspell/audio-files-addon/blob/master/src/addon.scm). +The following shows excerpts from it with some explanation. + +# The script + +## Helpers + +After the preamble, there are two helper functions. + +```lisp +(define* (errln formatstr . args) + (apply format (current-error-port) formatstr args) + (newline)) + +;; Macro for executing system commands and making this program exit in +;; case of failure. +(define-syntax sysexec + (syntax-rules () + ((sysexec exp ...) + (let ((rc (apply system* (list exp ...)))) + (unless (eqv? rc EXIT_SUCCESS) + (format (current-error-port) "> '~a …' failed with: ~#*~:*~d~%" exp ... rc) + (exit 1)) + #t)))) +``` + +As this addon wants to pass data back to Docspell via stdout, we use +the stderr for logging and printing general information. The function +`errln` (short for "error line" :)) allows to conveniently print to +stderr and the second wraps the `system*` procedure such that the +script fails whenever the external program fails. It is somewhat +similar to `set -e` in bash. + +## Dependencies + +Next is the declaration of external dependencies. At first all +external programs are listed. This is important for later, when the +script is packaged via nix. Nix will substitute these commands with +absolute paths. Then it's good to not have them scattered around. + +It also reads in the expected environment variables (only those we +need) that are provided by Docspell. Since this addon only makes sense +to work on an item, it quits early should some env vars are missing. + +```lisp +(define *curl* "curl") +(define *ffmpeg* "ffmpeg") +(define *stt* "stt") +(define *wkhtmltopdf* "wkhtmltopdf") + +;; Getting some environment variables +(define *output-dir* (getenv "OUTPUT_DIR")) +(define *tmp-dir* (getenv "TMP_DIR")) +(define *cache-dir* (getenv "CACHE_DIR")) + +(define *item-data-json* (getenv "ITEM_DATA_JSON")) +(define *original-files-json* (getenv "ITEM_ORIGINAL_JSON")) +(define *original-files-dir* (getenv "ITEM_ORIGINAL_DIR")) + +;; fail early if not in the right context +(when (not *item-data-json*) + (errln "No item data json file found.") + (exit 1)) +``` + +## Input/Output + +The input and output schemas can be defined now. This uses the +[guile-json](https://github.com/aconchillo/guile-json) module. It +provides very convenient features for reading and writing json. + +It is possible to define a record via `define-json-type` that +generates readers and writers to/from JSON. For example, the record +`` is defined to be an object with only one field `id`. The +function `json->scm` reads in json into scheme datastructures and then +the generated function `scm->itemdata` creates the record from it. For +every record, accessor functions exists. For example: `(itemdata-id +data)` would lookup the field `id` in the given itemdata record +`data`. + +Here we need it to get the item-id and the list of file properties +belonging to the original uploaded files. + +Another interesting definition is the `` record. This captures +(a subset of) the schema of what Docspell receives from this addon as +a result. A full example of this data is +[here](@/docs/addons/writing.md#output). We don't need `commands` or +`newItems`, so this schema only cares about the `files` attribute. + + +```lisp +(define-json-type + (id)) + +;; The array of original files +(define-json-type + (id) + (name) + (position) + (language) + (mimetype) + (length) + (checksum)) + +;; The output record, what is returned to docspell +(define-json-type + (itemId) + (textFiles) + (pdfFiles)) +(define-json-type + (files "files" #())) + +;; Parses the JSON containing the item information +(define *itemdata-json* + (scm->itemdata (call-with-input-file *item-data-json* json->scm))) + +;; The JSON file containing meta data for all source files as vector. +(define *original-meta-json* + (let ((props (vector->list (call-with-input-file *original-files-json* json->scm)))) + (map scm->original-file props))) +``` + + +## Finding the audio file + +The previously parsed json array `*original-meta-json*` can now be +used to find any audio files within the original uploaded files, as +done in `find-audio-files`. It simply goes through the list and keeps +those files whose mimetype starts with `audio/`. The mimetype is +provided by Docspell in the file properties in `ITEM_ORIGINAL_JSON`. + +Before converting to wav with ffmpeg, it is quickly checked if it's +not a wav already. + + +```lisp +(define (is-wav? mime) + "Test whether the mimetype MIME is denoting a wav file." + (or (string-suffix? "/wav" mime) + (string-suffix? "/x-wav" mime) + (string-suffix? "/vnd.wav" mime))) + +(define (find-audio-files) + "Find all source files that are audio files." + (filter! (lambda (el) + (string-prefix? + "audio/" + (original-file-mimetype el))) + *original-meta-json*)) + +(define (convert-wav id mime) + "Run ffmpeg to convert to wav." + (let ((src-file (format #f "~a/~a" *original-files-dir* id)) + (out-file (format #f "~a/in.wav" *tmp-dir*))) + (if (is-wav? mime) + src-file + (begin + (errln "Running ffmpeg to convert wav file...") + (sysexec *ffmpeg* "-loglevel" "error" "-y" "-i" src-file out-file) + out-file)))) +``` + +## Speech to text + +Once we have a wav file, we can run speech-to-text recognition on it. +As said above, we need to download a model first, which is depending +on a language. Luckily, Docspell provides the language of the file. +This is the lanugage either given directly by the user when uploading +or it's the collective's default language. + +In the following snippet, we get the language as arguments. We will +get it later from the file properties. + +As seen below, the model file is stored to the `CACHE_DIR`. This is +provided by Docspell and will survive the execution of this script. +All other directories involved will be deleted eventually. The +`CACHE_DIR` is the place to store intermediate results you don't want +to loose between addon runs. But as any cache, it may not exist the +next time the addon is run. Docspell doesn't clear it automatically, +though. + +The last function simply executes the `stt` external command and puts +stdout into a file. + +```lisp +(define (get-model language) + (let* ((lang (or language "eng")) + (file (format #f "~a/model_~a.pbmm" *cache-dir* lang))) + (unless (file-exists? file) + (download-model lang file)) + file)) + +(define (download-model lang file) + "Download model files per language. Nix has currently stt 0.9.3 packaged." + (let ((url (cond + ((string= lang "eng") "https://coqui.gateway.scarf.sh/english/coqui/v0.9.3/model.pbmm") + ((string= lang "deu") "https://coqui.gateway.scarf.sh/german/AASHISHAG/v0.9.0/model.pbmm") + (else (error "Unsupported language: " lang))))) + (errln "Downloading model file for language: ~a" lang) + (sysexec *curl* "-SsL" "-o" file url) + file)) + +(define (extract-text model input out) + "Runs stt for speech-to-text and writes the text into the file OUT." + (errln "Extracting text from audio…") + (with-output-to-file out + (lambda () + (sysexec *stt* "--model" model "--audio" input)))) +``` + + +## Create PDF + +Creating the PDF is straight forward. The extracted text is embedded +into a HTML file which is then passed to `wkhtmltopdf`. Since we don't +need this file for anything else, it is stored to the `TMP_DIR`. + +```lisp +(define (create-pdf txt-file out) + (define (line str) + (format #t "~a\n" str)) + (errln "Creating pdf file…") + (let ((tmphtml (format #f "~a/text.html" *tmp-dir*))) + (with-output-to-file tmphtml + (lambda () + (line "") + (line "") + (line " ") + (line " ") + (line "
") + (line " Extracted from audio using stt on ") + (display (strftime "%c" (localtime (current-time)))) + (line "
") + (line "

") + (display (call-with-input-file txt-file read-string)) + (line "

") + (line ""))) + (sysexec *wkhtmltopdf* tmphtml out))) +``` + + +## Putting it together + +The main function now puts everything together. The `process-file` +function is called for every file that is returned from +`(find-audio-files)`. It will extract the necessary information (like +the language) from the json document via record accessors (e.g. +`original-file-lanugage file)`) and then calls the functions defined +above. At last it creates a `` record with `make-itemfiles`. + +An `` record contains now the important information for +Docspell. It requires the item-id and a mapping from attachment-ids to +files in `OUTPUT_DIR`. For each attachment identified by its ID, +Docspell replaces the extracted text with the contents of the given +file and replaces the converted PDF file, respectively. In the code +below, two lists of such mappings are defined - the first for the text +files, the second for the converted pdf. The files must be specified +relative to `OUTPUT_DIR`. + +That means `process-all` returns a list of `` records which +is then used to create the `` record. And finally, a +`output->json` function will turn the record into proper JSON which is +send to stdout. + +```lisp +(define (process-file itemid file) + "Processing a single audio file." + (let* ((id (original-file-id file)) + (mime (original-file-mimetype file)) + (lang (original-file-language file)) + (txt-file (format #f "~a/~a.txt" *output-dir* id)) + (pdf-file (format #f "~a/~a.pdf" *output-dir* id)) + (wav (convert-wav id mime)) + (model (get-model lang))) + (extract-text model wav txt-file) + (create-pdf txt-file pdf-file) + (make-itemfiles itemid + `((,id . ,(format #f "~a.txt" id))) + `((,id . ,(format #f "~a.pdf" id)))))) + +(define (process-all) + (let ((item-id (itemdata-id *itemdata-json*))) + (map (lambda (file) + (process-file item-id file)) + (find-audio-files)))) + +(define (main args) + (let ((out (make-output (process-all)))) + (format #t "~a" (output->json out)))) +``` + +Example output: + +```json +{ + "files": [ + { + "itemId":"qZDnyGIAJsXr", + "textFiles": { "HPFvIDib6eA": "HPFvIDib6eA.txt" }, + "pdfFiles": { "HPFvIDib6eA": "HPFvIDib6eA.pdf"} + } + ] +} +``` + +# Packaging + +Now with that script some additional plumbing is needed to make it an +"Addon" for Docspell. + +The external tools - stt, ffmpeg, curl and wkhtmltopdf are required as +well as guile to compile and interpret the script. Also the guile-json +module must be installed. + +This can turn into a quite tedious task. Luckily, there is +[nix](https://nixos.org) that has an answer to this. A user who wants +to use this script only needs to install nix. This package manager +then takes care of providing the exact dependencies we need (down to +the correct version and including guile as the language and runtime). + +## A flake + +Everything is defined in the `flake.nix` in the source root. It looks +like this: + +```nix +{ + description = "A docspell addon for basic audio file support"; + + inputs = { + utils.url = "github:numtide/flake-utils"; + + # Nixpkgs / NixOS version to use. + nixpkgs.url = "nixpkgs/nixos-21.11"; + }; + + outputs = { self, nixpkgs, utils }: + utils.lib.eachSystem ["x86_64-linux"] (system: + let + pkgs = import nixpkgs { + inherit system; + overlays = [ + + ]; + }; + name = "audio-files-addon"; + in rec { + packages.${name} = pkgs.callPackage ./nix/addon.nix { + inherit name; + }; + + defaultPackage = packages.${name}; + + apps.${name} = utils.lib.mkApp { + inherit name; + drv = packages.${name}; + }; + defaultApp = apps.${name}; + + ## … omitted for brevity + } + ); +} +``` + +First sad thing is, that only `x86_64` systems are supported. This is +due to `stt` not being available on other platforms currently (as +provided by nixpkgs). + +The rest is a bit magic: A package and "defaultPackage" is defined +with a reference to `nix/addon.nix`. The important part is the line + +```nix + inputs = { + # Nixpkgs / NixOS version to use. + nixpkgs.url = "nixpkgs/nixos-21.11"; + }; +``` + +It says that as input for "building" the script, we take all of +[nixpkgs](https://github.com/NixOS/nixpkgs) which is a package +collection defined for (and in) nix - including thousands of software +packages. We can pick and choose from these. No surprise, all external +tools we need are included! + +A flake defines the inputs and outputs of a package. With all of +nixpkgs as inputs, we can create a definition to elevate this script +into a *package*. + +## Package definition + +The definition for "building" the script is in `nix/addon.nix`: + +```nix +{ stdenv, bash, cacert, curl, stt, wkhtmltopdf, ffmpeg, guile, guile-json, lib, name }: + +stdenv.mkDerivation { + inherit name; + src = lib.sources.cleanSource ../.; + + buildInputs = [ guile guile-json ]; + + patchPhase = '' + TARGET=src/addon.scm + sed -i 's,\*curl\* "curl",\*curl\* "${curl}/bin/curl",g' $TARGET + sed -i 's,\*ffmpeg\* "ffmpeg",\*ffmpeg\* "${ffmpeg}/bin/ffmpeg",g' $TARGET + sed -i 's,\*stt\* "stt",\*stt\* "${stt}/bin/stt",g' $TARGET + sed -i 's,\*wkhtmltopdf\* "wkhtmltopdf",\*wkhtmltopdf\* "${wkhtmltopdf}/bin/wkhtmltopdf",g' $TARGET + ''; + + buildPhase = '' + guild compile -o ${name}.go src/addon.scm + ''; + + # module name must be same as .go + installPhase = '' + mkdir -p $out/{bin,lib} + cp ${name}.go $out/lib/ + + cat > $out/bin/${name} <<-EOF + #!${bash}/bin/bash + export SSL_CERT_FILE="${cacert}/etc/ssl/certs/ca-bundle.crt" + exec -a "${name}" ${guile}/bin/guile -C ${guile-json}/share/guile/ccache -C $out/lib -e '(${name}) main' -c "" \$@ + EOF + chmod +x $out/bin/${name} + ''; +} +``` + +With a bit of handwaving - this is a bash script that modifies +slightly the scheme script and runs a compile on it. We simply declare +all packages we need in the first line of `{ … }` - these are +arguments that are automatically filled by nix by searching the +corresponding package in nixpkgs. + +First the `patchPhase` is executed. It will replace the variables +containing the external tools with an absolute path to the version +that we currently get from nixpkgs. With this step nix takes care that +all these packages are available *at runtime* when executing the +script. All versions are finally fixed in `flake.lock` and can be +upgraded manually. + +The `buildPhase` runs the guile compiler that produces some +intermediate code that will be loaded instead of compiling the script +on-the-fly. + +At last, `installPhase` creates a wrapper script that runs guile with +the correct load-path pointing to `guile-json` and to our pre-compiled +script. Additionally, trusted root certificates are exported to make +the curl commands work. This script will be created in `$out` +directory that is provided by nix. + +If you now run `nix build` in the source root, it will execute all +these phases and produce a symlink pointing to the result. You can +then `cat` the resulting file if you are curious. + +This way the script is completely isolated from the system it runs +on - as long as the nix package manager is available. It includes all +the external tools, as well as the underlying runtime (guile)! The +result is a tiny wrapper bash script that can be run "everywhere" +(modulo all the restrictions, like non-x86_64 platforms, of course +:)). + + +## Addon Descriptor + +At last, a small yaml file is needed to tell Docspell a little about +the addon. + +```yaml +meta: + name: "audio-files-addon" + version: "0.1.0" + description: | + This addon adds support for audio files. Audio files are processed + by a speech-to-text engine and a pdf is generated. + + It doesn't expect any user arguments at the moment. It requires + internet access to download model files. + +triggers: + - final-process-item + - final-reprocess-item + - existing-item + +runner: + nix: + enable: true + + docker: + enable: false + + trivial: + enable: true + exec: src/addon.scm + +options: + networking: true + collectOutput: true +``` + +This tells Docspell via `triggers` when this addon may be run. This +one only makes sense for an item. Thus it can be hooked up to run with +every file-processing job or a user can manually trigger it on an +item. + +It also tells via `runner:` that it can be build and run via nix, but +not via docker (I gave up after an hour to create a Dockerfile…). It +could also be run "as-is" but the user then needs to install all these +tools and guile manually. + +# Done + +That's it. You can install this addon in Docspell and create a run +configuration to let it execute when you want. diff --git a/website/site/content/docs/addons/_index.md b/website/site/content/docs/addons/_index.md new file mode 100644 index 00000000..46a36a68 --- /dev/null +++ b/website/site/content/docs/addons/_index.md @@ -0,0 +1,11 @@ ++++ +title = "Addons" +insert_anchor_links = "right" +description = "Describes how addons work." +weight = 55 +template = "pages.html" +sort_by = "weight" +redirect_to = "docs/addons/basics" ++++ + +No content here. diff --git a/website/site/content/docs/addons/addon-install-01.png b/website/site/content/docs/addons/addon-install-01.png new file mode 100644 index 00000000..3624d6ab Binary files /dev/null and b/website/site/content/docs/addons/addon-install-01.png differ diff --git a/website/site/content/docs/addons/addon-install-01_dark.png b/website/site/content/docs/addons/addon-install-01_dark.png new file mode 100644 index 00000000..5c6a0988 Binary files /dev/null and b/website/site/content/docs/addons/addon-install-01_dark.png differ diff --git a/website/site/content/docs/addons/addon-install-02.png b/website/site/content/docs/addons/addon-install-02.png new file mode 100644 index 00000000..0fb495ca Binary files /dev/null and b/website/site/content/docs/addons/addon-install-02.png differ diff --git a/website/site/content/docs/addons/addon-install-02_dark.png b/website/site/content/docs/addons/addon-install-02_dark.png new file mode 100644 index 00000000..68a15368 Binary files /dev/null and b/website/site/content/docs/addons/addon-install-02_dark.png differ diff --git a/website/site/content/docs/addons/addon-install-03.png b/website/site/content/docs/addons/addon-install-03.png new file mode 100644 index 00000000..8a118c79 Binary files /dev/null and b/website/site/content/docs/addons/addon-install-03.png differ diff --git a/website/site/content/docs/addons/addon-install-03_dark.png b/website/site/content/docs/addons/addon-install-03_dark.png new file mode 100644 index 00000000..8bdf5d9c Binary files /dev/null and b/website/site/content/docs/addons/addon-install-03_dark.png differ diff --git a/website/site/content/docs/addons/addon-install-04.png b/website/site/content/docs/addons/addon-install-04.png new file mode 100644 index 00000000..35beb57c Binary files /dev/null and b/website/site/content/docs/addons/addon-install-04.png differ diff --git a/website/site/content/docs/addons/addon-install-04_dark.png b/website/site/content/docs/addons/addon-install-04_dark.png new file mode 100644 index 00000000..06c2d08d Binary files /dev/null and b/website/site/content/docs/addons/addon-install-04_dark.png differ diff --git a/website/site/content/docs/addons/basics.md b/website/site/content/docs/addons/basics.md new file mode 100644 index 00000000..744ebe91 --- /dev/null +++ b/website/site/content/docs/addons/basics.md @@ -0,0 +1,149 @@ ++++ +title = "Basics" +insert_anchor_links = "right" +description = "Docspell Addons." +weight = 10 +template = "docs.html" ++++ + +# Addons + +Addons allow to execute custom software within a defined context in +Docspell. The idea is to be able to support new features and amend +existing ones. + +{% warningbubble(title="Experimental") %} Addons are considered +experimental. The interaction between addons and Docspell is still +subject to change. + +The intended audience for addons are developers (to create addons) and +technically inclined users to install, configure and use them. +{% end %} + +Despite the warning above, addons are a nice way to amend your +docspell server with new things, you are encouraged to try it out and +give feedback ;-). + +{% infobubble(title="Enable addons manually") %} +Addons are disabled by default. They must be enabled in the config +file of the restserver! +{% end %} + + +## What is an Addon? + +An addon is a zip file that contains a `docspell-addon.yml` (or .yaml +or .json) file in its root. The `docspell-addon.yml` is the *addon +descriptor* telling how to run and optionally build the addon. In the +ZIP file, an addon provides a program that expects one argument which +is a file containing the user input for the addon. Addons can +communicate back to docspell via their stdout and/or via directly +calling the docspell server as part of their program. + + +## What can Addons do? + +Addons can accept user input and are arbitrary external programs that +can do whatever they want. However, Docspell can embed running addons +in restricted environments, where they don't have network for example. +Addons can safely communicate to Docspell via their stdout output +returning instructions that Docspell will realise. + +Running addons is managed by docspell. Currently they can be executed: + +- as the final step when processing or re-procssing an item. They then + have access to all the item data that has been collected during + processing (id, extracted text, converted pdfs, etc) and it can work + with that. It may, for example, set more tags or custom fields. +- trigger manually on some existing item +- periodically defined by a schedule. This executes the addons only + with the configured user input. +- … (maybe more to come) + +Since an addon may not make sense to run on all these situations, it +must define a sensible subset via the `triggers` option in its +descriptor. + + +## How are they run + +Addons are always executed by the joex component as an external +process, therefore they can be written in any programming or scripting +language. + +That means the machine running joex possibly needs to match the +requirements of each addon. To ease this, addons can provide a [nix +descripton](https://nixos.wiki/wiki/Flakes) or a `Dockerfile`. Then +you need to prepare the machine only with two things (nix and docker) +to have the prerequisites for running many addons. + + +# More … + +Addons are a flexible way to extend Docspell and require some +technical affinity. However, only "using" addons should not be that +hard, but it will always depend on the documentation of the addon and +its own complexity. + +As the user, you may have different views: preparing the server to be +able to run addons, writing your own addons and finally using them + +The following sections are divided these perspectives: + +## Using Addons + +Addons must be installed and then configured in order before they can +be used. [Using Addons](@/docs/addons/using.md) describes this +perspective. + +{{ buttonright(href="/docs/addons/using", text="More…") }} + +## Control how addons are run + +As the owner of your server, you want to [control how addons are +run](@/docs/addons/control.md). Since addons are arbitrary programs, +potentially downloaded from the internet, they can be run in a +restricted environment. + +{{ buttonright(href="/docs/addons/control", text="More…") }} + + +## Write custom addons + +Finally, [writing addons](@/docs/addons/writing.md) requires (among +other things) to know how to interact with Docspell and what package +format is expected. + +{{ buttonright(href="/docs/addons/writing", text="More…") }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/site/content/docs/addons/control.md b/website/site/content/docs/addons/control.md new file mode 100644 index 00000000..bf57b07c --- /dev/null +++ b/website/site/content/docs/addons/control.md @@ -0,0 +1,238 @@ ++++ +title = "Control Runtime" +insert_anchor_links = "right" +description = "Control how addons are run" +weight = 30 +template = "docs.html" ++++ + +# Control runtime of addons + +Addons are run by the joex component as background tasks in an +external process. Depending on the machine it is running on, the addon +can be run + +- inside a docker container +- inside a systemd-nspawn container +- directly on the machine + +Addons can be provided as source packages, where the final program may +need to be built. They also can depend on other software. In order to +not prepare for each addon, it is recommended to install +[nix](https://nixos.org) with [flakes](https://nixos.wiki/wiki/Flakes) +and docker on the machine running joex. + +Please also look at addon section in the [default +configuration](@/docs/configure/main.md#joex) for joex. + +You need to explicitly enable addons in the restserver config file. + +Docspell uses "runners" to execute an addon. This includes building it +if necessary. The following runner exist: + +- `docker`: uses docker to build an run the addon +- `nix-flake`: builds via `nix build` and runs the executable in + `$out/bin` +- `trivial`: simply executes a file inside the addon (as specified in + the descriptor) + +In the joex configuration you can specify which runners your system +supports. + +## Prepare for *running* addons + +Depending on how you want addons to be run, you need to install either +docker and/or systemd-nspawn on the machine running joex. +Additionally, the user running joex must be able to use these tools. +For docker it usually means to add the user to some group. For +systemd-nspawn you most likely want to configure `sudo` to run +passwordless the `systemd-nspawn` command. + +Without this, an addon can only be run "directly" on the machine that +hosts joex (which might be perfectly fine). The addon then "sees" all +files on the machine and could potentially do harm. + +It is recommended to install `nix` and `docker`, if possible. Addons +may only run with docker or only without, so supporting both leaves +more options. + + +## Prepare for *building* addons + +Addons can be packaged as source or binary packages. For the former, +joex will build the addon first. There are two supported ways to do +so: + +- via `docker build` when the addons provides a `Dockerfile` (use + runner `docker`) +- via `nix build` when the addon provides a `flake.nix` file (use + runner `nix-flake`) + +Both build strategies will cache the resulting artifact, so subsequent +builds will be (almost) no-ops. + +{% infobubble(title="Note") %} +*Building* addons requires to be connected to the internet! Running +them may not require a network connection. +{% end %} + +If the addon is packaged as a binary, then usually the `trivial` +runner (possibly in combination with `systemd-nspawn`) can be used. + +# Runtime + +## Cache directory + +Addons can use a "cache directory" to store data between runs. This +directory is not cleaned by docspell. If you have concerns about +space, use a cron job or systemd-timer to periodically clean this +directory. + +## "Pure" vs "Impure" + +Addons can talk back to Docspell in these ways: they can use the http +api, for example with [dsc](@/docs/tools/cli.md), or they can return +data to instruct Docspell to apply changes. + +The former requires the addon to be connected to the network to reach +the Docspell *restserver*. This allows the addon to do arbitrary +changes at any time - this is the "impure" variant. + +The second approach can be run without network connectivity. When +using docker or systemd-nspawn, Docspell will run these addons without +any network. Thus they can't do anything really, except return data +back to Docspell. + +The pure way is much preferred! It allows for more consistent +behaviour, because Docspell is in charge for applying any changes. +Docspell can apply changes *only if* the addon returned successfully. +Addons can also be retried on error, because no changes happened yet. + +It's the decision of the addon author, how the addon will work. It +should document whether it is pure or impure. You can also look into +the descriptor and check for a `networking: false` setting. As the +server administrator, you can configure Docspell to only accept pure +addons. + + +## Runners + +### nix flake runner + +For addons providing a `flake.nix` this runner can build it and find +the file to execute. With this `flake.nix` file addons can declare how +they should be build and what dependencies are required to run them. + +The resulting executable can be executed via `systemd-nspawn` in a +restricted environment or directly on the machine. + +{% infobubble(title="Requires") %} +You need to install [nix](https://nixos.org) and enable +[flakes](https://nixos.wiki/wiki/Flakes) to use this runner. +{% end %} + +### docker + +Addons can provide a Dockerfile or an image. If no image is given, +`docker build` will be run to build an image from the `Dockerfile`. +Then `docker run` is used to run the addon. + +{% infobubble(title="Requires") %} +You need to install `docker` to use this runner. +{% end %} + +### trivial + +Addons can simply declare a file to execute. Docspell can use +`systemd-nspawn` to run it in an restricted environment, or it can be +run directly on the machine. This variant is only useful for very +simple addons, that don't require any special dependencies. + +{% infobubble(title="Requires") %} +You need to check each addon for its requirements and prepare the +machine accordingly. +{% end %} + +### Choosing runners + +The config `addons.executor-config.runners` accepts a list of runners. +It specifies the preferred runner first. If an addon can be executed +via docker and nix, Docspell will choose the runner first in the list. + +If you don't have nix installed, remove the `nix-flake` runner from +this list and same for docker, of course. + + +### systemd-nspawn + +The `systemd-nspawn` can be used to run programs in a lightweight +ad-hoc container. It is available on most linux distributions (it is +part of systemd…). It doesn't require an image to exist first; this +makes it very convenient for running addons in a restricted +environment. + +If you enable it in the config file, then all addons are either run +via `systemd-nspawn` or docker - and thus always in a restricted +environment, where they can only access their own files and the files +provided by Docspell. + +The downside is that `systemd-nspawn` needs to be run as root (as far +as I know). Therfore, configure `sudo` to allow the user that is +running joex to execute `systemd-nspawn` non-interactively. + +{% infobubble(title="Requires") %} +Install `systemd-nspawn` and enable the user running joex to use it +password-less via sudo. +{% end %} + +# Within Docker + +If joex itself is run as a docker container, things get a bit +complicated. The default image for joex does not contain `nix`, so the +`nix-flake` runner cannot be used out of the box. + +In order to use the `docker` runner, the container must be configured +to access the hosts docker daemon. On most systems this can be +achieved by bind-mounting the unix socket (usually at +`/var/run/docker.sock`) into the container. Here is a snippet from the +provided `docker-compose` file: + +```yaml + joex: + image: docspell/joex:latest + # ... left out for brevity + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - /tmp:/tmp +``` + +Additionally to `/var/run/docker.sock`, it also bind mounts the `/tmp` +directory. This is necessary, because docker will be invoked with bind +mounts from inside the continer - but these must be available on the +host, because the docker client in the container actually runs the +command on the host. + +The addon executor uses the systems temp-directory (which is usually +`/tmp`) as a base for creating a working and cache directory. Should +you change this in joex config file (or your system uses a different +default temp-dir), then the bind mount must be adapted as well. + +Another variant is to extend the default joex image and add more +programs as needed by addons and then use the `trivial` runner. + +# Summary / tl;dr + +When joex is not inside a container: + +- (optional) Install `systemd-nspawn` - it is provided on many + GNU/Linux distributions +- Configure `sudo` to allow the user running the joex component to + execute `systemd-nspawn` non-interactively (without requiring a + password) +- Install docker +- Install [nix](https://nixos.org) and enable + [flakes](https://nixos.wiki/wiki/Flakes) +- Allow the user who runs the joex component to use docker and nix. If + you install nix as multi-user, then this is already done. +- Check the section on addons in the [default + configuration](@/docs/configure/main.md#joex) for joex diff --git a/website/site/content/docs/addons/using.md b/website/site/content/docs/addons/using.md new file mode 100644 index 00000000..7c19f9e4 --- /dev/null +++ b/website/site/content/docs/addons/using.md @@ -0,0 +1,103 @@ ++++ +title = "Usage" +insert_anchor_links = "right" +description = "How to use addons" +weight = 20 +template = "docs.html" ++++ + +# Using Addons + +This shows with an example, how to install and use an addon. If the ui +doesn't show these forms, addons are probably disabled. Addons need to +be enabled in the config file of the rest server. + +## Discovering + +Addons can be installed from any URL to a zip file. One way is to use +URLs generated by forges like github or gitlab. They provide zip files +containing the repository contents. Alternatively an addon may provide +specific files in their release section. + +For example, this is the url to the first release of the rotate-pdf +addon: + +- + +This url points to a fixed version. It is also possible to use urls +that are "moving targets": + +- + +The contents behind the above url will very likely change over time. + +For better discoverability, repositories for addons on public forges +can be tagged with *docspell-addon*. + +## Install + +With an URL like above, you can go to *Manage Data -> Addons -> New* +and insert the url: + +{{ figure2(light="addon-install-01.png", dark="addon-install-01_dark.png") }} + +It might take a while for Docspell to download, extract and verify the +addon. The addon will be downloaded into the database. Once installed, +the given URL is not used anymore, unless a manual update is issued. + +After this finishes, you cannot change the URL anymore: + +{{ figure2(light="addon-install-02.png", dark="addon-install-02_dark.png") }} + +When using URLs pointing to "moving targets", you could click the +*Update Addon* button to re-download the contents at the url. This +doesn't make much sense for URLs to fixed versions (in *theory* these +could change as well, of course) and it is not without risk. It can be +useful for own addons to have them quickly updated. + +Now the addon is installed. It can now be used by creating a *run configuration*. + +## Run Configuration + +A run configuration is comprised of one or more addons, their inputs +and some settings regarding their runtime environment. + +The name is used for displaying in the webapp. You can disable/enable +a run configuration. + +It is possible that addons use [dsc](@/docs/tools/cli.md) or call the +rest-server otherwise. Usually a valid session is required (to set +tags or do searches). When selecting to run *on behalf of a user*, a +valid authenticator for that user is injected into the environment of +the addon run. + +The *Trigger Run* setting specfies when this run configuraiton should +be executed. You can choose from options that all addons in the list +must support. In this example, only `existing-item` is used. This +means the run configuration can be selected to run on any item. + +Other options include: +- `final-process-item`: executes automatically as the last step when + processing uploaded files +- `final-reprocess-item`: like `final-process-item` but applies when + an existing item is reprocessed. +- `scheduled`: runs periodically based on a schedule (and independent + from any item) + +Each addon may require arguments. Click on *Configure* to enable the +*Arguments* section and add arguments for the corresponding addon. +What to insert here is completely specific to the addon. In this case, +it expects a JSON object with only one field `"degree"` that indicates +how to rotate. In this example, it should be rotated by 90° +counter-clockwise. You need to click *Update* to set it into the addon +and then *Submit* to save everything. + +{{ figure2(light="addon-install-03.png", dark="addon-install-03_dark.png") }} + + +With this run configuration in place, you can try it out on some item: + +{{ figure2(light="addon-install-04.png", dark="addon-install-04_dark.png") }} + +This example configured the *rotate-pdf-addon* to rotate left by 90°. +Create a simlar run configuration to rotate to the right. diff --git a/website/site/content/docs/addons/writing.md b/website/site/content/docs/addons/writing.md new file mode 100644 index 00000000..9edfbb21 --- /dev/null +++ b/website/site/content/docs/addons/writing.md @@ -0,0 +1,376 @@ ++++ +title = "Writing" +insert_anchor_links = "right" +description = "How to write addons" +weight = 20 +template = "docs.html" ++++ + +# Writing Addons + +Writing an addon can be divided into two things: + +- create the program +- define how to package and run it + +The next sections describe both parts. For a quick start, check out +the example addons. + +As previously written, you can choose a language. The interaction with +docspell happens by exchanging JSON data. So, whatever you choose, it +should be possible to read and produce JSON with some convenience. + + +# Writing the program + +## Interface to Docspell + +The interface to Docspell is JSON data. The addon receives all inputs +as JSON and may return a JSON object as output (via stdout). + +An addon can be executed in different contexts. Depending on this, the +available inputs differ. The addon always receives one argument, which +is a file containing the user supplied data (it may be empty). A user +is able to provide data to every addon from the web-ui. + +All other things are provided as environment variables. There are +environment variables that are always provided and some are only +available for specific contexts. + +For example, an addon that is executed in the context of an item +(maybe after processing or when a user selects an addon to run "on an +item"), Docspell prepares all data for the corresponding item and +makes it available to the addon. In contrast, an addon executed +periodically by a schedule, won't have this data available. + + +## Basic Environment + +The following environment variables are always provided by Docspell: + +- `ADDON_DIR` points to the directory containing the extracted addon + zip file +- `TMPDIR` / `TMP_DIR` a directory for storing temporary data +- `OUTPUT_DIR` a directory for storing files that should be processed + by docspell +- `CACHE_DIR` a directory for storing data that should stay between + addon runs + +It is very much recommended to always use these environment variables +when reading and writing data. This keeps Docspell in control about +the exact location. + +The working directory will be set to a directory that is also +temporary, but please don't rely on that. Use the environment +variables. + +## Item data + +When executed in the context of an item. Meaning for triggers: +`final-process-item`, `final-reprocess-item`, `existing-item`. + +### `ITEM_DATA_JSON` + +This environment variable points to a JSON file containing information +about the current item. If it is run at processing time, it includes +all information gathered so far by Docspell. + +**Example** +{{ incl_json(path="templates/shortcodes/item-data") }} + + +### `ITEM_ARGS_JSON` + +This environment variable points to a JSON file that contains the user +supplied information with an upload request. That is, a user may +specify tags or a language when uploading files. This would be in this +file. + +*This is only available for uploads. Trigger `final-process-item`.* + +**Example** +{{ incl_json(path="templates/shortcodes/item-args") }} + + +### `ITEM_ORIGINAL_JSON` and `ITEM_PDF_JSON` + +These JSON files contains a list of objects. Each object provides +properties about a file - either the original file or the converted +pdf. The structure is the same. + +**Example** +{{ incl_json(path="templates/shortcodes/file-meta") }} + + + +### Directories + +These environment variables point to directories that contain the attachment files. + +- `ITEM_PDF_DIR` contains all converted pdf files, the attachment id is the filename +- `ITEM_ORIGINAL_DIR` contains all original files, the attachment id is the filename + +For example, to obtain a converted pdf file, lookup the id in +`ITEM_PDF_JSON` and then construct the file name via +`ITEM_PDF_DIR/{id}`. + + +## Session for dsc + +An addon may use [dsc](@/docs/tools/cli.md) which requires for many +commands a valid session identifier. Usually this is obtained by +logging in (i.e. using `dsc login`). This is not really feasible from +inside an addon, of course. Therefore you can configure an addon to +run on behalf of some user when creating the run configuration. +Docspell then generates a valid session identifier and puts it into +the environment. The [dsc](@/docs/tools/cli.md) tool will pick them up +automatically. + +It will also setup the URL to connect to some restserver. (If you have +multiple rest-servers running, it will pick one randomly). + +- `DSC_SESSION` env variable containing a session identifier. It's + validity is coupled on the configured timeout. +- `DSC_DOCSPELL_URL` the base url to some rest server + +That means when using an addon in this way, you can simply use `dsc` +without worrying about authentication or the correct URL to connect +to. + + +## Output + +Docspell doesn't interpret the returncode of an addon, except checking +for being equal to `0` which indicates a successful run. + +In order to do change data in Docspell, the addon program can run +`dsc` (for example) to change some state - like setting tags etc. But +the preferred approach would be to return instructions for Docspell. +Docspell will execute the instructions when the addon terminates +successfully - that is with return code `0`. + +These instructions are in a JSON object which needs to go to stdout. +You can use stderr in an addon for logging/debugging purposes. But if +you specify `collectOutput: true` in the descriptior, then stdout must +only return this specific JSON (or nothing, empty output is ignored). + +You find the complete structure below. It consists of these parts: + +- `commands`: let's you declare actions to do for an item or attachment +- `files`: defines files relative to `OUTPUT_DIR` that should be + processed +- `newItems`: declares files relative to `OUTPUT_DIR` that should be + processed as new uploads + +The `commands` allows to set tags, fields and other things. All parts +are optional, you don't need to return the complete structure. Just +returning `commands` or only `files` is ok. + +**Example** +{{ incl_json(path="templates/shortcodes/addon-output") }} + + +# Descriptor + +An addon must provide an *addon descriptior*, which is a yaml or json +file looking like this: + +```yaml +# The meta section is required. Name and version must not contain +# whitespace +meta: + name: "name-of-addon" + version: "2.21" + description: | + Describe the purpose and how it must be used here + +# Defining when this addon is run. This is used to guide the user +# interface in selecting an addon. At least one is required to specify. +# +# Possible values: +# - scheduled: requires to enter a timer to run this addon periodically +# - final-process-item: the final step when processing an item +# - final-reprocess-item: the final step when reprocessing an item +# - existing-item: A user selects the addon to run on an item +triggers: + - final-process-item + - final-reprocess-item + - existing-item + +# How to build and run this addon (optional). If missing, auto +# detection will enable a nix runner if a `flake.nix` is found in the +# source root and docker if a `Dockerfile` is found. +# +# Both runners are compared to what is enabled at the server. +runner: + # Building the program using nix flakes. This requires a flake.nix + # file in the source root with a default package and a flake-enabled + # nix on the joex machine. + # + # The program is build via `nix build`. If the joex machine has + # systemd-nspawn installed, it is used to run the addon inside a + # container. Otherwise the addon is run directly on the machine. + nix: + enable: true + + # Docker based runner can define a custom image to use. If a `build` + # key exists pointing to a Dockerfile, the image is build before. If + # the docker image is complex, you can build it independently and + # provide the pre-build image. + # + # The program is run via `docker run` passing the arguments to the + # addon. Thus it expects the entrypoint to be correctly configured + # to the executable. You may use `args` in order to prepend + # additional arguments, like the path to an executable if the image + # requires that. The joex machine must have docker installed and the + # user running joex must be allowed to use docker. You must either + # define an image with an appropriate entry point or a dockerfile. + docker: + enable: false + #image: myorg/myimage:latest + build: Dockerfile + + # Trivial runner that simply executes the file specified with + # `exec`. Nothing is build before. This runner usually requires that + # the joex machine contains all dependencies needed to run the + # addon. You may need to install additional software on the machine + # running joex. + trivial: + enable: false + exec: src/addon.sh + +# Optional arguments/options given to the program. The program +# receives at least one argument, which is a file to the user input as +# supplied in the application. The arguments here are prepended. +args: + + +options: + # If false, the program is run inside a private network, blocking + # traffic to the host and networks reachable from there. This only + # applies if the addon can be run inside a container. + # + # If the addon runs side effects (such as using dsc to set tags), + # this must be set to `true`. + # + # Default is false. + networking: true + + # If true, the stdout of the program is parsed into a JSON structure + # that is interpreted as actions executed by the task that runs the + # addon. If the addon runs side effects only, set this to `false` + # and the output is ignored. + # + # It is recommended to use this approach, if possible. It allows + # docspell itself to apply any changes and the addon can run + # completely isolated. + # + # Default is false. + collectOutput: true +``` + + +# Packaging + +Docspell can use different ways to build and run the addon: +`nix-flake`, `docker` and `trivial`. The first two allow to package +the addon in a defined way (with a single dependency, either nix or +docker) and then execute it independently from the underlying system. +This makes it possible to execute the addon on a variety of systems. +This is especially useful for addons that are meant to be public and +reusable by different people. + +The "trivial" runner is only executing some program specified in +`docspell-addon.yaml`, directly on the joex machine (or via +`systemd-nspawn`). The machine running joex must then provide all +necessary dependencies and it must be compatible to run the addon. It +may be useful especially for personal addons. + + +## nix flake + +Using [nix](https://nixos.org) with +[flakes](https://nixos.wiki/wiki/Flakes) enabled, is the recommended +approach. It is very flexible and reproducible while sharing most +dependencies (in contrast to docker where each image contains the same +packages again and again). + +Docspell runs `nix build` to build the addon and then executes the +file produced to `$out/bin`. + + +## docker + +For docker it is recommended to provide pre-build images. Docspell can +build images from provided `Dockerfile`, but for larger images it +might be better to do this apriori. + +Docspell will run the addon using `docker run …` passing it only the +user-input file as argument. Thus the image must define an appropriate +`ENTRYPOINT`. + +# Examples +## Minimal Addon + +The steps below create a minimal addon: + +1. Create a bash script `addon.sh` with this content: + + ```bash + #!/usr/bin/env bash + + echo "Hello world!" + ``` +2. Make it executable: + ```bash + chmod +x addon.sh + ``` +3. Create a yaml file `docspell-addon.yaml` with this content: + + ```yaml + meta: + name: "minimal-addon" + version: "0.1.0" + triggers: + - existing-item + - scheduled + runner: + trivial: + enable: true + exec: addon.sh + ``` +4. Create a zip file containing these two files: + ```bash + zip addon.zip docspell-addon.yaml addon.sh + ``` + +The addon is now ready. Make it available via an url (use some file +sharing tool, upload it somewhere etc) and then it can be installed +and run. + +## Non-Minimal Addon + +The minimal example above is good to see what is required, but it is +not very useful…. Please see this post about the [audio file +addon](@/blog/2022-05-16_audio_file_addon.md) that walks through a +more useful addon. + +# Misc + +## Advantages of "pure" addons + +Although the output structure is not set in stone, it is recommended +to use this in contrast to directly changing state via `dsc`. + +- outputs of all addons are collected and only applied if all were + successful; in contrast side effects are always applied even if the + addon fails shortly after +- since addons are executed as joex tasks, their result can be send as + events to another http server for further processing. +- addons can run in an isolated environment without network (no data + can go out) + +## Use addons in other addons? + +This can be achieved very conveniently by using `nix`. If addons are +defined as a nik flake, they can be easily consumed by each other. diff --git a/website/site/content/docs/features/_index.md b/website/site/content/docs/features/_index.md index e56f8fab..e285301d 100644 --- a/website/site/content/docs/features/_index.md +++ b/website/site/content/docs/features/_index.md @@ -82,6 +82,7 @@ template = "docs.html" - zip - [eml](https://en.wikipedia.org/wiki/Email#Filename_extensions) (e-mail files in plain text MIME) +- Extend Docspell via [addons](@/docs/addons/basics.md) - Tooling: - [Command Line Interface](@/docs/tools/cli.md) allowing to upload files, watch folders and many more! diff --git a/website/site/templates/shortcodes/incl_conf.md b/website/site/templates/shortcodes/incl_conf.md index 387d5317..d08d7390 100644 --- a/website/site/templates/shortcodes/incl_conf.md +++ b/website/site/templates/shortcodes/incl_conf.md @@ -1,4 +1,4 @@ -``` bash {% set data = load_data(path=path) %} +``` bash {{ data | safe }} ``` diff --git a/website/site/templates/shortcodes/incl_json.md b/website/site/templates/shortcodes/incl_json.md new file mode 100644 index 00000000..2973a2a3 --- /dev/null +++ b/website/site/templates/shortcodes/incl_json.md @@ -0,0 +1,4 @@ +{% set data = load_data(path=path) %} +``` json +{{ data | safe }} +``` diff --git a/website/src/main/scala/docspell/website/AddonOutputExample.scala b/website/src/main/scala/docspell/website/AddonOutputExample.scala new file mode 100644 index 00000000..c7059ea4 --- /dev/null +++ b/website/src/main/scala/docspell/website/AddonOutputExample.scala @@ -0,0 +1,86 @@ +package docspell.website + +import cats.syntax.all._ +import docspell.addons.out.{AddonOutput, ItemFile, NewFile, NewItem} +import docspell.addons.out.NewFile.{Meta => FileMeta} +import docspell.addons.out.NewItem.{Meta => ItemMeta} +import docspell.common._ +import docspell.common.bc.{AttachmentAction, BackendCommand, ItemAction} +import io.circe.syntax._ + +object AddonOutputExample extends Helper { + + val example = AddonOutput( + commands = List( + BackendCommand.ItemUpdate( + itemId = id("XabZ-item-id"), + actions = List( + ItemAction.AddTags(Set("tag1", "tag2")), + ItemAction.ReplaceTags(Set("tagX", "tagY")), + ItemAction.RemoveTags(Set("tag0", "tag9")), + ItemAction.RemoveTagsCategory(Set("doc-type")), + ItemAction.SetFolder("folder-name".some), + ItemAction.SetCorrOrg(id("OaIy-org-ID").some), + ItemAction.SetCorrPerson(id("OaIy-person-ID").some), + ItemAction.SetConcPerson(id("AEiae-person-ID").some), + ItemAction.SetConcEquipment(id("AEiae-equipment-ID").some), + ItemAction.SetField(id("eur"), "12.99"), + ItemAction.SetName("new item name"), + ItemAction.SetNotes("replace notes with this".some), + ItemAction.AddNotes("More notes appended", Some("-----")) + ) + ), + BackendCommand + .AttachmentUpdate( + itemId = id("XabZ-item-id"), + attachId = id("Atca-attach-id"), + actions = List( + AttachmentAction.SetExtractedText("replace extracted text with this".some) + ) + ) + ), + files = List( + ItemFile( + id("iZtb-item-id"), + textFiles = Map("attach-id" -> "newtext.txt"), + pdfFiles = Map("attach-id" -> "better.pdf"), + previewImages = Map("attach-id" -> "better-preview.png"), + newFiles = List( + NewFile( + metadata = FileMeta( + language = Some(Language.English), + skipDuplicate = Some(true), + attachmentsOnly = Some(false) + ), + file = "new-file1.docx" + ), + NewFile( + metadata = FileMeta( + language = Some(Language.German), + skipDuplicate = Some(true), + attachmentsOnly = Some(false) + ), + file = "new-file2.pdf" + ) + ) + ) + ), + newItems = List( + NewItem( + metadata = ItemMeta( + language = Some(Language.English), + direction = Direction.Incoming.some, + folderId = id("my-folder").some, + source = "the-addon-x".some, + skipDuplicate = true.some, + tags = List("tag1", "tag2").some, + attachmentsOnly = None + ).some, + files = List("a-file.pdf", "another.jpg") + ) + ) + ) + + def exampleJson = + example.asJson.spaces2 +} diff --git a/website/src/main/scala/docspell/website/AddonOutputMiniExample.scala b/website/src/main/scala/docspell/website/AddonOutputMiniExample.scala new file mode 100644 index 00000000..166f1e91 --- /dev/null +++ b/website/src/main/scala/docspell/website/AddonOutputMiniExample.scala @@ -0,0 +1,23 @@ +package docspell.website + +import docspell.addons.out._ +import docspell.common.bc._ +import io.circe.syntax._ + +object AddonOutputMiniExample extends Helper { + + val example = AddonOutput( + commands = List( + BackendCommand.ItemUpdate( + itemId = id("XabZ-item-id"), + actions = List( + ItemAction.AddTags(Set("tag1", "tag2")) + ) + ) + ) + ) + + def exampleJson = + example.asJson.spaces2 + +} diff --git a/website/src/main/scala/docspell/website/FileMetaExample.scala b/website/src/main/scala/docspell/website/FileMetaExample.scala new file mode 100644 index 00000000..97afa865 --- /dev/null +++ b/website/src/main/scala/docspell/website/FileMetaExample.scala @@ -0,0 +1,37 @@ +package docspell.website + +import cats.syntax.option._ +import docspell.common.{ByteSize, Language, MimeType} +import docspell.store.queries.AttachedFile +import io.circe.syntax._ +import scodec.bits.ByteVector + +object FileMetaExample extends Helper { + + val example1 = AttachedFile( + id = randomId, + name = "the filename.png".some, + position = 0, + language = Language.English.some, + mimetype = MimeType.png, + length = ByteSize(454654L), + checksum = ByteVector.fromValidHex("caffe0caffe").digest("sha256") + ) + + val example2 = AttachedFile( + id = randomId, + name = "other filename.png".some, + position = 1, + language = Language.English.some, + mimetype = MimeType.pdf, + length = ByteSize(1232214L), + checksum = ByteVector.fromValidHex("eff0eff0eff").digest("sha256") + ) + + val example = List( + example1, + example2 + ) + + val exampleJson = example.asJson.spaces2 +} diff --git a/website/src/main/scala/docspell/website/Helper.scala b/website/src/main/scala/docspell/website/Helper.scala new file mode 100644 index 00000000..82314d72 --- /dev/null +++ b/website/src/main/scala/docspell/website/Helper.scala @@ -0,0 +1,29 @@ +package docspell.website + +import docspell.common.{IdRef, Ident, Timestamp} +import scodec.bits.ByteVector + +import java.time.LocalDate +import scala.util.Random + +trait Helper { + + def id(str: String): Ident = Ident.unsafe(str) + + val date20220514 = Timestamp.atUtc(LocalDate.of(2022, 5, 14).atTime(11, 22, 12)) + + val cid = id("collective") + + implicit final class StringExt(self: String) { + def id: Ident = Ident.unsafe(self) + } + + def idRef(name: String): IdRef = IdRef(randomId, name) + + def randomId = { + val buffer = Array.ofDim[Byte](6) + new Random().nextBytes(buffer) + id(ByteVector.view(buffer).toBase58) + } + +} diff --git a/website/src/main/scala/docspell/website/ItemArgsExample.scala b/website/src/main/scala/docspell/website/ItemArgsExample.scala new file mode 100644 index 00000000..5ca5580b --- /dev/null +++ b/website/src/main/scala/docspell/website/ItemArgsExample.scala @@ -0,0 +1,25 @@ +package docspell.website + +import cats.syntax.option._ +import docspell.common.{Language, ProcessItemArgs} +import io.circe.syntax._ + +object ItemArgsExample extends Helper { + + val example = ProcessItemArgs.ProcessMeta( + collective = cid, + itemId = None, + language = Language.English, + direction = None, + sourceAbbrev = "scanner", + folderId = None, + validFileTypes = Seq.empty, + skipDuplicate = true, + fileFilter = None, + tags = List("given-tag-1").some, + reprocess = false, + attachmentsOnly = None + ) + + val exampleJson = example.asJson.spaces2 +} diff --git a/website/src/main/scala/docspell/website/ItemDataExample.scala b/website/src/main/scala/docspell/website/ItemDataExample.scala new file mode 100644 index 00000000..a1e1212c --- /dev/null +++ b/website/src/main/scala/docspell/website/ItemDataExample.scala @@ -0,0 +1,75 @@ +package docspell.website + +import cats.syntax.option._ +import docspell.common.MetaProposal.Candidate +import docspell.common._ +import docspell.joex.process.ItemData +import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} +import io.circe.syntax._ + +object ItemDataExample extends Helper { + + private val proposals: MetaProposalList = MetaProposalList( + List( + MetaProposal(MetaProposalType.CorrOrg, Candidate(idRef("Acme AG"), Set.empty)), + MetaProposal( + MetaProposalType.ConcPerson, + Candidate(idRef("Derek Jeter"), Set.empty) + ) + ) + ) + + private val givenProposals: MetaProposalList = MetaProposalList.empty + + val example = ItemData( + item = RItem( + id = id("UyZ-item-id"), + cid = cid, + name = "yearly report 2021", + itemDate = date20220514.some, + source = "webapp", + direction = Direction.Incoming, + state = ItemState.Processing, + corrOrg = None, + corrPerson = None, + concPerson = None, + concEquipment = None, + inReplyTo = None, + dueDate = None, + created = date20220514, + updated = date20220514, + notes = None, + folderId = None + ), + attachments = Vector( + RAttachment( + id = id("Apa-attach-id"), + itemId = id("UyZ-item-id"), + fileId = FileKey(cid, FileCategory.AttachmentConvert, id("abcxyz")), + position = 0, + created = date20220514, + name = "report_year_2021.pdf".some + ) + ), + metas = Vector( + RAttachmentMeta( + id = id("Apa-attach-id"), + content = "this is the extracted text …".some, + nerlabels = Nil, + proposals = proposals, + pages = 2.some, + language = Language.English.some + ) + ), + dateLabels = Vector.empty, + originFile = Map( + id("Apa-attach-id") -> FileKey(cid, FileCategory.AttachmentSource, "yanetar".id) + ), + givenMeta = givenProposals, + tags = List("tag-1"), + classifyProposals = MetaProposalList.empty, + classifyTags = List("invoice") + ) + + val exampleJson = example.asJson.spaces2 +} diff --git a/website/src/main/scala/docspell/website/Main.scala b/website/src/main/scala/docspell/website/Main.scala new file mode 100644 index 00000000..d9da86d0 --- /dev/null +++ b/website/src/main/scala/docspell/website/Main.scala @@ -0,0 +1,64 @@ +package docspell.website + +import cats.effect.{ExitCode, IO, IOApp} +import fs2.io.file.{Files, Path} +import fs2.Stream +import io.circe.Encoder +import io.circe.syntax._ + +object Main extends IOApp { + override def run(args: List[String]) = + args match { + case "addon-output" :: file :: Nil => + if (file.isEmpty) ok(stdout(AddonOutputExample.exampleJson)) + else ok(AddonOutputExample.example.writeFile(file)) + + case "addon-output-tags" :: file :: Nil => + if (file.isEmpty) ok(stdout(AddonOutputMiniExample.exampleJson)) + else ok(AddonOutputMiniExample.example.writeFile(file)) + + case "item-data" :: file :: Nil => + if (file.isEmpty) ok(stdout(ItemDataExample.exampleJson)) + else ok(ItemDataExample.example.writeFile(file)) + + case "item-args" :: file :: Nil => + if (file.isEmpty) ok(stdout(ItemArgsExample.exampleJson)) + else ok(ItemArgsExample.example.writeFile(file)) + + case "file-meta" :: file :: Nil => + if (file.isEmpty) ok(stdout(FileMetaExample.exampleJson)) + else ok(FileMetaExample.example.writeFile(file)) + + case v :: Nil => + err(stderr(s"Unknown example: $v")) + + case _ => + err(stderr("Specify what example to print")) + } + + def stdout(str: String, args: Any*): Unit = + Console.out.println(str.format(args: _*)) + + def stderr(str: String, args: Any*): Unit = + Console.err.println(str.format(args: _*)) + + def ok(f: IO[Unit]): IO[ExitCode] = + f.as(ExitCode.Success) + + def ok(p: => Unit): IO[ExitCode] = + ok(IO(p)) + + def err(p: => Unit): IO[ExitCode] = + IO(p).as(ExitCode.Error) + + implicit class WriteOps[A: Encoder](self: A) { + def writeFile(file: String): IO[Unit] = + Stream + .emit(self.asJson.spaces2) + .covary[IO] + .through(fs2.text.utf8.encode) + .through(Files[IO].writeAll(Path(file))) + .compile + .drain + } +}