diff --git a/.gitignore b/.gitignore
index 233e2fa2..05afc7d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,15 @@
#artwork/*.png
target/
+node_modules/
dev.conf
elm-stuff
result
_site/
-*.qcow2
\ No newline at end of file
+*.qcow2
+/website/site/content/docs/changelog/
+/website/site/public/
+/website/site/static/openapi/
+/website/site/static/js/bundle.js
+/website/site/templates/shortcodes/server.conf
+/website/site/templates/shortcodes/sample-exim.conf
+/website/site/templates/shortcodes/joex.conf
diff --git a/build.sbt b/build.sbt
index 62a8fe5e..1472d6d8 100644
--- a/build.sbt
+++ b/build.sbt
@@ -300,6 +300,7 @@ val restapi = project.in(file("modules/restapi")).
openapiTargetLanguage := Language.Scala,
openapiPackage := Pkg("docspell.restapi.model"),
openapiSpec := (Compile/resourceDirectory).value/"docspell-openapi.yml",
+ openapiStaticArgs := Seq("-l", "html2")
).dependsOn(common)
val joexapi = project.in(file("modules/joexapi")).
@@ -422,78 +423,53 @@ val restserver = project.in(file("modules/restserver")).
-// --- Microsite Documentation
+// --- Website Documentation
-val microsite = project.in(file("modules/microsite")).
+val website = project.in(file("website")).
disablePlugins(RevolverPlugin).
- enablePlugins(MicrositesPlugin).
disablePlugins(ReleasePlugin).
settings(sharedSettings).
settings(
- name := "docspell-microsite",
+ name := "docspell-website",
publishArtifact := false,
skip in publish := true,
- micrositeFooterText := Some(
- """
- |
© 2020 Docspell, v{{site.version}}
- |""".stripMargin
- ),
- micrositeName := "Docspell",
- micrositeDescription := "Auto-tagging Document Organizer",
- micrositeDocumentationUrl := "doc",
- micrositeDocumentationLabelDescription := "Documentation",
- micrositeFavicons := Seq(microsites.MicrositeFavicon("favicon.png", "96x96")),
- micrositeAuthor := "eikek",
- micrositeGithubOwner := "eikek",
- micrositeGithubRepo := "docspell",
- micrositeGitterChannel := false,
- micrositeShareOnSocial := false,
- micrositeHighlightLanguages ++= Seq("json", "javascript"),
- micrositeEditButton := Some(microsites.MicrositeEditButton("Improve this page", "/edit/master/modules/microsite/docs/{{ page.path }}")),
- fork in run := true,
- micrositeCompilingDocsTool := WithMdoc,
- mdocVariables := Map(
- "VERSION" -> version.value,
- "PVERSION" -> version.value.replace('.', '_')
- ),
- micrositeExtraMdFiles := Map(
- file("Changelog.md") -> ExtraMdFileConfig(
- "changelog.md",
- "docs",
- Map("title" -> "Changelog", "permalink" -> "changelog")
- )
- ),
Compile/resourceGenerators += Def.task {
- val jekyllOut = resourceManaged.value/"main"/"jekyll"
+ val templateOut = baseDirectory.value/"site"/"templates"/"shortcodes"
+ val staticOut = baseDirectory.value/"site"/"static"/"openapi"
+ IO.createDirectories(Seq(templateOut, staticOut))
val logger = streams.value.log
- val templates = Seq(
- (resourceDirectory in (restserver, Compile)).value / "reference.conf" -> jekyllOut /"_includes"/"server.conf",
- (resourceDirectory in (joex, Compile)).value / "reference.conf" -> jekyllOut/"_includes"/"joex.conf",
- (LocalRootProject / baseDirectory).value / "tools" / "exim" / "exim.conf" -> jekyllOut/ "_includes"/"sample-exim.conf"
- )
- val res1 = templates.map { case (s, t) =>
- logger.info(s"Copying $s -> $t")
- IO.write(t, "{% raw %}\n")
- IO.append(t, IO.readBytes(s))
- IO.write(t, "\n{% endraw %}", append = true)
- t
- }
-
val files = Seq(
- (resourceDirectory in (restapi, Compile)).value/"docspell-openapi.yml" -> jekyllOut/"openapi"/"docspell-openapi.yml"
+ (resourceDirectory in (restserver, Compile)).value / "reference.conf" -> templateOut /"server.conf",
+ (resourceDirectory in (joex, Compile)).value / "reference.conf" -> templateOut/"joex.conf",
+ (LocalRootProject / baseDirectory).value / "tools" / "exim" / "exim.conf" -> templateOut/"sample-exim.conf",
+ (resourceDirectory in (restapi, Compile)).value/"docspell-openapi.yml" -> staticOut/"docspell-openapi.yml",
+ (restapi/Compile/openapiStaticDoc).value -> staticOut/"docspell-openapi.html"
)
IO.copy(files)
- res1 ++ files.map(_._2)
+ files.map(_._2)
}.taskValue,
Compile/resourceGenerators += Def.task {
- val staticDoc = (restapi/Compile/openapiStaticDoc).value
- val target = resourceManaged.value/"main"/"jekyll"/"openapi"/"docspell-openapi.html"
- IO.copy(Seq(staticDoc -> target))
+ val changelog = (LocalRootProject / baseDirectory).value / "Changelog.md"
+ val targetDir = baseDirectory.value/"site"/"content"/"docs"/"changelog"
+ IO.createDirectory(targetDir)
+ val target = targetDir/"_index.md"
+
+ IO.write(target, """|+++
+ |title = "Changelog"
+ |description = "See what changed between releases."
+ |weight = 10
+ |insert_anchor_links = "right"
+ |[extra]
+ |maketoc = false
+ |+++
+ |""".stripMargin)
+ IO.append(target, IO.readBytes(changelog))
Seq(target)
- }.taskValue
+ }.taskValue
)
+
val root = project.in(file(".")).
settings(sharedSettings).
settings(noPublish).
diff --git a/modules/microsite/docs/dev.md b/modules/microsite/docs/dev.md
index 96a0954b..3211d3e7 100644
--- a/modules/microsite/docs/dev.md
+++ b/modules/microsite/docs/dev.md
@@ -97,7 +97,7 @@ the nix package manager and to integrate it into NixOS.
The modules can be build by building the `configuration-test.nix` file
together with some nixpkgs version. For example:
-``` shell
+``` bash
nixos-rebuild build-vm -I nixos-config=./configuration-test.nix \
-I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/nixos-19.09.tar.gz
```
@@ -108,21 +108,21 @@ the system configuration can be found behind the `./result/system`
symlink. So it is possible to look at the generated systemd config for
example:
-``` shell
+``` bash
cat result/system/etc/systemd/system/docspell-joex.service
```
And with some more commands (there probably is an easier way…) the
config file can be checked:
-``` shell
+``` bash
cat result/system/etc/systemd/system/docspell-joex.service | grep ExecStart | cut -d'=' -f2 | xargs cat | tail -n1 | awk '{print $NF}'| sed 's/.$//' | xargs cat | jq
```
To see the module in action, the vm can be started (the first line
sets more memory for the vm):
-``` shell
+``` bash
export QEMU_OPTS="-m 2048"
./result/bin/run-docspelltest-vm
```
diff --git a/modules/microsite/docs/doc/configure.md b/modules/microsite/docs/doc/configure.md
index 4a6ad60a..c0bf89aa 100644
--- a/modules/microsite/docs/doc/configure.md
+++ b/modules/microsite/docs/doc/configure.md
@@ -117,7 +117,7 @@ full-text-search {
This key is required if you want docspell to drop and re-create the
entire index. This is possible via a REST call:
-``` shell
+``` bash
$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
```
diff --git a/modules/microsite/docs/doc/emailsettings.md b/modules/microsite/docs/doc/emailsettings.md
index 050f90ac..ca3a7c1f 100644
--- a/modules/microsite/docs/doc/emailsettings.md
+++ b/modules/microsite/docs/doc/emailsettings.md
@@ -132,7 +132,7 @@ Download the `oauth2.py` script from
[here](https://github.com/google/gmail-oauth2-tools) and first create
an *oauth2-token*:
-``` shell
+``` bash
./oauth2.py --user=your.name@gmail.com \
--client_id=106701....d8c.apps.googleusercontent.com \
--client_secret=5Z1...Kir_t \
diff --git a/modules/microsite/docs/doc/tools/smtpgateway.md b/modules/microsite/docs/doc/tools/smtpgateway.md
index 9a73fdcf..86090b43 100644
--- a/modules/microsite/docs/doc/tools/smtpgateway.md
+++ b/modules/microsite/docs/doc/tools/smtpgateway.md
@@ -90,7 +90,7 @@ notes about the used options (see `man curl`):
Go into the `tools/exim` directory and build the docker image:
-``` shell
+``` bash
docker build -t ds-exim:latest -f exim.dockerfile .
```
@@ -114,7 +114,7 @@ variables as needed.
Finally start the container:
-``` shell
+``` bash
docker-compose up
```
@@ -169,7 +169,7 @@ The mail is processed and results in an item:
However, if a mail is to an unknown collective or not to the
configured local domain, the server rejects it immediately:
-``` shell
+``` bash
fish ~> telnet localhost 25
Trying ::1...
Connected to localhost.
diff --git a/project/build.nix b/project/build.nix
index 522d61e9..fb584dde 100644
--- a/project/build.nix
+++ b/project/build.nix
@@ -9,6 +9,8 @@ buildFHSUserEnv {
name = "docspell-sbt";
targetPkgs = pkgs: with pkgs; [
netcat jdk8 wget which zsh dpkg sbt git elmPackages.elm ncurses fakeroot mc jekyll
+ zola yarn
+
# haskells http client needs this (to download elm packages)
iana-etc
];
diff --git a/website/README.md b/website/README.md
new file mode 100644
index 00000000..2dbebd8d
--- /dev/null
+++ b/website/README.md
@@ -0,0 +1,30 @@
+# Website
+
+This is the docspell website and documentation.
+
+## Building
+
+The website is created using [zola](https://github.com/getzola/zola)
+static site generator. The (very minimal) dynamic parts are written in
+Elm.
+
+The `build.sh` script builds the site.
+
+
+## Development
+
+Install things by running `yarn install`.
+
+Open two terminals. In first run:
+
+``` shell
+nix-shell --run ./run-elm.sh
+```
+
+and in the second
+
+``` shell
+nix-shell --run "cd site && zola serve"
+```
+
+Open browser at `localhost:1111`.
diff --git a/website/build.sh b/website/build.sh
new file mode 100755
index 00000000..c9d9e671
--- /dev/null
+++ b/website/build.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -e
+
+yarn install
+elm make --output site/static/js/bundle.js --optimize elm/Main.elm
+cd site
+zola build
+cd ..
+
+echo "Site is in site/public."
diff --git a/website/elm-analyse.json b/website/elm-analyse.json
new file mode 100644
index 00000000..837c4ada
--- /dev/null
+++ b/website/elm-analyse.json
@@ -0,0 +1,9 @@
+{
+ "excludedPaths": [
+ "modules/webapp/target/elm-src/"
+ ],
+ "checks" : {
+ "ImportAll": false,
+ "SingleFieldRecord": false
+ }
+}
diff --git a/website/elm.json b/website/elm.json
new file mode 100644
index 00000000..daaa0710
--- /dev/null
+++ b/website/elm.json
@@ -0,0 +1,28 @@
+{
+ "type": "application",
+ "source-directories": [
+ "elm"
+ ],
+ "elm-version": "0.19.1",
+ "dependencies": {
+ "direct": {
+ "elm/browser": "1.0.2",
+ "elm/core": "1.0.5",
+ "elm/html": "1.0.0",
+ "elm/random": "1.0.0",
+ "elm-community/random-extra": "3.1.0",
+ "elm-explorations/markdown": "1.0.0"
+ },
+ "indirect": {
+ "elm/json": "1.1.3",
+ "elm/time": "1.0.0",
+ "elm/url": "1.0.0",
+ "elm/virtual-dom": "1.0.2",
+ "owanturist/elm-union-find": "1.0.0"
+ }
+ },
+ "test-dependencies": {
+ "direct": {},
+ "indirect": {}
+ }
+}
diff --git a/website/elm/ExtraAttr.elm b/website/elm/ExtraAttr.elm
new file mode 100644
index 00000000..7d7fb64c
--- /dev/null
+++ b/website/elm/ExtraAttr.elm
@@ -0,0 +1,36 @@
+module ExtraAttr exposing (..)
+
+import Html exposing (..)
+import Html.Attributes exposing (..)
+
+
+ariaExpanded : Bool -> Attribute msg
+ariaExpanded flag =
+ attribute "aria-expanded"
+ (if flag then
+ "true"
+
+ else
+ "false"
+ )
+
+
+ariaHidden : Bool -> Attribute msg
+ariaHidden flag =
+ attribute "aria-hidden"
+ (if flag then
+ "true"
+
+ else
+ "false"
+ )
+
+
+ariaLabel : String -> Attribute msg
+ariaLabel name =
+ attribute "aria-label" name
+
+
+role : String -> Attribute msg
+role name =
+ attribute "role" name
diff --git a/website/elm/Feature.elm b/website/elm/Feature.elm
new file mode 100644
index 00000000..8edece1d
--- /dev/null
+++ b/website/elm/Feature.elm
@@ -0,0 +1,110 @@
+module Feature exposing (..)
+
+import Html exposing (..)
+import Html.Attributes exposing (..)
+import Markdown
+
+
+type alias Feature =
+ { image : String
+ , header : String
+ , description : String
+ }
+
+
+featureBox : Int -> Feature -> Html msg
+featureBox index f =
+ case isOdd index of
+ False ->
+ div [ class "columns is-vcentered box mb-5" ]
+ [ div [ class "column is-three-quarter" ]
+ [ figure [ class "image is-2by1 feature-image" ]
+ [ img [ src f.image ] []
+ ]
+ ]
+ , div [ class "column" ]
+ [ h2 [ class "title" ]
+ [ text f.header
+ ]
+ , Markdown.toHtml []
+ f.description
+ ]
+ ]
+
+ True ->
+ div [ class "columns is-vcentered box mb-5" ]
+ [ div [ class "column is-three-quarter" ]
+ [ h2 [ class "title" ]
+ [ text f.header
+ ]
+ , Markdown.toHtml []
+ f.description
+ ]
+ , div [ class "column" ]
+ [ figure [ class "image is-2by1 feature-image" ]
+ [ img [ src f.image ] []
+ ]
+ ]
+ ]
+
+
+features : List Feature
+features =
+ [ { image = "img/user-feature.png"
+ , header = "Multi-User per Account"
+ , description = """
+Each account (a *collective*) can have multiple users that share the
+same files. For example, everyone in your family can work with your
+files while using their own account with their own settings.
+"""
+ }
+ , { image = "img/ocr-feature.png"
+ , header = "Text Extraction with OCR"
+ , description = """
+Text is extracted from all files. For scanned documents/images, OCR is used by utilising tesseract. The text is analysed and is available for full-text search.
+"""
+ }
+ , { image = "img/filetype-feature.svg"
+ , header = "Support for many files"
+ , description = """
+Docspell can read many file types. ZIP and EML (e-mail file format) files are extracted and their contents imported.
+"""
+ }
+ , { image = "img/convertpdf-feature.svg"
+ , header = "Conversion to PDF"
+ , description = """
+All files are converted to PDF. Don't worry about the originals. Original files are stored, too and can be downloaded untouched. When creating PDFs from image data (often returned from scanners), the resulting PDF contains the extracted text and is searchable.
+"""
+ }
+ , { image = "img/fts-feature.png"
+ , header = "Full-Text Search"
+ , description = """
+The extracted text of all files and some properties, like names and notes, are available for full-text search. Full-text search can also be used to further constrain the results of the search-menu where you can search by tags, correspondent, etc.
+"""
+ }
+ , { image = "img/sendmail-feature.png"
+ , header = "Send via E-Mail"
+ , description = """
+
+Users can define SMTP settings in the app and are then able to send items out via E-Mail. This is often useful to share with other people. There is e-mail-address completion from your address book, of course.
+
+"""
+ }
+ , { image = "img/scanmailbox-feature.png"
+ , header = "Import Mailboxes"
+ , description = """
+Users can define IMAP settings so that docspell can import their e-mails. This can be done periodically based on a schedule. Imported mails can be moved away into another folder or deleted.
+"""
+ }
+ , { image = "img/notify-feature.png"
+ , header = "Notifications"
+ , description = """
+Users can be notified by e-mail for documents whose due-date comes closer.
+"""
+ }
+ ]
+
+
+isOdd : Int -> Bool
+isOdd num =
+ modBy 2 num == 1
diff --git a/website/elm/GetStarted.elm b/website/elm/GetStarted.elm
new file mode 100644
index 00000000..9e6df77c
--- /dev/null
+++ b/website/elm/GetStarted.elm
@@ -0,0 +1,91 @@
+module GetStarted exposing (..)
+
+import Html exposing (..)
+import Html.Attributes exposing (..)
+import Icons
+import Markdown
+
+
+getStarted : String -> List (Html msg)
+getStarted version =
+ [ div [ class "content container is-size-5" ]
+ [ Markdown.toHtml []
+ """Docspell consists of several components. The easiest way to get started is probably to use docker and
+[docker-compose](https://docs.docker.com/compose/)."""
+ , Markdown.toHtml []
+ ("""1. Clone the github repository
+ ```bash
+ $ git clone https://github.com/eikek/docspell
+ ```
+ Alternatively, [download](https://github.com/eikek/docspell/archive/v"""
+ ++ version
+ ++ """.zip) the sources and extract the zip file.
+2. Change into the `docker` directory:
+ ```bash
+ $ cd docspell/docker
+ ```
+3. Run `docker-compose up`:
+
+ ```bash
+ $ export DOCSPELL_HEADER_VALUE="my-secret-123"
+ $ docker-compose up
+ ```
+
+ The environment variable defines a secret that is shared between
+ some containers. You can define whatever you like. Please see the
+ [`consumedir.sh`](doc/tools/consumedir#docker) docs for additional
+ info.
+4. Goto , signup and login. When signing up,
+ you can choose the same name for collective and user. Then login
+ with this name and the password.
+
+5. (Optional) Create a folder `./docs/` (the name you
+ chose for the collective at registration) and place files in there
+ for importing them.
+
+The directory contains a file `docspell.conf` that you can
+[modify](docs/configure) as needed.
+ """
+ )
+ ]
+ , div [ class "content container" ]
+ [ div [ class "notification is-info is-light" ]
+ [ text "If you don't use docker, there are other ways that are "
+ , text "described in the relevant "
+ , a [ href "/docs/install" ]
+ [ text "documentation page"
+ ]
+ ]
+ ]
+ , div [ class "content container" ]
+ [ div [ class "notification is-success is-light" ]
+ [ div [ class "content is-medium" ]
+ [ h3 [ class "title" ]
+ [ text "Where to go from here?"
+ ]
+ , ul []
+ [ li []
+ [ text "The "
+ , a [ href "/docs/intro" ]
+ [ text "introduction" ]
+ , text " writes about the goals and basic idea."
+ ]
+ , li []
+ [ text "There is a comprehensive "
+ , a [ href "/docs" ]
+ [ text "documentation"
+ ]
+ , text " available."
+ ]
+ , li []
+ [ text "The source code is hosted on "
+ , a [ href "https://github.com/eikek/docspell" ]
+ [ text "github"
+ ]
+ , text "."
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
diff --git a/website/elm/Icons.elm b/website/elm/Icons.elm
new file mode 100644
index 00000000..eea70115
--- /dev/null
+++ b/website/elm/Icons.elm
@@ -0,0 +1,54 @@
+module Icons exposing (..)
+
+import Html exposing (..)
+import Html.Attributes exposing (..)
+
+
+copyright : Html msg
+copyright =
+ img [ src "icons/copyright-40.svg" ] []
+
+
+infoSquared : Html msg
+infoSquared =
+ img [ src "icons/info-square-40.svg" ] []
+
+
+refresh : Html msg
+refresh =
+ img [ src "icons/refresh-40.svg" ] []
+
+
+logo : Html msg
+logo =
+ img [ src "icons/logo-only.svg" ] []
+
+
+logoMC : Html msg
+logoMC =
+ img [ src "icons/logo-only-mc.svg" ] []
+
+
+logoWidth : Int -> Html msg
+logoWidth w =
+ img [ src "icons/logo-only.svg", width w ] []
+
+
+home : Html msg
+home =
+ img [ src "icons/home-40.svg" ] []
+
+
+docs : Html msg
+docs =
+ img [ src "icons/notes-40.svg" ] []
+
+
+github : Html msg
+github =
+ img [ src "/icons/github-40.svg" ] []
+
+
+githubGreen : Html msg
+githubGreen =
+ img [ src "/icons/github-40-green.svg" ] []
diff --git a/website/elm/Main.elm b/website/elm/Main.elm
new file mode 100644
index 00000000..54b94bbd
--- /dev/null
+++ b/website/elm/Main.elm
@@ -0,0 +1,313 @@
+module Main exposing (..)
+
+import Browser exposing (Document)
+import Browser.Navigation exposing (Key)
+import ExtraAttr exposing (..)
+import Feature exposing (Feature)
+import GetStarted
+import Html exposing (..)
+import Html.Attributes exposing (..)
+import Html.Events exposing (..)
+import Icons
+import Random
+import Random.List
+
+
+
+-- MAIN
+
+
+main : Program Flags Model Msg
+main =
+ Browser.element
+ { init = init
+ , view = view
+ , update = update
+ , subscriptions = subscriptions
+ }
+
+
+
+--- Model
+
+
+type alias Flags =
+ { version : String
+ }
+
+
+type alias Model =
+ { navbarOpen : Bool
+ , features : List Feature
+ , flags : Flags
+ }
+
+
+type Msg
+ = ToggleNavbarMenu
+ | ShuffleFeatures
+ | ListShuffled (List Feature)
+
+
+
+--- Init
+
+
+viewFeatureCount : Int
+viewFeatureCount =
+ 10
+
+
+init : Flags -> ( Model, Cmd Msg )
+init flags =
+ ( { navbarOpen = False
+ , features = List.take viewFeatureCount Feature.features
+ , flags = flags
+ }
+ , Cmd.none
+ )
+
+
+shuffleFeatures : Cmd Msg
+shuffleFeatures =
+ Random.List.shuffle Feature.features
+ |> Random.map (List.take viewFeatureCount)
+ |> Random.generate ListShuffled
+
+
+
+--- Update
+
+
+update : Msg -> Model -> ( Model, Cmd Msg )
+update msg model =
+ case msg of
+ ToggleNavbarMenu ->
+ ( { model | navbarOpen = not model.navbarOpen }
+ , Cmd.none
+ )
+
+ ShuffleFeatures ->
+ ( model, shuffleFeatures )
+
+ ListShuffled lf ->
+ ( { model | features = lf }
+ , Cmd.none
+ )
+
+
+subscriptions : Model -> Sub Msg
+subscriptions _ =
+ Sub.none
+
+
+
+--- View
+
+
+view : Model -> Html Msg
+view model =
+ node "body"
+ []
+ [ mainHero model
+ , featureHero model
+ , section [ class "section" ]
+ [ div [ class "container" ]
+ (List.indexedMap Feature.featureBox model.features
+ ++ [ div [ class "columns box" ]
+ [ div [ class "column is-full" ]
+ [ div [ class "content has-text-centered is-medium" ]
+ [ text "A more complete list can be found in "
+ , a [ href "/docs/features" ] [ text "here" ]
+ , text "."
+ ]
+ ]
+ ]
+ ]
+ )
+ ]
+ , getStartedHero model
+ , div [ class "section" ]
+ (GetStarted.getStarted model.flags.version)
+ , footHero model
+ ]
+
+
+footHero : Model -> Html Msg
+footHero model =
+ footer
+ [ id "footer"
+ , class "footer"
+ ]
+ [ div [ class "has-text-centered" ]
+ [ span []
+ [ text ("Docspell, " ++ model.flags.version)
+ ]
+ , span [ class "pr-1 pl-1" ]
+ [ text " • "
+ ]
+ , a
+ [ href "https://spdx.org/licenses/GPL-3.0-or-later.html"
+ , target "_blank"
+ ]
+ [ text "GPLv3+"
+ ]
+ , span [ class "pr-1 pl-1" ]
+ [ text " • "
+ ]
+ , a
+ [ href "https://github.com/eikek/docspell"
+ , target "_blank"
+ ]
+ [ text "Source Code"
+ ]
+ , span [ class "pr-1 pl-1" ]
+ [ text " • "
+ ]
+ , span []
+ [ text "© 2020 "
+ ]
+ , a
+ [ href "https://github.com/eikek"
+ , target "_blank"
+ ]
+ [ text "@eikek"
+ ]
+ ]
+ ]
+
+
+getStartedHero : Model -> Html Msg
+getStartedHero _ =
+ section
+ [ id "get-started"
+ , class "hero is-primary is-bold"
+ ]
+ [ div [ class "hero-body" ]
+ [ div [ class "container" ]
+ [ h2 [ class "title" ]
+ [ text "Get Started"
+ ]
+ ]
+ ]
+ ]
+
+
+featureHero : Model -> Html Msg
+featureHero model =
+ section
+ [ id "feature-selection"
+ , class "hero is-info is-bold"
+ ]
+ [ div
+ [ class "hero-body"
+ ]
+ [ div [ class "container" ]
+ [ h2 [ class "title" ]
+ [ text "Feature Selection"
+ ]
+ ]
+ ]
+ ]
+
+
+mainHero : Model -> Html Msg
+mainHero model =
+ section
+ [ id "hero-main"
+ , class "hero is-fullheight is-primary"
+ ]
+ [ div [ class "hero-head" ]
+ [ nav [ class "navbar" ]
+ [ div [ class "navbar-brand" ]
+ [ a
+ [ class "navbar-item"
+ , href "/"
+ ]
+ [ span [ class "icon is-large" ]
+ [ Icons.logo
+ ]
+ , text "Docspell"
+ ]
+ , a
+ [ role "button"
+ , onClick ToggleNavbarMenu
+ , classList
+ [ ( "navbar-burger", True )
+ , ( "is-active", model.navbarOpen )
+ ]
+ , ariaLabel "menu"
+ , ariaExpanded False
+ ]
+ [ span [ ariaHidden True ] []
+ , span [ ariaHidden True ] []
+ , span [ ariaHidden True ] []
+ ]
+ ]
+ , div
+ [ classList
+ [ ( "navbar-menu", True )
+ , ( "is-active", model.navbarOpen )
+ ]
+ ]
+ [ div [ class "navbar-start" ]
+ [ a
+ [ href "docs/"
+ , class "navbar-item"
+ ]
+ [ span [ class "icon" ]
+ [ Icons.docs
+ ]
+ , span []
+ [ text "Documentation"
+ ]
+ ]
+ , a
+ [ target "_blank"
+ , href "https://github.com/eikek/docspell"
+ , class "navbar-item"
+ ]
+ [ span [ class "icon" ]
+ [ Icons.github
+ ]
+ , span []
+ [ text "Github"
+ ]
+ ]
+ ]
+ ]
+ ]
+ ]
+ , div [ class "hero-body" ]
+ [ div
+ [ class "container has-text-centered"
+ ]
+ [ Icons.logoWidth 112
+ , h1 [ class "title main-title is-2" ]
+ [ text "Docspell"
+ ]
+ , h2 [ class "subtitle is-3" ]
+ [ text "Simple document organizer"
+ ]
+ , p [ class "content is-medium" ]
+ [ text "Docspell can assist in organizing your piles of "
+ , text "digital documents, resulting from scanners, e-mails "
+ , text "and other sources with miminal effort."
+ ]
+ , div [ class " buttons is-centered" ]
+ [ a
+ [ class "button is-primary is-medium"
+ , href "#get-started"
+ ]
+ [ text "Get Started"
+ ]
+ , a
+ [ class "button is-info is-medium"
+ , href "#feature-selection"
+ ]
+ [ text "Features"
+ ]
+ ]
+ ]
+ ]
+ ]
diff --git a/website/package.json b/website/package.json
new file mode 100644
index 00000000..1679a004
--- /dev/null
+++ b/website/package.json
@@ -0,0 +1,6 @@
+{
+ "license": "GPL-3.0-or-later",
+ "dependencies": {
+ "bulma": "^0.9.0"
+ }
+}
diff --git a/website/run-elm.sh b/website/run-elm.sh
new file mode 100755
index 00000000..785bc548
--- /dev/null
+++ b/website/run-elm.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CMD="elm make --output site/static/js/bundle.js --optimize elm/Main.elm"
+$CMD
+
+inotifywait -m -e close_write -r elm/ |
+ while read f; do
+ $CMD
+ done
diff --git a/website/shell.nix b/website/shell.nix
new file mode 100644
index 00000000..4fed5990
--- /dev/null
+++ b/website/shell.nix
@@ -0,0 +1,15 @@
+let
+ nixpkgsUnstable = builtins.fetchTarball {
+ url = "https://github.com/NixOS/nixpkgs-channels/archive/nixos-unstable.tar.gz";
+ };
+ pkgsUnstable = import nixpkgsUnstable { };
+in
+with pkgsUnstable;
+
+ mkShell {
+ buildInputs = [
+ zola
+ yarn
+ inotifyTools
+ ];
+ }
diff --git a/website/site/config.toml b/website/site/config.toml
new file mode 100644
index 00000000..f701f182
--- /dev/null
+++ b/website/site/config.toml
@@ -0,0 +1,29 @@
+# The URL the site will be built for
+base_url = "https://docspell.org"
+
+# Whether to automatically compile all Sass files in the sass directory
+compile_sass = true
+
+# Whether to do syntax highlighting
+# Theme can be customised by setting the `highlight_theme` variable to a theme supported by Zola
+highlight_code = true
+
+highlight_theme = "gruvbox-dark"
+
+# Whether to build a search index to be used later on by a JavaScript library
+build_search_index = true
+
+[link_checker]
+skip_prefixes = [
+ "http://localhost",
+ "/openapi",
+ "https://www.abisource.com" # has bad ssl config
+]
+skip_anchor_prefixes = [
+ "https://github.com",
+ "https://package.elm-lang.org"
+]
+
+[extra]
+# Put all your custom variables here
+version = "0.9.0-SNAPSHOT"
diff --git a/website/site/content/doc/_index.md b/website/site/content/doc/_index.md
new file mode 100644
index 00000000..502bf030
--- /dev/null
+++ b/website/site/content/doc/_index.md
@@ -0,0 +1,3 @@
++++
+redirect_to = "/docs"
++++
diff --git a/website/site/content/docs/_index.md b/website/site/content/docs/_index.md
new file mode 100644
index 00000000..630db7b9
--- /dev/null
+++ b/website/site/content/docs/_index.md
@@ -0,0 +1,9 @@
++++
+title = "Overview"
+template = "overview.html"
+insert_anchor_links = "right"
++++
+
+# Note
+
+This content is not rendered. Everything is in the template.
diff --git a/website/site/content/docs/api/_index.md b/website/site/content/docs/api/_index.md
new file mode 100644
index 00000000..fd0b8718
--- /dev/null
+++ b/website/site/content/docs/api/_index.md
@@ -0,0 +1,93 @@
++++
+title = "Api"
+description = "Contains documentation about the REST API."
+weight = 70
+insert_anchor_links = "right"
+[extra]
+mktoc = true
++++
+
+Docspell is designed as a REST server that uses JSON to exchange
+data. The REST api can be used to integrate docspell into your
+workflow.
+
+[Docspell REST Api Doc](/openapi/docspell-openapi.html)
+
+The "raw" `openapi.yml` specification file can be found
+[here](/openapi/docspell-openapi.yml).
+
+The routes can be divided into protected and unprotected routes. The
+unprotected, or open routes are at `/open/*` while the protected
+routes are at `/sec/*`. Open routes don't require authenticated access
+and can be used by any user. The protected routes require an
+authenticated user.
+
+## Authentication
+
+The unprotected route `/open/auth/login` can be used to login with
+account name and password. The response contains a token that can be
+used for accessing protected routes. The token is only valid for a
+restricted time which can be configured (default is 5 minutes).
+
+New tokens can be generated using an existing valid token and the
+protected route `/sec/auth/session`. This will return the same
+response as above, giving a new token.
+
+This token can be added to requests in two ways: as a cookie header or
+a "normal" http header. If a cookie header is used, the cookie name
+must be `docspell_auth` and a custom header must be named
+`X-Docspell-Auth`.
+
+## Live Api
+
+Besides the statically generated documentation at this site, the rest
+server provides a swagger generated api documenation, that allows
+playing around with the api. It requires a running docspell rest
+server. If it is deployed at `http://localhost:7880`, then check this
+url:
+
+```
+http://localhost:7880/api/doc
+```
+
+## Examples
+
+These examples use the great command line tool
+[curl](https://curl.haxx.se/).
+
+### Login
+
+``` bash
+$ curl -X POST -d '{"account": "smith", "password": "test"}' http://localhost:7880/api/v1/open/auth/login
+{"collective":"smith"
+,"user":"smith"
+,"success":true
+,"message":"Login successful"
+,"token":"1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI="
+,"validMs":300000
+}
+```
+
+### Get new token
+
+``` bash
+$ curl -XPOST -H 'X-Docspell-Auth: 1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI=' http://localhost:7880/api/v1/sec/auth/session
+{"collective":"smith"
+,"user":"smith"
+,"success":true
+,"message":"Login successful"
+,"token":"1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc="
+,"validMs":300000
+}
+```
+
+### Get some insights
+
+``` bash
+$ curl -H 'X-Docspell-Auth: 1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc=' http://localhost:7880/api/v1/sec/collective/insights
+{"incomingCount":3
+,"outgoingCount":1
+,"itemSize":207310
+,"tagCloud":{"items":[]}
+}
+```
diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md
new file mode 100644
index 00000000..7606078c
--- /dev/null
+++ b/website/site/content/docs/configure/_index.md
@@ -0,0 +1,330 @@
++++
+title = "Configuration"
+insert_anchor_links = "right"
+description = "There are several tools distributed with docspell, like a program to watch a folder and import files to docspell."
+weight = 40
+[extra]
+mktoc = true
++++
+
+Docspell's executable can take one argument – a configuration file. If
+that is not given, the defaults are used. The config file overrides
+default values, so only values that differ from the defaults are
+necessary.
+
+This applies to the restserver and the joex as well.
+
+# Important Config Options
+
+The configuration of both components uses separate namespaces. The
+configuration for the REST server is below `docspell.server`, while
+the one for joex is below `docspell.joex`.
+
+## JDBC
+
+This configures the connection to the database. This has to be
+specified for the rest server and joex. By default, a H2 database in
+the current `/tmp` directory is configured.
+
+The config looks like this (both components):
+
+``` conf
+docspell.joex.jdbc {
+ url = ...
+ user = ...
+ password = ...
+}
+
+docspell.server.backend.jdbc {
+ url = ...
+ user = ...
+ password = ...
+}
+```
+
+The `url` is the connection to the database. It must start with
+`jdbc`, followed by name of the database. The rest is specific to the
+database used: it is either a path to a file for H2 or a host/database
+url for MariaDB and PostgreSQL.
+
+When using H2, the user and password can be chosen freely on first
+start, but must stay the same on subsequent starts. Usually, the user
+is `sa` and the password is left empty. Additionally, the url must
+include these options:
+
+```
+;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE
+```
+
+### Examples
+
+PostgreSQL:
+```
+url = "jdbc:postgresql://localhost:5432/docspelldb"
+```
+
+MariaDB:
+```
+url = "jdbc:mariadb://localhost:3306/docspelldb"
+```
+
+H2
+```
+url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
+```
+
+
+## Full-Text Search: SOLR
+
+[Apache SOLR](https://lucene.apache.org/solr) is used to provide the
+full-text search. Both docspell components must provide the same
+connection setup. This is defined in the `full-text-search.solr`
+subsection:
+
+``` conf
+...
+ full-text-search {
+ enabled = true
+ ...
+ solr = {
+ url = "http://localhost:8983/solr/docspell"
+ }
+ }
+```
+
+The default configuration at the end of this page contains more
+information about each setting.
+
+The `solr.url` is the mandatory setting that you need to change to
+point to your SOLR instance. Then you need to set the `enabled` flag
+to `true`.
+
+When installing docspell manually, just install solr and create a core
+as described in the [solr
+documentation](https://lucene.apache.org/solr/guide/8_4/installing-solr.html).
+That will provide you with the connection url (the last part is the
+core name).
+
+While the `full-text-search.solr` options are the same for joex and
+the restserver, there are some settings that differ. The restserver
+has this additional setting, that may be of interest:
+
+``` conf
+full-text-search {
+ recreate-key = "test123"
+}
+```
+
+This key is required if you want docspell to drop and re-create the
+entire index. This is possible via a REST call:
+
+``` bash
+$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
+```
+
+Here the `test123` is the key defined with `recreate-key`. If it is
+empty (the default), this REST call is disabled. Otherwise, the POST
+request will submit a system task that is executed by a joex instance
+eventually.
+
+Using this endpoint, the index will be re-created. This is sometimes
+necessary, for example if you upgrade SOLR or delete the core to
+provide a new one (see
+[here](https://lucene.apache.org/solr/guide/8_4/reindexing.html) for
+details). Note that a collective can also re-index their data using a
+similiar endpoint; but this is only deleting their data and doesn't do
+a full re-index.
+
+The solr index doesn't contain any new information, it can be
+regenerated any time using the above REST call. Thus it doesn't need
+to be backed up.
+
+## Bind
+
+The host and port the http server binds to. This applies to both
+components. The joex component also exposes a small REST api to
+inspect its state and notify the scheduler.
+
+``` conf
+docspell.server.bind {
+ address = localhost
+ port = 7880
+}
+docspell.joex.bind {
+ address = localhost
+ port = 7878
+}
+```
+
+By default, it binds to `localhost` and some predefined port. This
+must be changed, if components are on different machines.
+
+## Baseurl
+
+The base url is an important setting that defines the http URL where
+the corresponding component can be reached. It applies to both
+components. For a joex component, the url must be resolvable from a
+REST server component. The REST server also uses this url to create
+absolute urls and to configure the authenication cookie.
+
+By default it is build using the information from the `bind` setting.
+
+
+```
+docspell.server.baseurl = ...
+docspell.joex.baseurl = ...
+```
+
+### Examples
+
+```
+docspell.server.baseurl = "https://docspell.example.com"
+docspell.joex.baseurl = "http://192.168.101.10"
+```
+
+
+## App-id
+
+The `app-id` is the identifier of the corresponding instance. It *must
+be unique* for all instances. By default the REST server uses `rest1`
+and joex `joex1`. It is recommended to overwrite this setting to have
+an explicit and stable identifier.
+
+``` conf
+docspell.server.app-id = "rest1"
+docspell.joex.app-id = "joex1"
+```
+
+## Registration Options
+
+This defines if and how new users can create accounts. There are 3
+options:
+
+- *closed* no new user can sign up
+- *open* new users can sign up
+- *invite* new users can sign up but require an invitation key
+
+This applies only to the REST sevrer component.
+
+``` conf
+docspell.server.signup {
+ mode = "open"
+
+ # If mode == 'invite', a password must be provided to generate
+ # invitation keys. It must not be empty.
+ new-invite-password = ""
+
+ # If mode == 'invite', this is the period an invitation token is
+ # considered valid.
+ invite-time = "3 days"
+}
+```
+
+The mode `invite` is intended to open the application only to some
+users. The admin can create these invitation keys and distribute them
+to the desired people. For this, the `new-invite-password` must be
+given. The idea is that only the person who installs docspell knows
+this. If it is not set, then invitation won't work. New invitation
+keys can be generated from within the web application or via REST
+calls (using `curl`, for example).
+
+``` bash
+curl -X POST -d '{"password":"blabla"}' "http://localhost:7880/api/v1/open/signup/newinvite"
+```
+
+## Authentication
+
+Authentication works in two ways:
+
+- with an account-name / password pair
+- with an authentication token
+
+The initial authentication must occur with an accountname/password
+pair. This will generate an authentication token which is valid for a
+some time. Subsequent calls to secured routes can use this token. The
+token can be given as a normal http header or via a cookie header.
+
+These settings apply only to the REST server.
+
+``` conf
+docspell.server.auth {
+ server-secret = "hex:caffee" # or "b64:Y2FmZmVlCg=="
+ session-valid = "5 minutes"
+}
+```
+
+The `server-secret` is used to sign the token. If multiple REST
+servers are deployed, all must share the same server secret. Otherwise
+tokens from one instance are not valid on another instance. The secret
+can be given as Base64 encoded string or in hex form. Use the prefix
+`hex:` and `b64:`, respectively. If no prefix is given, the UTF8 bytes
+of the string are used.
+
+The `session-valid` deterimens how long a token is valid. This can be
+just some minutes, the web application obtains new ones
+periodically. So a short time is recommended.
+
+
+# File Format
+
+The format of the configuration files can be
+[HOCON](https://github.com/lightbend/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation),
+JSON or whatever the used [config
+library](https://github.com/lightbend/config) understands. The default
+values below are in HOCON format, which is recommended, since it
+allows comments and has some [advanced
+features](https://github.com/lightbend/config#features-of-hocon).
+Please refer to their documentation for more on this.
+
+Here are the default configurations.
+
+
+# Default Config
+## Rest Server
+
+{{ incl_conf(path="templates/shortcodes/server.conf") }}
+
+
+## Joex
+
+
+{{ incl_conf(path="templates/shortcodes/joex.conf") }}
+
+
+# Logging
+
+By default, docspell logs to stdout. This works well, when managed by
+systemd or other inits. Logging is done by
+[logback](https://logback.qos.ch/). Please refer to its documentation
+for how to configure logging.
+
+If you created your logback config file, it can be added as argument
+to the executable using this syntax:
+
+``` bash
+/path/to/docspell -Dlogback.configurationFile=/path/to/your/logging-config-file
+```
+
+To get started, the default config looks like this:
+
+``` xml
+
+
+ true
+
+
+ [%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n
+
+
+
+
+
+
+
+
+```
+
+The `` means, that only log statements with level
+"INFO" will be printed. But the `` above says, that for loggers with name "docspell"
+statements with level "DEBUG" will be printed, too.
diff --git a/website/site/content/docs/dev/_index.md b/website/site/content/docs/dev/_index.md
new file mode 100644
index 00000000..da91edbc
--- /dev/null
+++ b/website/site/content/docs/dev/_index.md
@@ -0,0 +1,9 @@
++++
+title = "Development"
+description = "Contains build instructions and other internal notes."
+weight = 300
+sort_by = "weight"
+insert_anchor_links = "right"
+template = "pages.html"
+redirect_to = "/docs/dev/building"
++++
diff --git a/website/site/content/docs/dev/adr/0000_use_markdown_architectural_decision_records.md b/website/site/content/docs/dev/adr/0000_use_markdown_architectural_decision_records.md
new file mode 100644
index 00000000..0cf80e8d
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0000_use_markdown_architectural_decision_records.md
@@ -0,0 +1,36 @@
++++
+title = "Use Markdown Architectural Decision Records"
+weight = 10
++++
+
+# Context and Problem Statement
+
+We want to [record architectural decisions](https://adr.github.io/)
+made in this project. Which format and structure should these records
+follow?
+
+# Considered Options
+
+* [MADR](https://adr.github.io/madr/) 2.1.0 - The Markdown Architectural Decision Records
+* [Michael Nygard's template](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions) - The first incarnation of the term "ADR"
+* [Sustainable Architectural
+ Decisions](https://www.infoq.com/articles/sustainable-architectural-design-decisions) -
+ The Y-Statements
+* Other templates listed at
+
+* Formless - No conventions for file format and structure
+
+# Decision Outcome
+
+Chosen option: "MADR 2.1.0", because
+
+* Implicit assumptions should be made explicit. Design documentation
+ is important to enable people understanding the decisions later on.
+ See also [A rational design process: How and why to fake
+ it](https://doi.org/10.1109/TSE.1986.6312940).
+* The MADR format is lean and fits our development style.
+* The MADR structure is comprehensible and facilitates usage &
+ maintenance.
+* The MADR project is vivid.
+* Version 2.1.0 is the latest one available when starting to document
+ ADRs.
diff --git a/website/site/content/docs/dev/adr/0001_components.md b/website/site/content/docs/dev/adr/0001_components.md
new file mode 100644
index 00000000..f31ebabb
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0001_components.md
@@ -0,0 +1,64 @@
++++
+title = "Components"
+weight = 20
++++
+
+# Context and Problem Statement
+
+How should the application be structured into its main components? The
+goal is to be able to have multiple rest servers/webapps and multiple
+document processor components working togehter.
+
+
+# Decision Outcome
+
+The following are the "main" modules. There may be more helper modules
+and libraries that support implementing a feature.
+
+## store
+
+The code related to database access. It also provides the job
+queue. It is designed as a library.
+
+## joex
+
+Joex stands for "job executor".
+
+An application that executes jobs from the queue and therefore depends
+on the `store` module. It provides the code for all tasks that can be
+submitted as jobs. If no jobs are in the queue, the joex "sleeps"
+and must be waked via an external request.
+
+It provides the document processing code.
+
+It provides a http rest server to get insight into the joex state
+and also to be notified for new jobs.
+
+## backend
+
+It provides all the logic, except document processing, as a set of
+"operations". An operation can be directly mapped to a rest
+endpoint.
+
+It is designed as a library.
+
+## rest api
+
+This module contains the specification for the rest server as an
+`openapi.yml` file. It is packaged as a scala library that also
+provides types and conversions to/from json.
+
+The idea is that the `rest server` module can depend on it as well as
+rest clients.
+
+## rest server
+
+This is the main application. It directly depends on the `backend`
+module, and each rest endpoint maps to a "backend operation". It is
+also responsible for converting the json data inside http requests
+to/from types recognized by the `backend` module.
+
+
+## webapp
+
+This module provides the user interface as a web application.
diff --git a/website/site/content/docs/dev/adr/0002_component_interaction.md b/website/site/content/docs/dev/adr/0002_component_interaction.md
new file mode 100644
index 00000000..888b233f
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0002_component_interaction.md
@@ -0,0 +1,63 @@
++++
+title = "Component Interaction"
+weight = 30
++++
+
+# Context and Problem Statement
+
+There are multiple web applications with their rest servers and there
+are multiple document processors. These processes must communicate:
+
+- once a new job is added to the queue the rest server must somehow
+ notify processors to wake up
+- once a processor takes a job, it must propagate the progress and
+ outcome to all rest servers only that the rest server can notify the
+ user that is currently logged in. Since it's not known which
+ rest-server the user is using right now, all must be notified.
+
+# Considered Options
+
+1. JMS (ActiveMQ or similiar): Message Broker as another active
+ component
+2. Akka: using a cluster
+3. DB: Register with "call back urls"
+
+# Decision Outcome
+
+Choosing option 3: DB as central synchronisation point.
+
+The reason is that this is the simplest solution and doesn't require
+external libraries or more processes. The other options seem too big
+of a weapon for the task at hand. They are both large components
+itself and require more knowledge to use them efficiently.
+
+It works roughly like this:
+
+- rest servers and processors register at the database on startup each
+ with a unique call-back url
+- and deregister on shutdown
+- each component has db access
+- rest servers can list all processors and vice versa
+
+## Positive Consequences
+
+- complexity of the whole application is not touched
+- since a lot of data must be transferred to the document processors,
+ this is solved by simply accessing the db. So the protocol for data
+ exchange is set. There is no need for other protocols that handle
+ large data (http chunking etc)
+- uses the already exsting db as synchronisation point
+- no additional knowledge required
+- simple to understand and so not hard to debug
+
+## Negative Consequences
+
+- all components must have db access. this also is a security con,
+ because if one of those processes is hacked, db access is
+ possible. and it simply is another dependency that is not really
+ required for the joex component
+- the joex component cannot be in an untrusted environment (untrusted
+ from the db's point of view). For example, it is not possible to
+ create "personal joex" that only receive your own jobs…
+- in order to know if a component is really active, one must run a
+ ping against the call-back url
diff --git a/website/site/content/docs/dev/adr/0003_encryption.md b/website/site/content/docs/dev/adr/0003_encryption.md
new file mode 100644
index 00000000..bc7a134f
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0003_encryption.md
@@ -0,0 +1,93 @@
++++
+title = "Encryption"
+weight = 40
++++
+
+
+# Context and Problem Statement
+
+Since docspell may store important documents, it should be possible to
+encrypt them on the server. It should be (almost) transparent to the
+user, for example, a user must be able to login and download a file in
+clear form. That is, the server must also decrypt them.
+
+Then all users of a collective should have access to the files. This
+requires to share the key among users of a collective.
+
+But, even when files are encrypted, the associated meta data is not!
+So especially access to the database would allow to see tags,
+associated persons and correspondents of documents.
+
+So in short, encryption means:
+
+- file contents (the blobs and extracted text) is encrypted
+- metadata is not
+- secret keys are stored at the server (protected by a passphrase),
+ such that files can be downloaded in clear form
+
+
+# Decision Drivers
+
+* major driver is to provide most possible privacy for users
+* even at the expense of less features; currently I think that the
+ associated meta data is enough for finding documents (i.e. full text
+ search is not needed)
+
+# Considered Options
+
+It is clear, that only blobs (file contents) can be encrypted, but not
+the associated metadata. And the extracted text must be encrypted,
+too, obviously.
+
+
+## Public Key Encryption (PKE)
+
+With PKE that the server can automatically encrypt files using
+publicly available key data. It wouldn't require a user to provide a
+passphrase for encryption, only for decryption.
+
+This would allows for first processing files (extracting text, doing
+text analyisis) and encrypting them (and the text) afterwards.
+
+The public and secret keys are stored at the database. The secret key
+must be protected. This can be done by encrypting the passphrase to
+the secret key using each users login password. If a user logs in, he
+or she must provide the correct password. Using this password, the
+private key can be unlocked. This requires to store the private key
+passphrase encrypted with every users password in the database. So the
+whole security then depends on users password quality.
+
+There are plenty of other difficulties with this approach (how about
+password change, new secret keys, adding users etc).
+
+Using this kind of encryption would protect the data against offline
+attacks and also for accidental leakage (for example, if a bug in the
+software would access a file of another user).
+
+
+## No Encryption
+
+If only blobs are encrypted, against which type of attack would it
+provide protection?
+
+The users must still trust the server. First, in order to provide the
+wanted features (document processing), the server must see the file
+contents. Then, it will receive and serve files in clear form, so it
+has access to them anyways.
+
+With that in mind, the "only" feature is to protect against "stolen
+database" attacks. If the database is somehow leaked, the attackers
+would only see the metadata, but not real documents. It also protects
+against leakage, maybe caused by a pogramming error.
+
+But the downside is, that it increases complexity *a lot*. And since
+this is a personal tool for personal use, is it worth the effort?
+
+
+# Decision Outcome
+
+No encryption, because of its complexity.
+
+For now, this tool is only meant for "self deployment" and personal
+use. If this changes or there is enough time, this decision should be
+reconsidered.
diff --git a/website/site/content/docs/dev/adr/0004_iso8601vsEpoch.md b/website/site/content/docs/dev/adr/0004_iso8601vsEpoch.md
new file mode 100644
index 00000000..4bda7210
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0004_iso8601vsEpoch.md
@@ -0,0 +1,40 @@
++++
+title = "ISO8601 vs Millis as Date-Time transfer"
+weight = 50
++++
+
+# Context and Problem Statement
+
+The question is whether the REST Api should return an ISO8601
+formatted string in UTC timezone, or the unix time (number of
+milliseconds since 1970-01-01).
+
+There is quite some controversy about it.
+
+-
+-
+
+In my opinion, the ISO8601 format (always UTC) is better. The reason
+is the better readability. But elm folks are on the other side:
+
+-
+-
+
+One can convert from an ISO8601 date-time string in UTC time into the
+epoch millis and vice versa. So it is the same to me. There is no less
+information in a ISO8601 string than in the epoch millis.
+
+To avoid confusion, all date/time values should use the same encoding.
+
+# Decision Outcome
+
+I go with the epoch time. Every timestamp/date-time values is
+transfered as Unix timestamp.
+
+Reasons:
+
+- the Elm application needs to frequently calculate with these values
+ to render the current waiting time etc. This is better if there are
+ numbers without requiring to parse dates first
+- Since the UI is written with Elm, it's probably good to adopt their
+ style
diff --git a/website/site/content/docs/dev/adr/0005_job-executor.md b/website/site/content/docs/dev/adr/0005_job-executor.md
new file mode 100644
index 00000000..01744e71
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0005_job-executor.md
@@ -0,0 +1,134 @@
++++
+title = "Joex - Job Executor"
+weight = 60
++++
+
+# Context and Problem Statement
+
+Docspell is a multi-user application. When processing user's
+documents, there must be some thought on how to distribute all the
+processing jobs on a much more restricted set of resources. There
+maybe 100 users but only 4 cores that can process documents at a
+time. Doing simply FIFO is not enough since it provides an unfair
+distribution. The first user who submits 20 documents will then occupy
+all cores for quite some time and all other users would need to wait.
+
+This tries to find a more fair distribution among the users (strictly
+meaning collectives here) of docspell.
+
+The job executor is a separate component that will run in its own
+process. It takes the next job from the "queue" and executes the
+associated task. This is used to run the document processing jobs
+(text extraction, text analysis etc).
+
+1. The task execution should survive restarts. State and task code
+ must be recreated from some persisted state.
+
+2. The processing should be fair with respect to collectives.
+
+3. It must be possible to run many job executors, possibly on
+ different machines. This can be used to quickly enable more
+ processing power and removing it once the peak is over.
+
+4. Task execution can fail and it should be able to retry those
+ tasks. Reasons are that errors may be temporarily (for example
+ talking to a third party service), and to enable repairing without
+ stopping the job executor. Some errors might be easily repaired (a
+ program was not installed or whatever). In such a case it is good
+ to know that the task will be retried later.
+
+# Considered Options
+
+In contrast to other ADRs this is just some sketching of thoughts for
+the current implementation.
+
+1. Job description are serialized and written to the database into a
+ table. This becomes the queue. Tasks are identified by names and a
+ job executor implementation must have a map of names to code to
+ lookup the task to perform. The tasks arguments are serialized into
+ a string and written to the database. Tasks must decode the
+ string. This can be conveniently done using JSON and the provided
+ circe decoders.
+
+2. To provide a fair execution jobs are organized into groups. When a
+ new job is requested from the queue, first a group is selected
+ using a round-robin strategy. This should ensure good enough
+ fairness among groups. A group maps to a collective. Within a
+ group, a job is selected based on priority, submitted time (fifo)
+ and job state (see notes about stuck jobs).
+
+3. Allowing multiple job executors means that getting the next job can
+ fail due to simultaneous running transactions. It is retried until
+ it succeeds. Taking a job puts in into _scheduled_ state. Each job
+ executor has a unique (manually supplied) id and jobs are marked
+ with that id once it is handed to the executor.
+
+4. When a task fails, its state is updated to state _stuck_. Stuck
+ jobs are retried in the future. The queue prefers to return stuck
+ jobs that are due at the specific point in time ignoring the
+ priority hint.
+
+## More Details
+
+A job has these properties
+
+- id (something random)
+- group
+- taskname (to choose task to run)
+- submitted-date
+- worker (the id of the job executor)
+- state, one of: waiting, scheduled, running, stuck, cancelled,
+ failed, success
+ - waiting: job has been inserted into the queue
+ - scheduled: job has been handed over to some executore and is
+ marked with the job executor id
+ - running: a task is currently executing
+ - stuck: a task has failed and is being retried eventually
+ - cancelled: task has finished and there was a cancel request
+ - failed: task has failed, execeeded the retries
+ - success: task has completed successfully
+
+The queue has a `take` or `nextJob` operation that takes the worker-id
+and a priority hint and goes roughly like this:
+
+- select the next group using round-robin strategy
+- select all jobs with that group, where
+ - state is stuck and waiting time has elapsed
+ - state is waiting and have the given priority if possible
+- jobs are ordered by submitted time, but stuck jobs whose waiting
+ time elapsed are preferred
+
+There are two priorities within a group: high and low. A configured
+counting scheme determines when to select certain priority. For
+example, counting scheme of `(2,1)` would select two high priority
+jobs and then 1 low priority job. The `take` operation tries to prefer
+this priority but falls back to the other if no job with this priority
+is available.
+
+A group corresponds to a collective. Then all collectives get
+(roughly) equal treatment.
+
+Once there are no jobs in the queue the executor goes into sleep and
+must be waked to run again. If a job is submitted, the executors are
+notified.
+
+## Stuck Jobs
+
+A job is going into _stuck_ state, if the task has failed. In this
+state, the task is rerun after a while until a maximum retry count is
+reached.
+
+The problem is how to notify all executors when the waiting time has
+elapsed. If one executor puts a job into stuck state, it means that
+all others should start looking into the queue again after `x`
+minutes. It would be possible to tell all existing executors to
+schedule themselves to wake up in the future, but this would miss all
+executors that show up later.
+
+The waiting time is increased exponentially after each retry (`2 ^
+retry`) and it is meant as the minimum waiting time. So it is ok if
+all executors wakeup periodically and check for new work. Most of the
+time this should not be necessary and is just a fallback if only stuck
+jobs are in the queue and nothing is submitted for a long time. If the
+system is used, jobs get submitted once in a while and would awake all
+executors.
diff --git a/website/site/content/docs/dev/adr/0006_more-file-types.md b/website/site/content/docs/dev/adr/0006_more-file-types.md
new file mode 100644
index 00000000..3c27490e
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0006_more-file-types.md
@@ -0,0 +1,150 @@
++++
+title = "More File Types"
+weight = 70
++++
+
+# Context and Problem Statement
+
+Docspell currently only supports PDF files. This has simplified early
+development and design a lot and so helped with starting the project.
+Handling pdf files is usually easy (to view, to extract text, print
+etc).
+
+The pdf format has been chosen, because PDFs files are very common and
+can be viewed with many tools on many systems (i.e. non-proprietary
+tools). Docspell also is a document archive and from this perspective,
+it is important that documents can be viewed in 10 years and more. The
+hope is, that the PDF format is best suited for this. Therefore all
+documents in Docspell must be accessible as PDF. The trivial solution
+to this requirement is to only allow PDF files.
+
+Support for more document types, must then take care of the following:
+
+- extracting text
+- converting into pdf
+- access original file
+
+Text should be extracted from the source file, in case conversion is
+not lossless. Since Docspell can already extract text from PDF files
+using OCR, text can also be extracted from the converted file as a
+fallback.
+
+The original file must always be accessible. The main reason is that
+all uploaded data should be accessible without any modification. And
+since the conversion may not always create best results, the original
+file should be kept.
+
+
+# Decision Drivers
+
+People expect that software like Docspell support the most common
+document types, like all the “office documents” (`docx`, `rtf`, `odt`,
+`xlsx`, …) and images. For many people it is more common to create
+those files instead of PDF. Some (older) scanners may not be able to
+scan into PDF files but only to image files.
+
+
+# Considered Options
+
+This ADR does not evaluate different options. It rather documents why
+this feature is realized and the thoughts that lead to how it is
+implemented.
+
+# Realization
+
+## Data Model
+
+The `attachment` table holds one file. There will be another table
+`attachment_source` that holds the original file. It looks like this:
+
+``` sql
+CREATE TABLE "attachment_source" (
+ "id" varchar(254) not null primary key,
+ "file_id" varchar(254) not null,
+ "filename" varchar(254),
+ "created" timestamp not null,
+ foreign key ("file_id") references "filemeta"("id"),
+ foreign key ("id") references "attachment"("attachid")
+);
+```
+
+The `id` is the primary key and is the same as the associated
+`attachment`, creating a `1-1` relationship (well, more correct is
+`0..1-1`) between `attachment` and `attachment_source`.
+
+There will always be a `attachment_source` record for every
+`attachment` record. If the original file is a PDF already, then both
+table's `file_id` columns point to the same file. But now the user can
+change the filename of an `attachment` while the original filename is
+preserved in `attachment_source`. It must not be possible for the user
+to change anything in `attachment_source`.
+
+The `attachment` table is not touched in order to keep current code
+mostly unchanged and to have a simpler data migration. The downside
+is, that the data model allows to have an `attachment` record without
+an `attachment_source` record. OTOH, a foreign key inside `attachment`
+pointing to an `attachment_source` is also not correct, because it
+allows the same `attachment_source` record to be associated with many
+`attachment` records. This would do even more harm, in my opinion.
+
+## Migration
+
+Creating a new table and not altering existing ones, should simplify
+data migration.
+
+Since only PDF files where allowed and the user could not change
+anything in the `attachment` table, the existing data can simply be
+inserted into the new table. This presents the trivial case where the
+attachment and source are the same.
+
+
+## Processing
+
+The first step in processing is now converting the file into a pdf. If
+it already is a pdf, nothing is done. This step is before text
+extraction, so text can first be tried to extract from the source file
+and only if that fails (or is not supported), text can be extracted
+from the converted pdf file. All remaining steps are untouched.
+
+If conversion is not supported for the input file, it is skipped. If
+conversion fails, the error is propagated to let the retry mechanism
+take care.
+
+### What types?
+
+Which file types should be supported? At a first step, all major
+office documents, common images, plain text (i.e. markdown) and html
+should be supported. In terms of file extensions: `doc`, `docx`,
+`xls`, `xlsx`, `odt`, `md`, `html`, `txt`, `jpg`, `png`, `tif`.
+
+There is always the preference to use jvm internal libraries in order
+to be more platform independent and to reduce external dependencies.
+But this is not always possible (like doing OCR).
+
+{{ figure(file="process-files.png") }}
+
+### Conversion
+
+- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
+ unoconv (see [ADR 9](@/docs/dev/adr/0009_convert_office_docs.md))
+- HTML (`html`): wkhtmltopdf (see [ADR 7](@/docs/dev/adr/0007_convert_html_files.md))
+- Text/Markdown (`txt`, `md`): Java-Lib flexmark + wkhtmltopdf
+- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
+ 10](@/docs/dev/adr/0010_convert_image_files.md))
+
+### Text Extraction
+
+- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
+- Office documends (`odt`, `ods`): Apache Tika (including the sources)
+- HTML: not supported, extract text from converted PDF
+- Images (`jpg`, `png`, `tif`): Tesseract
+- Text/Markdown: n.a.
+- PDF: Apache PDFBox or Tesseract
+
+# Links
+
+* [Convert HTML Files](@/docs/dev/adr/0007_convert_html_files.md)
+* [Convert Plain Text](@/docs/dev/adr/0008_convert_plain_text.md)
+* [Convert Office Documents](@/docs/dev/adr/0009_convert_office_docs.md)
+* [Convert Image Files](@/docs/dev/adr/0010_convert_image_files.md)
+* [Extract Text from Files](@/docs/dev/adr/0011_extract_text.md)
diff --git a/website/site/content/docs/dev/adr/0007_convert_html_files.md b/website/site/content/docs/dev/adr/0007_convert_html_files.md
new file mode 100644
index 00000000..164505d3
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0007_convert_html_files.md
@@ -0,0 +1,59 @@
++++
+title = "Convert HTML Files"
+weight = 80
++++
+
+# Context and Problem Statement
+
+How can HTML documents be converted into a PDF file that looks as much
+as possible like the original?
+
+It would be nice to have a java-only solution. But if an external tool
+has a better outcome, then an external tool is fine, too.
+
+Since Docspell is free software, the tools must also be free.
+
+
+# Considered Options
+
+* [pandoc](https://pandoc.org/) external command
+* [wkhtmltopdf](https://wkhtmltopdf.org/) external command
+* [Unoconv](https://github.com/unoconv/unoconv) external command
+
+Native (firefox) view:
+
+{{ figure(file="example-html-native.jpg") }}
+
+Note: the example html is from
+[here](https://www.sparksuite.com/open-source/invoice.html).
+
+I downloaded the HTML file to disk together with its resources (using
+*Save as...* in the browser).
+
+
+## Pandoc
+
+{{ figure(file="example-html-pandoc-latex.jpg") }}
+
+{{ figure(file="example-html-pandoc-html.jpg") }}
+
+Not showing the version using `context` pdf-engine, since it looked
+very similiar to the latex variant.
+
+
+## wkhtmltopdf
+
+{{ figure(file="example-html-wkhtmltopdf.jpg") }}
+
+
+## Unoconv
+
+
+{{ figure(file="example-html-unoconv.jpg") }}
+
+
+# Decision Outcome
+
+wkhtmltopdf.
+
+It shows the best results.
diff --git a/website/site/content/docs/dev/adr/0008_convert_plain_text.md b/website/site/content/docs/dev/adr/0008_convert_plain_text.md
new file mode 100644
index 00000000..743f53cc
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0008_convert_plain_text.md
@@ -0,0 +1,177 @@
++++
+title = "Convert Text Files"
+weight = 90
++++
+
+# Context and Problem Statement
+
+How can plain text and markdown documents be converted into a PDF
+files?
+
+Rendering images is not important here, since the files must be self
+contained when uploaded to Docspell.
+
+The test file is the current documentation page of Docspell, found in
+`microsite/docs/doc.md`.
+
+```
+---
+layout: docs
+position: 4
+title: Documentation
+---
+
+# {page .title}
+
+
+Docspell assists in organizing large amounts of PDF files that are
+...
+
+## How it works
+
+Documents have two ...
+
+1. You maintain a kind of address book. It should list all possible
+ correspondents and the concerning people/things. This grows
+ incrementally with each new unknown document.
+2. When docspell analyzes a document, it tries to find matches within
+ your address ...
+3. You can inspect ...
+
+The set of meta data that docspell uses to draw suggestions from, must
+be maintained ...
+
+
+## Terms
+
+In order to better understand these pages, some terms should be
+explained first.
+
+### Item
+
+An **Item** is roughly your (pdf) document, only that an item may span
+multiple files, which are called **attachments**. And an item has
+**meta data** associated:
+
+- a **correspondent**: the other side of the communication. It can be
+ an organization or a person.
+- a **concerning person** or **equipment**: a person or thing that
+ this item is about. Maybe it is an insurance contract about your
+ car.
+- ...
+
+### Collective
+
+The users of the application are part of a **collective**. A
+**collective** is a group of users that share access to the same
+items. The account name is therefore comprised of a *collective name*
+and a *user name*.
+
+All users of a collective are equal; they have same permissions to
+access all...
+```
+
+Then a plain text file is tried, too (without any markup).
+
+```
+Maecenas mauris lectus, lobortis et purus mattis
+
+Duis vehicula mi vel mi pretium
+
+In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu.
+
+Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut.
+Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros
+efficitur tincidunt. Cras justo mi, porttitor quis mattis vel,
+ultricies ut purus. Ut facilisis et lacus eu cursus.
+
+In eleifend velit vitae libero sollicitudin euismod:
+
+- Fusce vitae vestibulum velit,
+- Pellentesque vulputate lectus quis pellentesque commodo
+
+the end.
+```
+
+
+# Considered Options
+
+* [flexmark](https://github.com/vsch/flexmark-java) for markdown to
+ HTML, then use existing machinery described in [adr
+ 7](@/docs/dev/adr/0007_convert_html_files.md)
+* [pandoc](https://pandoc.org/) external command
+
+
+## flexmark markdown library for java
+
+Process files with [flexmark](https://github.com/vsch/flexmark-java)
+and then create a PDF from the resulting html.
+
+Using the following snippet:
+
+``` scala
+def renderMarkdown(): ExitCode = {
+ val opts = new MutableDataSet()
+ opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
+ util.Arrays.asList(TablesExtension.create(),
+ StrikethroughExtension.create()));
+
+ val parser = Parser.builder(opts).build()
+ val renderer = HtmlRenderer.builder(opts).build()
+ val reader = Files.newBufferedReader(Paths.get("in.txt|md"))
+ val doc = parser.parseReader(reader)
+ val html = renderer.render(doc)
+ val body = "" + html + ""
+ Files.write(
+ Paths.get("test.html"),
+ body.getBytes(StandardCharsets.UTF_8))
+
+ ExitCode.Success
+ }
+```
+
+Then run the result through `wkhtmltopdf`.
+
+Markdown file:
+{{ figure(file="example-md-java.jpg") }}
+
+TXT file:
+{{ figure(file="example-txt-java.jpg") }}
+
+
+## pandoc
+
+Command:
+
+```
+pandoc -f markdown -t html -o test.pdf microsite/docs/doc.md
+```
+
+Markdown/Latex:
+{{ figure(file="example-md-pandoc-latex.jpg") }}
+
+Markdown/Html:
+{{ figure(file="example-md-pandoc-html.jpg") }}
+
+Text/Latex:
+{{ figure(file="example-txt-pandoc-latex.jpg") }}
+
+Text/Html:
+{{ figure(file="example-txt-pandoc-html.jpg") }}
+
+
+# Decision Outcome
+
+Java library "flexmark".
+
+I think all results are great. It depends on the type of document and
+what one expects to see. I guess that most people expect something
+like pandoc-html produces for the kind of files docspell is for (it is
+not for newspaper articles, where pandoc-latex would be best fit).
+
+But choosing pandoc means yet another external command to depend on.
+And the results from flexmark are really good, too. One can fiddle
+with options and css to make it look better.
+
+To not introduce another external command, decision is to use flexmark
+and then the already existing html->pdf conversion.
diff --git a/website/site/content/docs/dev/adr/0009_convert_office_docs.md b/website/site/content/docs/dev/adr/0009_convert_office_docs.md
new file mode 100644
index 00000000..40f74c11
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0009_convert_office_docs.md
@@ -0,0 +1,205 @@
++++
+title = "Convert Office Documents"
+weight = 100
++++
+
+# Context and Problem Statement
+
+How can office documents, like `docx` or `odt` be converted into a PDF
+file that looks as much as possible like the original?
+
+It would be nice to have a java-only solution. But if an external tool
+has a better outcome, then an external tool is fine, too.
+
+Since Docspell is free software, the tools must also be free.
+
+# Considered Options
+
+* [Apache POI](https://poi.apache.org) together with
+ [this](https://search.maven.org/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.pdf/1.0.6/jar)
+ library
+* [pandoc](https://pandoc.org/) external command
+* [abiword](https://www.abisource.com/) external command
+* [Unoconv](https://github.com/unoconv/unoconv) external command
+
+To choose an option, some documents are converted to pdf and compared.
+Only the formats `docx` and `odt` are considered here. These are the
+most used formats. They have to look well, if a `xlsx` or `pptx`
+doesn't look so great, that is ok.
+
+Here is the native view to compare with:
+
+ODT:
+
+{{ figure(file="example-odt-native.jpg") }}
+
+## `XWPFConverter`
+
+I couldn't get any example to work. There were exceptions:
+
+```
+java.lang.IllegalArgumentException: Value for parameter 'id' was out of bounds
+ at org.apache.poi.util.IdentifierManager.reserve(IdentifierManager.java:80)
+ at org.apache.poi.xwpf.usermodel.XWPFRun.(XWPFRun.java:101)
+ at org.apache.poi.xwpf.usermodel.XWPFRun.(XWPFRun.java:146)
+ at org.apache.poi.xwpf.usermodel.XWPFParagraph.buildRunsInOrderFromXml(XWPFParagraph.java:135)
+ at org.apache.poi.xwpf.usermodel.XWPFParagraph.(XWPFParagraph.java:88)
+ at org.apache.poi.xwpf.usermodel.XWPFDocument.onDocumentRead(XWPFDocument.java:147)
+ at org.apache.poi.POIXMLDocument.load(POIXMLDocument.java:159)
+ at org.apache.poi.xwpf.usermodel.XWPFDocument.(XWPFDocument.java:124)
+ at docspell.convert.Testing$.withPoi(Testing.scala:17)
+ at docspell.convert.Testing$.$anonfun$run$1(Testing.scala:12)
+ at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:87)
+ at cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:355)
+ at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:376)
+ at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:316)
+ at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
+ at cats.effect.internals.PoolUtils$$anon$2$$anon$3.run(PoolUtils.scala:51)
+ at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+ at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+ at java.lang.Thread.run(Thread.java:748)
+```
+
+The project (not Apache Poi, the other) seems unmaintained. I could
+not find any website and the artifact in maven central is from 2016.
+
+
+## Pandoc
+
+I know pandoc as a very great tool when converting between markup
+documents. So this tries it with office documents. It supports `docx`
+and `odt` from there `--list-input-formats`.
+
+From the pandoc manual:
+
+> By default, pandoc will use LaTeX to create the PDF, which requires
+> that a LaTeX engine be installed (see --pdf-engine below).
+> Alternatively, pandoc can use ConTeXt, roff ms, or HTML as an
+> intermediate format. To do this, specify an output file with a .pdf
+> extension, as before, but add the --pdf-engine option or -t context,
+> -t html, or -t ms to the command line. The tool used to generate the
+> PDF from the intermediate format may be specified using --pdf-engine.
+
+Trying with latex engine:
+
+```
+pandoc -f odt -o test.pdf example.odt
+```
+
+Results ODT:
+
+{{ figure(file="example-odt-pandoc-latex.jpg") }}
+
+
+```
+pandoc -f odt -o test.pdf example.docx
+```
+
+Results DOCX:
+
+{{ figure(file="example-docx-pandoc-latex.jpg") }}
+
+
+----
+
+Trying with context engine:
+
+```
+pandoc -f odt -t context -o test.pdf example.odt
+```
+
+Results ODT:
+
+{{ figure(file="example-odt-pandoc-context.jpg") }}
+
+
+Results DOCX:
+
+{{ figure(file="example-docx-pandoc-context.jpg") }}
+
+
+----
+
+Trying with ms engine:
+
+```
+pandoc -f odt -t ms -o test.pdf example.odt
+```
+
+Results ODT:
+
+{{ figure(file="example-odt-pandoc-ms.jpg") }}
+
+Results DOCX:
+
+{{ figure(file="example-docx-pandoc-ms.jpg") }}
+
+
+---
+
+Trying with html engine (this requires `wkhtmltopdf` to be present):
+
+```
+$ pandoc --extract-media . -f odt -t html -o test.pdf example.odt
+```
+
+Results ODT:
+
+{{ figure(file="example-odt-pandoc-html.jpg") }}
+
+Results DOCX:
+
+{{ figure(file="example-docx-pandoc-html.jpg") }}
+
+
+## Abiword
+
+Trying with:
+
+```
+abiword --to=pdf example.odt
+```
+
+Results:
+
+{{ figure(file="example-odt-abiword.jpg") }}
+
+
+Trying with a `docx` file failed. It worked with a `doc` file.
+
+
+## Unoconv
+
+Unoconv relies on libreoffice/openoffice, so installing it will result
+in installing parts of libreoffice, which is a very large dependency.
+
+Trying with:
+
+```
+unoconv -f pdf example.odt
+```
+
+Results ODT:
+
+{{ figure(file="example-odt-unoconv.jpg") }}
+
+Results DOCX:
+
+{{ figure(file="example-docx-unoconv.jpg") }}
+
+# Decision Outcome
+
+Unoconv.
+
+The results from `unoconv` are really good.
+
+Abiword also is not that bad, it didn't convert the chart, but all
+font markup is there. It would be great to not depend on something as
+big as libreoffice, but the results are so much better.
+
+Also pandoc deals very well with DOCX files (using the `context`
+engine). The only thing that was not rendered was the embedded chart
+(like abiword). But all images and font styling was present.
+
+It will be a configurable external command anyways, so users can
+exchange it at any time with a different one.
diff --git a/website/site/content/docs/dev/adr/0010_convert_image_files.md b/website/site/content/docs/dev/adr/0010_convert_image_files.md
new file mode 100644
index 00000000..77542df7
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0010_convert_image_files.md
@@ -0,0 +1,190 @@
++++
+title = "Convert Image Files"
+weight = 110
++++
+
+# Context and Problem Statement
+
+How to convert image files properly to pdf?
+
+Since there are thousands of different image formats, there will never
+be support for all. The most common containers should be supported,
+though:
+
+- jpeg (jfif, exif)
+- png
+- tiff (baseline, single page)
+
+The focus is on document images, maybe from digital cameras or
+scanners.
+
+# Considered Options
+
+* [pdfbox](https://pdfbox.apache.org/) library
+* [imagemagick](https://www.imagemagick.org/) external command
+* [img2pdf](https://github.com/josch/img2pdf) external command
+* [tesseract](https://github.com/tesseract-ocr/tesseract) external command
+
+There are no screenshots here, because it doesn't make sense since
+they all look the same on the screen. Instead we look at the files
+properties.
+
+**Input File**
+
+The input files are:
+
+```
+$ identify input/*
+input/jfif.jpg JPEG 2480x3514 2480x3514+0+0 8-bit sRGB 240229B 0.000u 0:00.000
+input/letter-en.jpg JPEG 1695x2378 1695x2378+0+0 8-bit Gray 256c 467341B 0.000u 0:00.000
+input/letter-en.png PNG 1695x2378 1695x2378+0+0 8-bit Gray 256c 191571B 0.000u 0:00.000
+input/letter-en.tiff TIFF 1695x2378 1695x2378+0+0 8-bit Grayscale Gray 4030880B 0.000u 0:00.000
+```
+
+Size:
+- jfif.jpg 240k
+- letter-en.jpg 467k
+- letter-en.png 191k
+- letter-en.tiff 4.0M
+
+## pdfbox
+
+Using a java library is preferred, if the quality is good enough.
+There is an
+[example](https://github.com/apache/pdfbox/blob/2cea31cc63623fd6ece149c60d5f0cc05a696ea7/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ImageToPDF.java)
+for this exact use case.
+
+This is the sample code:
+
+``` scala
+def imgtopdf(file: String): ExitCode = {
+ val jpg = Paths.get(file).toAbsolutePath
+ if (!Files.exists(jpg)) {
+ sys.error(s"file doesn't exist: $jpg")
+ }
+ val pd = new PDDocument()
+ val page = new PDPage(PDRectangle.A4)
+ pd.addPage(page)
+ val bimg = ImageIO.read(jpg.toFile)
+
+ val img = LosslessFactory.createFromImage(pd, bimg)
+
+ val stream = new PDPageContentStream(pd, page)
+ stream.drawImage(img, 0, 0, PDRectangle.A4.getWidth, PDRectangle.A4.getHeight)
+ stream.close()
+
+ pd.save("test.pdf")
+ pd.close()
+
+ ExitCode.Success
+}
+```
+
+Using pdfbox 2.0.18 and twelvemonkeys 3.5. Running time: `1384ms`
+
+```
+$ identify *.pdf
+jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129660B 0.000u 0:00.000
+letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
+letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
+letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
+```
+
+Size:
+- jfif.jpg 1.1M
+- letter-en.jpg 142k
+- letter-en.png 142k
+- letter-en.tiff 142k
+
+## img2pdf
+
+This is a python tool that adds the image into the pdf without
+reencoding.
+
+Using version 0.3.1. Running time: `323ms`.
+
+```
+$ identify *.pdf
+jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129708B 0.000u 0:00.000
+letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
+letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
+letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
+```
+
+Size:
+- jfif.jpg 241k
+- letter-en.jpg 468k
+- letter-en.png 191k
+- letter-en.tiff 192k
+
+## ImageMagick
+
+The well known imagemagick tool can convert images to pdfs, too.
+
+Using version 6.9.10-71. Running time: `881ms`.
+
+```
+$ identify *.pdf
+jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 134873B 0.000u 0:00.000
+letter-en.jpg.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 360100B 0.000u 0:00.000
+letter-en.png.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
+letter-en.tiff.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
+```
+
+Size:
+- jfif.jpg 300k
+- letter-en.jpg 390k
+- letter-en.png 180k
+- letter-en.tiff 5.1M
+
+
+## Tesseract
+
+Docspell already relies on tesseract for doing OCR. And in contrast to
+all other candidates, it can create PDFs that are searchable. Of
+course, this yields in much longer running time, that cannot be
+compared to the times of the other options.
+
+```
+tesseract doc3.jpg out -l deu pdf
+```
+
+It can also create both outputs in one go:
+
+```
+tesseract doc3.jpg out -l deu pdf txt
+```
+
+Using tesseract 4. Running time: `6661ms`
+
+```
+$ identify *.pdf
+tesseract/jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 130535B 0.000u 0:00.000
+tesseract/letter-en.jpg.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
+tesseract/letter-en.png.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
+tesseract/letter-en.tiff.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
+```
+
+Size:
+- jfif.jpg 246k
+- letter-en.jpg 473k
+- letter-en.png 183k
+- letter-en.tiff 183k
+
+
+# Decision
+
+Tesseract.
+
+To not use more external tools, imagemagick and img2pdf are not
+chosen, even though img2pdf shows the best results and is fastest.
+
+Pdfbox library would be the favorite, because results are good and
+with the [twelvemonkeys](https://github.com/haraldk/TwelveMonkeys)
+library there is support for many images. The priority is to avoid
+more external commands if possible.
+
+But since there already is a dependency to tesseract and it can create
+searchable pdfs, the decision is to use tesseract for this. Then PDFs
+with images can be converted to searchable PDFs with images. And text
+extraction is required anyways.
diff --git a/website/site/content/docs/dev/adr/0011_extract_text.md b/website/site/content/docs/dev/adr/0011_extract_text.md
new file mode 100644
index 00000000..d27ab83d
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0011_extract_text.md
@@ -0,0 +1,76 @@
++++
+title = "Extract Text from Files"
+weight = 120
++++
+
+
+# Context and Problem Statement
+
+With support for more file types there must be a way to extract text
+from all of them. It is better to extract text from the source files,
+in contrast to extracting the text from the converted pdf file.
+
+There are multiple options and multiple file types. Again, most
+priority is to use a java/scala library to reduce external
+dependencies.
+
+# Considered Options
+
+## MS Office Documents
+
+There is only one library I know: [Apache
+POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
+However, it doesn't support open-document format (odt and ods).
+
+## OpenDocument Format
+
+There are two libraries:
+
+- [Apache Tika Parser](https://tika.apache.org/)
+- [ODFToolkit](https://github.com/tdf/odftoolkit)
+
+*Tika:* The tika-parsers package contains an opendocument parser for
+extracting text. But it has a huge dependency tree, since it is a
+super-package containing a parser for almost every common file type.
+
+*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
+and also pulls in quite some dependencies (while not as much as
+tika-parser). It is not too bad, since it is a library for
+manipulating opendocument files. But all I need is to only extract
+text. I created tests that extracted text from my odt/ods files. It
+worked at first sight, but running the tests in a loop resulted in
+strange nullpointer exceptions (it only worked the first run).
+
+## Richtext
+
+Richtext is supported by the jdk (using `RichtextEditorKit` from
+swing).
+
+## PDF
+
+For "image" pdf files, tesseract is used. For "text" PDF files, the
+library [Apache PDFBox](https://pdfbox.apache.org) can be used.
+
+There also is [iText](https://github.com/itext/itext7) with a AGPL
+license.
+
+## Images
+
+For images and "image" PDF files, there is already tesseract in place.
+
+## HTML
+
+HTML must be converted into a PDF file before text can be extracted.
+
+## Text/Markdown
+
+These files can be used as-is, obviously.
+
+
+# Decision Outcome
+
+- MS Office files: POI library
+- Open Document files: Tika, but integrating the few source files that
+ make up the open document parser. Due to its huge dependency tree,
+ the library is not added.
+- PDF: Apache PDFBox. I know this library better than itext.
diff --git a/website/site/content/docs/dev/adr/0012_periodic_tasks.md b/website/site/content/docs/dev/adr/0012_periodic_tasks.md
new file mode 100644
index 00000000..e87d64e1
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0012_periodic_tasks.md
@@ -0,0 +1,103 @@
++++
+title = "Periodic Tasks"
+weight = 130
++++
+
+# Context and Problem Statement
+
+Currently there is a `Scheduler` that consumes tasks off a queue in
+the database. This allows multiple job executors running in parallel
+racing for the next job to execute. This is for executing tasks
+immediately – as long as there are enough resource.
+
+What is missing, is a component that maintains periodic tasks. The
+reason for this is to have house keeping tasks that run regularily and
+clean up stale or unused data. Later, users should be able to create
+periodic tasks, for example to read e-mails from an inbox or to be
+notified of due items.
+
+The problem is again, that it must work with multiple job executor
+instances running at the same time. This is the same pattern as with
+the `Scheduler`: it must be ensured that only one task is used at a
+time. Multiple job exectuors must not schedule a perdiodic task more
+than once. If a periodic tasks takes longer than the time between
+runs, it must wait for the next interval.
+
+
+# Considered Options
+
+1. Adding a `timer` and `nextrun` field to the current `job` table
+2. Creating a separate table for periodic tasks
+
+## Decision Outcome
+
+The 2. option.
+
+For internal housekeeping tasks, it may suffice to reuse the existing
+`job` queue by adding more fields such that a job may be considered
+periodic. But this conflates with what the `Scheduler` is doing now
+(executing tasks as soon as possible while being bound to some
+resource limits) with a completely different subject.
+
+There will be a new `PeriodicScheduler` that works on a new table in
+the database that is representing periodic tasks. This table will
+share fields with the `job` table to be able to create `RJob` records.
+This new component is only taking care of periodically submitting jobs
+to the job queue such that the `Scheduler` will eventually pick it up
+and run it. If the tasks cannot run (for example due to resource
+limitation), the periodic scheduler can't do nothing but wait and try
+next time.
+
+```sql
+CREATE TABLE "periodic_task" (
+ "id" varchar(254) not null primary key,
+ "enabled" boolean not null,
+ "task" varchar(254) not null,
+ "group_" varchar(254) not null,
+ "args" text not null,
+ "subject" varchar(254) not null,
+ "submitter" varchar(254) not null,
+ "priority" int not null,
+ "worker" varchar(254),
+ "marked" timestamp,
+ "timer" varchar(254) not null,
+ "nextrun" timestamp not null,
+ "created" timestamp not null
+);
+```
+
+Preparing for other features, at some point periodic tasks will be
+created by users. It should be possible to disable/enable them. The
+next 6 properties are needed to insert jobs into the `job` table. The
+`worker` field (and `marked`) are used to mark a periodic job as
+"being worked on by a job executor".
+
+The `timer` is the schedule, which is a
+[systemd-like](https://man.cx/systemd.time#heading7) calendar event
+string. This is parsed by [this
+library](https://github.com/eikek/calev). The `nextrun` field will
+store the timestamp of the next time the task would need to be
+executed. This is needed to query this table for the newest task.
+
+The `PeriodicScheduler` works roughly like this:
+
+On startup:
+- Remove stale worker values. If the process has been killed, there
+ may be marked tasks which must be cleared now.
+
+Main-Loop:
+0. Cancel current scheduled notify (see 4. below)
+1. get next (= earliest & enabled) periodic job
+2. if none: stop
+3. if triggered (= `nextrun <= 'now'`):
+ - Mark periodic task. On fail: goto 1.
+ - Submit new job into the jobqueue:
+ - Update `nextrun` field
+ - Check for non-final jobs of that name. This is required to not
+ run the same periodic task multiple times concurrently.
+ - if exist: goto 4.
+ - if not exist: submit job
+ - Unmark periodic task
+4. if future
+ - schedule notify: notify self to run again next time the task
+ schedule triggers
diff --git a/website/site/content/docs/dev/adr/0013_archive_files.md b/website/site/content/docs/dev/adr/0013_archive_files.md
new file mode 100644
index 00000000..2f74745f
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0013_archive_files.md
@@ -0,0 +1,42 @@
++++
+title = "Archive Files"
+weight = 140
++++
+
+
+# Context and Problem Statement
+
+Docspell should have support for files that contain the actual files
+that matter, like zip files and other such things. It should extract
+its contents automatcially.
+
+Since docspell should never drop or modify user data, the archive file
+must be present in the database. And it must be possible to download
+the file unmodified.
+
+On the other hand, files in there need to be text analysed and
+converted to pdf files.
+
+# Decision Outcome
+
+There is currently a table `attachment_source` which holds references
+to "original" files. These are the files as uploaded by the user,
+before converted to pdf. Archive files add a subtlety to this: in case
+of an archive, an `attachment_source` is the original (non-archive)
+file inside an archive.
+
+The archive file itself will be stored in a separate table `attachment_archive`.
+
+Example: uploading a `files.zip` ZIP file containing `report.jpg`:
+
+- `attachment_source`: report.jpg
+- `attachment`: report.pdf
+- `attachment_archive`: files.zip
+
+Archive may contain other archives. Then the inner archives will not
+be saved. The archive file is extracted recursively, until there is no
+known archive file found.
+
+# Initial Support
+
+Initial support is implemented for ZIP and EML (e-mail files) files.
diff --git a/website/site/content/docs/dev/adr/0014_fulltext_search_engine.md b/website/site/content/docs/dev/adr/0014_fulltext_search_engine.md
new file mode 100644
index 00000000..e979eeae
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0014_fulltext_search_engine.md
@@ -0,0 +1,47 @@
++++
+title = "Fulltext Search Engine"
+weight = 150
++++
+
+It should be possible to search the contents of all documents.
+
+# Context and Problem Statement
+
+To allow searching the documents contents efficiently, a separate
+index is necessary. The "defacto standard" for fulltext search on the
+JVM is something backed by [Lucene](https://lucene.apache.org).
+Another option is to use a RDBMS that supports fulltext search.
+
+This adds another component to the mix, which increases the complexity
+of the setup and the software. Since docspell works great without this
+feature, it shouldn't have a huge impact on the application, i.e. if
+the fulltext search component is down or broken, docspell should still
+work (just the fulltext search is then not working).
+
+# Considered Options
+
+* [Apache SOLR](https://lucene.apache.org/solr)
+* [ElasticSearch](https://www.elastic.co/elasticsearch/)
+* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html)
+* All of them or a subset
+
+# Decision Outcome
+
+If docspell is running on PostgreSQL, it would be nice to also use it
+for fulltext search to save the cost of running another component. But
+I don't want to lock the database to PostgreSQL *only* because of the
+fulltext search feature.
+
+ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
+part of Lucene and therefore lives in the Apache ecosystem. I would
+choose SOLR over ElasticSearch, because I used it before.
+
+The last option (supporting all) is interesting, since it would enable
+to use PostgreSQL for fulltext search for those that use PostgreSQL as
+the database for docspell.
+
+In a first step, identify what docspell needs from a fulltext search
+component and create this interface and an implementation for Apache
+SOLR. This enables all users to use the fulltext search feature. As a
+later step, an implementation based on PostgreSQL and/or ElasticSearch
+could be provided, too.
diff --git a/website/site/content/docs/dev/adr/0015_convert_pdf_files.md b/website/site/content/docs/dev/adr/0015_convert_pdf_files.md
new file mode 100644
index 00000000..445bbebd
--- /dev/null
+++ b/website/site/content/docs/dev/adr/0015_convert_pdf_files.md
@@ -0,0 +1,64 @@
++++
+title = "Convert PDF Files"
+weight = 160
++++
+
+# Context and Problem Statement
+
+Some PDFs contain only images (when coming from a scanner) and
+therefore one is not able to click into the pdf and select text for
+copy&paste. Also it is not searchable in a PDF viewer. These are
+really shortcomings that can be fixed, especially when there is
+already OCR build in.
+
+For images, this works already as tesseract is used to create the PDF
+files. Tesseract creates the files with an additional text layer
+containing the OCRed text.
+
+# Considered Options
+
+* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
+ OCR text layer to scanned PDF files, allowing them to be searched
+
+
+## ocrmypdf
+
+This is a very nice python tool, that uses tesseract to do OCR on each
+page and add the extracted text as a pdf text layer to the page.
+Additionally it creates PDF/A type pdfs, which are great for
+archiving. This fixes exactly the things stated above.
+
+### Integration
+
+Docspell already has this built in for images. When converting images
+to a PDF (which is done early in processing), the process creates a
+text and a PDF file. Docspell then sets the text in this step and the
+text extraction step skips doing its work, if there is already text
+available.
+
+It would be possible to use the `--sidecar` option with ocrmypdf to
+create a text file of the extracted text with one run, too (exactly
+like it works for tesseract). But for "text" pdfs, ocrmypdf writes
+some info-message into this text file:
+
+```
+[OCR skipped on page 1][OCR skipped on page 2]
+```
+
+Docspell cannot reliably tell, wether this is extracted text or not.
+It would be reqiured to load the pdf and check its contents. This is a
+bit of bad luck, because everything would just work already. So it
+requires a (small) change in the text-extraction step. By default,
+text extraction happens on the source file. For PDFs, text extraction
+should now be run on the converted file, to avoid running OCR twice.
+
+The converted pdf file is either be a text-pdf in the first place,
+where ocrmypdf would only convert it to a PDF/A file; or it may be a
+converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
+is disabled, the converted file and the source file are the same for
+PDFs.
+
+# Decision Outcome
+
+Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
+distributed under the GPL-3 license.
diff --git a/website/site/content/docs/dev/adr/_index.md b/website/site/content/docs/dev/adr/_index.md
new file mode 100644
index 00000000..83de24d2
--- /dev/null
+++ b/website/site/content/docs/dev/adr/_index.md
@@ -0,0 +1,14 @@
++++
+title = "ADRs"
+description = "Contains some ADRs, which are internal notes on decisions made."
+weight = 300
+sort_by = "weight"
+insert_anchor_links = "right"
+template = "pages.html"
+[extra]
+mktoc = true
++++
+
+This contains a list of ADRs, most of them are from very early. It
+often just contains notes that could go nowhere else, but still should
+be captured.
diff --git a/website/site/content/docs/dev/adr/example-docx-pandoc-context.jpg b/website/site/content/docs/dev/adr/example-docx-pandoc-context.jpg
new file mode 100644
index 00000000..fdbbeed3
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-docx-pandoc-context.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-docx-pandoc-html.jpg b/website/site/content/docs/dev/adr/example-docx-pandoc-html.jpg
new file mode 100644
index 00000000..3e22ecee
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-docx-pandoc-html.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-docx-pandoc-latex.jpg b/website/site/content/docs/dev/adr/example-docx-pandoc-latex.jpg
new file mode 100644
index 00000000..fe42eedf
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-docx-pandoc-latex.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-docx-pandoc-ms.jpg b/website/site/content/docs/dev/adr/example-docx-pandoc-ms.jpg
new file mode 100644
index 00000000..50766cf7
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-docx-pandoc-ms.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-docx-unoconv.jpg b/website/site/content/docs/dev/adr/example-docx-unoconv.jpg
new file mode 100644
index 00000000..7acf7c4d
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-docx-unoconv.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-html-native.jpg b/website/site/content/docs/dev/adr/example-html-native.jpg
new file mode 100644
index 00000000..91ba500f
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-html-native.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-html-pandoc-html.jpg b/website/site/content/docs/dev/adr/example-html-pandoc-html.jpg
new file mode 100644
index 00000000..79235243
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-html-pandoc-html.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-html-pandoc-latex.jpg b/website/site/content/docs/dev/adr/example-html-pandoc-latex.jpg
new file mode 100644
index 00000000..0c6cc22f
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-html-pandoc-latex.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-html-unoconv.jpg b/website/site/content/docs/dev/adr/example-html-unoconv.jpg
new file mode 100644
index 00000000..3d4d0f4e
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-html-unoconv.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-html-wkhtmltopdf.jpg b/website/site/content/docs/dev/adr/example-html-wkhtmltopdf.jpg
new file mode 100644
index 00000000..e7e6fe56
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-html-wkhtmltopdf.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-md-java.jpg b/website/site/content/docs/dev/adr/example-md-java.jpg
new file mode 100644
index 00000000..f65e3538
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-md-java.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-md-pandoc-html.jpg b/website/site/content/docs/dev/adr/example-md-pandoc-html.jpg
new file mode 100644
index 00000000..28429746
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-md-pandoc-html.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-md-pandoc-latex.jpg b/website/site/content/docs/dev/adr/example-md-pandoc-latex.jpg
new file mode 100644
index 00000000..6e7be587
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-md-pandoc-latex.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-abiword.jpg b/website/site/content/docs/dev/adr/example-odt-abiword.jpg
new file mode 100644
index 00000000..94fa1f69
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-abiword.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-native.jpg b/website/site/content/docs/dev/adr/example-odt-native.jpg
new file mode 100644
index 00000000..18a0a416
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-native.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-pandoc-context.jpg b/website/site/content/docs/dev/adr/example-odt-pandoc-context.jpg
new file mode 100644
index 00000000..609868fa
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-pandoc-context.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-pandoc-html.jpg b/website/site/content/docs/dev/adr/example-odt-pandoc-html.jpg
new file mode 100644
index 00000000..780683c6
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-pandoc-html.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-pandoc-latex.jpg b/website/site/content/docs/dev/adr/example-odt-pandoc-latex.jpg
new file mode 100644
index 00000000..d2f43957
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-pandoc-latex.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-pandoc-ms.jpg b/website/site/content/docs/dev/adr/example-odt-pandoc-ms.jpg
new file mode 100644
index 00000000..fedf8d2f
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-pandoc-ms.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-odt-unoconv.jpg b/website/site/content/docs/dev/adr/example-odt-unoconv.jpg
new file mode 100644
index 00000000..e1a1ea22
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-odt-unoconv.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-txt-java.jpg b/website/site/content/docs/dev/adr/example-txt-java.jpg
new file mode 100644
index 00000000..3434ea18
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-txt-java.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-txt-pandoc-html.jpg b/website/site/content/docs/dev/adr/example-txt-pandoc-html.jpg
new file mode 100644
index 00000000..c46e5ebf
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-txt-pandoc-html.jpg differ
diff --git a/website/site/content/docs/dev/adr/example-txt-pandoc-latex.jpg b/website/site/content/docs/dev/adr/example-txt-pandoc-latex.jpg
new file mode 100644
index 00000000..fa25a7d4
Binary files /dev/null and b/website/site/content/docs/dev/adr/example-txt-pandoc-latex.jpg differ
diff --git a/website/site/content/docs/dev/adr/process-files.png b/website/site/content/docs/dev/adr/process-files.png
new file mode 100644
index 00000000..455b1a13
Binary files /dev/null and b/website/site/content/docs/dev/adr/process-files.png differ
diff --git a/website/site/content/docs/dev/adr/process-files.puml b/website/site/content/docs/dev/adr/process-files.puml
new file mode 100644
index 00000000..2c5330cd
--- /dev/null
+++ b/website/site/content/docs/dev/adr/process-files.puml
@@ -0,0 +1,43 @@
+@startuml
+scale 1200 width
+title: Processing Files
+skinparam monochrome true
+skinparam backgroundColor white
+skinparam rectangle {
+ roundCorner<> 25
+ roundCorner<