Initial website

This commit is contained in:
Eike Kettner 2020-07-27 22:13:22 +02:00
parent dbd0f3ff97
commit f8c6f79b10
160 changed files with 8854 additions and 64 deletions

10
.gitignore vendored
View File

@ -1,7 +1,15 @@
#artwork/*.png
target/
node_modules/
dev.conf
elm-stuff
result
_site/
*.qcow2
*.qcow2
/website/site/content/docs/changelog/
/website/site/public/
/website/site/static/openapi/
/website/site/static/js/bundle.js
/website/site/templates/shortcodes/server.conf
/website/site/templates/shortcodes/sample-exim.conf
/website/site/templates/shortcodes/joex.conf

View File

@ -300,6 +300,7 @@ val restapi = project.in(file("modules/restapi")).
openapiTargetLanguage := Language.Scala,
openapiPackage := Pkg("docspell.restapi.model"),
openapiSpec := (Compile/resourceDirectory).value/"docspell-openapi.yml",
openapiStaticArgs := Seq("-l", "html2")
).dependsOn(common)
val joexapi = project.in(file("modules/joexapi")).
@ -422,78 +423,53 @@ val restserver = project.in(file("modules/restserver")).
// --- Microsite Documentation
// --- Website Documentation
val microsite = project.in(file("modules/microsite")).
val website = project.in(file("website")).
disablePlugins(RevolverPlugin).
enablePlugins(MicrositesPlugin).
disablePlugins(ReleasePlugin).
settings(sharedSettings).
settings(
name := "docspell-microsite",
name := "docspell-website",
publishArtifact := false,
skip in publish := true,
micrositeFooterText := Some(
"""
|<p>&copy; 2020 <a href="https://github.com/eikek/docspell">Docspell, v{{site.version}}</a></p>
|""".stripMargin
),
micrositeName := "Docspell",
micrositeDescription := "Auto-tagging Document Organizer",
micrositeDocumentationUrl := "doc",
micrositeDocumentationLabelDescription := "Documentation",
micrositeFavicons := Seq(microsites.MicrositeFavicon("favicon.png", "96x96")),
micrositeAuthor := "eikek",
micrositeGithubOwner := "eikek",
micrositeGithubRepo := "docspell",
micrositeGitterChannel := false,
micrositeShareOnSocial := false,
micrositeHighlightLanguages ++= Seq("json", "javascript"),
micrositeEditButton := Some(microsites.MicrositeEditButton("Improve this page", "/edit/master/modules/microsite/docs/{{ page.path }}")),
fork in run := true,
micrositeCompilingDocsTool := WithMdoc,
mdocVariables := Map(
"VERSION" -> version.value,
"PVERSION" -> version.value.replace('.', '_')
),
micrositeExtraMdFiles := Map(
file("Changelog.md") -> ExtraMdFileConfig(
"changelog.md",
"docs",
Map("title" -> "Changelog", "permalink" -> "changelog")
)
),
Compile/resourceGenerators += Def.task {
val jekyllOut = resourceManaged.value/"main"/"jekyll"
val templateOut = baseDirectory.value/"site"/"templates"/"shortcodes"
val staticOut = baseDirectory.value/"site"/"static"/"openapi"
IO.createDirectories(Seq(templateOut, staticOut))
val logger = streams.value.log
val templates = Seq(
(resourceDirectory in (restserver, Compile)).value / "reference.conf" -> jekyllOut /"_includes"/"server.conf",
(resourceDirectory in (joex, Compile)).value / "reference.conf" -> jekyllOut/"_includes"/"joex.conf",
(LocalRootProject / baseDirectory).value / "tools" / "exim" / "exim.conf" -> jekyllOut/ "_includes"/"sample-exim.conf"
)
val res1 = templates.map { case (s, t) =>
logger.info(s"Copying $s -> $t")
IO.write(t, "{% raw %}\n")
IO.append(t, IO.readBytes(s))
IO.write(t, "\n{% endraw %}", append = true)
t
}
val files = Seq(
(resourceDirectory in (restapi, Compile)).value/"docspell-openapi.yml" -> jekyllOut/"openapi"/"docspell-openapi.yml"
(resourceDirectory in (restserver, Compile)).value / "reference.conf" -> templateOut /"server.conf",
(resourceDirectory in (joex, Compile)).value / "reference.conf" -> templateOut/"joex.conf",
(LocalRootProject / baseDirectory).value / "tools" / "exim" / "exim.conf" -> templateOut/"sample-exim.conf",
(resourceDirectory in (restapi, Compile)).value/"docspell-openapi.yml" -> staticOut/"docspell-openapi.yml",
(restapi/Compile/openapiStaticDoc).value -> staticOut/"docspell-openapi.html"
)
IO.copy(files)
res1 ++ files.map(_._2)
files.map(_._2)
}.taskValue,
Compile/resourceGenerators += Def.task {
val staticDoc = (restapi/Compile/openapiStaticDoc).value
val target = resourceManaged.value/"main"/"jekyll"/"openapi"/"docspell-openapi.html"
IO.copy(Seq(staticDoc -> target))
val changelog = (LocalRootProject / baseDirectory).value / "Changelog.md"
val targetDir = baseDirectory.value/"site"/"content"/"docs"/"changelog"
IO.createDirectory(targetDir)
val target = targetDir/"_index.md"
IO.write(target, """|+++
|title = "Changelog"
|description = "See what changed between releases."
|weight = 10
|insert_anchor_links = "right"
|[extra]
|maketoc = false
|+++
|""".stripMargin)
IO.append(target, IO.readBytes(changelog))
Seq(target)
}.taskValue
}.taskValue
)
val root = project.in(file(".")).
settings(sharedSettings).
settings(noPublish).

View File

@ -97,7 +97,7 @@ the nix package manager and to integrate it into NixOS.
The modules can be build by building the `configuration-test.nix` file
together with some nixpkgs version. For example:
``` shell
``` bash
nixos-rebuild build-vm -I nixos-config=./configuration-test.nix \
-I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/nixos-19.09.tar.gz
```
@ -108,21 +108,21 @@ the system configuration can be found behind the `./result/system`
symlink. So it is possible to look at the generated systemd config for
example:
``` shell
``` bash
cat result/system/etc/systemd/system/docspell-joex.service
```
And with some more commands (there probably is an easier way…) the
config file can be checked:
``` shell
``` bash
cat result/system/etc/systemd/system/docspell-joex.service | grep ExecStart | cut -d'=' -f2 | xargs cat | tail -n1 | awk '{print $NF}'| sed 's/.$//' | xargs cat | jq
```
To see the module in action, the vm can be started (the first line
sets more memory for the vm):
``` shell
``` bash
export QEMU_OPTS="-m 2048"
./result/bin/run-docspelltest-vm
```

View File

@ -117,7 +117,7 @@ full-text-search {
This key is required if you want docspell to drop and re-create the
entire index. This is possible via a REST call:
``` shell
``` bash
$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
```

View File

@ -132,7 +132,7 @@ Download the `oauth2.py` script from
[here](https://github.com/google/gmail-oauth2-tools) and first create
an *oauth2-token*:
``` shell
``` bash
./oauth2.py --user=your.name@gmail.com \
--client_id=106701....d8c.apps.googleusercontent.com \
--client_secret=5Z1...Kir_t \

View File

@ -90,7 +90,7 @@ notes about the used options (see `man curl`):
Go into the `tools/exim` directory and build the docker image:
``` shell
``` bash
docker build -t ds-exim:latest -f exim.dockerfile .
```
@ -114,7 +114,7 @@ variables as needed.
Finally start the container:
``` shell
``` bash
docker-compose up
```
@ -169,7 +169,7 @@ The mail is processed and results in an item:
However, if a mail is to an unknown collective or not to the
configured local domain, the server rejects it immediately:
``` shell
``` bash
fish ~> telnet localhost 25
Trying ::1...
Connected to localhost.

View File

@ -9,6 +9,8 @@ buildFHSUserEnv {
name = "docspell-sbt";
targetPkgs = pkgs: with pkgs; [
netcat jdk8 wget which zsh dpkg sbt git elmPackages.elm ncurses fakeroot mc jekyll
zola yarn
# haskells http client needs this (to download elm packages)
iana-etc
];

30
website/README.md Normal file
View File

@ -0,0 +1,30 @@
# Website
This is the docspell website and documentation.
## Building
The website is created using [zola](https://github.com/getzola/zola)
static site generator. The (very minimal) dynamic parts are written in
Elm.
The `build.sh` script builds the site.
## Development
Install things by running `yarn install`.
Open two terminals. In first run:
``` shell
nix-shell --run ./run-elm.sh
```
and in the second
``` shell
nix-shell --run "cd site && zola serve"
```
Open browser at `localhost:1111`.

11
website/build.sh Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -e
yarn install
elm make --output site/static/js/bundle.js --optimize elm/Main.elm
cd site
zola build
cd ..
echo "Site is in site/public."

9
website/elm-analyse.json Normal file
View File

@ -0,0 +1,9 @@
{
"excludedPaths": [
"modules/webapp/target/elm-src/"
],
"checks" : {
"ImportAll": false,
"SingleFieldRecord": false
}
}

28
website/elm.json Normal file
View File

@ -0,0 +1,28 @@
{
"type": "application",
"source-directories": [
"elm"
],
"elm-version": "0.19.1",
"dependencies": {
"direct": {
"elm/browser": "1.0.2",
"elm/core": "1.0.5",
"elm/html": "1.0.0",
"elm/random": "1.0.0",
"elm-community/random-extra": "3.1.0",
"elm-explorations/markdown": "1.0.0"
},
"indirect": {
"elm/json": "1.1.3",
"elm/time": "1.0.0",
"elm/url": "1.0.0",
"elm/virtual-dom": "1.0.2",
"owanturist/elm-union-find": "1.0.0"
}
},
"test-dependencies": {
"direct": {},
"indirect": {}
}
}

36
website/elm/ExtraAttr.elm Normal file
View File

@ -0,0 +1,36 @@
module ExtraAttr exposing (..)
import Html exposing (..)
import Html.Attributes exposing (..)
ariaExpanded : Bool -> Attribute msg
ariaExpanded flag =
attribute "aria-expanded"
(if flag then
"true"
else
"false"
)
ariaHidden : Bool -> Attribute msg
ariaHidden flag =
attribute "aria-hidden"
(if flag then
"true"
else
"false"
)
ariaLabel : String -> Attribute msg
ariaLabel name =
attribute "aria-label" name
role : String -> Attribute msg
role name =
attribute "role" name

110
website/elm/Feature.elm Normal file
View File

@ -0,0 +1,110 @@
module Feature exposing (..)
import Html exposing (..)
import Html.Attributes exposing (..)
import Markdown
type alias Feature =
{ image : String
, header : String
, description : String
}
featureBox : Int -> Feature -> Html msg
featureBox index f =
case isOdd index of
False ->
div [ class "columns is-vcentered box mb-5" ]
[ div [ class "column is-three-quarter" ]
[ figure [ class "image is-2by1 feature-image" ]
[ img [ src f.image ] []
]
]
, div [ class "column" ]
[ h2 [ class "title" ]
[ text f.header
]
, Markdown.toHtml []
f.description
]
]
True ->
div [ class "columns is-vcentered box mb-5" ]
[ div [ class "column is-three-quarter" ]
[ h2 [ class "title" ]
[ text f.header
]
, Markdown.toHtml []
f.description
]
, div [ class "column" ]
[ figure [ class "image is-2by1 feature-image" ]
[ img [ src f.image ] []
]
]
]
features : List Feature
features =
[ { image = "img/user-feature.png"
, header = "Multi-User per Account"
, description = """
Each account (a *collective*) can have multiple users that share the
same files. For example, everyone in your family can work with your
files while using their own account with their own settings.
"""
}
, { image = "img/ocr-feature.png"
, header = "Text Extraction with OCR"
, description = """
Text is extracted from all files. For scanned documents/images, OCR is used by utilising tesseract. The text is analysed and is available for full-text search.
"""
}
, { image = "img/filetype-feature.svg"
, header = "Support for many files"
, description = """
Docspell can read many file types. ZIP and EML (e-mail file format) files are extracted and their contents imported.
"""
}
, { image = "img/convertpdf-feature.svg"
, header = "Conversion to PDF"
, description = """
All files are converted to PDF. Don't worry about the originals. Original files are stored, too and can be downloaded untouched. When creating PDFs from image data (often returned from scanners), the resulting PDF contains the extracted text and is searchable.
"""
}
, { image = "img/fts-feature.png"
, header = "Full-Text Search"
, description = """
The extracted text of all files and some properties, like names and notes, are available for full-text search. Full-text search can also be used to further constrain the results of the search-menu where you can search by tags, correspondent, etc.
"""
}
, { image = "img/sendmail-feature.png"
, header = "Send via E-Mail"
, description = """
Users can define SMTP settings in the app and are then able to send items out via E-Mail. This is often useful to share with other people. There is e-mail-address completion from your address book, of course.
"""
}
, { image = "img/scanmailbox-feature.png"
, header = "Import Mailboxes"
, description = """
Users can define IMAP settings so that docspell can import their e-mails. This can be done periodically based on a schedule. Imported mails can be moved away into another folder or deleted.
"""
}
, { image = "img/notify-feature.png"
, header = "Notifications"
, description = """
Users can be notified by e-mail for documents whose due-date comes closer.
"""
}
]
isOdd : Int -> Bool
isOdd num =
modBy 2 num == 1

View File

@ -0,0 +1,91 @@
module GetStarted exposing (..)
import Html exposing (..)
import Html.Attributes exposing (..)
import Icons
import Markdown
getStarted : String -> List (Html msg)
getStarted version =
[ div [ class "content container is-size-5" ]
[ Markdown.toHtml []
"""Docspell consists of several components. The easiest way to get started is probably to use docker and
[docker-compose](https://docs.docker.com/compose/)."""
, Markdown.toHtml []
("""1. Clone the github repository
```bash
$ git clone https://github.com/eikek/docspell
```
Alternatively, [download](https://github.com/eikek/docspell/archive/v"""
++ version
++ """.zip) the sources and extract the zip file.
2. Change into the `docker` directory:
```bash
$ cd docspell/docker
```
3. Run `docker-compose up`:
```bash
$ export DOCSPELL_HEADER_VALUE="my-secret-123"
$ docker-compose up
```
The environment variable defines a secret that is shared between
some containers. You can define whatever you like. Please see the
[`consumedir.sh`](doc/tools/consumedir#docker) docs for additional
info.
4. Goto <http://localhost:7880>, signup and login. When signing up,
you can choose the same name for collective and user. Then login
with this name and the password.
5. (Optional) Create a folder `./docs/<collective-name>` (the name you
chose for the collective at registration) and place files in there
for importing them.
The directory contains a file `docspell.conf` that you can
[modify](docs/configure) as needed.
"""
)
]
, div [ class "content container" ]
[ div [ class "notification is-info is-light" ]
[ text "If you don't use docker, there are other ways that are "
, text "described in the relevant "
, a [ href "/docs/install" ]
[ text "documentation page"
]
]
]
, div [ class "content container" ]
[ div [ class "notification is-success is-light" ]
[ div [ class "content is-medium" ]
[ h3 [ class "title" ]
[ text "Where to go from here?"
]
, ul []
[ li []
[ text "The "
, a [ href "/docs/intro" ]
[ text "introduction" ]
, text " writes about the goals and basic idea."
]
, li []
[ text "There is a comprehensive "
, a [ href "/docs" ]
[ text "documentation"
]
, text " available."
]
, li []
[ text "The source code is hosted on "
, a [ href "https://github.com/eikek/docspell" ]
[ text "github"
]
, text "."
]
]
]
]
]
]

54
website/elm/Icons.elm Normal file
View File

@ -0,0 +1,54 @@
module Icons exposing (..)
import Html exposing (..)
import Html.Attributes exposing (..)
copyright : Html msg
copyright =
img [ src "icons/copyright-40.svg" ] []
infoSquared : Html msg
infoSquared =
img [ src "icons/info-square-40.svg" ] []
refresh : Html msg
refresh =
img [ src "icons/refresh-40.svg" ] []
logo : Html msg
logo =
img [ src "icons/logo-only.svg" ] []
logoMC : Html msg
logoMC =
img [ src "icons/logo-only-mc.svg" ] []
logoWidth : Int -> Html msg
logoWidth w =
img [ src "icons/logo-only.svg", width w ] []
home : Html msg
home =
img [ src "icons/home-40.svg" ] []
docs : Html msg
docs =
img [ src "icons/notes-40.svg" ] []
github : Html msg
github =
img [ src "/icons/github-40.svg" ] []
githubGreen : Html msg
githubGreen =
img [ src "/icons/github-40-green.svg" ] []

313
website/elm/Main.elm Normal file
View File

@ -0,0 +1,313 @@
module Main exposing (..)
import Browser exposing (Document)
import Browser.Navigation exposing (Key)
import ExtraAttr exposing (..)
import Feature exposing (Feature)
import GetStarted
import Html exposing (..)
import Html.Attributes exposing (..)
import Html.Events exposing (..)
import Icons
import Random
import Random.List
-- MAIN
main : Program Flags Model Msg
main =
Browser.element
{ init = init
, view = view
, update = update
, subscriptions = subscriptions
}
--- Model
type alias Flags =
{ version : String
}
type alias Model =
{ navbarOpen : Bool
, features : List Feature
, flags : Flags
}
type Msg
= ToggleNavbarMenu
| ShuffleFeatures
| ListShuffled (List Feature)
--- Init
viewFeatureCount : Int
viewFeatureCount =
10
init : Flags -> ( Model, Cmd Msg )
init flags =
( { navbarOpen = False
, features = List.take viewFeatureCount Feature.features
, flags = flags
}
, Cmd.none
)
shuffleFeatures : Cmd Msg
shuffleFeatures =
Random.List.shuffle Feature.features
|> Random.map (List.take viewFeatureCount)
|> Random.generate ListShuffled
--- Update
update : Msg -> Model -> ( Model, Cmd Msg )
update msg model =
case msg of
ToggleNavbarMenu ->
( { model | navbarOpen = not model.navbarOpen }
, Cmd.none
)
ShuffleFeatures ->
( model, shuffleFeatures )
ListShuffled lf ->
( { model | features = lf }
, Cmd.none
)
subscriptions : Model -> Sub Msg
subscriptions _ =
Sub.none
--- View
view : Model -> Html Msg
view model =
node "body"
[]
[ mainHero model
, featureHero model
, section [ class "section" ]
[ div [ class "container" ]
(List.indexedMap Feature.featureBox model.features
++ [ div [ class "columns box" ]
[ div [ class "column is-full" ]
[ div [ class "content has-text-centered is-medium" ]
[ text "A more complete list can be found in "
, a [ href "/docs/features" ] [ text "here" ]
, text "."
]
]
]
]
)
]
, getStartedHero model
, div [ class "section" ]
(GetStarted.getStarted model.flags.version)
, footHero model
]
footHero : Model -> Html Msg
footHero model =
footer
[ id "footer"
, class "footer"
]
[ div [ class "has-text-centered" ]
[ span []
[ text ("Docspell, " ++ model.flags.version)
]
, span [ class "pr-1 pl-1" ]
[ text " "
]
, a
[ href "https://spdx.org/licenses/GPL-3.0-or-later.html"
, target "_blank"
]
[ text "GPLv3+"
]
, span [ class "pr-1 pl-1" ]
[ text " "
]
, a
[ href "https://github.com/eikek/docspell"
, target "_blank"
]
[ text "Source Code"
]
, span [ class "pr-1 pl-1" ]
[ text " "
]
, span []
[ text "© 2020 "
]
, a
[ href "https://github.com/eikek"
, target "_blank"
]
[ text "@eikek"
]
]
]
getStartedHero : Model -> Html Msg
getStartedHero _ =
section
[ id "get-started"
, class "hero is-primary is-bold"
]
[ div [ class "hero-body" ]
[ div [ class "container" ]
[ h2 [ class "title" ]
[ text "Get Started"
]
]
]
]
featureHero : Model -> Html Msg
featureHero model =
section
[ id "feature-selection"
, class "hero is-info is-bold"
]
[ div
[ class "hero-body"
]
[ div [ class "container" ]
[ h2 [ class "title" ]
[ text "Feature Selection"
]
]
]
]
mainHero : Model -> Html Msg
mainHero model =
section
[ id "hero-main"
, class "hero is-fullheight is-primary"
]
[ div [ class "hero-head" ]
[ nav [ class "navbar" ]
[ div [ class "navbar-brand" ]
[ a
[ class "navbar-item"
, href "/"
]
[ span [ class "icon is-large" ]
[ Icons.logo
]
, text "Docspell"
]
, a
[ role "button"
, onClick ToggleNavbarMenu
, classList
[ ( "navbar-burger", True )
, ( "is-active", model.navbarOpen )
]
, ariaLabel "menu"
, ariaExpanded False
]
[ span [ ariaHidden True ] []
, span [ ariaHidden True ] []
, span [ ariaHidden True ] []
]
]
, div
[ classList
[ ( "navbar-menu", True )
, ( "is-active", model.navbarOpen )
]
]
[ div [ class "navbar-start" ]
[ a
[ href "docs/"
, class "navbar-item"
]
[ span [ class "icon" ]
[ Icons.docs
]
, span []
[ text "Documentation"
]
]
, a
[ target "_blank"
, href "https://github.com/eikek/docspell"
, class "navbar-item"
]
[ span [ class "icon" ]
[ Icons.github
]
, span []
[ text "Github"
]
]
]
]
]
]
, div [ class "hero-body" ]
[ div
[ class "container has-text-centered"
]
[ Icons.logoWidth 112
, h1 [ class "title main-title is-2" ]
[ text "Docspell"
]
, h2 [ class "subtitle is-3" ]
[ text "Simple document organizer"
]
, p [ class "content is-medium" ]
[ text "Docspell can assist in organizing your piles of "
, text "digital documents, resulting from scanners, e-mails "
, text "and other sources with miminal effort."
]
, div [ class " buttons is-centered" ]
[ a
[ class "button is-primary is-medium"
, href "#get-started"
]
[ text "Get Started"
]
, a
[ class "button is-info is-medium"
, href "#feature-selection"
]
[ text "Features"
]
]
]
]
]

6
website/package.json Normal file
View File

@ -0,0 +1,6 @@
{
"license": "GPL-3.0-or-later",
"dependencies": {
"bulma": "^0.9.0"
}
}

9
website/run-elm.sh Executable file
View File

@ -0,0 +1,9 @@
#!/usr/bin/env bash
CMD="elm make --output site/static/js/bundle.js --optimize elm/Main.elm"
$CMD
inotifywait -m -e close_write -r elm/ |
while read f; do
$CMD
done

15
website/shell.nix Normal file
View File

@ -0,0 +1,15 @@
let
nixpkgsUnstable = builtins.fetchTarball {
url = "https://github.com/NixOS/nixpkgs-channels/archive/nixos-unstable.tar.gz";
};
pkgsUnstable = import nixpkgsUnstable { };
in
with pkgsUnstable;
mkShell {
buildInputs = [
zola
yarn
inotifyTools
];
}

29
website/site/config.toml Normal file
View File

@ -0,0 +1,29 @@
# The URL the site will be built for
base_url = "https://docspell.org"
# Whether to automatically compile all Sass files in the sass directory
compile_sass = true
# Whether to do syntax highlighting
# Theme can be customised by setting the `highlight_theme` variable to a theme supported by Zola
highlight_code = true
highlight_theme = "gruvbox-dark"
# Whether to build a search index to be used later on by a JavaScript library
build_search_index = true
[link_checker]
skip_prefixes = [
"http://localhost",
"/openapi",
"https://www.abisource.com" # has bad ssl config
]
skip_anchor_prefixes = [
"https://github.com",
"https://package.elm-lang.org"
]
[extra]
# Put all your custom variables here
version = "0.9.0-SNAPSHOT"

View File

@ -0,0 +1,3 @@
+++
redirect_to = "/docs"
+++

View File

@ -0,0 +1,9 @@
+++
title = "Overview"
template = "overview.html"
insert_anchor_links = "right"
+++
# Note
This content is not rendered. Everything is in the template.

View File

@ -0,0 +1,93 @@
+++
title = "Api"
description = "Contains documentation about the REST API."
weight = 70
insert_anchor_links = "right"
[extra]
mktoc = true
+++
Docspell is designed as a REST server that uses JSON to exchange
data. The REST api can be used to integrate docspell into your
workflow.
[Docspell REST Api Doc](/openapi/docspell-openapi.html)
The "raw" `openapi.yml` specification file can be found
[here](/openapi/docspell-openapi.yml).
The routes can be divided into protected and unprotected routes. The
unprotected, or open routes are at `/open/*` while the protected
routes are at `/sec/*`. Open routes don't require authenticated access
and can be used by any user. The protected routes require an
authenticated user.
## Authentication
The unprotected route `/open/auth/login` can be used to login with
account name and password. The response contains a token that can be
used for accessing protected routes. The token is only valid for a
restricted time which can be configured (default is 5 minutes).
New tokens can be generated using an existing valid token and the
protected route `/sec/auth/session`. This will return the same
response as above, giving a new token.
This token can be added to requests in two ways: as a cookie header or
a "normal" http header. If a cookie header is used, the cookie name
must be `docspell_auth` and a custom header must be named
`X-Docspell-Auth`.
## Live Api
Besides the statically generated documentation at this site, the rest
server provides a swagger generated api documenation, that allows
playing around with the api. It requires a running docspell rest
server. If it is deployed at `http://localhost:7880`, then check this
url:
```
http://localhost:7880/api/doc
```
## Examples
These examples use the great command line tool
[curl](https://curl.haxx.se/).
### Login
``` bash
$ curl -X POST -d '{"account": "smith", "password": "test"}' http://localhost:7880/api/v1/open/auth/login
{"collective":"smith"
,"user":"smith"
,"success":true
,"message":"Login successful"
,"token":"1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI="
,"validMs":300000
}
```
### Get new token
``` bash
$ curl -XPOST -H 'X-Docspell-Auth: 1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI=' http://localhost:7880/api/v1/sec/auth/session
{"collective":"smith"
,"user":"smith"
,"success":true
,"message":"Login successful"
,"token":"1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc="
,"validMs":300000
}
```
### Get some insights
``` bash
$ curl -H 'X-Docspell-Auth: 1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc=' http://localhost:7880/api/v1/sec/collective/insights
{"incomingCount":3
,"outgoingCount":1
,"itemSize":207310
,"tagCloud":{"items":[]}
}
```

View File

@ -0,0 +1,330 @@
+++
title = "Configuration"
insert_anchor_links = "right"
description = "There are several tools distributed with docspell, like a program to watch a folder and import files to docspell."
weight = 40
[extra]
mktoc = true
+++
Docspell's executable can take one argument a configuration file. If
that is not given, the defaults are used. The config file overrides
default values, so only values that differ from the defaults are
necessary.
This applies to the restserver and the joex as well.
# Important Config Options
The configuration of both components uses separate namespaces. The
configuration for the REST server is below `docspell.server`, while
the one for joex is below `docspell.joex`.
## JDBC
This configures the connection to the database. This has to be
specified for the rest server and joex. By default, a H2 database in
the current `/tmp` directory is configured.
The config looks like this (both components):
``` conf
docspell.joex.jdbc {
url = ...
user = ...
password = ...
}
docspell.server.backend.jdbc {
url = ...
user = ...
password = ...
}
```
The `url` is the connection to the database. It must start with
`jdbc`, followed by name of the database. The rest is specific to the
database used: it is either a path to a file for H2 or a host/database
url for MariaDB and PostgreSQL.
When using H2, the user and password can be chosen freely on first
start, but must stay the same on subsequent starts. Usually, the user
is `sa` and the password is left empty. Additionally, the url must
include these options:
```
;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE
```
### Examples
PostgreSQL:
```
url = "jdbc:postgresql://localhost:5432/docspelldb"
```
MariaDB:
```
url = "jdbc:mariadb://localhost:3306/docspelldb"
```
H2
```
url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
```
## Full-Text Search: SOLR
[Apache SOLR](https://lucene.apache.org/solr) is used to provide the
full-text search. Both docspell components must provide the same
connection setup. This is defined in the `full-text-search.solr`
subsection:
``` conf
...
full-text-search {
enabled = true
...
solr = {
url = "http://localhost:8983/solr/docspell"
}
}
```
The default configuration at the end of this page contains more
information about each setting.
The `solr.url` is the mandatory setting that you need to change to
point to your SOLR instance. Then you need to set the `enabled` flag
to `true`.
When installing docspell manually, just install solr and create a core
as described in the [solr
documentation](https://lucene.apache.org/solr/guide/8_4/installing-solr.html).
That will provide you with the connection url (the last part is the
core name).
While the `full-text-search.solr` options are the same for joex and
the restserver, there are some settings that differ. The restserver
has this additional setting, that may be of interest:
``` conf
full-text-search {
recreate-key = "test123"
}
```
This key is required if you want docspell to drop and re-create the
entire index. This is possible via a REST call:
``` bash
$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
```
Here the `test123` is the key defined with `recreate-key`. If it is
empty (the default), this REST call is disabled. Otherwise, the POST
request will submit a system task that is executed by a joex instance
eventually.
Using this endpoint, the index will be re-created. This is sometimes
necessary, for example if you upgrade SOLR or delete the core to
provide a new one (see
[here](https://lucene.apache.org/solr/guide/8_4/reindexing.html) for
details). Note that a collective can also re-index their data using a
similiar endpoint; but this is only deleting their data and doesn't do
a full re-index.
The solr index doesn't contain any new information, it can be
regenerated any time using the above REST call. Thus it doesn't need
to be backed up.
## Bind
The host and port the http server binds to. This applies to both
components. The joex component also exposes a small REST api to
inspect its state and notify the scheduler.
``` conf
docspell.server.bind {
address = localhost
port = 7880
}
docspell.joex.bind {
address = localhost
port = 7878
}
```
By default, it binds to `localhost` and some predefined port. This
must be changed, if components are on different machines.
## Baseurl
The base url is an important setting that defines the http URL where
the corresponding component can be reached. It applies to both
components. For a joex component, the url must be resolvable from a
REST server component. The REST server also uses this url to create
absolute urls and to configure the authenication cookie.
By default it is build using the information from the `bind` setting.
```
docspell.server.baseurl = ...
docspell.joex.baseurl = ...
```
### Examples
```
docspell.server.baseurl = "https://docspell.example.com"
docspell.joex.baseurl = "http://192.168.101.10"
```
## App-id
The `app-id` is the identifier of the corresponding instance. It *must
be unique* for all instances. By default the REST server uses `rest1`
and joex `joex1`. It is recommended to overwrite this setting to have
an explicit and stable identifier.
``` conf
docspell.server.app-id = "rest1"
docspell.joex.app-id = "joex1"
```
## Registration Options
This defines if and how new users can create accounts. There are 3
options:
- *closed* no new user can sign up
- *open* new users can sign up
- *invite* new users can sign up but require an invitation key
This applies only to the REST sevrer component.
``` conf
docspell.server.signup {
mode = "open"
# If mode == 'invite', a password must be provided to generate
# invitation keys. It must not be empty.
new-invite-password = ""
# If mode == 'invite', this is the period an invitation token is
# considered valid.
invite-time = "3 days"
}
```
The mode `invite` is intended to open the application only to some
users. The admin can create these invitation keys and distribute them
to the desired people. For this, the `new-invite-password` must be
given. The idea is that only the person who installs docspell knows
this. If it is not set, then invitation won't work. New invitation
keys can be generated from within the web application or via REST
calls (using `curl`, for example).
``` bash
curl -X POST -d '{"password":"blabla"}' "http://localhost:7880/api/v1/open/signup/newinvite"
```
## Authentication
Authentication works in two ways:
- with an account-name / password pair
- with an authentication token
The initial authentication must occur with an accountname/password
pair. This will generate an authentication token which is valid for a
some time. Subsequent calls to secured routes can use this token. The
token can be given as a normal http header or via a cookie header.
These settings apply only to the REST server.
``` conf
docspell.server.auth {
server-secret = "hex:caffee" # or "b64:Y2FmZmVlCg=="
session-valid = "5 minutes"
}
```
The `server-secret` is used to sign the token. If multiple REST
servers are deployed, all must share the same server secret. Otherwise
tokens from one instance are not valid on another instance. The secret
can be given as Base64 encoded string or in hex form. Use the prefix
`hex:` and `b64:`, respectively. If no prefix is given, the UTF8 bytes
of the string are used.
The `session-valid` deterimens how long a token is valid. This can be
just some minutes, the web application obtains new ones
periodically. So a short time is recommended.
# File Format
The format of the configuration files can be
[HOCON](https://github.com/lightbend/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation),
JSON or whatever the used [config
library](https://github.com/lightbend/config) understands. The default
values below are in HOCON format, which is recommended, since it
allows comments and has some [advanced
features](https://github.com/lightbend/config#features-of-hocon).
Please refer to their documentation for more on this.
Here are the default configurations.
# Default Config
## Rest Server
{{ incl_conf(path="templates/shortcodes/server.conf") }}
## Joex
{{ incl_conf(path="templates/shortcodes/joex.conf") }}
# Logging
By default, docspell logs to stdout. This works well, when managed by
systemd or other inits. Logging is done by
[logback](https://logback.qos.ch/). Please refer to its documentation
for how to configure logging.
If you created your logback config file, it can be added as argument
to the executable using this syntax:
``` bash
/path/to/docspell -Dlogback.configurationFile=/path/to/your/logging-config-file
```
To get started, the default config looks like this:
``` xml
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>
```
The `<root level="INFO">` means, that only log statements with level
"INFO" will be printed. But the `<logger name="docspell"
level="debug">` above says, that for loggers with name "docspell"
statements with level "DEBUG" will be printed, too.

View File

@ -0,0 +1,9 @@
+++
title = "Development"
description = "Contains build instructions and other internal notes."
weight = 300
sort_by = "weight"
insert_anchor_links = "right"
template = "pages.html"
redirect_to = "/docs/dev/building"
+++

View File

@ -0,0 +1,36 @@
+++
title = "Use Markdown Architectural Decision Records"
weight = 10
+++
# Context and Problem Statement
We want to [record architectural decisions](https://adr.github.io/)
made in this project. Which format and structure should these records
follow?
# Considered Options
* [MADR](https://adr.github.io/madr/) 2.1.0 - The Markdown Architectural Decision Records
* [Michael Nygard's template](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions) - The first incarnation of the term "ADR"
* [Sustainable Architectural
Decisions](https://www.infoq.com/articles/sustainable-architectural-design-decisions) -
The Y-Statements
* Other templates listed at
<https://github.com/joelparkerhenderson/architecture_decision_record>
* Formless - No conventions for file format and structure
# Decision Outcome
Chosen option: "MADR 2.1.0", because
* Implicit assumptions should be made explicit. Design documentation
is important to enable people understanding the decisions later on.
See also [A rational design process: How and why to fake
it](https://doi.org/10.1109/TSE.1986.6312940).
* The MADR format is lean and fits our development style.
* The MADR structure is comprehensible and facilitates usage &
maintenance.
* The MADR project is vivid.
* Version 2.1.0 is the latest one available when starting to document
ADRs.

View File

@ -0,0 +1,64 @@
+++
title = "Components"
weight = 20
+++
# Context and Problem Statement
How should the application be structured into its main components? The
goal is to be able to have multiple rest servers/webapps and multiple
document processor components working togehter.
# Decision Outcome
The following are the "main" modules. There may be more helper modules
and libraries that support implementing a feature.
## store
The code related to database access. It also provides the job
queue. It is designed as a library.
## joex
Joex stands for "job executor".
An application that executes jobs from the queue and therefore depends
on the `store` module. It provides the code for all tasks that can be
submitted as jobs. If no jobs are in the queue, the joex "sleeps"
and must be waked via an external request.
It provides the document processing code.
It provides a http rest server to get insight into the joex state
and also to be notified for new jobs.
## backend
It provides all the logic, except document processing, as a set of
"operations". An operation can be directly mapped to a rest
endpoint.
It is designed as a library.
## rest api
This module contains the specification for the rest server as an
`openapi.yml` file. It is packaged as a scala library that also
provides types and conversions to/from json.
The idea is that the `rest server` module can depend on it as well as
rest clients.
## rest server
This is the main application. It directly depends on the `backend`
module, and each rest endpoint maps to a "backend operation". It is
also responsible for converting the json data inside http requests
to/from types recognized by the `backend` module.
## webapp
This module provides the user interface as a web application.

View File

@ -0,0 +1,63 @@
+++
title = "Component Interaction"
weight = 30
+++
# Context and Problem Statement
There are multiple web applications with their rest servers and there
are multiple document processors. These processes must communicate:
- once a new job is added to the queue the rest server must somehow
notify processors to wake up
- once a processor takes a job, it must propagate the progress and
outcome to all rest servers only that the rest server can notify the
user that is currently logged in. Since it's not known which
rest-server the user is using right now, all must be notified.
# Considered Options
1. JMS (ActiveMQ or similiar): Message Broker as another active
component
2. Akka: using a cluster
3. DB: Register with "call back urls"
# Decision Outcome
Choosing option 3: DB as central synchronisation point.
The reason is that this is the simplest solution and doesn't require
external libraries or more processes. The other options seem too big
of a weapon for the task at hand. They are both large components
itself and require more knowledge to use them efficiently.
It works roughly like this:
- rest servers and processors register at the database on startup each
with a unique call-back url
- and deregister on shutdown
- each component has db access
- rest servers can list all processors and vice versa
## Positive Consequences
- complexity of the whole application is not touched
- since a lot of data must be transferred to the document processors,
this is solved by simply accessing the db. So the protocol for data
exchange is set. There is no need for other protocols that handle
large data (http chunking etc)
- uses the already exsting db as synchronisation point
- no additional knowledge required
- simple to understand and so not hard to debug
## Negative Consequences
- all components must have db access. this also is a security con,
because if one of those processes is hacked, db access is
possible. and it simply is another dependency that is not really
required for the joex component
- the joex component cannot be in an untrusted environment (untrusted
from the db's point of view). For example, it is not possible to
create "personal joex" that only receive your own jobs…
- in order to know if a component is really active, one must run a
ping against the call-back url

View File

@ -0,0 +1,93 @@
+++
title = "Encryption"
weight = 40
+++
# Context and Problem Statement
Since docspell may store important documents, it should be possible to
encrypt them on the server. It should be (almost) transparent to the
user, for example, a user must be able to login and download a file in
clear form. That is, the server must also decrypt them.
Then all users of a collective should have access to the files. This
requires to share the key among users of a collective.
But, even when files are encrypted, the associated meta data is not!
So especially access to the database would allow to see tags,
associated persons and correspondents of documents.
So in short, encryption means:
- file contents (the blobs and extracted text) is encrypted
- metadata is not
- secret keys are stored at the server (protected by a passphrase),
such that files can be downloaded in clear form
# Decision Drivers
* major driver is to provide most possible privacy for users
* even at the expense of less features; currently I think that the
associated meta data is enough for finding documents (i.e. full text
search is not needed)
# Considered Options
It is clear, that only blobs (file contents) can be encrypted, but not
the associated metadata. And the extracted text must be encrypted,
too, obviously.
## Public Key Encryption (PKE)
With PKE that the server can automatically encrypt files using
publicly available key data. It wouldn't require a user to provide a
passphrase for encryption, only for decryption.
This would allows for first processing files (extracting text, doing
text analyisis) and encrypting them (and the text) afterwards.
The public and secret keys are stored at the database. The secret key
must be protected. This can be done by encrypting the passphrase to
the secret key using each users login password. If a user logs in, he
or she must provide the correct password. Using this password, the
private key can be unlocked. This requires to store the private key
passphrase encrypted with every users password in the database. So the
whole security then depends on users password quality.
There are plenty of other difficulties with this approach (how about
password change, new secret keys, adding users etc).
Using this kind of encryption would protect the data against offline
attacks and also for accidental leakage (for example, if a bug in the
software would access a file of another user).
## No Encryption
If only blobs are encrypted, against which type of attack would it
provide protection?
The users must still trust the server. First, in order to provide the
wanted features (document processing), the server must see the file
contents. Then, it will receive and serve files in clear form, so it
has access to them anyways.
With that in mind, the "only" feature is to protect against "stolen
database" attacks. If the database is somehow leaked, the attackers
would only see the metadata, but not real documents. It also protects
against leakage, maybe caused by a pogramming error.
But the downside is, that it increases complexity *a lot*. And since
this is a personal tool for personal use, is it worth the effort?
# Decision Outcome
No encryption, because of its complexity.
For now, this tool is only meant for "self deployment" and personal
use. If this changes or there is enough time, this decision should be
reconsidered.

View File

@ -0,0 +1,40 @@
+++
title = "ISO8601 vs Millis as Date-Time transfer"
weight = 50
+++
# Context and Problem Statement
The question is whether the REST Api should return an ISO8601
formatted string in UTC timezone, or the unix time (number of
milliseconds since 1970-01-01).
There is quite some controversy about it.
- <https://stackoverflow.com/questions/47426786/epoch-or-iso8601-date-format>
- <https://nbsoftsolutions.com/blog/designing-a-rest-api-unix-time-vs-iso-8601>
In my opinion, the ISO8601 format (always UTC) is better. The reason
is the better readability. But elm folks are on the other side:
- <https://package.elm-lang.org/packages/elm/time/1.0.0#iso-8601>
- <https://package.elm-lang.org/packages/rtfeldman/elm-iso8601-date-strings/latest/>
One can convert from an ISO8601 date-time string in UTC time into the
epoch millis and vice versa. So it is the same to me. There is no less
information in a ISO8601 string than in the epoch millis.
To avoid confusion, all date/time values should use the same encoding.
# Decision Outcome
I go with the epoch time. Every timestamp/date-time values is
transfered as Unix timestamp.
Reasons:
- the Elm application needs to frequently calculate with these values
to render the current waiting time etc. This is better if there are
numbers without requiring to parse dates first
- Since the UI is written with Elm, it's probably good to adopt their
style

View File

@ -0,0 +1,134 @@
+++
title = "Joex - Job Executor"
weight = 60
+++
# Context and Problem Statement
Docspell is a multi-user application. When processing user's
documents, there must be some thought on how to distribute all the
processing jobs on a much more restricted set of resources. There
maybe 100 users but only 4 cores that can process documents at a
time. Doing simply FIFO is not enough since it provides an unfair
distribution. The first user who submits 20 documents will then occupy
all cores for quite some time and all other users would need to wait.
This tries to find a more fair distribution among the users (strictly
meaning collectives here) of docspell.
The job executor is a separate component that will run in its own
process. It takes the next job from the "queue" and executes the
associated task. This is used to run the document processing jobs
(text extraction, text analysis etc).
1. The task execution should survive restarts. State and task code
must be recreated from some persisted state.
2. The processing should be fair with respect to collectives.
3. It must be possible to run many job executors, possibly on
different machines. This can be used to quickly enable more
processing power and removing it once the peak is over.
4. Task execution can fail and it should be able to retry those
tasks. Reasons are that errors may be temporarily (for example
talking to a third party service), and to enable repairing without
stopping the job executor. Some errors might be easily repaired (a
program was not installed or whatever). In such a case it is good
to know that the task will be retried later.
# Considered Options
In contrast to other ADRs this is just some sketching of thoughts for
the current implementation.
1. Job description are serialized and written to the database into a
table. This becomes the queue. Tasks are identified by names and a
job executor implementation must have a map of names to code to
lookup the task to perform. The tasks arguments are serialized into
a string and written to the database. Tasks must decode the
string. This can be conveniently done using JSON and the provided
circe decoders.
2. To provide a fair execution jobs are organized into groups. When a
new job is requested from the queue, first a group is selected
using a round-robin strategy. This should ensure good enough
fairness among groups. A group maps to a collective. Within a
group, a job is selected based on priority, submitted time (fifo)
and job state (see notes about stuck jobs).
3. Allowing multiple job executors means that getting the next job can
fail due to simultaneous running transactions. It is retried until
it succeeds. Taking a job puts in into _scheduled_ state. Each job
executor has a unique (manually supplied) id and jobs are marked
with that id once it is handed to the executor.
4. When a task fails, its state is updated to state _stuck_. Stuck
jobs are retried in the future. The queue prefers to return stuck
jobs that are due at the specific point in time ignoring the
priority hint.
## More Details
A job has these properties
- id (something random)
- group
- taskname (to choose task to run)
- submitted-date
- worker (the id of the job executor)
- state, one of: waiting, scheduled, running, stuck, cancelled,
failed, success
- waiting: job has been inserted into the queue
- scheduled: job has been handed over to some executore and is
marked with the job executor id
- running: a task is currently executing
- stuck: a task has failed and is being retried eventually
- cancelled: task has finished and there was a cancel request
- failed: task has failed, execeeded the retries
- success: task has completed successfully
The queue has a `take` or `nextJob` operation that takes the worker-id
and a priority hint and goes roughly like this:
- select the next group using round-robin strategy
- select all jobs with that group, where
- state is stuck and waiting time has elapsed
- state is waiting and have the given priority if possible
- jobs are ordered by submitted time, but stuck jobs whose waiting
time elapsed are preferred
There are two priorities within a group: high and low. A configured
counting scheme determines when to select certain priority. For
example, counting scheme of `(2,1)` would select two high priority
jobs and then 1 low priority job. The `take` operation tries to prefer
this priority but falls back to the other if no job with this priority
is available.
A group corresponds to a collective. Then all collectives get
(roughly) equal treatment.
Once there are no jobs in the queue the executor goes into sleep and
must be waked to run again. If a job is submitted, the executors are
notified.
## Stuck Jobs
A job is going into _stuck_ state, if the task has failed. In this
state, the task is rerun after a while until a maximum retry count is
reached.
The problem is how to notify all executors when the waiting time has
elapsed. If one executor puts a job into stuck state, it means that
all others should start looking into the queue again after `x`
minutes. It would be possible to tell all existing executors to
schedule themselves to wake up in the future, but this would miss all
executors that show up later.
The waiting time is increased exponentially after each retry (`2 ^
retry`) and it is meant as the minimum waiting time. So it is ok if
all executors wakeup periodically and check for new work. Most of the
time this should not be necessary and is just a fallback if only stuck
jobs are in the queue and nothing is submitted for a long time. If the
system is used, jobs get submitted once in a while and would awake all
executors.

View File

@ -0,0 +1,150 @@
+++
title = "More File Types"
weight = 70
+++
# Context and Problem Statement
Docspell currently only supports PDF files. This has simplified early
development and design a lot and so helped with starting the project.
Handling pdf files is usually easy (to view, to extract text, print
etc).
The pdf format has been chosen, because PDFs files are very common and
can be viewed with many tools on many systems (i.e. non-proprietary
tools). Docspell also is a document archive and from this perspective,
it is important that documents can be viewed in 10 years and more. The
hope is, that the PDF format is best suited for this. Therefore all
documents in Docspell must be accessible as PDF. The trivial solution
to this requirement is to only allow PDF files.
Support for more document types, must then take care of the following:
- extracting text
- converting into pdf
- access original file
Text should be extracted from the source file, in case conversion is
not lossless. Since Docspell can already extract text from PDF files
using OCR, text can also be extracted from the converted file as a
fallback.
The original file must always be accessible. The main reason is that
all uploaded data should be accessible without any modification. And
since the conversion may not always create best results, the original
file should be kept.
# Decision Drivers
People expect that software like Docspell support the most common
document types, like all the “office documents” (`docx`, `rtf`, `odt`,
`xlsx`, …) and images. For many people it is more common to create
those files instead of PDF. Some (older) scanners may not be able to
scan into PDF files but only to image files.
# Considered Options
This ADR does not evaluate different options. It rather documents why
this feature is realized and the thoughts that lead to how it is
implemented.
# Realization
## Data Model
The `attachment` table holds one file. There will be another table
`attachment_source` that holds the original file. It looks like this:
``` sql
CREATE TABLE "attachment_source" (
"id" varchar(254) not null primary key,
"file_id" varchar(254) not null,
"filename" varchar(254),
"created" timestamp not null,
foreign key ("file_id") references "filemeta"("id"),
foreign key ("id") references "attachment"("attachid")
);
```
The `id` is the primary key and is the same as the associated
`attachment`, creating a `1-1` relationship (well, more correct is
`0..1-1`) between `attachment` and `attachment_source`.
There will always be a `attachment_source` record for every
`attachment` record. If the original file is a PDF already, then both
table's `file_id` columns point to the same file. But now the user can
change the filename of an `attachment` while the original filename is
preserved in `attachment_source`. It must not be possible for the user
to change anything in `attachment_source`.
The `attachment` table is not touched in order to keep current code
mostly unchanged and to have a simpler data migration. The downside
is, that the data model allows to have an `attachment` record without
an `attachment_source` record. OTOH, a foreign key inside `attachment`
pointing to an `attachment_source` is also not correct, because it
allows the same `attachment_source` record to be associated with many
`attachment` records. This would do even more harm, in my opinion.
## Migration
Creating a new table and not altering existing ones, should simplify
data migration.
Since only PDF files where allowed and the user could not change
anything in the `attachment` table, the existing data can simply be
inserted into the new table. This presents the trivial case where the
attachment and source are the same.
## Processing
The first step in processing is now converting the file into a pdf. If
it already is a pdf, nothing is done. This step is before text
extraction, so text can first be tried to extract from the source file
and only if that fails (or is not supported), text can be extracted
from the converted pdf file. All remaining steps are untouched.
If conversion is not supported for the input file, it is skipped. If
conversion fails, the error is propagated to let the retry mechanism
take care.
### What types?
Which file types should be supported? At a first step, all major
office documents, common images, plain text (i.e. markdown) and html
should be supported. In terms of file extensions: `doc`, `docx`,
`xls`, `xlsx`, `odt`, `md`, `html`, `txt`, `jpg`, `png`, `tif`.
There is always the preference to use jvm internal libraries in order
to be more platform independent and to reduce external dependencies.
But this is not always possible (like doing OCR).
{{ figure(file="process-files.png") }}
### Conversion
- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
unoconv (see [ADR 9](@/docs/dev/adr/0009_convert_office_docs.md))
- HTML (`html`): wkhtmltopdf (see [ADR 7](@/docs/dev/adr/0007_convert_html_files.md))
- Text/Markdown (`txt`, `md`): Java-Lib flexmark + wkhtmltopdf
- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
10](@/docs/dev/adr/0010_convert_image_files.md))
### Text Extraction
- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
- Office documends (`odt`, `ods`): Apache Tika (including the sources)
- HTML: not supported, extract text from converted PDF
- Images (`jpg`, `png`, `tif`): Tesseract
- Text/Markdown: n.a.
- PDF: Apache PDFBox or Tesseract
# Links
* [Convert HTML Files](@/docs/dev/adr/0007_convert_html_files.md)
* [Convert Plain Text](@/docs/dev/adr/0008_convert_plain_text.md)
* [Convert Office Documents](@/docs/dev/adr/0009_convert_office_docs.md)
* [Convert Image Files](@/docs/dev/adr/0010_convert_image_files.md)
* [Extract Text from Files](@/docs/dev/adr/0011_extract_text.md)

View File

@ -0,0 +1,59 @@
+++
title = "Convert HTML Files"
weight = 80
+++
# Context and Problem Statement
How can HTML documents be converted into a PDF file that looks as much
as possible like the original?
It would be nice to have a java-only solution. But if an external tool
has a better outcome, then an external tool is fine, too.
Since Docspell is free software, the tools must also be free.
# Considered Options
* [pandoc](https://pandoc.org/) external command
* [wkhtmltopdf](https://wkhtmltopdf.org/) external command
* [Unoconv](https://github.com/unoconv/unoconv) external command
Native (firefox) view:
{{ figure(file="example-html-native.jpg") }}
Note: the example html is from
[here](https://www.sparksuite.com/open-source/invoice.html).
I downloaded the HTML file to disk together with its resources (using
*Save as...* in the browser).
## Pandoc
{{ figure(file="example-html-pandoc-latex.jpg") }}
{{ figure(file="example-html-pandoc-html.jpg") }}
Not showing the version using `context` pdf-engine, since it looked
very similiar to the latex variant.
## wkhtmltopdf
{{ figure(file="example-html-wkhtmltopdf.jpg") }}
## Unoconv
{{ figure(file="example-html-unoconv.jpg") }}
# Decision Outcome
wkhtmltopdf.
It shows the best results.

View File

@ -0,0 +1,177 @@
+++
title = "Convert Text Files"
weight = 90
+++
# Context and Problem Statement
How can plain text and markdown documents be converted into a PDF
files?
Rendering images is not important here, since the files must be self
contained when uploaded to Docspell.
The test file is the current documentation page of Docspell, found in
`microsite/docs/doc.md`.
```
---
layout: docs
position: 4
title: Documentation
---
# {page .title}
Docspell assists in organizing large amounts of PDF files that are
...
## How it works
Documents have two ...
1. You maintain a kind of address book. It should list all possible
correspondents and the concerning people/things. This grows
incrementally with each new unknown document.
2. When docspell analyzes a document, it tries to find matches within
your address ...
3. You can inspect ...
The set of meta data that docspell uses to draw suggestions from, must
be maintained ...
## Terms
In order to better understand these pages, some terms should be
explained first.
### Item
An **Item** is roughly your (pdf) document, only that an item may span
multiple files, which are called **attachments**. And an item has
**meta data** associated:
- a **correspondent**: the other side of the communication. It can be
an organization or a person.
- a **concerning person** or **equipment**: a person or thing that
this item is about. Maybe it is an insurance contract about your
car.
- ...
### Collective
The users of the application are part of a **collective**. A
**collective** is a group of users that share access to the same
items. The account name is therefore comprised of a *collective name*
and a *user name*.
All users of a collective are equal; they have same permissions to
access all...
```
Then a plain text file is tried, too (without any markup).
```
Maecenas mauris lectus, lobortis et purus mattis
Duis vehicula mi vel mi pretium
In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu.
Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut.
Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros
efficitur tincidunt. Cras justo mi, porttitor quis mattis vel,
ultricies ut purus. Ut facilisis et lacus eu cursus.
In eleifend velit vitae libero sollicitudin euismod:
- Fusce vitae vestibulum velit,
- Pellentesque vulputate lectus quis pellentesque commodo
the end.
```
# Considered Options
* [flexmark](https://github.com/vsch/flexmark-java) for markdown to
HTML, then use existing machinery described in [adr
7](@/docs/dev/adr/0007_convert_html_files.md)
* [pandoc](https://pandoc.org/) external command
## flexmark markdown library for java
Process files with [flexmark](https://github.com/vsch/flexmark-java)
and then create a PDF from the resulting html.
Using the following snippet:
``` scala
def renderMarkdown(): ExitCode = {
val opts = new MutableDataSet()
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
util.Arrays.asList(TablesExtension.create(),
StrikethroughExtension.create()));
val parser = Parser.builder(opts).build()
val renderer = HtmlRenderer.builder(opts).build()
val reader = Files.newBufferedReader(Paths.get("in.txt|md"))
val doc = parser.parseReader(reader)
val html = renderer.render(doc)
val body = "<html><head></head><body style=\"padding: 0 5em;\">" + html + "</body></html>"
Files.write(
Paths.get("test.html"),
body.getBytes(StandardCharsets.UTF_8))
ExitCode.Success
}
```
Then run the result through `wkhtmltopdf`.
Markdown file:
{{ figure(file="example-md-java.jpg") }}
TXT file:
{{ figure(file="example-txt-java.jpg") }}
## pandoc
Command:
```
pandoc -f markdown -t html -o test.pdf microsite/docs/doc.md
```
Markdown/Latex:
{{ figure(file="example-md-pandoc-latex.jpg") }}
Markdown/Html:
{{ figure(file="example-md-pandoc-html.jpg") }}
Text/Latex:
{{ figure(file="example-txt-pandoc-latex.jpg") }}
Text/Html:
{{ figure(file="example-txt-pandoc-html.jpg") }}
# Decision Outcome
Java library "flexmark".
I think all results are great. It depends on the type of document and
what one expects to see. I guess that most people expect something
like pandoc-html produces for the kind of files docspell is for (it is
not for newspaper articles, where pandoc-latex would be best fit).
But choosing pandoc means yet another external command to depend on.
And the results from flexmark are really good, too. One can fiddle
with options and css to make it look better.
To not introduce another external command, decision is to use flexmark
and then the already existing html->pdf conversion.

View File

@ -0,0 +1,205 @@
+++
title = "Convert Office Documents"
weight = 100
+++
# Context and Problem Statement
How can office documents, like `docx` or `odt` be converted into a PDF
file that looks as much as possible like the original?
It would be nice to have a java-only solution. But if an external tool
has a better outcome, then an external tool is fine, too.
Since Docspell is free software, the tools must also be free.
# Considered Options
* [Apache POI](https://poi.apache.org) together with
[this](https://search.maven.org/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.pdf/1.0.6/jar)
library
* [pandoc](https://pandoc.org/) external command
* [abiword](https://www.abisource.com/) external command
* [Unoconv](https://github.com/unoconv/unoconv) external command
To choose an option, some documents are converted to pdf and compared.
Only the formats `docx` and `odt` are considered here. These are the
most used formats. They have to look well, if a `xlsx` or `pptx`
doesn't look so great, that is ok.
Here is the native view to compare with:
ODT:
{{ figure(file="example-odt-native.jpg") }}
## `XWPFConverter`
I couldn't get any example to work. There were exceptions:
```
java.lang.IllegalArgumentException: Value for parameter 'id' was out of bounds
at org.apache.poi.util.IdentifierManager.reserve(IdentifierManager.java:80)
at org.apache.poi.xwpf.usermodel.XWPFRun.<init>(XWPFRun.java:101)
at org.apache.poi.xwpf.usermodel.XWPFRun.<init>(XWPFRun.java:146)
at org.apache.poi.xwpf.usermodel.XWPFParagraph.buildRunsInOrderFromXml(XWPFParagraph.java:135)
at org.apache.poi.xwpf.usermodel.XWPFParagraph.<init>(XWPFParagraph.java:88)
at org.apache.poi.xwpf.usermodel.XWPFDocument.onDocumentRead(XWPFDocument.java:147)
at org.apache.poi.POIXMLDocument.load(POIXMLDocument.java:159)
at org.apache.poi.xwpf.usermodel.XWPFDocument.<init>(XWPFDocument.java:124)
at docspell.convert.Testing$.withPoi(Testing.scala:17)
at docspell.convert.Testing$.$anonfun$run$1(Testing.scala:12)
at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:87)
at cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:355)
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:376)
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:316)
at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
at cats.effect.internals.PoolUtils$$anon$2$$anon$3.run(PoolUtils.scala:51)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
```
The project (not Apache Poi, the other) seems unmaintained. I could
not find any website and the artifact in maven central is from 2016.
## Pandoc
I know pandoc as a very great tool when converting between markup
documents. So this tries it with office documents. It supports `docx`
and `odt` from there `--list-input-formats`.
From the pandoc manual:
> By default, pandoc will use LaTeX to create the PDF, which requires
> that a LaTeX engine be installed (see --pdf-engine below).
> Alternatively, pandoc can use ConTeXt, roff ms, or HTML as an
> intermediate format. To do this, specify an output file with a .pdf
> extension, as before, but add the --pdf-engine option or -t context,
> -t html, or -t ms to the command line. The tool used to generate the
> PDF from the intermediate format may be specified using --pdf-engine.
Trying with latex engine:
```
pandoc -f odt -o test.pdf example.odt
```
Results ODT:
{{ figure(file="example-odt-pandoc-latex.jpg") }}
```
pandoc -f odt -o test.pdf example.docx
```
Results DOCX:
{{ figure(file="example-docx-pandoc-latex.jpg") }}
----
Trying with context engine:
```
pandoc -f odt -t context -o test.pdf example.odt
```
Results ODT:
{{ figure(file="example-odt-pandoc-context.jpg") }}
Results DOCX:
{{ figure(file="example-docx-pandoc-context.jpg") }}
----
Trying with ms engine:
```
pandoc -f odt -t ms -o test.pdf example.odt
```
Results ODT:
{{ figure(file="example-odt-pandoc-ms.jpg") }}
Results DOCX:
{{ figure(file="example-docx-pandoc-ms.jpg") }}
---
Trying with html engine (this requires `wkhtmltopdf` to be present):
```
$ pandoc --extract-media . -f odt -t html -o test.pdf example.odt
```
Results ODT:
{{ figure(file="example-odt-pandoc-html.jpg") }}
Results DOCX:
{{ figure(file="example-docx-pandoc-html.jpg") }}
## Abiword
Trying with:
```
abiword --to=pdf example.odt
```
Results:
{{ figure(file="example-odt-abiword.jpg") }}
Trying with a `docx` file failed. It worked with a `doc` file.
## Unoconv
Unoconv relies on libreoffice/openoffice, so installing it will result
in installing parts of libreoffice, which is a very large dependency.
Trying with:
```
unoconv -f pdf example.odt
```
Results ODT:
{{ figure(file="example-odt-unoconv.jpg") }}
Results DOCX:
{{ figure(file="example-docx-unoconv.jpg") }}
# Decision Outcome
Unoconv.
The results from `unoconv` are really good.
Abiword also is not that bad, it didn't convert the chart, but all
font markup is there. It would be great to not depend on something as
big as libreoffice, but the results are so much better.
Also pandoc deals very well with DOCX files (using the `context`
engine). The only thing that was not rendered was the embedded chart
(like abiword). But all images and font styling was present.
It will be a configurable external command anyways, so users can
exchange it at any time with a different one.

View File

@ -0,0 +1,190 @@
+++
title = "Convert Image Files"
weight = 110
+++
# Context and Problem Statement
How to convert image files properly to pdf?
Since there are thousands of different image formats, there will never
be support for all. The most common containers should be supported,
though:
- jpeg (jfif, exif)
- png
- tiff (baseline, single page)
The focus is on document images, maybe from digital cameras or
scanners.
# Considered Options
* [pdfbox](https://pdfbox.apache.org/) library
* [imagemagick](https://www.imagemagick.org/) external command
* [img2pdf](https://github.com/josch/img2pdf) external command
* [tesseract](https://github.com/tesseract-ocr/tesseract) external command
There are no screenshots here, because it doesn't make sense since
they all look the same on the screen. Instead we look at the files
properties.
**Input File**
The input files are:
```
$ identify input/*
input/jfif.jpg JPEG 2480x3514 2480x3514+0+0 8-bit sRGB 240229B 0.000u 0:00.000
input/letter-en.jpg JPEG 1695x2378 1695x2378+0+0 8-bit Gray 256c 467341B 0.000u 0:00.000
input/letter-en.png PNG 1695x2378 1695x2378+0+0 8-bit Gray 256c 191571B 0.000u 0:00.000
input/letter-en.tiff TIFF 1695x2378 1695x2378+0+0 8-bit Grayscale Gray 4030880B 0.000u 0:00.000
```
Size:
- jfif.jpg 240k
- letter-en.jpg 467k
- letter-en.png 191k
- letter-en.tiff 4.0M
## pdfbox
Using a java library is preferred, if the quality is good enough.
There is an
[example](https://github.com/apache/pdfbox/blob/2cea31cc63623fd6ece149c60d5f0cc05a696ea7/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ImageToPDF.java)
for this exact use case.
This is the sample code:
``` scala
def imgtopdf(file: String): ExitCode = {
val jpg = Paths.get(file).toAbsolutePath
if (!Files.exists(jpg)) {
sys.error(s"file doesn't exist: $jpg")
}
val pd = new PDDocument()
val page = new PDPage(PDRectangle.A4)
pd.addPage(page)
val bimg = ImageIO.read(jpg.toFile)
val img = LosslessFactory.createFromImage(pd, bimg)
val stream = new PDPageContentStream(pd, page)
stream.drawImage(img, 0, 0, PDRectangle.A4.getWidth, PDRectangle.A4.getHeight)
stream.close()
pd.save("test.pdf")
pd.close()
ExitCode.Success
}
```
Using pdfbox 2.0.18 and twelvemonkeys 3.5. Running time: `1384ms`
```
$ identify *.pdf
jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129660B 0.000u 0:00.000
letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
```
Size:
- jfif.jpg 1.1M
- letter-en.jpg 142k
- letter-en.png 142k
- letter-en.tiff 142k
## img2pdf
This is a python tool that adds the image into the pdf without
reencoding.
Using version 0.3.1. Running time: `323ms`.
```
$ identify *.pdf
jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129708B 0.000u 0:00.000
letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
```
Size:
- jfif.jpg 241k
- letter-en.jpg 468k
- letter-en.png 191k
- letter-en.tiff 192k
## ImageMagick
The well known imagemagick tool can convert images to pdfs, too.
Using version 6.9.10-71. Running time: `881ms`.
```
$ identify *.pdf
jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 134873B 0.000u 0:00.000
letter-en.jpg.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 360100B 0.000u 0:00.000
letter-en.png.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
letter-en.tiff.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
```
Size:
- jfif.jpg 300k
- letter-en.jpg 390k
- letter-en.png 180k
- letter-en.tiff 5.1M
## Tesseract
Docspell already relies on tesseract for doing OCR. And in contrast to
all other candidates, it can create PDFs that are searchable. Of
course, this yields in much longer running time, that cannot be
compared to the times of the other options.
```
tesseract doc3.jpg out -l deu pdf
```
It can also create both outputs in one go:
```
tesseract doc3.jpg out -l deu pdf txt
```
Using tesseract 4. Running time: `6661ms`
```
$ identify *.pdf
tesseract/jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 130535B 0.000u 0:00.000
tesseract/letter-en.jpg.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
tesseract/letter-en.png.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
tesseract/letter-en.tiff.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
```
Size:
- jfif.jpg 246k
- letter-en.jpg 473k
- letter-en.png 183k
- letter-en.tiff 183k
# Decision
Tesseract.
To not use more external tools, imagemagick and img2pdf are not
chosen, even though img2pdf shows the best results and is fastest.
Pdfbox library would be the favorite, because results are good and
with the [twelvemonkeys](https://github.com/haraldk/TwelveMonkeys)
library there is support for many images. The priority is to avoid
more external commands if possible.
But since there already is a dependency to tesseract and it can create
searchable pdfs, the decision is to use tesseract for this. Then PDFs
with images can be converted to searchable PDFs with images. And text
extraction is required anyways.

View File

@ -0,0 +1,76 @@
+++
title = "Extract Text from Files"
weight = 120
+++
# Context and Problem Statement
With support for more file types there must be a way to extract text
from all of them. It is better to extract text from the source files,
in contrast to extracting the text from the converted pdf file.
There are multiple options and multiple file types. Again, most
priority is to use a java/scala library to reduce external
dependencies.
# Considered Options
## MS Office Documents
There is only one library I know: [Apache
POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
However, it doesn't support open-document format (odt and ods).
## OpenDocument Format
There are two libraries:
- [Apache Tika Parser](https://tika.apache.org/)
- [ODFToolkit](https://github.com/tdf/odftoolkit)
*Tika:* The tika-parsers package contains an opendocument parser for
extracting text. But it has a huge dependency tree, since it is a
super-package containing a parser for almost every common file type.
*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
and also pulls in quite some dependencies (while not as much as
tika-parser). It is not too bad, since it is a library for
manipulating opendocument files. But all I need is to only extract
text. I created tests that extracted text from my odt/ods files. It
worked at first sight, but running the tests in a loop resulted in
strange nullpointer exceptions (it only worked the first run).
## Richtext
Richtext is supported by the jdk (using `RichtextEditorKit` from
swing).
## PDF
For "image" pdf files, tesseract is used. For "text" PDF files, the
library [Apache PDFBox](https://pdfbox.apache.org) can be used.
There also is [iText](https://github.com/itext/itext7) with a AGPL
license.
## Images
For images and "image" PDF files, there is already tesseract in place.
## HTML
HTML must be converted into a PDF file before text can be extracted.
## Text/Markdown
These files can be used as-is, obviously.
# Decision Outcome
- MS Office files: POI library
- Open Document files: Tika, but integrating the few source files that
make up the open document parser. Due to its huge dependency tree,
the library is not added.
- PDF: Apache PDFBox. I know this library better than itext.

View File

@ -0,0 +1,103 @@
+++
title = "Periodic Tasks"
weight = 130
+++
# Context and Problem Statement
Currently there is a `Scheduler` that consumes tasks off a queue in
the database. This allows multiple job executors running in parallel
racing for the next job to execute. This is for executing tasks
immediately as long as there are enough resource.
What is missing, is a component that maintains periodic tasks. The
reason for this is to have house keeping tasks that run regularily and
clean up stale or unused data. Later, users should be able to create
periodic tasks, for example to read e-mails from an inbox or to be
notified of due items.
The problem is again, that it must work with multiple job executor
instances running at the same time. This is the same pattern as with
the `Scheduler`: it must be ensured that only one task is used at a
time. Multiple job exectuors must not schedule a perdiodic task more
than once. If a periodic tasks takes longer than the time between
runs, it must wait for the next interval.
# Considered Options
1. Adding a `timer` and `nextrun` field to the current `job` table
2. Creating a separate table for periodic tasks
## Decision Outcome
The 2. option.
For internal housekeeping tasks, it may suffice to reuse the existing
`job` queue by adding more fields such that a job may be considered
periodic. But this conflates with what the `Scheduler` is doing now
(executing tasks as soon as possible while being bound to some
resource limits) with a completely different subject.
There will be a new `PeriodicScheduler` that works on a new table in
the database that is representing periodic tasks. This table will
share fields with the `job` table to be able to create `RJob` records.
This new component is only taking care of periodically submitting jobs
to the job queue such that the `Scheduler` will eventually pick it up
and run it. If the tasks cannot run (for example due to resource
limitation), the periodic scheduler can't do nothing but wait and try
next time.
```sql
CREATE TABLE "periodic_task" (
"id" varchar(254) not null primary key,
"enabled" boolean not null,
"task" varchar(254) not null,
"group_" varchar(254) not null,
"args" text not null,
"subject" varchar(254) not null,
"submitter" varchar(254) not null,
"priority" int not null,
"worker" varchar(254),
"marked" timestamp,
"timer" varchar(254) not null,
"nextrun" timestamp not null,
"created" timestamp not null
);
```
Preparing for other features, at some point periodic tasks will be
created by users. It should be possible to disable/enable them. The
next 6 properties are needed to insert jobs into the `job` table. The
`worker` field (and `marked`) are used to mark a periodic job as
"being worked on by a job executor".
The `timer` is the schedule, which is a
[systemd-like](https://man.cx/systemd.time#heading7) calendar event
string. This is parsed by [this
library](https://github.com/eikek/calev). The `nextrun` field will
store the timestamp of the next time the task would need to be
executed. This is needed to query this table for the newest task.
The `PeriodicScheduler` works roughly like this:
On startup:
- Remove stale worker values. If the process has been killed, there
may be marked tasks which must be cleared now.
Main-Loop:
0. Cancel current scheduled notify (see 4. below)
1. get next (= earliest & enabled) periodic job
2. if none: stop
3. if triggered (= `nextrun <= 'now'`):
- Mark periodic task. On fail: goto 1.
- Submit new job into the jobqueue:
- Update `nextrun` field
- Check for non-final jobs of that name. This is required to not
run the same periodic task multiple times concurrently.
- if exist: goto 4.
- if not exist: submit job
- Unmark periodic task
4. if future
- schedule notify: notify self to run again next time the task
schedule triggers

View File

@ -0,0 +1,42 @@
+++
title = "Archive Files"
weight = 140
+++
# Context and Problem Statement
Docspell should have support for files that contain the actual files
that matter, like zip files and other such things. It should extract
its contents automatcially.
Since docspell should never drop or modify user data, the archive file
must be present in the database. And it must be possible to download
the file unmodified.
On the other hand, files in there need to be text analysed and
converted to pdf files.
# Decision Outcome
There is currently a table `attachment_source` which holds references
to "original" files. These are the files as uploaded by the user,
before converted to pdf. Archive files add a subtlety to this: in case
of an archive, an `attachment_source` is the original (non-archive)
file inside an archive.
The archive file itself will be stored in a separate table `attachment_archive`.
Example: uploading a `files.zip` ZIP file containing `report.jpg`:
- `attachment_source`: report.jpg
- `attachment`: report.pdf
- `attachment_archive`: files.zip
Archive may contain other archives. Then the inner archives will not
be saved. The archive file is extracted recursively, until there is no
known archive file found.
# Initial Support
Initial support is implemented for ZIP and EML (e-mail files) files.

View File

@ -0,0 +1,47 @@
+++
title = "Fulltext Search Engine"
weight = 150
+++
It should be possible to search the contents of all documents.
# Context and Problem Statement
To allow searching the documents contents efficiently, a separate
index is necessary. The "defacto standard" for fulltext search on the
JVM is something backed by [Lucene](https://lucene.apache.org).
Another option is to use a RDBMS that supports fulltext search.
This adds another component to the mix, which increases the complexity
of the setup and the software. Since docspell works great without this
feature, it shouldn't have a huge impact on the application, i.e. if
the fulltext search component is down or broken, docspell should still
work (just the fulltext search is then not working).
# Considered Options
* [Apache SOLR](https://lucene.apache.org/solr)
* [ElasticSearch](https://www.elastic.co/elasticsearch/)
* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html)
* All of them or a subset
# Decision Outcome
If docspell is running on PostgreSQL, it would be nice to also use it
for fulltext search to save the cost of running another component. But
I don't want to lock the database to PostgreSQL *only* because of the
fulltext search feature.
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
part of Lucene and therefore lives in the Apache ecosystem. I would
choose SOLR over ElasticSearch, because I used it before.
The last option (supporting all) is interesting, since it would enable
to use PostgreSQL for fulltext search for those that use PostgreSQL as
the database for docspell.
In a first step, identify what docspell needs from a fulltext search
component and create this interface and an implementation for Apache
SOLR. This enables all users to use the fulltext search feature. As a
later step, an implementation based on PostgreSQL and/or ElasticSearch
could be provided, too.

View File

@ -0,0 +1,64 @@
+++
title = "Convert PDF Files"
weight = 160
+++
# Context and Problem Statement
Some PDFs contain only images (when coming from a scanner) and
therefore one is not able to click into the pdf and select text for
copy&paste. Also it is not searchable in a PDF viewer. These are
really shortcomings that can be fixed, especially when there is
already OCR build in.
For images, this works already as tesseract is used to create the PDF
files. Tesseract creates the files with an additional text layer
containing the OCRed text.
# Considered Options
* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
OCR text layer to scanned PDF files, allowing them to be searched
## ocrmypdf
This is a very nice python tool, that uses tesseract to do OCR on each
page and add the extracted text as a pdf text layer to the page.
Additionally it creates PDF/A type pdfs, which are great for
archiving. This fixes exactly the things stated above.
### Integration
Docspell already has this built in for images. When converting images
to a PDF (which is done early in processing), the process creates a
text and a PDF file. Docspell then sets the text in this step and the
text extraction step skips doing its work, if there is already text
available.
It would be possible to use the `--sidecar` option with ocrmypdf to
create a text file of the extracted text with one run, too (exactly
like it works for tesseract). But for "text" pdfs, ocrmypdf writes
some info-message into this text file:
```
[OCR skipped on page 1] [OCR skipped on page 2]
```
Docspell cannot reliably tell, wether this is extracted text or not.
It would be reqiured to load the pdf and check its contents. This is a
bit of bad luck, because everything would just work already. So it
requires a (small) change in the text-extraction step. By default,
text extraction happens on the source file. For PDFs, text extraction
should now be run on the converted file, to avoid running OCR twice.
The converted pdf file is either be a text-pdf in the first place,
where ocrmypdf would only convert it to a PDF/A file; or it may be a
converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
is disabled, the converted file and the source file are the same for
PDFs.
# Decision Outcome
Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
distributed under the GPL-3 license.

View File

@ -0,0 +1,14 @@
+++
title = "ADRs"
description = "Contains some ADRs, which are internal notes on decisions made."
weight = 300
sort_by = "weight"
insert_anchor_links = "right"
template = "pages.html"
[extra]
mktoc = true
+++
This contains a list of ADRs, most of them are from very early. It
often just contains notes that could go nowhere else, but still should
be captured.

Binary file not shown.

After

Width:  |  Height:  |  Size: 385 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 443 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 291 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 353 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 292 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 142 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 586 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 479 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 280 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 363 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 418 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 500 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 349 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 350 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 296 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 176 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 174 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 155 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

View File

@ -0,0 +1,43 @@
@startuml
scale 1200 width
title: Processing Files
skinparam monochrome true
skinparam backgroundColor white
skinparam rectangle {
roundCorner<<Input>> 25
roundCorner<<Output>> 5
}
rectangle Input <<Input>> {
file "html"
file "plaintext"
file "image"
file "msoffice"
file "rtf"
file "odf"
file "pdf"
}
node toBoth [
PDF + TXT
]
node toPdf [
PDF
]
node toTxt [
TXT
]
image --> toBoth:<tesseract>
html --> toPdf:<wkhtmltopdf>
toPdf --> toTxt:[pdfbox]
plaintext --> html:[flexmark]
msoffice --> toPdf:<unoconv>
msoffice --> toTxt:[poi]
rtf --> toTxt:[jdk]
rtf --> toPdf:<unoconv>
odf --> toTxt:[tika]
odf --> toPdf:<unoconv>
pdf --> toTxt:<tesseract>
pdf --> toTxt:[pdfbox]
plaintext -> toTxt:[identity]
@enduml

View File

@ -0,0 +1,77 @@
+++
title = "Short Title"
draft = true
+++
# [short title of solved problem and solution]
* Status: [proposed | rejected | accepted | deprecated | … | superseded by [ADR-0005](0005-example.md)] <!-- optional -->
* Deciders: [list everyone involved in the decision] <!-- optional -->
* Date: [YYYY-MM-DD when the decision was last updated] <!-- optional -->
Technical Story: [description | ticket/issue URL] <!-- optional -->
## Context and Problem Statement
[Describe the context and problem statement, e.g., in free form using two to three sentences. You may want to articulate the problem in form of a question.]
## Decision Drivers <!-- optional -->
* [driver 1, e.g., a force, facing concern, …]
* [driver 2, e.g., a force, facing concern, …]
* … <!-- numbers of drivers can vary -->
## Considered Options
* [option 1]
* [option 2]
* [option 3]
* … <!-- numbers of options can vary -->
## Decision Outcome
Chosen option: "[option 1]", because [justification. e.g., only option, which meets k.o. criterion decision driver | which resolves force force | … | comes out best (see below)].
### Positive Consequences <!-- optional -->
* [e.g., improvement of quality attribute satisfaction, follow-up decisions required, …]
* …
### Negative Consequences <!-- optional -->
* [e.g., compromising quality attribute, follow-up decisions required, …]
* …
## Pros and Cons of the Options <!-- optional -->
### [option 1]
[example | description | pointer to more information | …] <!-- optional -->
* Good, because [argument a]
* Good, because [argument b]
* Bad, because [argument c]
* … <!-- numbers of pros and cons can vary -->
### [option 2]
[example | description | pointer to more information | …] <!-- optional -->
* Good, because [argument a]
* Good, because [argument b]
* Bad, because [argument c]
* … <!-- numbers of pros and cons can vary -->
### [option 3]
[example | description | pointer to more information | …] <!-- optional -->
* Good, because [argument a]
* Good, because [argument b]
* Bad, because [argument c]
* … <!-- numbers of pros and cons can vary -->
## Links <!-- optional -->
* [Link type] [Link to ADR] <!-- example: Refined by [ADR-0005](0005-example.md) -->
* … <!-- numbers of links can vary -->

View File

@ -0,0 +1,22 @@
+++
title = "Building Docspell"
weight = 0
+++
You must install [sbt](https://scala-sbt.org) and [Elm](https://elm-lang.org).
Clone the sources and run:
- `make` to compile all sources (Elm + Scala)
- `make-zip` to create zip packages
- `make-deb` to create debian packages
- `make-tools` to create a zip containing the script in `tools/`
- `make-pkg` for a clean compile + building all packages (zip + deb)
The `zip` and `deb` files can be found afterwards in:
```
modules/restserver/target/universal
modules/joex/target/universal
```

View File

@ -0,0 +1,110 @@
+++
title = "Tips & Setup"
weight = 20
+++
# Starting Servers with `reStart`
When developing, it's very convenient to use the [revolver sbt
plugin](https://github.com/spray/sbt-revolver). Start the sbt console
and then run:
```
sbt:docspell-root> restserver/reStart
```
This starts a REST server. Once this started up, type:
```
sbt:docspell-root> joex/reStart
```
if also a joex component is required. Prefixing the commads with `~`,
results in recompile+restart once a source file is modified.
It is possible to start both in the root project:
```
sbt:docspell-root> reStart
```
# Custom config file
The sbt build is setup such that a file `dev.conf` in the directory
`local` (at root of the source tree) is picked up as config file, if
it exists. So you can create a custom config file for development. For
example, a custom database for development may be setup this way:
```
#jdbcurl = "jdbc:h2:///home/dev/workspace/projects/docspell/local/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
jdbcurl = "jdbc:postgresql://localhost:5432/docspelldev"
#jdbcurl = "jdbc:mariadb://localhost:3306/docspelldev"
docspell.server {
backend {
jdbc {
url = ${jdbcurl}
user = "dev"
password = "dev"
}
}
}
docspell.joex {
jdbc {
url = ${jdbcurl}
user = "dev"
password = "dev"
}
scheduler {
pool-size = 1
}
}
```
# Nix Expressions
The directory `/nix` contains nix expressions to install docspell via
the nix package manager and to integrate it into NixOS.
## Testing NixOS Modules
The modules can be build by building the `configuration-test.nix` file
together with some nixpkgs version. For example:
``` bash
nixos-rebuild build-vm -I nixos-config=./configuration-test.nix \
-I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/nixos-19.09.tar.gz
```
This will build all modules imported in `configuration-test.nix` and
create a virtual machine containing the system. After that completes,
the system configuration can be found behind the `./result/system`
symlink. So it is possible to look at the generated systemd config for
example:
``` bash
cat result/system/etc/systemd/system/docspell-joex.service
```
And with some more commands (there probably is an easier way…) the
config file can be checked:
``` bash
cat result/system/etc/systemd/system/docspell-joex.service | grep ExecStart | cut -d'=' -f2 | xargs cat | tail -n1 | awk '{print $NF}'| sed 's/.$//' | xargs cat | jq
```
To see the module in action, the vm can be started (the first line
sets more memory for the vm):
``` bash
export QEMU_OPTS="-m 2048"
./result/bin/run-docspelltest-vm
```
# Background Info
There is a list of [ADRs](@/docs/dev/adr/_index.md) containing
internal/background info for various topics.

View File

@ -0,0 +1,72 @@
+++
title = "Features and Limitations"
weight = 10
insert_anchor_links = "right"
description = "A list of features and limitations."
+++
# Features
- Multi-account application
- Multiple users per account (multiple users can access the same
account)
- Handle multiple documents as one unit
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
- [Full-Text Search](@/docs/webapp/finding.md#full-text-search) based
on [Apache SOLR](https://lucene.apache.org/solr)
- Conversion to PDF: all files are converted into a PDF file. PDFs
with only images (as often returned from scanners) are converted
into searchable PDF/A pdfs.
- Non-destructive: all your uploaded files are never modified and can
always be downloaded untouched
- Text is analysed to find and attach meta data automatically
- [Manage document processing](@/docs/webapp/processing.md): cancel
jobs, set priorities
- Everything available via a [documented](https://www.openapis.org/)
[REST Api](@/docs/api/_index.md); allows to [generate
clients](https://openapi-generator.tech/docs/generators) for
(almost) any language
- mobile-friendly Web-UI
- [Create “share-urls”](@/docs/webapp/uploading.md#anonymous-upload)
to upload files anonymously
- [Send documents via e-mail](@/docs/webapp/mailitem.md)
- [E-Mail notification](@/docs/webapp/notifydueitems.md) for documents
with due dates
- [Read your mailboxes](@/docs/webapp/scanmailbox.md) via IMAP to
import mails into docspell
- REST server and document processing are separate applications which
can be scaled-out independently
- Everything stored in a SQL database: PostgreSQL, MariaDB or H2
- H2 is embedded, a "one-file-only" database, avoids installing db
servers
- Files supported:
- Documents:
- PDF
- common MS Office (doc, docx, xls, xlsx)
- OpenDocument (odt, ods)
- RichText (rtf)
- Images (jpg, png, tiff)
- HTML
- text/* (treated as Markdown)
- Archives (extracted automatically, can be nested)
- zip
- [eml](https://en.wikipedia.org/wiki/Email#Filename_extensions)
(e-mail files in plain text MIME)
- Tooling:
- [Watch a folder](@/docs/tools/consumedir.md): watch folders for
changes and send files to docspell
- [Simple CLI for uploading files](@/docs/tools/ds.md)
- [Firefox plugin](@/docs/tools/browserext.md): right click on a
link and send the file to docspell
- [SMTP Gateway](@/docs/tools/smtpgateway.md): Setup a SMTP server
that delivers mails directly to docspell.
- License: GPLv3
# Limitations
These are current known limitations that may be of interest for
considering docspell at the moment.
- Documents cannot be modified.
- You can remove and add documents but there is no versioning.

View File

@ -0,0 +1,9 @@
+++
title = "Installation and Deployment"
description = "There are multiple ways to install Docspell. This section contains detailed instructions."
weight = 30
sort_by = "weight"
insert_anchor_links = "right"
template = "pages.html"
redirect_to = "/docs/install/quickstart"
+++

View File

@ -0,0 +1,324 @@
+++
title = "Installing"
weight = 20
+++
# Docker
There is a [docker-compose](https://docs.docker.com/compose/) setup
available in the `/docker` folder. This setup is also taking care of
all the necessary [prerequisites](@/docs/install/prereq.md) and
creates a container to watch a directory for incoming files. It's only
3 steps:
1. Clone the github repository
```bash
$ git clone https://github.com/eikek/docspell
```
2. Change into the `docker` directory:
```bash
$ cd docspell/docker
```
3. Run `docker-compose up`:
```bash
$ export DOCSPELL_HEADER_VALUE="my-secret-123"
$ docker-compose up
```
The environment variable defines a secret that is shared between
some containers. You can define whatever you like. Please see the
[consumedir.sh](@/docs/tools/consumedir.md#docker) docs for
additional info.
4. Goto `http://localhost:7880`, signup and login. When signing up,
you can choose the same name for collective and user. Then login
with this name and the password.
5. (Optional) Create a folder `./docs/<collective-name>` (the name you
chose for the collective at registration) and place files in there
for importing them.
The directory contains a file `docspell.conf` that you can
[modify](@/docs/configure/_index.md) as needed.
# Download, Unpack, Run
You can install via zip or deb archives. Please see the
[prerequisites](@/docs/install/prereq.md) first.
## Using zip files
You need to download the two files:
- [docspell-restserver-{{version()}}.zip](https://github.com/eikek/docspell/releases/download/v{{version()}}/docspell-restserver-{{version()}}.zip)
- [docspell-joex-{{version()}}.zip](https://github.com/eikek/docspell/releases/download/v{{version()}}/docspell-joex-{{version()}}.zip)
1. Unzip both files:
``` bash
$ unzip docspell-*.zip
```
2. Open two terminal windows and navigate to the the directory
containing the zip files.
3. Start both components executing:
``` bash
$ ./docspell-restserver*/bin/docspell-restserver
```
in one terminal and
``` bash
$ ./docspell-joex*/bin/docspell-joex
```
in the other.
4. Point your browser to: <http://localhost:7880/app>
5. Register a new account, sign in and try it.
Note, that this setup doesn't include watching a directory. You can
use the [consumedir.sh](@/docs/tools/consumedir.md) tool for this or
use the docker variant below.
## Using deb files
The DEB packages can be installed on Debian, or Debian based Distros:
``` bash
$ sudo dpkg -i docspell*.deb
```
Then the start scripts are in your `$PATH`. Run `docspell-restserver`
or `docspell-joex` from a terminal window.
The packages come with a systemd unit file that will be installed to
autostart the services.
# Nix
## Install via Nix
Docspell can be installed via the [nix](https://nixos.org/nix) package
manager, which is available for Linux and OSX. Docspell is currently not
part of the [nixpkgs collection](https://nixos.org/nixpkgs/), but you
can use the derivation from this repository. This is sometimes
referred to as [import from
derivation](https://nixos.wiki/wiki/Import_From_Derivation).
For example, the `builtins.fetchTarball` function can be used to
retrieve the files; then import the `release.nix` file:
``` nix
let
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
in
import "${docspellsrc}/nix/release.nix";
```
This creates a set containing a function for creating a derivation for
docspell. This then needs to be called like other custom packages. For
example, in your `~/.nixpkgs/config.nix` you could write this:
``` nix
let
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
docspell = import "${docspellsrc}/nix/release.nix";
in
{ packageOverrides = pkgs:
let
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
custom = {
docspell = callPackage docspell.currentPkg {};
};
in custom;
}
```
The `docspell` custom package is again a set that contains derivations
for all 3 installable docspell programs: the restserver, joex and the
tools.
Then you can install docspell via `nix-shell` or `nix-env`, for example:
``` bash
$ nix-env -iA nixpkgs.docspell.server nixpkgs.docspell.joex nixpkgs.docspell.tools
```
You may need to replace `nixpkgs` with `nixos` when you're on NixOS.
The expression `docspell.currentPkg` refers to the most current
release of Docspell. So even if you use the tarball of the current
master branch, the `release.nix` file only contains derivations for
releases. The expression `docspell.currentPkg` is a shortcut for
selecting the most current release. For example it translates to
`docspell.pkg docspell.cfg.v{{ pversion() }}` if the current version
is `{{version()}}`.
## Docspell on NixOS {#nixos}
If you are running [NixOS](https://nixos.org), there is a module
definition for installing Docspell as a service using systemd.
There are the following modules provided:
- restserver
- joex
- consumedir
The `consumedir` module defines a systemd unit that starts the
`consumedir.sh` script to watch one or more directories for new files.
You need to import the `release.nix` file as described above in your
`configuration.nix` and then append the docspell module to your list of
modules. Here is an example:
```nix
{ config, pkgs, ... }:
let
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
docspell = import "${docspellsrc}/nix/release.nix";
in
{
imports = [ mymodule1 mymodule2 ] ++ docspell.modules;
nixpkgs = {
config = {
packageOverrides = pkgs:
let
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
custom = {
docspell = callPackage docspell.currentPkg {};
};
in custom;
};
};
services.docspell-restserver = {
enable = true;
base-url = "http://docspelltest:7880";
# ... more settings here
};
services.docspell-joex = {
enable = true;
base-url = "http://docspelltexst:7878";
# ... more settings here
};
services.docspell-consumedir = {
enable = true;
watchDirs = ["/tmp/test"];
urls = ["http://localhost:7880/api/v1/open/upload/item/the-source-id"];
};
...
}
```
Please see the `nix/module-server.nix` and `nix/module-joex.nix` files
for the set of options. The nixos options are modelled after the
default configuration file.
The modules files are only applicable to the newest version of
Docspell. If you really need an older version, checkout the
appropriate commit.
## NixOS Example
This is a example system configuration that installs docspell with a
postgres database. This snippet can be used to create a vm (using
`nixos-rebuild build-vm` as shown above) or a container, for example.
``` nix
{ config, pkgs, ... }:
let
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
docspell = import "${docspellsrc}/nix/release.nix";
in
{
imports = docspell.modules;
nixpkgs = {
config = {
packageOverrides = pkgs:
let
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
custom = {
docspell = callPackage docspell.currentPkg {};
};
in custom;
};
};
##### just for the example…
users.users.root = {
password = "root";
};
#####
# install docspell-joex and enable the systemd service
services.docspell-joex = {
enable = true;
base-url = "http://localhost:7878";
bind = {
address = "0.0.0.0";
port = 7878;
};
scheduler = {
pool-size = 1;
};
jdbc = {
url = "jdbc:postgresql://localhost:5432/docspell";
user = "docspell";
password = "docspell";
};
};
# install docspell-restserver and enable the systemd service
services.docspell-restserver = {
enable = true;
base-url = "http://localhost:7880";
bind = {
address = "0.0.0.0";
port = 7880;
};
auth = {
server-secret = "b64:EirgaudMyNvWg4TvxVGxTu-fgtrto4ETz--Hk9Pv2o4=";
};
backend = {
signup = {
mode = "invite";
new-invite-password = "dsinvite2";
invite-time = "30 days";
};
jdbc = {
url = "jdbc:postgresql://localhost:5432/docspell";
user = "docspell";
password = "docspell";
};
};
};
# install postgresql and initially create user/database
services.postgresql =
let
pginit = pkgs.writeText "pginit.sql" ''
CREATE USER docspell WITH PASSWORD 'docspell' LOGIN CREATEDB;
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO docspell;
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO docspell;
CREATE DATABASE DOCSPELL OWNER 'docspell';
'';
in {
enable = true;
package = pkgs.postgresql_11;
enableTCPIP = true;
initialScript = pginit;
port = 5432;
authentication = ''
host all all 0.0.0.0/0 md5
'';
};
networking = {
hostName = "docspellexample";
firewall.allowedTCPPorts = [7880];
};
}
```

View File

@ -0,0 +1,107 @@
+++
title = "Prerequisites"
weight = 10
+++
The two components have one prerequisite in common: they both require
Java to run. While this is the only requirement for the *REST server*,
the *Joex* components requires some more external programs.
The rest server and joex components are not required to "see" each
other, though it is recommended.
# Java
Very often, Java is already installed. You can check this by opening a
terminal and typing `java -version`. Otherwise install Java using your
package manager or see [this site](https://adoptopenjdk.net/) for
other options.
It is enough to install the JRE. The JDK is required, if you want to
build docspell from source.
Docspell has been tested with Java version 1.8 (or sometimes referred
to as JRE 8 and JDK 8, respectively). The pre-build packages are also
build using JDK 8. But a later version of Java should work as well.
The next tools are only required on machines running the *Joex*
component.
# External Programs for Joex
- [Ghostscript](http://pages.cs.wisc.edu/~ghost/) (the `gs` command)
is used to extract/convert PDF files into images that are then fed
to ocr. It is available on most GNU/Linux distributions.
- [Unpaper](https://github.com/Flameeyes/unpaper) is a program that
pre-processes images to yield better results when doing ocr. If this
is not installed, docspell tries without it. However, it is
recommended to install, because it [improves text
extraction](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
(at the expense of a longer runtime).
- [Tesseract](https://github.com/tesseract-ocr/tesseract) is the tool
doing the OCR (converts images into text). It can also convert
images into pdf files. It is a widely used open source OCR engine.
Tesseract 3 and 4 should work with docspell; you can adopt the
command line in the configuration file, if necessary.
- [Unoconv](https://github.com/unoconv/unoconv) is used to convert
office documents into PDF files. It uses libreoffice/openoffice.
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
PDF files.
- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
used to convert PDF to PDF files. It adds an OCR layer to scanned
PDF files to make them searchable. It also creates PDF/A files from
the input pdf.
The performance of `unoconv` can be improved by starting `unoconv -l`
in a separate process. This runs a libreoffice/openoffice listener and
therefore avoids starting one each time `unoconv` is called.
## Example Debian
On Debian this should install all joex requirements:
``` bash
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
```
# Apache SOLR
SOLR is used to provide the fulltext search feature. This feature can
be disabled, so installing SOLR is optional. But without it, there is
no fulltext search.
When installing manually (i.e. not via docker), just install solr and
create a core as described in the [solr
documentation](https://lucene.apache.org/solr/guide/8_4/installing-solr.html).
That will provide you with the connection url (the last part is the
core name).
When using the provided `docker-compose.yml` setup, SOLR is already setup.
SOLR must be reachable from all joex and all rest server components.
# Database
Both components must have access to a SQL database. The SQL database
contains all data (including binary files) and is the central
component of docspell. Docspell has support these databases:
- PostreSQL
- MariaDB
- H2
The H2 database is an interesting option for personal and mid-size
setups, as it requires no additional work. It is integrated into
docspell and works really well out of the box. It is also configured
as the default database.
When using H2, make sure that all components access the same database
the jdbc url must point to the same file. Then, it is important to
add the options
`;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE` at the end
of the url. See the [config page](@/docs/configure/_index.md#jdbc) for
an example.
For large installations, PostgreSQL or MariaDB is recommended. Create
a database and a user with enough privileges (read, write, create
table) to that database.

View File

@ -0,0 +1,23 @@
+++
title = "Quickstart"
weight = 0
+++
To get started, here are some quick links:
- Using [docker and
docker-compose](@/docs/install/installing.md#docker). This sets up
everything: all prerequisites, both docspell components and a
container running the [consumedir.sh](@/docs/tools/consumedir.md)
script to import files that are dropped in a folder.
- [Download, Unpack and
Run](@/docs/install/installing.md#download-unpack-run). This option
is also very quick, but you need to check the
[prerequisites](@/docs/install/prereq.md) yourself. Database is
already setup, but you'd need to setup SOLR (when using fulltext
search) and install some programs for the joex component. This
applies to the `zip` and `deb` files. The files can be downloaded
from the [release page](https://github.com/eikek/docspell/releases/latest).
- via the [nix package manager](@/docs/install/installing.md#nix) and/or as a [NixOS
module](@/docs/install/installing.md#nixos). If you use nix/nixos, you
know what to do. The linked page contains some examples.

View File

@ -0,0 +1,95 @@
+++
title = "Reverse Proxy"
weight = 50
+++
This contains examples for how to use docspell behind a reverse proxy.
For the examples below, assume the following:
- Docspell app is available at `192.168.1.11:7880`. If it is running
on the same machine as the reverse proxy server, you can set
`localhost:7880` instead.
- The external domain/hostname is `docspell.example.com`
## Configuring Docspell
These settings require a complement config part in the docspell
configuration file:
- First, if Docspell REST server is on a different machine, you need
to change the `bind.address` setting to be either `0.0.0.0` or the
ip address of the network interface that the reverse proxy server
connects to.
``` conf
docspell.server {
# Where the server binds to.
bind {
address = "192.168.1.11"
port = 7880
}
}
```
Note that a value of `0.0.0.0` instead of `192.168.1.11` will bind
the server to every network interface.
- Docspell needs to know the external url. The `base-url` setting
must point to the external address. Using above values, it must be
set to `https://docspell.example.com`.
``` conf
docspell.server {
# This is the base URL this application is deployed to. This is used
# to create absolute URLs and to configure the cookie.
base-url = "https://docspell.example.com"
...
}
```
Note that this example assumes that the docspell-joex component is on
the same machine. This page is only related for exposing the REST
server and web application.
If you have examples for more servers, please let me know or add it to
this site.
## Nginx
This defines two servers: one listens for http traffic and redirects
to the https variant. Additionally it defines the let's encrypt
`.well-known` folder name.
The https server endpoint is configured with the let's encrypt
certificates and acts as a proxy for the application at
`192.168.1.11:7880`.
``` conf
server {
listen 0.0.0.0:80 ;
listen [::]:80 ;
server_name docspell.example.com ;
location /.well-known/acme-challenge {
root /var/data/nginx/ACME-PUBLIC;
auth_basic off;
}
location / {
return 301 https://$host$request_uri;
}
}
server {
listen 0.0.0.0:443 ssl http2 ;
listen [::]:443 ssl http2 ;
server_name docspell.example.com ;
location /.well-known/acme-challenge {
root /var/data/nginx/ACME-PUBLIC;
auth_basic off;
}
ssl_certificate /var/lib/acme/docspell.example.com/fullchain.pem;
ssl_certificate_key /var/lib/acme/docspell.example.com/key.pem;
ssl_trusted_certificate /var/lib/acme/docspell.example.com/full.pem;
location / {
proxy_pass http://192.168.1.11:7880;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
}
}
```

View File

@ -0,0 +1,37 @@
+++
title = "Raspberry-Pi and Similiar"
weight = 40
+++
# Raspberry Pi, and similiar
Both component can run next to each other on a raspberry pi or
similiar device.
## REST Server
The REST server component runs very well on the Raspberry Pi and
similiar devices. It doesn't require much resources, because the heavy
work is done by the joex components.
## Joex
Running the joex component on the Raspberry Pi is possible, but will
result in long processing times for OCR. Files that don't require OCR
are no problem.
Tested on a RPi model 3 (4 cores, 1G RAM) processing a PDF (scanned
with 300dpi) with two pages took 9:52. You can speed it up
considerably by uninstalling the `unpaper` command, because this step
takes quite long. This, of course, reduces the quality of OCR. But
without `unpaper` the same sample pdf was then processed in 1:24, a
speedup of 8 minutes.
You should limit the joex pool size to 1 and, depending on your model
and the amount of RAM, set a heap size of at least 500M
(`-J-Xmx500M`).
For personal setups, when you don't need the processing results asap,
this can work well enough.

View File

@ -0,0 +1,66 @@
+++
title = "Running"
weight = 30
+++
# Running
Run the start script (in the corresponding `bin/` directory when using
the zip files):
```
$ ./docspell-restserver*/bin/docspell-restserver
$ ./docspell-joex*/bin/docspell-joex
```
This will startup both components using the default configuration. The
configuration should be adopted to your needs. For example, the
database connection is configured to use a H2 database in the `/tmp`
directory. Please refer to the [configuration
page](@/docs/configure/_index.md) for how to create a custom config
file. Once you have your config file, simply pass it as argument to
the command:
```
$ ./docspell-restserver*/bin/docspell-restserver /path/to/server-config.conf
$ ./docspell-joex*/bin/docspell-joex /path/to/joex-config.conf
```
After starting the rest server, you can reach the web application at
path `/app`, so using default values it would be
`http://localhost:7880/app`. There also is a redirect from `/` to
`/app`.
You should be able to create a new account and sign in. Check the
[configuration page](@/docs/configure/_index.md) to further customize
docspell.
## Options
The start scripts support some options to configure the JVM. One often
used setting is the maximum heap size of the JVM. By default, java
determines it based on properties of the current machine. You can
specify it by given java startup options to the command:
```
$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -- /path/to/server-config.conf
```
This would limit the maximum heap to 1GB. The double slash separates
internal options and the arguments to the program. Another frequently
used option is to change the default temp directory. Usually it is
`/tmp`, but it may be desired to have a dedicated temp directory,
which can be configured:
```
$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -Djava.io.tmpdir=/path/to/othertemp -- /path/to/server-config.conf
```
The command:
```
$ ./docspell-restserver*/bin/docspell-restserver -h
```
gives an overview of supported options.

View File

@ -0,0 +1,133 @@
+++
title = "Introduction"
weight = 0
description = "Gives a short introduction to the goals of docspell and an overview of the components involved when running docspell."
insert_anchor_links = "right"
[extra]
mktoc = true
+++
# Introduction
Docspell aims to be a simple yet effective document organizer that
makes stowing documents away very quick and finding them later
reliable (and also fast). It doesn't require technical background or
studying huge manuals in order to use it. With this in mind, it is
rather opinionated and more targeted for home use and small/medium
organizations.
Docspell analyzes the text of your files and tries to find metadata
that will be annotated automatically. This metadata is taken from an
address book that must be maintained manually. Docspell then looks for
candidates for:
- Correspondents
- Concerned person or things
- A date
It will propose a few candidates and sets the most likely one to your
item.
This might be wrong, so it is recommended to curate the results.
However, very often the correct one is either set or within the
proposals where you fix it by a single click.
Besides these properties, there are more metadata you can use to
organize your files, for example tags, folders and notes.
Docspell is also for programmers. Everything is available via a REST
or HTTP api and can be easily used within your own scripts and tools,
for example using `curl`. There are also features for "advanced use"
and many configuration options.
# Components
Docspell consists of multiple components that run in separate
processes:
- REST server
- JOEX, short for *job executor*
- Fulltext Search Index (optional, currently Apache SOLR)
The REST server provides the Api and the web application. The web
application is a
[SPA](https://en.wikipedia.org/wiki/Single-page_application) written
in [Elm](https://elm-lang.org) and is a client to the REST api. All
features are available via a http/rest api.
The *joex* is the component that does the “heavy work”, excuting
long-running tasks, like processing files or importing your mails
periodically. While the joex component also exposes a small REST api
for controlling it, the user interface is all inside the rest server
api.
The rest server and the job executor can be started multiple times in
order to scale out. It must be ensured, that all connect to the same
database.
The fulltext search index is another separate component, where
currently only SOLR is supported. SOLR also supports running in a
distributed way. Fulltext search is optional, so the SOLR component is
not required if docspell is run without fulltext search support.
# Terms
In order to better understand the following pages, some terms are
explained.
## Item
An **Item** is roughly your document, only that an item may span
multiple files, which are called **attachments**. An item has **meta
data** associated:
- a **correspondent**: the other side of the communication. It can be
an organization or a person.
- a **concerning person** or **equipment**: a person or thing that
this item is about. Maybe it is an insurance contract about your
car.
- **tag**: an item can be tagged with one or more tags (or labels). A
tag can have a *category*. This is intended for grouping tags, for
example a category `doctype` could be used to group tags like
`bill`, `contract`, `receipt` etc. Usually an item is not tagged
with more than one tag of a category.
- a **folder**: a folder is similiar to a tag, but an item can only be
in exactly one folder (or none). Furhtermore folders allow to
associate users, so that items are only visible to the users who are
members of a folder.
- an **item date**: this is the date of the document if this is not
set, the created date of the item is used.
- a **due date**: an optional date indicating that something has to be
done (e.g. paying a bill, submitting it) about this item until this
date
- a **direction**: one of "incoming" or "outgoing"
- a **name**: some item name, defaults to the file name of the
attachments
- some **notes**: arbitrary descriptive text. You can use markdown
here, which is properly formatted in the web application.
## Collective
The users of the application are part of a **collective**. A
**collective** is a group of users that share access to the same
items. The account name is therefore comprised of a *collective name*
and a *user name*.
All users of a collective are equal; they have same permissions to
access all items. The items don't belong to a user, but to the
collective.
That means, to identify yourself when signing in, you have to give the
collective name and your user name. By default it is separated by a
slash `/`, for example `smith/john`. If your user name is the same as
the collective name, you can omit one; so `smith/smith` can be
abbreviated to just `smith`.
By default, all users can see all items of their collective. A
*folder* can be used to implement other visibilities: Every user can
create a folder and associate members. It is possible to put items in
these folders and docspell shows only items that are either in no
specific folder or in a folder where the current user is owner or
member.

View File

@ -0,0 +1,178 @@
+++
title = "Joex"
description = "More information about the job executor component."
weight = 90
insert_anchor_links = "right"
[extra]
mktoc = true
+++
Joex is short for *Job Executor* and it is the component managing long
running tasks in docspell. One of these long running tasks is the file
processing task.
One joex component handles the processing of all files of all
collectives/users. It requires much more resources than the rest
server component. Therefore the number of jobs that can run in
parallel is limited with respect to the hardware it is running on.
For larger installations, it is probably better to run several joex
components on different machines. That works out of the box, as long
as all components point to the same database and use different
`app-id`s (see [configuring
docspell](@/docs/configure/_index.md#app-id)).
When files are submitted to docspell, they are stored in the database
and all known joex components are notified about new work. Then they
compete on getting the next job from the queue. After a job finishes
and no job is waiting in the queue, joex will sleep until notified
again. It will also periodically notify itself as a fallback.
## Task vs Job
Just for the sake of this document, a task denotes the code that has
to be executed or the thing that has to be done. It emerges in a job,
once a task is submitted into the queue from where it will be picked
up and executed eventually. A job maintains a state and other things,
while a task is just code.
## Scheduler and Queue
The scheduler is the part that runs and monitors the long running
jobs. It works together with the job queue, which defines what job to
take next.
To create a somewhat fair distribution among multiple collectives, a
collective is first chosen in a simple round-robin way. Then a job
from this collective is chosen by priority.
There are only two priorities: low and high. A simple *counting
scheme* determines if a low prio or high prio job is selected
next. The default is `4, 1`, meaning to first select 4 high priority
jobs and then 1 low priority job, then starting over. If no such job
exists, its falls back to the other priority.
The priority can be set on a *Source* (see
[uploads](@/docs/webapp/uploading.md)). Uploading through the web
application will always use priority *high*. The idea is that while
logged in, jobs are more important that those submitted when not
logged in.
## Scheduler Config
The relevant part of the config file regarding the scheduler is shown
below with some explanations.
```
docspell.joex {
# other settings left out for brevity
scheduler {
# Number of processing allowed in parallel.
pool-size = 2
# A counting scheme determines the ratio of how high- and low-prio
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
# 1 low prio and then start over.
counting-scheme = "4,1"
# How often a failed job should be retried until it enters failed
# state. If a job fails, it becomes "stuck" and will be retried
# after a delay.
retries = 5
# The delay until the next try is performed for a failed job. This
# delay is increased exponentially with the number of retries.
retry-delay = "1 minute"
# The queue size of log statements from a job.
log-buffer-size = 500
# If no job is left in the queue, the scheduler will wait until a
# notify is requested (using the REST interface). To also retry
# stuck jobs, it will notify itself periodically.
wakeup-period = "30 minutes"
}
}
```
The `pool-size` setting determines how many jobs run in parallel. You
need to play with this setting on your machine to find an optimal
value.
The `counting-scheme` determines for all collectives how to select
between high and low priority jobs; as explained above. It is
currently not possible to define that per collective.
If a job fails, it will be set to *stuck* state and retried by the
scheduler. The `retries` setting defines how many times a job is
retried until it enters the final *failed* state. The scheduler waits
some time until running the next try. This delay is given by
`retry-delay`. This is the initial delay, the time until the first
re-try (the second attempt). This time increases exponentially with
the number of retries.
The jobs will log about what they do, which is picked up and stored
into the database asynchronously. The log events are buffered in a
queue and another thread will consume this queue and store them in the
database. The `log-buffer-size` determines the size of the queue.
At last, there is a `wakeup-period` that determines at what interval
the joex component notifies itself to look for new jobs. If jobs get
stuck, and joex is not notified externally it could miss to
retry. Also, since networks are not reliable, a notification may not
reach a joex component. This periodic wakup is just to ensure that
jobs are eventually run.
## Periodic Tasks
The job executor can execute tasks periodically. These tasks are
stored in the database such that they can be submitted into the job
queue. Multiple job executors can run at once, only one is ever doing
something with a task. So a periodic task is never submitted twice. It
is also not submitted, if a previous task has not finished yet.
## Starting on demand
The job executor and rest server can be started multiple times. This
is especially useful for the job executor. For example, when
submitting a lot of files in a short time, you can simply startup more
job executors on other computers on your network. Maybe use your
laptop to help with processing for a while.
You have to make sure, that all connect to the same database, and that
all have unique `app-id`s.
Once the files have been processced you can stop the additional
executors.
## Shutting down
If a job executor is sleeping and not executing any jobs, you can just
quit using SIGTERM or `Ctrl-C` when running in a terminal. But if
there are jobs currently executing, it is advisable to initiate a
graceful shutdown. The job executor will then stop taking new jobs
from the queue but it will wait until all running jobs have completed
before shutting down.
This can be done by sending a http POST request to the api of this job
executor:
```
curl -XPOST "http://localhost:7878/api/v1/shutdownAndExit"
```
If joex receives this request it will immediately stop taking new jobs
and it will quit when all running jobs are done.
If a job executor gets terminated while there are running jobs, the
jobs are still in the current state marked to be executed by this job
executor. In order to fix this, start the job executor again. It will
search all jobs that are marked with its id and put them back into
waiting state. Then send a graceful shutdown request as shown above.

View File

@ -0,0 +1,11 @@
+++
title = "Tools"
description = "There are several tools distributed with docspell, like a program to watch a folder and import files to docspell."
weight = 60
insert_anchor_links = "right"
template = "pages.html"
redirect_to = "docs/tools/ds"
sort_by = "weight"
[extra]
mktoc = false
+++

View File

@ -0,0 +1,80 @@
+++
title = "Browser Extension (Firefox)"
description = "An extension for firefox to upload files from your browser via right-click → upload to docspell."
weight = 30
+++
The idea is to click on a file in firefox and send it to docspell. It
is downloaded in the context of your current page. Then handed to an
application that pushes it to docspell. There is a browser add-on
implementing this in `tools/webextension`. This add-on only works with
firefox.
Installation is a bit complicated, since you need to install external
tools and the web extension. Both work together.
# Install `ds.sh`
First copy the `ds.sh` tool somewhere in your `PATH`, maybe
`/usr/local/bin` as described above.
# Install the native part
Then install the "native" part of the web extension:
Copy or symlink the `native.py` script into some known location. For
example:
``` bash
ln -s ~/docspell-checkout/tools/webextension/native/native.py /usr/local/share/docspell/native.py
```
Then copy the `app_manifest.json` to
`$HOME/.mozilla/native-messaging-hosts/docspell.json`. For example:
``` bash
cp ~/docspell-checkout/tools/webextension/native/app_manifest.json ~/.mozilla/native-messaging-hosts/docspell.json
```
See
[here](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/Native_manifests#Manifest_location)
for details.
And you might want to modify this json file, so the path to the
`native.py` script is correct (it must be absolute).
If the `ds.sh` script is in your `$PATH`, then this should
work. Otherwise, edit the `native.py` script and change the path to
the tool. Or create a file `$HOME/.config/docspell/ds.cmd` whose
content is the path to the `ds.sh` script.
# Install the extension
An extension file can be build using the `make-xpi.sh` script. But
installing it in "standard" firefox won't work, because [Mozilla
requires extensions to be signed by
them](https://wiki.mozilla.org/Add-ons/Extension_Signing). This means
creating an account and going through some process…. So here are two
alternatives:
1. Open firefox and type `about:debugging` in the addressbar. Then
click on *'Load Temporary Add-on...'* and select the
`manifest.json` file. The extension is now installed. The downside
is, that the extension will be removed once firefox is closed.
2. Use Firefox ESR, which allows to install Add-ons not signed by
Mozilla. But it has to be configured: Open firefox and type
`about:config` in the address bar. Search for key
`xpinstall.signatures.required` and set it to `false`. This is
described on the last paragraph on [this
page](https://support.mozilla.org/en-US/kb/add-on-signing-in-firefox).
When you right click on a file link, there should be a context menu
entry *'Docspell Upload Helper'*. The add-on will download this file
using the browser and then send the file path to the `native.py`
script. This script will in turn call `ds.sh` which finally uploads it
to your configured URLs.
Open the Add-ons page (`Ctrl`+`Shift`+`A`), the new add-on should be
there.

View File

@ -0,0 +1,140 @@
+++
title = "Consume Directory"
description = "A script to watch a directory for new files and upload them to docspell."
weight = 20
+++
The `consumerdir.sh` is a bash script that works in two modes:
- Go through all files in given directories (recursively, if `-r` is
specified) and sent each to docspell.
- Watch one or more directories for new files and upload them to
docspell.
It can watch or go through one or more directories. Files can be
uploaded to multiple urls.
Run the script with the `-h` or `--help` option, to see a short help
text. The help text will also show the values for any given option.
The script requires `curl` for uploading. It requires the
`inotifywait` command if directories should be watched for new
files.
Example for watching two directories:
``` bash
./tools/consumedir.sh --path ~/Downloads --path ~/pdfs -m -dv http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
```
The script by default watches the given directories. If the `-o` or
`--once` option is used, it will instead go through these directories
and upload all files in there.
Example for uploading all immediatly (the same as above only with `-o`
added):
``` bash
$ consumedir.sh -o --path ~/Downloads --path ~/pdfs/ -m -dv http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
```
The URL can be any docspell url that accepts uploads without
authentication. This is usually a [source
url](@/docs/webapp/uploading.md#anonymous-upload). It is also possible
to use the script with the [integration
endpoint](@/docs/webapp/uploading.md#integration-endpoint).
## Integration Endpoint
When given the `-i` or `--integration` option, the script changes its
behaviour slightly to work with the [integration
endpoint](@/docs/webapp/uploading.md#integration-endpoint).
First, if `-i` is given, it implies `-r` so the directories are
watched or traversed recursively. The script then assumes that there
is a subfolder with the collective name. Files must not be placed
directly into a folder given by `-p`, but below a sub-directory that
matches a collective name. In order to know for which collective the
file is, the script uses the first subfolder.
If the endpoint is protected, these credentials can be specified as
arguments `--iuser` and `--iheader`, respectively. The format is for
both `<name>:<value>`, so the username cannot contain a colon
character (but the password can).
Example:
``` bash
$ consumedir.sh -i -iheader 'Docspell-Integration:test123' -m -p ~/Downloads/ http://localhost:7880/api/v1/open/integration/item
```
The url is the integration endpoint url without the collective, as
this is amended by the script.
This watches the folder `~/Downloads`. If a file is placed in this
folder directly, say `~/Downloads/test.pdf` the upload will fail,
because the collective cannot be determined. Create a subfolder below
`~/Downloads` with the name of a collective, for example
`~/Downloads/family` and place files somewhere below this `family`
subfolder, like `~/Downloads/family/test.pdf`.
## Duplicates
With the `-m` option, the script will not upload files that already
exist at docspell. For this the `sha256sum` command is required.
So you can move and rename files in those folders without worring
about duplicates. This allows to keep your files organized using the
file-system and have them mirrored into docspell as well.
## Systemd
The script can be used with systemd to run as a service. This is an
example unit file:
``` systemd
[Unit]
After=networking.target
Description=Docspell Consumedir
[Service]
Environment="PATH=/set/a/path"
ExecStart=/bin/su -s /bin/bash someuser -c "consumedir.sh --path '/a/path/' -m 'http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ'"
```
This unit file is just an example, it needs some fiddling. It assumes
an existing user `someuser` that is used to run this service. The url
`http://localhost:7880/api/v1/open/upload/...` is an anonymous upload
url as described [here](@/docs/webapp/uploading.md#anonymous-upload).
## Docker
The provided docker image runs this script to watch a single
directory, `./docs` in current directory, for new files. If a new file
is detected, it is pushed to docspell.
This utilizes the [integration
endpoint](@/docs/webapp/uploading.md#integration-endpoint), which is
enabled in the config file, to allow uploading documents for all
collectives. A subfolder must be created for each registered
collective. The docker containers are configured to use http-header
protection for the integration endpoint. This requires you to provide
a secret, that is shared between the rest-server and the
`consumedir.sh` script. This can be done by defining an environment
variable which gets picked up by the containers defined in
`docker-compose.yml`:
``` bash
export DOCSPELL_HEADER_VALUE="my-secret"
docker-compose up
```
Now you can create a folder `./docs/<collective-name>` and place all
files in there that you want to import. Once dropped in this folder
the `consumedir` container will push it to docspell.

View File

@ -0,0 +1,44 @@
+++
title = "Upload CLI"
description = "A script to quickly upload files from the command line."
weight = 10
+++
A bash script to quickly upload files from the command line. It reads
a configuration file containing the URLs to upload to. Then each file
given to the script will be uploaded to al URLs in the config.
The config file is expected in
`$XDG_CONFIG_HOME/docspell/ds.conf`. `$XDG_CONFIG_HOME` defaults to
`~/.config`.
The config file contains lines with key-value pairs, separated by a
`=` sign. Lines starting with `#` are ignored. Example:
```
# Config file
url.1 = http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
url.2 = http://localhost:7880/api/v1/open/upload/item/6DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
```
The key must start with `url`. The urls should be [anonymous upload
urls](@/docs/webapp/uploading.md#anonymous-upload).
## Usage
- The `-c` option allows to specifiy a different config file.
- The `-h` option shows a help overview.
- The `-d` option deletes files after upload was successful
- The `-e` option can be used to check for file existence in docspell.
Instead of uploading, the script only checks whether the file is in
docspell or not.
The script takes a list of files as arguments.
Example:
``` bash
./ds.sh ~/Downloads/*.pdf
```

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

View File

@ -0,0 +1,190 @@
+++
title = "SMTP Gateway with Exim"
description = "Start a SMTP server that forwards all mails to docspell."
weight = 40
+++
One possible use case for the [integration
endpoint](@/docs/webapp/uploading.md#integration-endpoint) is a SMTP
server that forwards all local mail to docspell. This way there is no
periodic polling involved and documents (e-mails) get into docspell
without delay.
The `tools/exim` folder contains a docker file and a sample
`exim.conf` to help start with this setup. Note that these files
provide a minimal setup, you might want to add tls and spam protection
when opening it to the public.
## What you need
You need to own a domain and add the appropriate MX records to point
to your server. In this document, the domain `test.org` is used.
You need to enable the [integration
endpoint](@/docs/webapp/uploading.md#integration-endpoint) in the
docspell configuration.
## Exim
[Exim](http://exim.org/) is a popular smtp server (message transfer
agent). It is used here only because of previous knowledge, but same
can be achieved with other MTAs.
## The Config File
Here is the example config file for exim:
{{ incl_conf(path="templates/shortcodes/sample-exim.conf") }}
Exim has good [documentation](https://www.exim.org/docs.html), look
there for more info. The following is only a quick summary of the file
above.
The `domainlist local_domains` should list your domain. Only mails to
this domain are allowed, as specified in the first rule in
`acl_check_rcpt`. So mails to `name@test.org` are ok, but
`name@someother.org` not.
Another rule in `acl_check_rcpt` executes a `GET` request against the
integration endpoint. If that fails, the recipient is wrong (or the
endpoint disabled) and the mail is rejected right away.
Then the `routers` define how a mail is handled. There is only one
router that accepts all mails (that have not been rejected by a rule
in acls) and uses the `docspell` transport to deliver it. The
transport specifies a command via the `pipe` driver that is run with
the mail. The mail itself is provided via stdin. So a simple `curl`
command can upload it to the integration endpoint. Here are some quick
notes about the used options (see `man curl`):
- `--silent` and `--out /dev/null` don't print upload progress
information and no output to stdout
- `--fail` return non-zero if http status code is not success
- `-F` use a multipart/form-data request (defaults to a POST request)
- `"file=@-;filename=\"$_subject:\""` add one part with name `file`
and take the data from stdin (`@-`). Since there is no filename, we
use the subject of the mail. This is [supported by
exim](http://exim.org/exim-html-current/doc/html/spec_html/ch-string_expansions.html)
by expanding the subject mail header via `$h_subject:` (the colon is
required).
- `$local_part` this is expanded by exim to the recipient address,
only the part until the `@` sign.
- `${env{DS_HEADER}{$value} fail}` looks up an environment variable by
key `DS_HEADER`. This is usually defined in `docker-compose.yml`.
The value must be the "secret" header value as defined in docspell's
configuration file.
- `${env{DS_URL}{$value} fail}` the url to docspell. It is looked up
from the environment with key `DS_URL`, which is usually defined in
`docker-compose.yml`. Adding the `$local_part` at the end means that
mails to `somename@test.org` are uploaded to the collective
`somename`.
## Install with Docker
Go into the `tools/exim` directory and build the docker image:
``` bash
docker build -t ds-exim:latest -f exim.dockerfile .
```
Then start docspell somewhere and configure the integration endpoint
to use http-header protection; i.e. set this in the config file:
``` conf
docspell.server {
integration-endpoint {
enabled = true
http-header = {
enabled = true
header-value = "test123"
}
}
}
```
Then edit the `docker-compose.yml` and change the environment
variables as needed.
Finally start the container:
``` bash
docker-compose up
```
## Test Run
Now it is possible to send mails to this MTA which will be immediatly
uploaded to docspell for the collective corresponding to the
`$local_part` of the recipients address. Here is a quick telnet
session (the collective is named `family`):
```
~> telnet localhost 25
Trying ::1...
Connected to localhost.
Escape character is '^]'.
220 test.org ESMTP Exim 4.93 Sun, 14 Jun 2020 19:03:51 +0000
ehlo localhost
250-test.org Hello localhost [::1]
250-SIZE 31457280
250-8BITMIME
250-PIPELINING
250-CHUNKING
250 HELP
mail from:<me@test.org>
250 OK
rcpt to:<family@test.org>
250 Accepted
data
354 Enter message, ending with "." on a line by itself
From: me@test.org
To: family@test.org
Subject: This is a test
Test,
this is just a test mail.
.
250 OK id=1jkXwf-000007-0d
quit
221 test.org closing connection
Connection closed by foreign host.
~>
```
The mail is processed and results in an item:
{{ figure(file="exim-mail.png") }}
However, if a mail is to an unknown collective or not to the
configured local domain, the server rejects it immediately:
``` bash
~> telnet localhost 25
Trying ::1...
Connected to localhost.
Escape character is '^]'.
220 test.org ESMTP Exim 4.93 Sun, 14 Jun 2020 19:07:04 +0000
ehlo localhost
250-test.org Hello localhost [::1]
250-SIZE 31457280
250-8BITMIME
250-PIPELINING
250-CHUNKING
250 HELP
mail from:<me@test.org>
250 OK
rcpt to:<family22@test.org>
550 Recipient unknown
rcpt to:<family@gmail.com>
550 Administrative prohibition
quit
221 test.org closing connection
Connection closed by foreign host.
~>
```

View File

@ -0,0 +1,12 @@
+++
title = "Web-UI"
summary = true
description = "This section describes the features of the web application."
weight = 50
insert_anchor_links = "right"
template = "pages.html"
sort_by = "weight"
redirect_to = "docs/webapp/uploading"
+++
No content here.

View File

@ -0,0 +1,66 @@
+++
title = "Curate Items"
weight = 20
+++
Curating the items meta data helps finding them later. This page
describes how you can quickly go through those items and correct or
amend with existing data.
## Select New items
After files have been uploaded and the job executor created the
corresponding items, they will show up on the main page. All items,
the job executor has created are initially marked as *New*. The option
*only New* in the left search menu can be used to select only new
items:
{{ figure(file="docspell-curate-1.jpg") }}
## Check selected items
Then you can go through all new items and check their metadata: Click
on the first item to open the detail view. This shows the documents
and the meta data in the header.
{{ figure(file="docspell-curate-2.jpg") }}
## Modify if necessary
To change something, click the *Edit* button in the menu above the
document view. This will open a form next to your documents. You can
compare the data with the documents and change as you like. Since the
item status is *New*, you'll see the suggestions docspell found during
processing. If there were multiple candidates, you can select another
one by clicking its name in the suggestion list.
{{ figure(file="docspell-curate-3.jpg") }}
When you change something in the form, it is immediatly applied. Only
when changing text fields, a click on the *Save* symbol next to the
field is required.
## Confirm
If everything looks good, click the *Confirm* button to confirm the
current data. The *New* status goes away and also the suggestions are
hidden in this state. You can always go back by clicking the
*Unconfirm* button.
{{ figure(file="docspell-curate-5.jpg") }}
## Proceed with next item
To look at the next item in the search results, click the *Next*
button in the menu (next to the *Edit* button). Clicking next, will
keep the current view, so you can continue checking the data. If you
are on the last item, the view switches to the listing view when
clicking *Next*.
{{ figure(file="docspell-curate-6.jpg") }}

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

View File

@ -0,0 +1,234 @@
+++
title = "E-Mail Settings"
weight = 40
[extra]
mktoc = true
+++
Docspell has a good integration for E-Mail. You can send e-mails
related to an item and you can import e-mails from your mailbox into
docspell.
This requires to define settings to use for sending and receiving
e-mails. E-Mails are commonly send via
[SMTP](https://en.wikipedia.org/wiki/Simple_Mail_Transfer_Protocol)
and for receiving
[IMAP](https://en.wikipedia.org/wiki/Internet_Message_Access_Protocol)
is quite common. Docspell has support for SMTP and IMAP. These
settings are associated to a user, so that each user can specify its
own settings separately from others in the collective.
*Note: Passwords to your e-mail accounts are stored in plain-text in
docspell's database. This is necessary to have docspell connect to
your e-mail account to send mails on behalf of you and receive your
mails.*
## SMTP Settings
For sending mail, you need to provide information to connect to a SMTP
server. Every e-mail provider has this information somewhere
available.
Configure this in *User Settings -> E-Mail Settings (SMTP)*:
{{ figure(file="mail-settings-1.png") }}
First, you need to provide some name that is used to recognize this
account. This name is also used in URLs to docspell and so it must not
contain whitespace or any special characters. A good value is the
domain of your provider, for example `gmail.com`, or something like
that.
These information should be available from your e-mail provider. For
example, for google-mail it is:
- SMTP Host: `smtp.gmail.com`
- SMTP Port: `587` or `465`
- SMTP User: Your Gmail address (for example, example@gmail.com)
- SMTP Password: Your Gmail password
- SSL: use `SSL` for port `465` and `StartSSL` for port `587`
Then you need to define the e-mail address that is used for the `From`
field. This is in most cases the same address as used for the SMTP
User field.
The `Reply-To` field is optional and can be set to define a different
e-mail address that your recipients should use to answer a mail.
Once this is setup, you can start sending mails within docspell. It is
possible to set up these settings for multiple providers, so you can
choose from which account you want to send mails.
## IMAP Settings
For receiving e-mails, you need to provide information to connect to
an IMAP server. Your e-mail provider should have this information
somewhere available.
Configure this in *User Settings -> E-Mail Settings (IMAP)*:
{{ figure(file="mail-settings-2.png") }}
First you need to define a *Name* to recognize this connection inside
docspell. This name is also used in URLs to docspell and so it must
not contain whitespace or any special characters. A good value is the
domain of your provider, for example `gmail.com`, or something like
that.
You can provide imap connections to multiple mailboxes.
Here is an example for posteo.de:
- IMAP Server: `posteo.de`
- IMAP Port: 143
- IMAP User: Your posteo address
- IMAP Password: Your posteo password
- SSL: use `StartTLS`
## SSL / TLS / StartTLS
*Please Note: If `SSL` is set to `None`, then mails will be sent
unencrypted to your mail provider! If `Ignore certificate check` is
enabled, connections to your mail provider will succeed even if the
provider is wrongly configured for SSL/TLS. This flag should only be
enabled if you know why.*
## GMail
Authenticating with GMail may be not so simple. GMail implements an
authentication scheme called *XOAUTH2* (at least for Imap). It will
not work with your normal password. This is to avoid giving an
application full access to your gmail account.
The e-mail integration in docspell relies on the
[JavaMail](https://javaee.github.io/javamail) library which has
support for XOAUTH2. It also has documentation on what you need to do
on your gmail account: <https://javaee.github.io/javamail/OAuth2>.
First you need to go to the [Google Developers
Console](https://console.developers.google.com) and create an "App" to
get a Client-Id and a Client-Secret. This "App" will be your instance
of docspell. You tell google that this app may send and read your
mails and then you get an *access token* that should be used instead
of the password.
Once you setup an App in Google Developers Console, you get the
Client-Id and the Client-Secret, which look something like this:
- Client-Id: 106701....d8c.apps.googleusercontent.com
- Client-Secret: 5Z1...Kir_t
Google has a python tool to help with getting this access token.
Download the `oauth2.py` script from
[here](https://github.com/google/gmail-oauth2-tools) and first create
an *oauth2-token*:
``` bash
./oauth2.py --user=your.name@gmail.com \
--client_id=106701....d8c.apps.googleusercontent.com \
--client_secret=5Z1...Kir_t \
--generate_oauth2_token
```
This will "redirect you" to an URL where you have to authenticate with
google. Afterwards it lets you add permissions to the app for
accessing your mail account. The result is another code you need to
give to the script to proceed:
```
4/zwE....q0QBAb-99yD7lw
```
Then the scripts produces this:
```
Refresh Token: 1//09zH.........Lj6oc2SmFlZww
Access Token: ya29.a0........SECDQ
Access Token Expiration Seconds: 3599
```
The access token can be used to sign in via IMAP with google. The
Refresh Token doesn't expire and can be used to generate new access
tokens:
```
./oauth2.py --user=your.name@gmail.com \
--client_id=106701....d8c.apps.googleusercontent.com \
--client_secret=5Z1...Kir_t \
--refresh_token=1//09zH.........Lj6oc2SmFlZww
```
Output:
```
Access Token: ya29.a0....._q-lX3ypntk3ln0h9Yk
Access Token Expiration Seconds: 3599
```
The problem is that the access token expires. Docspell doesn't support
updating the access token. It could be worked around by setting up a
cron-job or similiar which uses the `oauth2.py` tool to generate new
access tokens and update your imap settings via a
[REST](@/docs/api/_index.md) call.
``` bash
#!/usr/bin/env bash
set -e
## Change this to your values:
DOCSPELL_USER="[docspell-user]"
DOCSPELL_PASSWORD="[docspell-password]"
DOCSPELL_URL="http://localhost:7880"
DOCSPELL_IMAP_NAME="gmail.com"
GMAIL_USER="your.name@gmail.com"
CLIENT_ID="106701....d8c.apps.googleusercontent.com"
CLIENT_SECRET="secret=5Z1...Kir_t"
REFRESH_TOKEN="1//09zH.........Lj6oc2SmFlZww"
# Path to the oauth2.py tool
OAUTH_TOOL="./oauth2.py"
##############################################################################
## Script
# Login to docspell and store the auth-token
AUTH_DATA=$(curl --silent -XPOST \
-H 'Content-Type: application/json' \
--data-binary "{\"account\":\"$DOCSPELL_USER\",\"password\":\"$DOCSPELL_PASSWORD\"}" \
$DOCSPELL_URL/api/v1/open/auth/login)
if [ $(echo $AUTH_DATA | jq .success) == "false" ]; then
echo "Auth failed"
echo $AUTH_DATA
fi
TOKEN="$(echo $AUTH_DATA | jq -r .token)"
# Get the imap settings
UPDATE_URL="$DOCSPELL_URL/api/v1/sec/email/settings/imap/$DOCSPELL_IMAP_NAME"
IMAP_DATA=$(curl -s -H "X-Docspell-Auth: $TOKEN" "$UPDATE_URL")
echo "Current Settings:"
echo $IMAP_DATA | jq
# Get the new access token
ACCESS_TOKEN=$($OAUTH_TOOL --user=$GMAIL_USER \
--client_id="$CLIENT_ID" \
--client_secret="$CLIENT_SECRET" \
--refresh_token="$REFRESH_TOKEN" | head -n1 | cut -d':' -f2 | xargs)
# Update settings
echo "Updating IMAP settings"
NEW_IMAP=$(echo $IMAP_DATA | jq ".imapPassword |= \"$ACCESS_TOKEN\"")
curl -s -XPUT -H "X-Docspell-Auth: $TOKEN" \
-H 'Content-Type: application/json' \
--data-binary "$NEW_IMAP" "$UPDATE_URL"
echo
echo "New Settings:"
curl -s -H "X-Docspell-Auth: $TOKEN" "$UPDATE_URL" | jq
```

View File

@ -0,0 +1,178 @@
+++
title = "Finding Items"
weight = 30
[extra]
mktoc = true
+++
Items can be searched by their annotated meta data and their contents
using full text search. The landing page shows a list of current
items. Items are displayed sorted by their date, newest first.
Docspell has two modes for searching: a simple search bar and a search
menu with many options. Both are active at the same time, but only one
is visible. You can switch between them without affecting the results.
## Search Bar
{{ imgright(file="search-bar.png") }}
By default, the search bar is shown. It provides a refined view of the
search menu. The dropdown contains different options to do a quick
search.
### *All Names* and *Contents*
These two options correspond to the same named field in the search
menu. If you switch between search menu and search bar (by clicking
the icon on the left), you'll see that they are the same fields.
Typing in the search bar also fills the corresponding field in the
search menu (and vice versa).
- The *All Names* searches in the item name, item notes, names of
correspondent organization and person, and names of concering person
and equipment. It uses a simple substring search.
- The option *Contents* searches the contents of all attachments
(documents), attachment names, the item name and item notes. It uses
full text search. However, it does not search the names of attached
meta data.
When searching with one of these fields active, it simply submits the
(hidden) search menu. So if the menu has other fields filled out, they
will affect the result, too. Using one of these fields, the bar is
just a reduced view of the search menu.
So you can choose tags or correspondents in the search menu and
further restrict the results using full text search. The results will
be returned sorted by the item date, newest first.
If the left button in the search bar shows a little blue bubble, it
means that there are more search fields filled out in the search menu
that you currently can't see. In this case the results are not only
restricted by the search term given in the search-bar, but also by
what is specified in the search menu.
### *Contents Only*
This option has no corresponding part in the search menu. Searching
with this option active, there is only a full text search done in the
attachments contents, attachment names, item name and item notes.
The results are not ordered by item date, but by relevance with
respect to the search term. This ordering is returned from the full
text search engine and is simply transfered unmodified.
## Search Menu
{{ imgright(file="search-menu.png") }}
The search menu can be opened by clicking the left icon in the top
bar. It shows some options to constrain the item list:
### Show new items
Clicking the checkbox "Only new" shows items that have not been
"Confirmed". All items that have been created by docspell and not
looked at are marked as "new" automatically.
### Names
Searches in names of certain properties. The `All Names` field is the
same as the search in the search bar (see above).
The `Name` field only searches in the name property of an item.
### Folder
Set a folder to only show items in that folder. If no folder is set,
all accessible items are shown. These are all items that either have
no folder set, or a folder where the current user is member.
### Tags
Specify a list of tags that the items must have. When adding tags to
the "Include" list, an item must have all these tags in order to be
included in the results.
When adding tags to the "Exclude" list, then an item is removed from
the results if it has at least one of these tags.
### Correspondent
Pick a correspondent to show only these items.
### Concerned
Pick a concerned entity to show only these items.
### Date
Specify a date range to show only items whose date property is within
this range. If you want to see items of a specific day, choose the
same day for both fields.
For items that don't have an explicitly date property set, the created
date is used.
### Due Date
Specify a date range to show only items whose due date property is
within this range. Items without a due date are not shown.
### Direction
Specify whether to show only incoming, only outgoing or all items.
## Customize Substring Search
The substring search of the *All Names* and *Name* field can be
customized in the following way: A wildcard `*` can be used at the
start or end of a search term to do a substring match. A `*` means
"everything". So a term `*company` matches all names ending in
`company` and `*company*` matches all names containing the word
`company`. The matching is case insensitive.
Docspell adds a `*` to the front and end of a term automatically,
unless one of the following is true:
- The term already has a wildcard.
- The term is enclosed in quotes `"`.
## Full Text Search
### The Query
The query string for full text search is very powerful. Docspell
currently supports [Apache SOLR](https://lucene.apache.org/solr/) as
full text search backend, so you may want to have a look at their
[documentation on query
syntax](https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing)
for a in depth guide.
- Wildcards: `?` matches any single character, `*` matches zero or
more characters
- Fuzzy search: Appending a `~` to a term, results in a fuzzy search
(search this term and similiar spelled ones)
- Proximity Search: Search for terms that "near" each other, again
using `~` appended to a search phrase. Example: `"cheese cake"~5`.
- Boosting: apply more weight to a term with `^`. Example: `cheese^4
cake` cheese is 4x more important.
Docspell will preprocess the search query to prepare a query for SOLR.
It will by default search all indexed fields, which are: attachment
contents, attachment names, item name and item notes.
### The Results
When using full text search, each item in the result list is annotated
with the highlighted occurrence of the match.
{{ figure(file="search-content-results.png") }}

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 177 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

Some files were not shown because too many files have changed in this diff Show More