Merge pull request #485 from eikek/export-files

Export files
2025-10-30 21:40:12 +00:00 · 2020-11-30 00:23:32 +00:00
parent 9b838f68fd 8c8dd2fb40
commit 3ea2d20823
3 changed files with 466 additions and 8 deletions
--- a/tools/export-files.sh
+++ b/tools/export-files.sh
@@ -0,0 +1,255 @@
+#!/usr/bin/env bash
+#
+# Simple script for downloading all your files. It goes through all
+# items visible to the logged in user and downloads the attachments
+# (the original files).
+#
+# The item's metadata are stored next to the files to provide more
+# information about the item: tags, dates, custom fields etc. This
+# contains most of your user supplied data.
+#
+# This script is intended for having your data outside and independent
+# of docspell. Another good idea for a backup strategy is to take
+# database dumps *and* storing the releases of docspell next to this
+# dump.
+#
+# Usage:
+#
+# export-files.sh <docspell-base-url> <target-directory>
+#
+# The docspell base url is required as well as a directory to store
+# all the files into.
+#
+# Example:
+#
+#    export-files.sh http://localhost:7880 /tmp/ds-download
+#
+# The script then asks for username and password and starts
+# downloading. Files are downloaded into the following structure
+# (below the given target directory):
+#
+# - yyyy-mm (item date)
+#   - A3…XY (item id)
+#     - somefile.pdf (attachments with name)
+#     - metadata.json (json file with items metadata)
+#
+# By default, files are not overwritten, it stops if existing files
+# are encountered. Configuration can be specified using environment
+# variables:
+#
+# - OVERWRITE_FILE= if `y` then overwriting existing files is ok.
+# - SKIP_FILE= if `y` then existing files are skipped (supersedes
+#   OVERWRITE_FILE).
+# - DROP_ITEM= if `y` the item folder is removed before attempting to
+#   download it. If this is set to `y` then the above options don't
+#   make sense, since they operate on the files inside the item folder
+#
+# Docspell sends with each file its sha256 checksum via the ETag
+# header. This is used to do a integrity check after downloading.
+
+
+if [ -z "$1" ]; then
+    echo "The base-url to docspell is required."
+    exit 1
+else
+    BASE_URL="$1"
+    shift
+fi
+
+if [ -z "$1" ]; then
+    echo "A directory is required to store the files into."
+    exit 1
+else
+    TARGET="$1"
+    shift
+fi
+
+set -o errexit -o pipefail -o noclobber -o nounset
+
+LOGIN_URL="$BASE_URL/api/v1/open/auth/login"
+SEARCH_URL="$BASE_URL/api/v1/sec/item/searchWithTags"
+INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights"
+DETAIL_URL="$BASE_URL/api/v1/sec/item"
+ATTACH_URL="$BASE_URL/api/v1/sec/attachment"
+
+OVERWRITE_FILE=${OVERWRITE_FILE:-n}
+DROP_ITEM=${DROP_ITEM:-n}
+
+errout() {
+    >&2 echo "$@"
+}
+
+trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT
+
+mcurl() {
+    tmpfile1=$(mktemp -t "ds-export.XXXXX")
+    tmpfile2=$(mktemp -t "ds-export.XXXXX")
+    set +e
+    curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@"
+    status=$?
+    set -e
+    if [ $status -ne 0 ]; then
+        errout "curl -H 'X-Docspell-Auth: …' $@"
+        errout "Curl command failed (rc=$status)! Output is below."
+        cat "$tmpfile1" >&2
+        cat "$tmpfile2" >&2
+        rm -f "$tmpfile1" "$tmpfile2"
+        return 2
+    else
+        ret=$(cat "$tmpfile2")
+        rm "$tmpfile2" "$tmpfile1"
+        echo $ret
+    fi
+}
+
+
+errout "Login to Docspell."
+errout "Using url: $BASE_URL"
+if [ -z "$DS_USER" ]; then
+    errout -n "Account: "
+    read DS_USER
+fi
+if [ -z "$DS_PASS" ]; then
+    errout -n "Password: "
+    read -s DS_PASS
+fi
+echo
+
+declare auth
+declare auth_token
+declare auth_time
+
+
+login() {
+    auth=$(curl -s --fail -XPOST \
+                 --data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL")
+
+    if [ "$(echo $auth | jq .success)" == "true" ]; then
+        errout "Login successful"
+        auth_token=$(echo $auth | jq -r .token)
+        auth_time=$(date +%s)
+    else
+        errout "Login failed."
+        exit 1
+    fi
+}
+
+checkLogin() {
+    elapsed=$((1000 * ($(date +%s) - $auth_time)))
+    maxtime=$(echo $auth | jq .validMs)
+
+    elapsed=$(($elapsed + 1000))
+    if [ $elapsed -gt $maxtime ]; then
+        errout "Need to re-login $elapsed > $maxtime"
+        login
+    fi
+}
+
+listItems() {
+    OFFSET="${1:-0}"
+    LIMIT="${2:-50}"
+    errout "Get next items with offset=$OFFSET, limit=$LIMIT"
+    REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}"
+
+    mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id'
+}
+
+fetchItemCount() {
+    mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add'
+}
+
+fetchItem() {
+    mcurl -XGET "$DETAIL_URL/$1"
+}
+
+downloadAttachment() {
+    attachId="$1"
+    errout " - Download '$attachName' ($attachId)"
+
+    if [ -f "$attachOut" ] && [ "$SKIP_FILE" == "y" ]; then
+        errout " - Skipping file '$attachOut' since it already exists"
+    else
+        if [ -f "$attachOut" ] && [ "$OVERWRITE_FILE" == "y" ]; then
+            errout " - Removing attachment file as requested: $attachOut"
+            rm -f "$attachOut"
+        fi
+
+        checksum1=$(curl --fail -s -I -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" | \
+                        grep 'ETag' | cut -d' ' -f2 | jq -r)
+        curl --fail -s -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original"
+        checksum2=$(sha256sum "$attachOut" | cut -d' ' -f1 | xargs)
+        if [ "$checksum1" == "$checksum2" ]; then
+            errout " - Checksum ok."
+        else
+            errout " - WARNING: Checksum mismatch! Server: $checksum1 Downloaded: $checksum2"
+            return 3
+        fi
+    fi
+}
+
+downloadItem() {
+    checkLogin
+    itemData=$(fetchItem "$1")
+    errout "Get item $(echo $itemData | jq -r .id)"
+    created=$(echo $itemData|jq '.created')
+    created=$((($(echo $itemData|jq '.created') + 500) / 1000))
+    itemId=$(echo $itemData | jq -r '.id')
+    out="$TARGET/$(date -d @$created +%Y-%m)/$itemId"
+
+    if [ -d "$out" ] && [ "$DROP_ITEM" == "y" ]; then
+        errout "Removing item folder as requested: $out"
+        rm -rf "$out"
+    fi
+
+    mkdir -p "$out"
+    if [ -f "$out/metadata.json" ] && [ "$SKIP_FILE" == "y" ]; then
+        errout " - Skipping file 'metadata.json' since it already exists"
+    else
+        if [ -f "$out/metadata.json" ] && [ "$OVERWRITE_FILE" == "y" ]; then
+            errout " - Removing metadata.json as requested"
+            rm -f "$out/metadata.json"
+        fi
+        echo $itemData | jq > "$out/metadata.json"
+    fi
+
+    while read attachId attachName; do
+        attachOut="$out/$attachName"
+        checkLogin
+        downloadAttachment "$attachId"
+    done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")')
+
+}
+
+login
+
+allCount=$(fetchItemCount)
+errout "Downloading $allCount items…"
+
+allCounter=0 innerCounter=0 limit=100 offset=0 done=n
+
+while [ "$done" = "n" ]; do
+    checkLogin
+
+    innerCounter=0
+    while read id; do
+        downloadItem "$id"
+        innerCounter=$(($innerCounter + 1))
+    done < <(listItems $offset $limit)
+
+    allCounter=$(($allCounter + $innerCounter))
+    offset=$(($offset + $limit))
+
+
+    if [ $innerCounter -lt $limit ]; then
+        done=y
+    fi
+
+done
+errout "Downloaded $allCounter/$allCount items"
+if [[ $allCounter < $allCount ]]; then
+    errout
+    errout "  Downloaded less items than were reported as available. This"
+    errout "  may be due to items in folders that you cannot see. Or it"
+    errout "  may be a bug."
+    errout
+fi
--- a/website/site/content/docs/faq/_index.md
+++ b/website/site/content/docs/faq/_index.md
@@ -82,14 +82,9 @@ documentation, too.

 In order to move to a different tool, it is necessary to get the data
 out of Docspell in a machine readable/automatic way. Currently, there
-is no *easy way* for this. However, it is possible to get to all data
-with some scripting effort. Everything can be queried using a
-[HTTP/REST api](@/docs/api/_index.md) and so you can write a
-script/program that, for example, queries all items and downloads the
-files (something like this might be provided soon, for now there are
-starting points in the `/tools` folder). It is planned to provide a
-more convenient way to export the data into the file system. But there
-is no ETA for this.
+is a [export-files.sh](@/docs/tools/export-files.md) script provided
+(in the `tools/` folder) that can be used to download all your files
+and item metadata.

 My recommendation is to run periodic database backups and also store
 the binaries/docker images. This lets you re-create the current state
--- a/website/site/content/docs/tools/export-files.md
+++ b/website/site/content/docs/tools/export-files.md
@@ -0,0 +1,208 @@
+++
+title = "Export Files"
+description = "Downloads all files from docspell."
+weight = 65
+++
+
+# export-files.sh
+
+This script can be used to download all files from docspell that have
+been uploaded before and the item metadata.
+
+It downloads the original files, those that have been uploaded and not
+the converted pdf files.
+
+The item's metadata are stored next to the files to provide more
+information about the item: corresponent, tags, dates, custom fields
+etc. This contains most of your user supplied data.
+
+This script is intended for having your data outside and independent
+of docspell. Another good idea for a backup strategy is to take
+database dumps *and* storing the releases of docspell next to this
+dump.
+
+Files are stored into the following folder structure (below the given
+target directory):
+
+```
+- yyyy-mm (item date)
+  - A3…XY (item id)
+    - somefile.pdf (attachments with name)
+    - metadata.json (json file with items metadata)
+```
+
+By default, files are not overwritten, it stops if existing files are
+encountered. This and some other things can be changed using
+environment variables:
+
+- `DS_USER` the account name for login, it is asked if not available
+- `DS_PASS` the password for login, it is asked if not available
+- `OVERWRITE_FILE=` if `y` then overwriting existing files is ok.
+  Default is `n`.
+- `SKIP_FILE=` if `y` then existing files are skipped (supersedes
+  `OVERWRITE_FILE`). Default is `n`.
+- `DROP_ITEM=` if `y` the item folder is removed before attempting to
+  download it. If this is set to `y` then the above options don't make
+  sense, since they operate on the files inside the item folder.
+  Default is `n`.
+
+Docspell sends the sha256 hash with each file via the ETag header.
+This is used to do a integrity check after downloading.
+
+
+# Requirements
+
+It is a bash script that additionally needs
+[curl](https://curl.haxx.se/) and [jq](https://stedolan.github.io/jq/)
+to be available.
+
+# Usage
+
+```
+./export-files.sh <docspell-base-url> <target-directory>
+```
+
+For example, if docspell is at `http://localhost:7880`:
+
+```
+./export-files.sh http://localhost:7880 /tmp/ds-downloads
+```
+
+The script asks for your account name and password. It then logs in
+and goes through all items downloading the metadata as json and the
+attachments.
+
+
+# Example Run
+
+``` bash
+fish> env SKIP_FILE=y DS_USER=demo DS_PASS=test ./export-files.sh http://localhost:7880 /tmp/download
+Login to Docspell.
+Using url: http://localhost:7880
+
+Login successful
+Downloading 73 items…
+Get next items with offset=0, limit=100
+Get item 57Znskthf3g-X7RP1fxzE2U-dwr4vM6Yjnn-b7s1PoCznhz
+ - Download 'something.txt' (8HbeFornAUN-kBCyc8bHSVr-bnLBYDzgRQ7-peMZzyTzM2X)
+ - Checksum ok.
+Get item 94u5Pt39q6N-7vKu3LugoRj-zohGS4ie4jb-68bW5gXU6Jd
+ - Download 'letter-en.pdf' (6KNNmoyqpew-RAkdwEmQgBT-QDqdY97whZA-4k2rmbssdfQ)
+ - Checksum ok.
+Get item 7L9Fh53RVG4-vGSt2G2YUcY-cvpBKRXQgBn-omYpg6xQXyD
+ - Download 'mail.html' (A6yTYKrDc7y-xU3whmLB1kB-TGhEAVb12mo-RUw5u9PsYMo)
+ - Checksum ok.
+Get item DCn9UtWUtvF-2qjxB5PXGEG-vqRUUU7JUJH-zBBrmSeGYPe
+ - Download 'Invoice_7340224.pdf' (6FWdjxJh7yB-CCjY39p6uH9-uVLbmGfm25r-cw6RksrSx4n)
+ - Checksum ok.
+…
+```
+
+The resulting directory looks then like this:
+
+``` bash
+…
+├── 2020-08
+│   ├── 6t27gQQ4TfW-H4uAmkYyiSe-rBnerFE2v5F-9BdqbGEhMcv
+│   │   ├── 52241.pdf
+│   │   └── metadata.json
+│   └── 9qwT2GuwEvV-s9UuBQ4w7o9-uE8AdMc7PwL-GFDd62gduAm
+│       ├── DOC-20191223-155707.jpg
+│       └── metadata.json
+├── 2020-09
+│   ├── 2CM8C9VaVAT-sVJiKyUPCvR-Muqr2Cqvi6v-GXhRtg6eomA
+│   │   ├── letter with spaces.pdf
+│   │   └── metadata.json
+│   ├── 4sXpX2Sc9Ex-QX1M6GtjiXp-DApuDDzGQXR-7pg1QPW9pbs
+│   │   ├── analyse.org
+│   │   ├── 201703.docx
+│   │   ├── 11812_120719.pdf
+│   │   ├── letter-de.pdf
+│   │   ├── letter-en.pdf
+│   │   └── metadata.json
+│   ├── 5VhP5Torsy1-15pwJBeRjPi-es8BGnxhWn7-3pBQTJv3zPb
+│   │   └── metadata.json
+│   ├── 7ePWmK4xCNk-gmvnTDdFwG8-JcN5MDSUNPL-NTZZrho2Jc6
+│   │   ├── metadata.json
+│   │   └── Rechnung.pdf
+…
+```
+
+The `metadata.json` file contains all the item metadata. This may be
+useful when importing into other tools.
+
+``` json
+{
+  "id": "AWCNx7tJgUw-SdrNtRouNJB-FGs6Y2VP5bV-218sFN8mjjk",
+  "direction": "incoming",
+  "name": "Ruecksendung.pdf",
+  "source": "integration",
+  "state": "confirmed",
+  "created": 1606171810005,
+  "updated": 1606422917826,
+  "itemDate": null,
+  "corrOrg": null,
+  "corrPerson": null,
+  "concPerson": null,
+  "concEquipment": null,
+  "inReplyTo": null,
+  "folder": null,
+  "dueDate": null,
+  "notes": null,
+  "attachments": [
+    {
+      "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q",
+      "name": "Ruecksendung.converted.pdf",
+      "size": 57777,
+      "contentType": "application/pdf",
+      "converted": true
+    }
+  ],
+  "sources": [
+    {
+      "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q",
+      "name": "Ruecksendung.pdf",
+      "size": 65715,
+      "contentType": "application/pdf"
+    }
+  ],
+  "archives": [],
+  "tags": [
+    {
+      "id": "EQvJ6AHw19Y-Cdg3gF78zZk-BY2zFtNTwes-J95jpXpzhfw",
+      "name": "Hupe",
+      "category": "state",
+      "created": 1606427083171
+    },
+    {
+      "id": "4xyZoeeELdJ-tJ91GiRLinJ-7bdauy3U1jR-Bzr4VS96bGS",
+      "name": "Invoice",
+      "category": "doctype",
+      "created": 1594249709473
+    }
+  ],
+  "customfields": [
+    {
+      "id": "5tYmDHin3Kx-HomKkeEVtJN-v99oKxQ8ot6-yFVrEmMayoo",
+      "name": "amount",
+      "label": "EUR",
+      "ftype": "money",
+      "value": "151.55"
+    },
+    {
+      "id": "3jbwbep8rDs-hNJ9ePRE7gv-21nYMbUj3eb-mKRWAr4xSS2",
+      "name": "invoice-number",
+      "label": "Invoice-Nr",
+      "ftype": "text",
+      "value": "I454602"
+    },
+    {
+      "id": "AH4p4NUCa9Y-EUkH66wLzxE-Rf2wJPxTAYd-DeGDm4AT4Yg",
+      "name": "number",
+      "label": "Number",
+      "ftype": "numeric",
+      "value": "0.10"
+    }
+  ]
+}
+```