From 0dafe57034362e1f6ee4200cc26768fdc116eac9 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 29 Nov 2020 01:30:15 +0100 Subject: [PATCH 1/2] Add an export-files script --- tools/export-files.sh | 189 ++++++++++++++++++ website/site/content/docs/faq/_index.md | 13 +- .../site/content/docs/tools/export-files.md | 33 +++ 3 files changed, 227 insertions(+), 8 deletions(-) create mode 100755 tools/export-files.sh create mode 100644 website/site/content/docs/tools/export-files.md diff --git a/tools/export-files.sh b/tools/export-files.sh new file mode 100755 index 00000000..f335c1b8 --- /dev/null +++ b/tools/export-files.sh @@ -0,0 +1,189 @@ +#!/usr/bin/env bash +# +# Simple script for downloading all your files. It goes through all +# items visible to the logged in user and downloads the attachments +# (the original files). +# +# The item's metadata are stored next to the files to provide more +# information about the item. It is not meant to be imported back into +# docspell. +# +# Usage: +# +# export-files.sh +# +# The docspell base url is required as well as a directory to store +# all the files into. +# +# Example: +# +# export-files.sh http://localhost:7880 /tmp/ds-download +# +# +# The script then asks for username and password and starts downloading. + +if [ -z "$1" ]; then + echo "The base-url to docspell is required." + exit 1 +else + BASE_URL="$1" + shift +fi + +if [ -z "$1" ]; then + echo "A directory is required to store the files into." + exit 1 +else + TARGET="$1" + shift +fi + +set -o errexit -o pipefail -o noclobber -o nounset + +LOGIN_URL="$BASE_URL/api/v1/open/auth/login" +SEARCH_URL="$BASE_URL/api/v1/sec/item/search" +INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights" +DETAIL_URL="$BASE_URL/api/v1/sec/item" +ATTACH_URL="$BASE_URL/api/v1/sec/attachment" + +errout() { + >&2 echo "$@" +} + +trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT + +mcurl() { + tmpfile1=$(mktemp -t "ds-export.XXXXX") + tmpfile2=$(mktemp -t "ds-export.XXXXX") + set +e + curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@" + status=$? + set -e + if [ $status -ne 0 ]; then + errout "curl -H 'X-Docspell-Auth: …' $@" + errout "Curl command failed (rc=$status)! Output is below." + cat "$tmpfile1" >&2 + cat "$tmpfile2" >&2 + rm -f "$tmpfile1" "$tmpfile2" + return 2 + else + ret=$(cat "$tmpfile2") + rm "$tmpfile2" "$tmpfile1" + echo $ret + fi +} + + +errout "Login to Docspell." +errout "Using url: $BASE_URL" +if [ -z "$DS_USER" ]; then + errout -n "Account: " + read DS_USER +fi +if [ -z "$DS_PASS" ]; then + errout -n "Password: " + read -s DS_PASS +fi +echo + +declare auth +declare auth_token +declare auth_time + + +login() { + auth=$(curl -s --fail -XPOST \ + --data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL") + + if [ "$(echo $auth | jq .success)" == "true" ]; then + errout "Login successful" + auth_token=$(echo $auth | jq -r .token) + auth_time=$(date +%s) + else + errout "Login failed." + exit 1 + fi +} + +checkLogin() { + elapsed=$((1000 * ($(date +%s) - $auth_time))) + maxtime=$(echo $auth | jq .validMs) + + elapsed=$(($elapsed + 1000)) + if [ $elapsed -gt $maxtime ]; then + errout "Need to re-login $elapsed > $maxtime" + login + fi +} + +listItems() { + OFFSET="${1:-0}" + LIMIT="${2:-50}" + errout "Get next items with offset=$OFFSET, limit=$LIMIT" + REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}" + + mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id' +} + +fetchItemCount() { + mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add' +} + +fetchItem() { + mcurl -XGET "$DETAIL_URL/$1" +} + +downloadItem() { + checkLogin + itemData=$(fetchItem "$1") + errout "Get item $(echo $itemData | jq -r .id)" + created=$(echo $itemData|jq '.created') + created=$((($(echo $itemData|jq '.created') + 500) / 1000)) + itemId=$(echo $itemData | jq -r '.id') + out="$TARGET/$(date -d @$created +%Y-%m)/$itemId" + + mkdir -p "$out" + echo $itemData | jq > "$out/metadata.json" + + while read attachId attachName; do + errout " - download $attachName ($attachId)" + attachOut="$out/$attachName" + checkLogin + curl --fail -# -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId" + done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")') + +} + +login + +allCount=$(fetchItemCount) +errout "Downloading $allCount items…" + +allCounter=0 innerCounter=0 limit=100 offset=0 done=n + +while [ "$done" = "n" ]; do + checkLogin + + innerCounter=0 + while read id; do + downloadItem "$id" + innerCounter=$(($innerCounter + 1)) + done < <(listItems $offset $limit) + + allCounter=$(($allCounter + $innerCounter)) + offset=$(($offset + $limit)) + + + if [ $innerCounter -lt $limit ]; then + done=y + fi + +done +errout "Downloaded $allCounter/$allCount items" +if [[ $allCounter < $allCount ]]; then + errout + errout " Downloaded less items than were reported as available. This" + errout " may be due to items in folders that you cannot see. Or it" + errout " may be a bug." + errout +fi diff --git a/website/site/content/docs/faq/_index.md b/website/site/content/docs/faq/_index.md index cf7015c3..5d867c04 100644 --- a/website/site/content/docs/faq/_index.md +++ b/website/site/content/docs/faq/_index.md @@ -82,14 +82,11 @@ documentation, too. In order to move to a different tool, it is necessary to get the data out of Docspell in a machine readable/automatic way. Currently, there -is no *easy way* for this. However, it is possible to get to all data -with some scripting effort. Everything can be queried using a -[HTTP/REST api](@/docs/api/_index.md) and so you can write a -script/program that, for example, queries all items and downloads the -files (something like this might be provided soon, for now there are -starting points in the `/tools` folder). It is planned to provide a -more convenient way to export the data into the file system. But there -is no ETA for this. +is no *easy way* for this. However, everything can be queried using a +[HTTP/REST api](@/docs/api/_index.md) and so it is possible to get to +all data with some scripting effort. There exists a script in the +`tools/` folder that at least can go and download all files that have +been uploaded to docspell. My recommendation is to run periodic database backups and also store the binaries/docker images. This lets you re-create the current state diff --git a/website/site/content/docs/tools/export-files.md b/website/site/content/docs/tools/export-files.md new file mode 100644 index 00000000..78fc5432 --- /dev/null +++ b/website/site/content/docs/tools/export-files.md @@ -0,0 +1,33 @@ ++++ +title = "Export Files" +description = "Downloads all files from docspell." +weight = 65 ++++ + +# export-files.sh + +This script can be used to download all files from docspell that have +been uploaded before. + +# Requirements + +It is a bash script that additionally needs +[curl](https://curl.haxx.se/) and +[jq](https://stedolan.github.io/jq/). + +# Usage + +``` +./export-files.sh +``` + +For example, if docspell is at `http://localhost:7880`: + +``` +./export-files.sh http://localhost:7880 /tmp/ds-downloads +``` + +The script asks for your account name and password. It then logs in +and goes through all items downloading the metadata as json and the +attachments. It will fetch the original files (not the converted +ones). From 8c8dd2fb40d1a2f9327c43ce27c3fa2255d9e497 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 30 Nov 2020 00:59:24 +0100 Subject: [PATCH 2/2] Improve export-file.sh script and add docs --- tools/export-files.sh | 80 +++++++- website/site/content/docs/faq/_index.md | 8 +- .../site/content/docs/tools/export-files.md | 185 +++++++++++++++++- 3 files changed, 256 insertions(+), 17 deletions(-) diff --git a/tools/export-files.sh b/tools/export-files.sh index f335c1b8..ce7583e3 100755 --- a/tools/export-files.sh +++ b/tools/export-files.sh @@ -5,8 +5,13 @@ # (the original files). # # The item's metadata are stored next to the files to provide more -# information about the item. It is not meant to be imported back into -# docspell. +# information about the item: tags, dates, custom fields etc. This +# contains most of your user supplied data. +# +# This script is intended for having your data outside and independent +# of docspell. Another good idea for a backup strategy is to take +# database dumps *and* storing the releases of docspell next to this +# dump. # # Usage: # @@ -19,8 +24,29 @@ # # export-files.sh http://localhost:7880 /tmp/ds-download # +# The script then asks for username and password and starts +# downloading. Files are downloaded into the following structure +# (below the given target directory): # -# The script then asks for username and password and starts downloading. +# - yyyy-mm (item date) +# - A3…XY (item id) +# - somefile.pdf (attachments with name) +# - metadata.json (json file with items metadata) +# +# By default, files are not overwritten, it stops if existing files +# are encountered. Configuration can be specified using environment +# variables: +# +# - OVERWRITE_FILE= if `y` then overwriting existing files is ok. +# - SKIP_FILE= if `y` then existing files are skipped (supersedes +# OVERWRITE_FILE). +# - DROP_ITEM= if `y` the item folder is removed before attempting to +# download it. If this is set to `y` then the above options don't +# make sense, since they operate on the files inside the item folder +# +# Docspell sends with each file its sha256 checksum via the ETag +# header. This is used to do a integrity check after downloading. + if [ -z "$1" ]; then echo "The base-url to docspell is required." @@ -41,11 +67,14 @@ fi set -o errexit -o pipefail -o noclobber -o nounset LOGIN_URL="$BASE_URL/api/v1/open/auth/login" -SEARCH_URL="$BASE_URL/api/v1/sec/item/search" +SEARCH_URL="$BASE_URL/api/v1/sec/item/searchWithTags" INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights" DETAIL_URL="$BASE_URL/api/v1/sec/item" ATTACH_URL="$BASE_URL/api/v1/sec/attachment" +OVERWRITE_FILE=${OVERWRITE_FILE:-n} +DROP_ITEM=${DROP_ITEM:-n} + errout() { >&2 echo "$@" } @@ -133,6 +162,31 @@ fetchItem() { mcurl -XGET "$DETAIL_URL/$1" } +downloadAttachment() { + attachId="$1" + errout " - Download '$attachName' ($attachId)" + + if [ -f "$attachOut" ] && [ "$SKIP_FILE" == "y" ]; then + errout " - Skipping file '$attachOut' since it already exists" + else + if [ -f "$attachOut" ] && [ "$OVERWRITE_FILE" == "y" ]; then + errout " - Removing attachment file as requested: $attachOut" + rm -f "$attachOut" + fi + + checksum1=$(curl --fail -s -I -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" | \ + grep 'ETag' | cut -d' ' -f2 | jq -r) + curl --fail -s -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" + checksum2=$(sha256sum "$attachOut" | cut -d' ' -f1 | xargs) + if [ "$checksum1" == "$checksum2" ]; then + errout " - Checksum ok." + else + errout " - WARNING: Checksum mismatch! Server: $checksum1 Downloaded: $checksum2" + return 3 + fi + fi +} + downloadItem() { checkLogin itemData=$(fetchItem "$1") @@ -142,14 +196,26 @@ downloadItem() { itemId=$(echo $itemData | jq -r '.id') out="$TARGET/$(date -d @$created +%Y-%m)/$itemId" + if [ -d "$out" ] && [ "$DROP_ITEM" == "y" ]; then + errout "Removing item folder as requested: $out" + rm -rf "$out" + fi + mkdir -p "$out" - echo $itemData | jq > "$out/metadata.json" + if [ -f "$out/metadata.json" ] && [ "$SKIP_FILE" == "y" ]; then + errout " - Skipping file 'metadata.json' since it already exists" + else + if [ -f "$out/metadata.json" ] && [ "$OVERWRITE_FILE" == "y" ]; then + errout " - Removing metadata.json as requested" + rm -f "$out/metadata.json" + fi + echo $itemData | jq > "$out/metadata.json" + fi while read attachId attachName; do - errout " - download $attachName ($attachId)" attachOut="$out/$attachName" checkLogin - curl --fail -# -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId" + downloadAttachment "$attachId" done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")') } diff --git a/website/site/content/docs/faq/_index.md b/website/site/content/docs/faq/_index.md index 5d867c04..799d0a7d 100644 --- a/website/site/content/docs/faq/_index.md +++ b/website/site/content/docs/faq/_index.md @@ -82,11 +82,9 @@ documentation, too. In order to move to a different tool, it is necessary to get the data out of Docspell in a machine readable/automatic way. Currently, there -is no *easy way* for this. However, everything can be queried using a -[HTTP/REST api](@/docs/api/_index.md) and so it is possible to get to -all data with some scripting effort. There exists a script in the -`tools/` folder that at least can go and download all files that have -been uploaded to docspell. +is a [export-files.sh](@/docs/tools/export-files.md) script provided +(in the `tools/` folder) that can be used to download all your files +and item metadata. My recommendation is to run periodic database backups and also store the binaries/docker images. This lets you re-create the current state diff --git a/website/site/content/docs/tools/export-files.md b/website/site/content/docs/tools/export-files.md index 78fc5432..0e4a4d27 100644 --- a/website/site/content/docs/tools/export-files.md +++ b/website/site/content/docs/tools/export-files.md @@ -7,13 +7,54 @@ weight = 65 # export-files.sh This script can be used to download all files from docspell that have -been uploaded before. +been uploaded before and the item metadata. + +It downloads the original files, those that have been uploaded and not +the converted pdf files. + +The item's metadata are stored next to the files to provide more +information about the item: corresponent, tags, dates, custom fields +etc. This contains most of your user supplied data. + +This script is intended for having your data outside and independent +of docspell. Another good idea for a backup strategy is to take +database dumps *and* storing the releases of docspell next to this +dump. + +Files are stored into the following folder structure (below the given +target directory): + +``` +- yyyy-mm (item date) + - A3…XY (item id) + - somefile.pdf (attachments with name) + - metadata.json (json file with items metadata) +``` + +By default, files are not overwritten, it stops if existing files are +encountered. This and some other things can be changed using +environment variables: + +- `DS_USER` the account name for login, it is asked if not available +- `DS_PASS` the password for login, it is asked if not available +- `OVERWRITE_FILE=` if `y` then overwriting existing files is ok. + Default is `n`. +- `SKIP_FILE=` if `y` then existing files are skipped (supersedes + `OVERWRITE_FILE`). Default is `n`. +- `DROP_ITEM=` if `y` the item folder is removed before attempting to + download it. If this is set to `y` then the above options don't make + sense, since they operate on the files inside the item folder. + Default is `n`. + +Docspell sends the sha256 hash with each file via the ETag header. +This is used to do a integrity check after downloading. + # Requirements It is a bash script that additionally needs -[curl](https://curl.haxx.se/) and -[jq](https://stedolan.github.io/jq/). +[curl](https://curl.haxx.se/) and [jq](https://stedolan.github.io/jq/) +to be available. # Usage @@ -29,5 +70,139 @@ For example, if docspell is at `http://localhost:7880`: The script asks for your account name and password. It then logs in and goes through all items downloading the metadata as json and the -attachments. It will fetch the original files (not the converted -ones). +attachments. + + +# Example Run + +``` bash +fish> env SKIP_FILE=y DS_USER=demo DS_PASS=test ./export-files.sh http://localhost:7880 /tmp/download +Login to Docspell. +Using url: http://localhost:7880 + +Login successful +Downloading 73 items… +Get next items with offset=0, limit=100 +Get item 57Znskthf3g-X7RP1fxzE2U-dwr4vM6Yjnn-b7s1PoCznhz + - Download 'something.txt' (8HbeFornAUN-kBCyc8bHSVr-bnLBYDzgRQ7-peMZzyTzM2X) + - Checksum ok. +Get item 94u5Pt39q6N-7vKu3LugoRj-zohGS4ie4jb-68bW5gXU6Jd + - Download 'letter-en.pdf' (6KNNmoyqpew-RAkdwEmQgBT-QDqdY97whZA-4k2rmbssdfQ) + - Checksum ok. +Get item 7L9Fh53RVG4-vGSt2G2YUcY-cvpBKRXQgBn-omYpg6xQXyD + - Download 'mail.html' (A6yTYKrDc7y-xU3whmLB1kB-TGhEAVb12mo-RUw5u9PsYMo) + - Checksum ok. +Get item DCn9UtWUtvF-2qjxB5PXGEG-vqRUUU7JUJH-zBBrmSeGYPe + - Download 'Invoice_7340224.pdf' (6FWdjxJh7yB-CCjY39p6uH9-uVLbmGfm25r-cw6RksrSx4n) + - Checksum ok. +… +``` + +The resulting directory looks then like this: + +``` bash +… +├── 2020-08 +│   ├── 6t27gQQ4TfW-H4uAmkYyiSe-rBnerFE2v5F-9BdqbGEhMcv +│   │   ├── 52241.pdf +│   │   └── metadata.json +│   └── 9qwT2GuwEvV-s9UuBQ4w7o9-uE8AdMc7PwL-GFDd62gduAm +│   ├── DOC-20191223-155707.jpg +│   └── metadata.json +├── 2020-09 +│   ├── 2CM8C9VaVAT-sVJiKyUPCvR-Muqr2Cqvi6v-GXhRtg6eomA +│   │   ├── letter with spaces.pdf +│   │   └── metadata.json +│   ├── 4sXpX2Sc9Ex-QX1M6GtjiXp-DApuDDzGQXR-7pg1QPW9pbs +│   │   ├── analyse.org +│   │   ├── 201703.docx +│   │   ├── 11812_120719.pdf +│   │   ├── letter-de.pdf +│   │   ├── letter-en.pdf +│   │   └── metadata.json +│   ├── 5VhP5Torsy1-15pwJBeRjPi-es8BGnxhWn7-3pBQTJv3zPb +│   │   └── metadata.json +│   ├── 7ePWmK4xCNk-gmvnTDdFwG8-JcN5MDSUNPL-NTZZrho2Jc6 +│   │   ├── metadata.json +│   │   └── Rechnung.pdf +… +``` + +The `metadata.json` file contains all the item metadata. This may be +useful when importing into other tools. + +``` json +{ + "id": "AWCNx7tJgUw-SdrNtRouNJB-FGs6Y2VP5bV-218sFN8mjjk", + "direction": "incoming", + "name": "Ruecksendung.pdf", + "source": "integration", + "state": "confirmed", + "created": 1606171810005, + "updated": 1606422917826, + "itemDate": null, + "corrOrg": null, + "corrPerson": null, + "concPerson": null, + "concEquipment": null, + "inReplyTo": null, + "folder": null, + "dueDate": null, + "notes": null, + "attachments": [ + { + "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q", + "name": "Ruecksendung.converted.pdf", + "size": 57777, + "contentType": "application/pdf", + "converted": true + } + ], + "sources": [ + { + "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q", + "name": "Ruecksendung.pdf", + "size": 65715, + "contentType": "application/pdf" + } + ], + "archives": [], + "tags": [ + { + "id": "EQvJ6AHw19Y-Cdg3gF78zZk-BY2zFtNTwes-J95jpXpzhfw", + "name": "Hupe", + "category": "state", + "created": 1606427083171 + }, + { + "id": "4xyZoeeELdJ-tJ91GiRLinJ-7bdauy3U1jR-Bzr4VS96bGS", + "name": "Invoice", + "category": "doctype", + "created": 1594249709473 + } + ], + "customfields": [ + { + "id": "5tYmDHin3Kx-HomKkeEVtJN-v99oKxQ8ot6-yFVrEmMayoo", + "name": "amount", + "label": "EUR", + "ftype": "money", + "value": "151.55" + }, + { + "id": "3jbwbep8rDs-hNJ9ePRE7gv-21nYMbUj3eb-mKRWAr4xSS2", + "name": "invoice-number", + "label": "Invoice-Nr", + "ftype": "text", + "value": "I454602" + }, + { + "id": "AH4p4NUCa9Y-EUkH66wLzxE-Rf2wJPxTAYd-DeGDm4AT4Yg", + "name": "number", + "label": "Number", + "ftype": "numeric", + "value": "0.10" + } + ] +} +```