Add an export-files script

This commit is contained in:
Eike Kettner 2020-11-29 01:30:15 +01:00
parent 45e4035e07
commit 0dafe57034
3 changed files with 227 additions and 8 deletions

189
tools/export-files.sh Executable file
View File

@ -0,0 +1,189 @@
#!/usr/bin/env bash
#
# Simple script for downloading all your files. It goes through all
# items visible to the logged in user and downloads the attachments
# (the original files).
#
# The item's metadata are stored next to the files to provide more
# information about the item. It is not meant to be imported back into
# docspell.
#
# Usage:
#
# export-files.sh <docspell-base-url> <target-directory>
#
# The docspell base url is required as well as a directory to store
# all the files into.
#
# Example:
#
# export-files.sh http://localhost:7880 /tmp/ds-download
#
#
# The script then asks for username and password and starts downloading.
if [ -z "$1" ]; then
echo "The base-url to docspell is required."
exit 1
else
BASE_URL="$1"
shift
fi
if [ -z "$1" ]; then
echo "A directory is required to store the files into."
exit 1
else
TARGET="$1"
shift
fi
set -o errexit -o pipefail -o noclobber -o nounset
LOGIN_URL="$BASE_URL/api/v1/open/auth/login"
SEARCH_URL="$BASE_URL/api/v1/sec/item/search"
INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights"
DETAIL_URL="$BASE_URL/api/v1/sec/item"
ATTACH_URL="$BASE_URL/api/v1/sec/attachment"
errout() {
>&2 echo "$@"
}
trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT
mcurl() {
tmpfile1=$(mktemp -t "ds-export.XXXXX")
tmpfile2=$(mktemp -t "ds-export.XXXXX")
set +e
curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@"
status=$?
set -e
if [ $status -ne 0 ]; then
errout "curl -H 'X-Docspell-Auth: …' $@"
errout "Curl command failed (rc=$status)! Output is below."
cat "$tmpfile1" >&2
cat "$tmpfile2" >&2
rm -f "$tmpfile1" "$tmpfile2"
return 2
else
ret=$(cat "$tmpfile2")
rm "$tmpfile2" "$tmpfile1"
echo $ret
fi
}
errout "Login to Docspell."
errout "Using url: $BASE_URL"
if [ -z "$DS_USER" ]; then
errout -n "Account: "
read DS_USER
fi
if [ -z "$DS_PASS" ]; then
errout -n "Password: "
read -s DS_PASS
fi
echo
declare auth
declare auth_token
declare auth_time
login() {
auth=$(curl -s --fail -XPOST \
--data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL")
if [ "$(echo $auth | jq .success)" == "true" ]; then
errout "Login successful"
auth_token=$(echo $auth | jq -r .token)
auth_time=$(date +%s)
else
errout "Login failed."
exit 1
fi
}
checkLogin() {
elapsed=$((1000 * ($(date +%s) - $auth_time)))
maxtime=$(echo $auth | jq .validMs)
elapsed=$(($elapsed + 1000))
if [ $elapsed -gt $maxtime ]; then
errout "Need to re-login $elapsed > $maxtime"
login
fi
}
listItems() {
OFFSET="${1:-0}"
LIMIT="${2:-50}"
errout "Get next items with offset=$OFFSET, limit=$LIMIT"
REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}"
mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id'
}
fetchItemCount() {
mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add'
}
fetchItem() {
mcurl -XGET "$DETAIL_URL/$1"
}
downloadItem() {
checkLogin
itemData=$(fetchItem "$1")
errout "Get item $(echo $itemData | jq -r .id)"
created=$(echo $itemData|jq '.created')
created=$((($(echo $itemData|jq '.created') + 500) / 1000))
itemId=$(echo $itemData | jq -r '.id')
out="$TARGET/$(date -d @$created +%Y-%m)/$itemId"
mkdir -p "$out"
echo $itemData | jq > "$out/metadata.json"
while read attachId attachName; do
errout " - download $attachName ($attachId)"
attachOut="$out/$attachName"
checkLogin
curl --fail -# -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId"
done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")')
}
login
allCount=$(fetchItemCount)
errout "Downloading $allCount items…"
allCounter=0 innerCounter=0 limit=100 offset=0 done=n
while [ "$done" = "n" ]; do
checkLogin
innerCounter=0
while read id; do
downloadItem "$id"
innerCounter=$(($innerCounter + 1))
done < <(listItems $offset $limit)
allCounter=$(($allCounter + $innerCounter))
offset=$(($offset + $limit))
if [ $innerCounter -lt $limit ]; then
done=y
fi
done
errout "Downloaded $allCounter/$allCount items"
if [[ $allCounter < $allCount ]]; then
errout
errout " Downloaded less items than were reported as available. This"
errout " may be due to items in folders that you cannot see. Or it"
errout " may be a bug."
errout
fi

View File

@ -82,14 +82,11 @@ documentation, too.
In order to move to a different tool, it is necessary to get the data
out of Docspell in a machine readable/automatic way. Currently, there
is no *easy way* for this. However, it is possible to get to all data
with some scripting effort. Everything can be queried using a
[HTTP/REST api](@/docs/api/_index.md) and so you can write a
script/program that, for example, queries all items and downloads the
files (something like this might be provided soon, for now there are
starting points in the `/tools` folder). It is planned to provide a
more convenient way to export the data into the file system. But there
is no ETA for this.
is no *easy way* for this. However, everything can be queried using a
[HTTP/REST api](@/docs/api/_index.md) and so it is possible to get to
all data with some scripting effort. There exists a script in the
`tools/` folder that at least can go and download all files that have
been uploaded to docspell.
My recommendation is to run periodic database backups and also store
the binaries/docker images. This lets you re-create the current state

View File

@ -0,0 +1,33 @@
+++
title = "Export Files"
description = "Downloads all files from docspell."
weight = 65
+++
# export-files.sh
This script can be used to download all files from docspell that have
been uploaded before.
# Requirements
It is a bash script that additionally needs
[curl](https://curl.haxx.se/) and
[jq](https://stedolan.github.io/jq/).
# Usage
```
./export-files.sh <docspell-base-url> <target-directory>
```
For example, if docspell is at `http://localhost:7880`:
```
./export-files.sh http://localhost:7880 /tmp/ds-downloads
```
The script asks for your account name and password. It then logs in
and goes through all items downloading the metadata as json and the
attachments. It will fetch the original files (not the converted
ones).