Merge pull request #485 from eikek/export-files

Export files
This commit is contained in:
mergify[bot] 2020-11-30 00:23:32 +00:00 committed by GitHub
commit 3ea2d20823
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 466 additions and 8 deletions

255
tools/export-files.sh Executable file
View File

@ -0,0 +1,255 @@
#!/usr/bin/env bash
#
# Simple script for downloading all your files. It goes through all
# items visible to the logged in user and downloads the attachments
# (the original files).
#
# The item's metadata are stored next to the files to provide more
# information about the item: tags, dates, custom fields etc. This
# contains most of your user supplied data.
#
# This script is intended for having your data outside and independent
# of docspell. Another good idea for a backup strategy is to take
# database dumps *and* storing the releases of docspell next to this
# dump.
#
# Usage:
#
# export-files.sh <docspell-base-url> <target-directory>
#
# The docspell base url is required as well as a directory to store
# all the files into.
#
# Example:
#
# export-files.sh http://localhost:7880 /tmp/ds-download
#
# The script then asks for username and password and starts
# downloading. Files are downloaded into the following structure
# (below the given target directory):
#
# - yyyy-mm (item date)
# - A3…XY (item id)
# - somefile.pdf (attachments with name)
# - metadata.json (json file with items metadata)
#
# By default, files are not overwritten, it stops if existing files
# are encountered. Configuration can be specified using environment
# variables:
#
# - OVERWRITE_FILE= if `y` then overwriting existing files is ok.
# - SKIP_FILE= if `y` then existing files are skipped (supersedes
# OVERWRITE_FILE).
# - DROP_ITEM= if `y` the item folder is removed before attempting to
# download it. If this is set to `y` then the above options don't
# make sense, since they operate on the files inside the item folder
#
# Docspell sends with each file its sha256 checksum via the ETag
# header. This is used to do a integrity check after downloading.
if [ -z "$1" ]; then
echo "The base-url to docspell is required."
exit 1
else
BASE_URL="$1"
shift
fi
if [ -z "$1" ]; then
echo "A directory is required to store the files into."
exit 1
else
TARGET="$1"
shift
fi
set -o errexit -o pipefail -o noclobber -o nounset
LOGIN_URL="$BASE_URL/api/v1/open/auth/login"
SEARCH_URL="$BASE_URL/api/v1/sec/item/searchWithTags"
INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights"
DETAIL_URL="$BASE_URL/api/v1/sec/item"
ATTACH_URL="$BASE_URL/api/v1/sec/attachment"
OVERWRITE_FILE=${OVERWRITE_FILE:-n}
DROP_ITEM=${DROP_ITEM:-n}
errout() {
>&2 echo "$@"
}
trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT
mcurl() {
tmpfile1=$(mktemp -t "ds-export.XXXXX")
tmpfile2=$(mktemp -t "ds-export.XXXXX")
set +e
curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@"
status=$?
set -e
if [ $status -ne 0 ]; then
errout "curl -H 'X-Docspell-Auth: …' $@"
errout "Curl command failed (rc=$status)! Output is below."
cat "$tmpfile1" >&2
cat "$tmpfile2" >&2
rm -f "$tmpfile1" "$tmpfile2"
return 2
else
ret=$(cat "$tmpfile2")
rm "$tmpfile2" "$tmpfile1"
echo $ret
fi
}
errout "Login to Docspell."
errout "Using url: $BASE_URL"
if [ -z "$DS_USER" ]; then
errout -n "Account: "
read DS_USER
fi
if [ -z "$DS_PASS" ]; then
errout -n "Password: "
read -s DS_PASS
fi
echo
declare auth
declare auth_token
declare auth_time
login() {
auth=$(curl -s --fail -XPOST \
--data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL")
if [ "$(echo $auth | jq .success)" == "true" ]; then
errout "Login successful"
auth_token=$(echo $auth | jq -r .token)
auth_time=$(date +%s)
else
errout "Login failed."
exit 1
fi
}
checkLogin() {
elapsed=$((1000 * ($(date +%s) - $auth_time)))
maxtime=$(echo $auth | jq .validMs)
elapsed=$(($elapsed + 1000))
if [ $elapsed -gt $maxtime ]; then
errout "Need to re-login $elapsed > $maxtime"
login
fi
}
listItems() {
OFFSET="${1:-0}"
LIMIT="${2:-50}"
errout "Get next items with offset=$OFFSET, limit=$LIMIT"
REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}"
mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id'
}
fetchItemCount() {
mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add'
}
fetchItem() {
mcurl -XGET "$DETAIL_URL/$1"
}
downloadAttachment() {
attachId="$1"
errout " - Download '$attachName' ($attachId)"
if [ -f "$attachOut" ] && [ "$SKIP_FILE" == "y" ]; then
errout " - Skipping file '$attachOut' since it already exists"
else
if [ -f "$attachOut" ] && [ "$OVERWRITE_FILE" == "y" ]; then
errout " - Removing attachment file as requested: $attachOut"
rm -f "$attachOut"
fi
checksum1=$(curl --fail -s -I -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" | \
grep 'ETag' | cut -d' ' -f2 | jq -r)
curl --fail -s -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original"
checksum2=$(sha256sum "$attachOut" | cut -d' ' -f1 | xargs)
if [ "$checksum1" == "$checksum2" ]; then
errout " - Checksum ok."
else
errout " - WARNING: Checksum mismatch! Server: $checksum1 Downloaded: $checksum2"
return 3
fi
fi
}
downloadItem() {
checkLogin
itemData=$(fetchItem "$1")
errout "Get item $(echo $itemData | jq -r .id)"
created=$(echo $itemData|jq '.created')
created=$((($(echo $itemData|jq '.created') + 500) / 1000))
itemId=$(echo $itemData | jq -r '.id')
out="$TARGET/$(date -d @$created +%Y-%m)/$itemId"
if [ -d "$out" ] && [ "$DROP_ITEM" == "y" ]; then
errout "Removing item folder as requested: $out"
rm -rf "$out"
fi
mkdir -p "$out"
if [ -f "$out/metadata.json" ] && [ "$SKIP_FILE" == "y" ]; then
errout " - Skipping file 'metadata.json' since it already exists"
else
if [ -f "$out/metadata.json" ] && [ "$OVERWRITE_FILE" == "y" ]; then
errout " - Removing metadata.json as requested"
rm -f "$out/metadata.json"
fi
echo $itemData | jq > "$out/metadata.json"
fi
while read attachId attachName; do
attachOut="$out/$attachName"
checkLogin
downloadAttachment "$attachId"
done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")')
}
login
allCount=$(fetchItemCount)
errout "Downloading $allCount items…"
allCounter=0 innerCounter=0 limit=100 offset=0 done=n
while [ "$done" = "n" ]; do
checkLogin
innerCounter=0
while read id; do
downloadItem "$id"
innerCounter=$(($innerCounter + 1))
done < <(listItems $offset $limit)
allCounter=$(($allCounter + $innerCounter))
offset=$(($offset + $limit))
if [ $innerCounter -lt $limit ]; then
done=y
fi
done
errout "Downloaded $allCounter/$allCount items"
if [[ $allCounter < $allCount ]]; then
errout
errout " Downloaded less items than were reported as available. This"
errout " may be due to items in folders that you cannot see. Or it"
errout " may be a bug."
errout
fi

View File

@ -82,14 +82,9 @@ documentation, too.
In order to move to a different tool, it is necessary to get the data
out of Docspell in a machine readable/automatic way. Currently, there
is no *easy way* for this. However, it is possible to get to all data
with some scripting effort. Everything can be queried using a
[HTTP/REST api](@/docs/api/_index.md) and so you can write a
script/program that, for example, queries all items and downloads the
files (something like this might be provided soon, for now there are
starting points in the `/tools` folder). It is planned to provide a
more convenient way to export the data into the file system. But there
is no ETA for this.
is a [export-files.sh](@/docs/tools/export-files.md) script provided
(in the `tools/` folder) that can be used to download all your files
and item metadata.
My recommendation is to run periodic database backups and also store
the binaries/docker images. This lets you re-create the current state

View File

@ -0,0 +1,208 @@
+++
title = "Export Files"
description = "Downloads all files from docspell."
weight = 65
+++
# export-files.sh
This script can be used to download all files from docspell that have
been uploaded before and the item metadata.
It downloads the original files, those that have been uploaded and not
the converted pdf files.
The item's metadata are stored next to the files to provide more
information about the item: corresponent, tags, dates, custom fields
etc. This contains most of your user supplied data.
This script is intended for having your data outside and independent
of docspell. Another good idea for a backup strategy is to take
database dumps *and* storing the releases of docspell next to this
dump.
Files are stored into the following folder structure (below the given
target directory):
```
- yyyy-mm (item date)
- A3…XY (item id)
- somefile.pdf (attachments with name)
- metadata.json (json file with items metadata)
```
By default, files are not overwritten, it stops if existing files are
encountered. This and some other things can be changed using
environment variables:
- `DS_USER` the account name for login, it is asked if not available
- `DS_PASS` the password for login, it is asked if not available
- `OVERWRITE_FILE=` if `y` then overwriting existing files is ok.
Default is `n`.
- `SKIP_FILE=` if `y` then existing files are skipped (supersedes
`OVERWRITE_FILE`). Default is `n`.
- `DROP_ITEM=` if `y` the item folder is removed before attempting to
download it. If this is set to `y` then the above options don't make
sense, since they operate on the files inside the item folder.
Default is `n`.
Docspell sends the sha256 hash with each file via the ETag header.
This is used to do a integrity check after downloading.
# Requirements
It is a bash script that additionally needs
[curl](https://curl.haxx.se/) and [jq](https://stedolan.github.io/jq/)
to be available.
# Usage
```
./export-files.sh <docspell-base-url> <target-directory>
```
For example, if docspell is at `http://localhost:7880`:
```
./export-files.sh http://localhost:7880 /tmp/ds-downloads
```
The script asks for your account name and password. It then logs in
and goes through all items downloading the metadata as json and the
attachments.
# Example Run
``` bash
fish> env SKIP_FILE=y DS_USER=demo DS_PASS=test ./export-files.sh http://localhost:7880 /tmp/download
Login to Docspell.
Using url: http://localhost:7880
Login successful
Downloading 73 items…
Get next items with offset=0, limit=100
Get item 57Znskthf3g-X7RP1fxzE2U-dwr4vM6Yjnn-b7s1PoCznhz
- Download 'something.txt' (8HbeFornAUN-kBCyc8bHSVr-bnLBYDzgRQ7-peMZzyTzM2X)
- Checksum ok.
Get item 94u5Pt39q6N-7vKu3LugoRj-zohGS4ie4jb-68bW5gXU6Jd
- Download 'letter-en.pdf' (6KNNmoyqpew-RAkdwEmQgBT-QDqdY97whZA-4k2rmbssdfQ)
- Checksum ok.
Get item 7L9Fh53RVG4-vGSt2G2YUcY-cvpBKRXQgBn-omYpg6xQXyD
- Download 'mail.html' (A6yTYKrDc7y-xU3whmLB1kB-TGhEAVb12mo-RUw5u9PsYMo)
- Checksum ok.
Get item DCn9UtWUtvF-2qjxB5PXGEG-vqRUUU7JUJH-zBBrmSeGYPe
- Download 'Invoice_7340224.pdf' (6FWdjxJh7yB-CCjY39p6uH9-uVLbmGfm25r-cw6RksrSx4n)
- Checksum ok.
```
The resulting directory looks then like this:
``` bash
├── 2020-08
│   ├── 6t27gQQ4TfW-H4uAmkYyiSe-rBnerFE2v5F-9BdqbGEhMcv
│   │   ├── 52241.pdf
│   │   └── metadata.json
│   └── 9qwT2GuwEvV-s9UuBQ4w7o9-uE8AdMc7PwL-GFDd62gduAm
│   ├── DOC-20191223-155707.jpg
│   └── metadata.json
├── 2020-09
│   ├── 2CM8C9VaVAT-sVJiKyUPCvR-Muqr2Cqvi6v-GXhRtg6eomA
│   │   ├── letter with spaces.pdf
│   │   └── metadata.json
│   ├── 4sXpX2Sc9Ex-QX1M6GtjiXp-DApuDDzGQXR-7pg1QPW9pbs
│   │   ├── analyse.org
│   │   ├── 201703.docx
│   │   ├── 11812_120719.pdf
│   │   ├── letter-de.pdf
│   │   ├── letter-en.pdf
│   │   └── metadata.json
│   ├── 5VhP5Torsy1-15pwJBeRjPi-es8BGnxhWn7-3pBQTJv3zPb
│   │   └── metadata.json
│   ├── 7ePWmK4xCNk-gmvnTDdFwG8-JcN5MDSUNPL-NTZZrho2Jc6
│   │   ├── metadata.json
│   │   └── Rechnung.pdf
```
The `metadata.json` file contains all the item metadata. This may be
useful when importing into other tools.
``` json
{
"id": "AWCNx7tJgUw-SdrNtRouNJB-FGs6Y2VP5bV-218sFN8mjjk",
"direction": "incoming",
"name": "Ruecksendung.pdf",
"source": "integration",
"state": "confirmed",
"created": 1606171810005,
"updated": 1606422917826,
"itemDate": null,
"corrOrg": null,
"corrPerson": null,
"concPerson": null,
"concEquipment": null,
"inReplyTo": null,
"folder": null,
"dueDate": null,
"notes": null,
"attachments": [
{
"id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q",
"name": "Ruecksendung.converted.pdf",
"size": 57777,
"contentType": "application/pdf",
"converted": true
}
],
"sources": [
{
"id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q",
"name": "Ruecksendung.pdf",
"size": 65715,
"contentType": "application/pdf"
}
],
"archives": [],
"tags": [
{
"id": "EQvJ6AHw19Y-Cdg3gF78zZk-BY2zFtNTwes-J95jpXpzhfw",
"name": "Hupe",
"category": "state",
"created": 1606427083171
},
{
"id": "4xyZoeeELdJ-tJ91GiRLinJ-7bdauy3U1jR-Bzr4VS96bGS",
"name": "Invoice",
"category": "doctype",
"created": 1594249709473
}
],
"customfields": [
{
"id": "5tYmDHin3Kx-HomKkeEVtJN-v99oKxQ8ot6-yFVrEmMayoo",
"name": "amount",
"label": "EUR",
"ftype": "money",
"value": "151.55"
},
{
"id": "3jbwbep8rDs-hNJ9ePRE7gv-21nYMbUj3eb-mKRWAr4xSS2",
"name": "invoice-number",
"label": "Invoice-Nr",
"ftype": "text",
"value": "I454602"
},
{
"id": "AH4p4NUCa9Y-EUkH66wLzxE-Rf2wJPxTAYd-DeGDm4AT4Yg",
"name": "number",
"label": "Number",
"ftype": "numeric",
"value": "0.10"
}
]
}
```