Add an export-files script

2025-07-04 16:48:26 +00:00 · 2020-11-29 01:30:15 +01:00
parent 45e4035e07
commit 0dafe57034
3 changed files with 227 additions and 8 deletions
--- a/tools/export-files.sh
+++ b/tools/export-files.sh
@ -0,0 +1,189 @@
 #!/usr/bin/env bash
 #
 # Simple script for downloading all your files. It goes through all
 # items visible to the logged in user and downloads the attachments
 # (the original files).
 #
 # The item's metadata are stored next to the files to provide more
 # information about the item. It is not meant to be imported back into
 # docspell.
 #
 # Usage:
 #
 # export-files.sh <docspell-base-url> <target-directory>
 #
 # The docspell base url is required as well as a directory to store
 # all the files into.
 #
 # Example:
 #
 #    export-files.sh http://localhost:7880 /tmp/ds-download
 #
 #
 # The script then asks for username and password and starts downloading.
 if [ -z "$1" ]; then
    echo "The base-url to docspell is required."
    exit 1
 else
    BASE_URL="$1"
    shift
 fi
 if [ -z "$1" ]; then
    echo "A directory is required to store the files into."
    exit 1
 else
    TARGET="$1"
    shift
 fi
 set -o errexit -o pipefail -o noclobber -o nounset
 LOGIN_URL="$BASE_URL/api/v1/open/auth/login"
 SEARCH_URL="$BASE_URL/api/v1/sec/item/search"
 INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights"
 DETAIL_URL="$BASE_URL/api/v1/sec/item"
 ATTACH_URL="$BASE_URL/api/v1/sec/attachment"
 errout() {
    >&2 echo "$@"
 }
 trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT
 mcurl() {
    tmpfile1=$(mktemp -t "ds-export.XXXXX")
    tmpfile2=$(mktemp -t "ds-export.XXXXX")
    set +e
    curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@"
    status=$?
    set -e
    if [ $status -ne 0 ]; then
        errout "curl -H 'X-Docspell-Auth: …' $@"
        errout "Curl command failed (rc=$status)! Output is below."
        cat "$tmpfile1" >&2
        cat "$tmpfile2" >&2
        rm -f "$tmpfile1" "$tmpfile2"
        return 2
    else
        ret=$(cat "$tmpfile2")
        rm "$tmpfile2" "$tmpfile1"
        echo $ret
    fi
 }
 errout "Login to Docspell."
 errout "Using url: $BASE_URL"
 if [ -z "$DS_USER" ]; then
    errout -n "Account: "
    read DS_USER
 fi
 if [ -z "$DS_PASS" ]; then
    errout -n "Password: "
    read -s DS_PASS
 fi
 echo
 declare auth
 declare auth_token
 declare auth_time
 login() {
    auth=$(curl -s --fail -XPOST \
                 --data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL")
    if [ "$(echo $auth | jq .success)" == "true" ]; then
        errout "Login successful"
        auth_token=$(echo $auth | jq -r .token)
        auth_time=$(date +%s)
    else
        errout "Login failed."
        exit 1
    fi
 }
 checkLogin() {
    elapsed=$((1000 * ($(date +%s) - $auth_time)))
    maxtime=$(echo $auth | jq .validMs)
    elapsed=$(($elapsed + 1000))
    if [ $elapsed -gt $maxtime ]; then
        errout "Need to re-login $elapsed > $maxtime"
        login
    fi
 }
 listItems() {
    OFFSET="${1:-0}"
    LIMIT="${2:-50}"
    errout "Get next items with offset=$OFFSET, limit=$LIMIT"
    REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}"
    mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id'
 }
 fetchItemCount() {
    mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add'
 }
 fetchItem() {
    mcurl -XGET "$DETAIL_URL/$1"
 }
 downloadItem() {
    checkLogin
    itemData=$(fetchItem "$1")
    errout "Get item $(echo $itemData | jq -r .id)"
    created=$(echo $itemData|jq '.created')
    created=$((($(echo $itemData|jq '.created') + 500) / 1000))
    itemId=$(echo $itemData | jq -r '.id')
    out="$TARGET/$(date -d @$created +%Y-%m)/$itemId"
    mkdir -p "$out"
    echo $itemData | jq > "$out/metadata.json"
    while read attachId attachName; do
        errout " - download $attachName ($attachId)"
        attachOut="$out/$attachName"
        checkLogin
        curl --fail -# -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId"
    done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")')
 }
 login
 allCount=$(fetchItemCount)
 errout "Downloading $allCount items…"
 allCounter=0 innerCounter=0 limit=100 offset=0 done=n
 while [ "$done" = "n" ]; do
    checkLogin
    innerCounter=0
    while read id; do
        downloadItem "$id"
        innerCounter=$(($innerCounter + 1))
    done < <(listItems $offset $limit)
    allCounter=$(($allCounter + $innerCounter))
    offset=$(($offset + $limit))
    if [ $innerCounter -lt $limit ]; then
        done=y
    fi
 done
 errout "Downloaded $allCounter/$allCount items"
 if [[ $allCounter < $allCount ]]; then
    errout
    errout "  Downloaded less items than were reported as available. This"
    errout "  may be due to items in folders that you cannot see. Or it"
    errout "  may be a bug."
    errout
 fi
--- a/website/site/content/docs/faq/_index.md
+++ b/website/site/content/docs/faq/_index.md
@ -82,14 +82,11 @@ documentation, too.
 In order to move to a different tool, it is necessary to get the data
 out of Docspell in a machine readable/automatic way. Currently, there
-is no *easy way* for this. However, it is possible to get to all data
+is no *easy way* for this. However, everything can be queried using a
-with some scripting effort. Everything can be queried using a
+[HTTP/REST api](@/docs/api/_index.md) and so it is possible to get to
-[HTTP/REST api](@/docs/api/_index.md) and so you can write a
+all data with some scripting effort. There exists a script in the
-script/program that, for example, queries all items and downloads the
+`tools/` folder that at least can go and download all files that have
-files (something like this might be provided soon, for now there are
+been uploaded to docspell.
 starting points in the `/tools` folder). It is planned to provide a
 more convenient way to export the data into the file system. But there
 is no ETA for this.
 My recommendation is to run periodic database backups and also store
 the binaries/docker images. This lets you re-create the current state
--- a/website/site/content/docs/tools/export-files.md
+++ b/website/site/content/docs/tools/export-files.md
@ -0,0 +1,33 @@
 +++
 title = "Export Files"
 description = "Downloads all files from docspell."
 weight = 65
 +++
 # export-files.sh
 This script can be used to download all files from docspell that have
 been uploaded before.
 # Requirements
 It is a bash script that additionally needs
 [curl](https://curl.haxx.se/) and
 [jq](https://stedolan.github.io/jq/).
 # Usage
 ```
 ./export-files.sh <docspell-base-url> <target-directory>
 ```
 For example, if docspell is at `http://localhost:7880`:
 ```
 ./export-files.sh http://localhost:7880 /tmp/ds-downloads
 ```
 The script asks for your account name and password. It then logs in
 and goes through all items downloading the metadata as json and the
 attachments. It will fetch the original files (not the converted
 ones).