diff --git a/tools/download-files/download-files.sh b/tools/download-files/download-files.sh new file mode 100755 index 00000000..6444045f --- /dev/null +++ b/tools/download-files/download-files.sh @@ -0,0 +1,213 @@ +#!/usr/bin/env bash +# +# Script for downloading files (the PDF versions) flat in the current +# directory. It takes a search query for selecting what to download. +# Metadata is not downloaded, only the files. +# +# Usage: +# +# download-files.sh +# +# The docspell base url is required as well as a search query. The +# output directory is the current directory, and can be defined via +# env variable "TARGET_DIR". +# +# Example: +# +# download-files.sh http://localhost:7880 "tag:todo folder:work" +# +# The script then asks for username and password and starts +# downloading. For more details about the query, please see the docs +# here: https://docspell.org/docs/query/ + +CURL_CMD="curl" +JQ_CMD="jq" + + +if [ -z "$1" ]; then + echo "The base-url to docspell is required." + exit 1 +else + BASE_URL="$1" + shift +fi + +if [ -z "$1" ]; then + errout "A search query is required" + exit 1 +else + QUERY="$1" + shift +fi + +set -o errexit -o pipefail -o noclobber -o nounset + +LOGIN_URL="$BASE_URL/api/v1/open/auth/login" +SEARCH_URL="$BASE_URL/api/v1/sec/item/search" +DETAIL_URL="$BASE_URL/api/v1/sec/item" +ATTACH_URL="$BASE_URL/api/v1/sec/attachment" + +OVERWRITE_FILE=${OVERWRITE_FILE:-n} +TARGET=${TARGET_DIR:-"$(pwd)"} + +errout() { + >&2 echo "$@" +} + +trap "{ rm -f ${TMPDIR-:/tmp}/ds-download.*; }" EXIT + +mcurl() { + tmpfile1=$(mktemp -t "ds-download.XXXXX") + tmpfile2=$(mktemp -t "ds-download.XXXXX") + set +e + "$CURL_CMD" -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@" + status=$? + set -e + if [ $status -ne 0 ]; then + errout "$CURL_CMD -H 'X-Docspell-Auth: …' $@" + errout "curl command failed (rc=$status)! Output is below." + cat "$tmpfile1" >&2 + cat "$tmpfile2" >&2 + rm -f "$tmpfile1" "$tmpfile2" + return 2 + else + ret=$(cat "$tmpfile2") + rm "$tmpfile2" "$tmpfile1" + echo $ret + fi +} + + +errout "Login to Docspell." +errout "Using url: $BASE_URL" +if [ -z "${DS_USER:-}" ]; then + errout -n "Account: " + read DS_USER +fi +if [ -z "${DS_PASS:-}" ]; then + errout -n "Password: " + read -s DS_PASS +fi +echo + +declare auth +declare auth_token +declare auth_time + + +login() { + auth=$("$CURL_CMD" -s --fail -XPOST \ + --data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL") + + if [ "$(echo $auth | "$JQ_CMD" .success)" == "true" ]; then + errout "Login successful" + auth_token=$(echo $auth | "$JQ_CMD" -r .token) + auth_time=$(date +%s) + else + errout "Login failed." + exit 1 + fi +} + +checkLogin() { + elapsed=$((1000 * ($(date +%s) - $auth_time))) + maxtime=$(echo $auth | "$JQ_CMD" .validMs) + + elapsed=$(($elapsed + 1000)) + if [ $elapsed -gt $maxtime ]; then + errout "Need to re-login $elapsed > $maxtime" + login + fi +} + +listItems() { + OFFSET="${1:-0}" + LIMIT="${2:-50}" + QUERY="$3" + errout "Get next items with offset=$OFFSET, limit=$LIMIT" + REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"query\":\" $QUERY \"}" + + mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | "$JQ_CMD" -r '.groups[].items[]|.id' +} + + +fetchItem() { + mcurl -XGET "$DETAIL_URL/$1" +} + +downloadAttachment() { + attachId="$1" + errout " - Download '$attachName' ($attachId)" + + if [ -f "$attachOut" ] && [ "$SKIP_FILE" == "y" ]; then + errout " - Skipping file '$attachOut' since it already exists" + else + if [ -f "$attachOut" ] && [ "$OVERWRITE_FILE" == "y" ]; then + errout " - Removing attachment file as requested: $attachOut" + rm -f "$attachOut" + fi + + DL_URL="$ATTACH_URL/$attachId" + + checksum1=$("$CURL_CMD" -s -I -H "X-Docspell-Auth: $auth_token" "$DL_URL" | \ + grep -i 'etag' | cut -d' ' -f2 | "$JQ_CMD" -r) + "$CURL_CMD" -s -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$DL_URL" + checksum2=$(sha256sum "$attachOut" | cut -d' ' -f1 | xargs) + if [ "$checksum1" == "$checksum2" ]; then + errout " - Checksum ok." + else + errout " - WARNING: Checksum mismatch! Server: $checksum1 Downloaded: $checksum2" + return 3 + fi + fi +} + +downloadItem() { + checkLogin + itemData=$(fetchItem "$1") + errout "Get item $(echo $itemData | "$JQ_CMD" -r .id)" + created=$(echo $itemData|"$JQ_CMD" '.created') + created=$((($(echo $itemData|"$JQ_CMD" '.created') + 500) / 1000)) + itemId=$(echo $itemData | "$JQ_CMD" -r '.id') + # out="$TARGET/$(date -d @$created +%Y-%m)/$itemId" + out="$TARGET" + + if [ -d "$out" ] && [ "${DROP_ITEM:-}" == "y" ]; then + errout "Removing item folder as requested: $out" + rm -rf "$out" + fi + + mkdir -p "$out" + + while read attachId attachName; do + attachOut="$out/$attachName" + checkLogin + downloadAttachment "$attachId" + done < <(echo $itemData | "$JQ_CMD" -r '.attachments[] | [.id,.name] | join(" ")') +} + +login + +errout "Downloading files…" + +allCounter=0 innerCounter=0 limit=100 offset=0 done=n + +while [ "$done" = "n" ]; do + checkLogin + + innerCounter=0 + while read id; do + downloadItem "$id" + innerCounter=$(($innerCounter + 1)) + done < <(listItems $offset $limit "$QUERY") + + allCounter=$(($allCounter + $innerCounter)) + offset=$(($offset + $limit)) + + + if [ $innerCounter -lt $limit ]; then + done=y + fi + +done +errout "Downloaded $allCounter items"