mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-30 21:40:12 +00:00 
			
		
		
		
	
							
								
								
									
										255
									
								
								tools/export-files.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										255
									
								
								tools/export-files.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,255 @@ | ||||
| #!/usr/bin/env bash | ||||
| # | ||||
| # Simple script for downloading all your files. It goes through all | ||||
| # items visible to the logged in user and downloads the attachments | ||||
| # (the original files). | ||||
| # | ||||
| # The item's metadata are stored next to the files to provide more | ||||
| # information about the item: tags, dates, custom fields etc. This | ||||
| # contains most of your user supplied data. | ||||
| # | ||||
| # This script is intended for having your data outside and independent | ||||
| # of docspell. Another good idea for a backup strategy is to take | ||||
| # database dumps *and* storing the releases of docspell next to this | ||||
| # dump. | ||||
| # | ||||
| # Usage: | ||||
| # | ||||
| # export-files.sh <docspell-base-url> <target-directory> | ||||
| # | ||||
| # The docspell base url is required as well as a directory to store | ||||
| # all the files into. | ||||
| # | ||||
| # Example: | ||||
| # | ||||
| #    export-files.sh http://localhost:7880 /tmp/ds-download | ||||
| # | ||||
| # The script then asks for username and password and starts | ||||
| # downloading. Files are downloaded into the following structure | ||||
| # (below the given target directory): | ||||
| # | ||||
| # - yyyy-mm (item date) | ||||
| #   - A3…XY (item id) | ||||
| #     - somefile.pdf (attachments with name) | ||||
| #     - metadata.json (json file with items metadata) | ||||
| # | ||||
| # By default, files are not overwritten, it stops if existing files | ||||
| # are encountered. Configuration can be specified using environment | ||||
| # variables: | ||||
| # | ||||
| # - OVERWRITE_FILE= if `y` then overwriting existing files is ok. | ||||
| # - SKIP_FILE= if `y` then existing files are skipped (supersedes | ||||
| #   OVERWRITE_FILE). | ||||
| # - DROP_ITEM= if `y` the item folder is removed before attempting to | ||||
| #   download it. If this is set to `y` then the above options don't | ||||
| #   make sense, since they operate on the files inside the item folder | ||||
| # | ||||
| # Docspell sends with each file its sha256 checksum via the ETag | ||||
| # header. This is used to do a integrity check after downloading. | ||||
|  | ||||
|  | ||||
| if [ -z "$1" ]; then | ||||
|     echo "The base-url to docspell is required." | ||||
|     exit 1 | ||||
| else | ||||
|     BASE_URL="$1" | ||||
|     shift | ||||
| fi | ||||
|  | ||||
| if [ -z "$1" ]; then | ||||
|     echo "A directory is required to store the files into." | ||||
|     exit 1 | ||||
| else | ||||
|     TARGET="$1" | ||||
|     shift | ||||
| fi | ||||
|  | ||||
| set -o errexit -o pipefail -o noclobber -o nounset | ||||
|  | ||||
| LOGIN_URL="$BASE_URL/api/v1/open/auth/login" | ||||
| SEARCH_URL="$BASE_URL/api/v1/sec/item/searchWithTags" | ||||
| INSIGHT_URL="$BASE_URL/api/v1/sec/collective/insights" | ||||
| DETAIL_URL="$BASE_URL/api/v1/sec/item" | ||||
| ATTACH_URL="$BASE_URL/api/v1/sec/attachment" | ||||
|  | ||||
| OVERWRITE_FILE=${OVERWRITE_FILE:-n} | ||||
| DROP_ITEM=${DROP_ITEM:-n} | ||||
|  | ||||
| errout() { | ||||
|     >&2 echo "$@" | ||||
| } | ||||
|  | ||||
| trap "{ rm -f ${TMPDIR-:/tmp}/ds-export.*; }" EXIT | ||||
|  | ||||
| mcurl() { | ||||
|     tmpfile1=$(mktemp -t "ds-export.XXXXX") | ||||
|     tmpfile2=$(mktemp -t "ds-export.XXXXX") | ||||
|     set +e | ||||
|     curl -# --fail --stderr "$tmpfile1" -o "$tmpfile2" -H "X-Docspell-Auth: $auth_token" "$@" | ||||
|     status=$? | ||||
|     set -e | ||||
|     if [ $status -ne 0 ]; then | ||||
|         errout "curl -H 'X-Docspell-Auth: …' $@" | ||||
|         errout "Curl command failed (rc=$status)! Output is below." | ||||
|         cat "$tmpfile1" >&2 | ||||
|         cat "$tmpfile2" >&2 | ||||
|         rm -f "$tmpfile1" "$tmpfile2" | ||||
|         return 2 | ||||
|     else | ||||
|         ret=$(cat "$tmpfile2") | ||||
|         rm "$tmpfile2" "$tmpfile1" | ||||
|         echo $ret | ||||
|     fi | ||||
| } | ||||
|  | ||||
|  | ||||
| errout "Login to Docspell." | ||||
| errout "Using url: $BASE_URL" | ||||
| if [ -z "$DS_USER" ]; then | ||||
|     errout -n "Account: " | ||||
|     read DS_USER | ||||
| fi | ||||
| if [ -z "$DS_PASS" ]; then | ||||
|     errout -n "Password: " | ||||
|     read -s DS_PASS | ||||
| fi | ||||
| echo | ||||
|  | ||||
| declare auth | ||||
| declare auth_token | ||||
| declare auth_time | ||||
|  | ||||
|  | ||||
| login() { | ||||
|     auth=$(curl -s --fail -XPOST \ | ||||
|                  --data-binary "{\"account\":\"$DS_USER\", \"password\":\"$DS_PASS\"}" "$LOGIN_URL") | ||||
|  | ||||
|     if [ "$(echo $auth | jq .success)" == "true" ]; then | ||||
|         errout "Login successful" | ||||
|         auth_token=$(echo $auth | jq -r .token) | ||||
|         auth_time=$(date +%s) | ||||
|     else | ||||
|         errout "Login failed." | ||||
|         exit 1 | ||||
|     fi | ||||
| } | ||||
|  | ||||
| checkLogin() { | ||||
|     elapsed=$((1000 * ($(date +%s) - $auth_time))) | ||||
|     maxtime=$(echo $auth | jq .validMs) | ||||
|  | ||||
|     elapsed=$(($elapsed + 1000)) | ||||
|     if [ $elapsed -gt $maxtime ]; then | ||||
|         errout "Need to re-login $elapsed > $maxtime" | ||||
|         login | ||||
|     fi | ||||
| } | ||||
|  | ||||
| listItems() { | ||||
|     OFFSET="${1:-0}" | ||||
|     LIMIT="${2:-50}" | ||||
|     errout "Get next items with offset=$OFFSET, limit=$LIMIT" | ||||
|     REQ="{\"offset\":$OFFSET, \"limit\":$LIMIT, \"tagsInclude\":[],\"tagsExclude\":[],\"tagCategoriesInclude\":[], \"tagCategoriesExclude\":[],\"customValues\":[],\"inbox\":false}" | ||||
|  | ||||
|     mcurl -XPOST -H 'ContentType: application/json' -d "$REQ" "$SEARCH_URL" | jq -r '.groups[].items[]|.id' | ||||
| } | ||||
|  | ||||
| fetchItemCount() { | ||||
|     mcurl -XGET "$INSIGHT_URL" | jq '[.incomingCount, .outgoingCount] | add' | ||||
| } | ||||
|  | ||||
| fetchItem() { | ||||
|     mcurl -XGET "$DETAIL_URL/$1" | ||||
| } | ||||
|  | ||||
| downloadAttachment() { | ||||
|     attachId="$1" | ||||
|     errout " - Download '$attachName' ($attachId)" | ||||
|  | ||||
|     if [ -f "$attachOut" ] && [ "$SKIP_FILE" == "y" ]; then | ||||
|         errout " - Skipping file '$attachOut' since it already exists" | ||||
|     else | ||||
|         if [ -f "$attachOut" ] && [ "$OVERWRITE_FILE" == "y" ]; then | ||||
|             errout " - Removing attachment file as requested: $attachOut" | ||||
|             rm -f "$attachOut" | ||||
|         fi | ||||
|  | ||||
|         checksum1=$(curl --fail -s -I -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" | \ | ||||
|                         grep 'ETag' | cut -d' ' -f2 | jq -r) | ||||
|         curl --fail -s -o "$attachOut" -H "X-Docspell-Auth: $auth_token" "$ATTACH_URL/$attachId/original" | ||||
|         checksum2=$(sha256sum "$attachOut" | cut -d' ' -f1 | xargs) | ||||
|         if [ "$checksum1" == "$checksum2" ]; then | ||||
|             errout " - Checksum ok." | ||||
|         else | ||||
|             errout " - WARNING: Checksum mismatch! Server: $checksum1 Downloaded: $checksum2" | ||||
|             return 3 | ||||
|         fi | ||||
|     fi | ||||
| } | ||||
|  | ||||
| downloadItem() { | ||||
|     checkLogin | ||||
|     itemData=$(fetchItem "$1") | ||||
|     errout "Get item $(echo $itemData | jq -r .id)" | ||||
|     created=$(echo $itemData|jq '.created') | ||||
|     created=$((($(echo $itemData|jq '.created') + 500) / 1000)) | ||||
|     itemId=$(echo $itemData | jq -r '.id') | ||||
|     out="$TARGET/$(date -d @$created +%Y-%m)/$itemId" | ||||
|  | ||||
|     if [ -d "$out" ] && [ "$DROP_ITEM" == "y" ]; then | ||||
|         errout "Removing item folder as requested: $out" | ||||
|         rm -rf "$out" | ||||
|     fi | ||||
|  | ||||
|     mkdir -p "$out" | ||||
|     if [ -f "$out/metadata.json" ] && [ "$SKIP_FILE" == "y" ]; then | ||||
|         errout " - Skipping file 'metadata.json' since it already exists" | ||||
|     else | ||||
|         if [ -f "$out/metadata.json" ] && [ "$OVERWRITE_FILE" == "y" ]; then | ||||
|             errout " - Removing metadata.json as requested" | ||||
|             rm -f "$out/metadata.json" | ||||
|         fi | ||||
|         echo $itemData | jq > "$out/metadata.json" | ||||
|     fi | ||||
|  | ||||
|     while read attachId attachName; do | ||||
|         attachOut="$out/$attachName" | ||||
|         checkLogin | ||||
|         downloadAttachment "$attachId" | ||||
|     done < <(echo $itemData | jq -r '.sources[] | [.id,.name] | join(" ")') | ||||
|  | ||||
| } | ||||
|  | ||||
| login | ||||
|  | ||||
| allCount=$(fetchItemCount) | ||||
| errout "Downloading $allCount items…" | ||||
|  | ||||
| allCounter=0 innerCounter=0 limit=100 offset=0 done=n | ||||
|  | ||||
| while [ "$done" = "n" ]; do | ||||
|     checkLogin | ||||
|  | ||||
|     innerCounter=0 | ||||
|     while read id; do | ||||
|         downloadItem "$id" | ||||
|         innerCounter=$(($innerCounter + 1)) | ||||
|     done < <(listItems $offset $limit) | ||||
|  | ||||
|     allCounter=$(($allCounter + $innerCounter)) | ||||
|     offset=$(($offset + $limit)) | ||||
|  | ||||
|  | ||||
|     if [ $innerCounter -lt $limit ]; then | ||||
|         done=y | ||||
|     fi | ||||
|  | ||||
| done | ||||
| errout "Downloaded $allCounter/$allCount items" | ||||
| if [[ $allCounter < $allCount ]]; then | ||||
|     errout | ||||
|     errout "  Downloaded less items than were reported as available. This" | ||||
|     errout "  may be due to items in folders that you cannot see. Or it" | ||||
|     errout "  may be a bug." | ||||
|     errout | ||||
| fi | ||||
| @@ -82,14 +82,9 @@ documentation, too. | ||||
|  | ||||
| In order to move to a different tool, it is necessary to get the data | ||||
| out of Docspell in a machine readable/automatic way. Currently, there | ||||
| is no *easy way* for this. However, it is possible to get to all data | ||||
| with some scripting effort. Everything can be queried using a | ||||
| [HTTP/REST api](@/docs/api/_index.md) and so you can write a | ||||
| script/program that, for example, queries all items and downloads the | ||||
| files (something like this might be provided soon, for now there are | ||||
| starting points in the `/tools` folder). It is planned to provide a | ||||
| more convenient way to export the data into the file system. But there | ||||
| is no ETA for this. | ||||
| is a [export-files.sh](@/docs/tools/export-files.md) script provided | ||||
| (in the `tools/` folder) that can be used to download all your files | ||||
| and item metadata. | ||||
|  | ||||
| My recommendation is to run periodic database backups and also store | ||||
| the binaries/docker images. This lets you re-create the current state | ||||
|   | ||||
							
								
								
									
										208
									
								
								website/site/content/docs/tools/export-files.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										208
									
								
								website/site/content/docs/tools/export-files.md
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,208 @@ | ||||
| +++ | ||||
| title = "Export Files" | ||||
| description = "Downloads all files from docspell." | ||||
| weight = 65 | ||||
| +++ | ||||
|  | ||||
| # export-files.sh | ||||
|  | ||||
| This script can be used to download all files from docspell that have | ||||
| been uploaded before and the item metadata. | ||||
|  | ||||
| It downloads the original files, those that have been uploaded and not | ||||
| the converted pdf files. | ||||
|  | ||||
| The item's metadata are stored next to the files to provide more | ||||
| information about the item: corresponent, tags, dates, custom fields | ||||
| etc. This contains most of your user supplied data. | ||||
|  | ||||
| This script is intended for having your data outside and independent | ||||
| of docspell. Another good idea for a backup strategy is to take | ||||
| database dumps *and* storing the releases of docspell next to this | ||||
| dump. | ||||
|  | ||||
| Files are stored into the following folder structure (below the given | ||||
| target directory): | ||||
|  | ||||
| ``` | ||||
| - yyyy-mm (item date) | ||||
|   - A3…XY (item id) | ||||
|     - somefile.pdf (attachments with name) | ||||
|     - metadata.json (json file with items metadata) | ||||
| ``` | ||||
|  | ||||
| By default, files are not overwritten, it stops if existing files are | ||||
| encountered. This and some other things can be changed using | ||||
| environment variables: | ||||
|  | ||||
| - `DS_USER` the account name for login, it is asked if not available | ||||
| - `DS_PASS` the password for login, it is asked if not available | ||||
| - `OVERWRITE_FILE=` if `y` then overwriting existing files is ok. | ||||
|   Default is `n`. | ||||
| - `SKIP_FILE=` if `y` then existing files are skipped (supersedes | ||||
|   `OVERWRITE_FILE`). Default is `n`. | ||||
| - `DROP_ITEM=` if `y` the item folder is removed before attempting to | ||||
|   download it. If this is set to `y` then the above options don't make | ||||
|   sense, since they operate on the files inside the item folder. | ||||
|   Default is `n`. | ||||
|  | ||||
| Docspell sends the sha256 hash with each file via the ETag header. | ||||
| This is used to do a integrity check after downloading. | ||||
|  | ||||
|  | ||||
| # Requirements | ||||
|  | ||||
| It is a bash script that additionally needs | ||||
| [curl](https://curl.haxx.se/) and [jq](https://stedolan.github.io/jq/) | ||||
| to be available. | ||||
|  | ||||
| # Usage | ||||
|  | ||||
| ``` | ||||
| ./export-files.sh <docspell-base-url> <target-directory> | ||||
| ``` | ||||
|  | ||||
| For example, if docspell is at `http://localhost:7880`: | ||||
|  | ||||
| ``` | ||||
| ./export-files.sh http://localhost:7880 /tmp/ds-downloads | ||||
| ``` | ||||
|  | ||||
| The script asks for your account name and password. It then logs in | ||||
| and goes through all items downloading the metadata as json and the | ||||
| attachments. | ||||
|  | ||||
|  | ||||
| # Example Run | ||||
|  | ||||
| ``` bash | ||||
| fish> env SKIP_FILE=y DS_USER=demo DS_PASS=test ./export-files.sh http://localhost:7880 /tmp/download | ||||
| Login to Docspell. | ||||
| Using url: http://localhost:7880 | ||||
|  | ||||
| Login successful | ||||
| Downloading 73 items… | ||||
| Get next items with offset=0, limit=100 | ||||
| Get item 57Znskthf3g-X7RP1fxzE2U-dwr4vM6Yjnn-b7s1PoCznhz | ||||
|  - Download 'something.txt' (8HbeFornAUN-kBCyc8bHSVr-bnLBYDzgRQ7-peMZzyTzM2X) | ||||
|  - Checksum ok. | ||||
| Get item 94u5Pt39q6N-7vKu3LugoRj-zohGS4ie4jb-68bW5gXU6Jd | ||||
|  - Download 'letter-en.pdf' (6KNNmoyqpew-RAkdwEmQgBT-QDqdY97whZA-4k2rmbssdfQ) | ||||
|  - Checksum ok. | ||||
| Get item 7L9Fh53RVG4-vGSt2G2YUcY-cvpBKRXQgBn-omYpg6xQXyD | ||||
|  - Download 'mail.html' (A6yTYKrDc7y-xU3whmLB1kB-TGhEAVb12mo-RUw5u9PsYMo) | ||||
|  - Checksum ok. | ||||
| Get item DCn9UtWUtvF-2qjxB5PXGEG-vqRUUU7JUJH-zBBrmSeGYPe | ||||
|  - Download 'Invoice_7340224.pdf' (6FWdjxJh7yB-CCjY39p6uH9-uVLbmGfm25r-cw6RksrSx4n) | ||||
|  - Checksum ok. | ||||
| … | ||||
| ``` | ||||
|  | ||||
| The resulting directory looks then like this: | ||||
|  | ||||
| ``` bash | ||||
| … | ||||
| ├── 2020-08 | ||||
| │   ├── 6t27gQQ4TfW-H4uAmkYyiSe-rBnerFE2v5F-9BdqbGEhMcv | ||||
| │   │   ├── 52241.pdf | ||||
| │   │   └── metadata.json | ||||
| │   └── 9qwT2GuwEvV-s9UuBQ4w7o9-uE8AdMc7PwL-GFDd62gduAm | ||||
| │       ├── DOC-20191223-155707.jpg | ||||
| │       └── metadata.json | ||||
| ├── 2020-09 | ||||
| │   ├── 2CM8C9VaVAT-sVJiKyUPCvR-Muqr2Cqvi6v-GXhRtg6eomA | ||||
| │   │   ├── letter with spaces.pdf | ||||
| │   │   └── metadata.json | ||||
| │   ├── 4sXpX2Sc9Ex-QX1M6GtjiXp-DApuDDzGQXR-7pg1QPW9pbs | ||||
| │   │   ├── analyse.org | ||||
| │   │   ├── 201703.docx | ||||
| │   │   ├── 11812_120719.pdf | ||||
| │   │   ├── letter-de.pdf | ||||
| │   │   ├── letter-en.pdf | ||||
| │   │   └── metadata.json | ||||
| │   ├── 5VhP5Torsy1-15pwJBeRjPi-es8BGnxhWn7-3pBQTJv3zPb | ||||
| │   │   └── metadata.json | ||||
| │   ├── 7ePWmK4xCNk-gmvnTDdFwG8-JcN5MDSUNPL-NTZZrho2Jc6 | ||||
| │   │   ├── metadata.json | ||||
| │   │   └── Rechnung.pdf | ||||
| … | ||||
| ``` | ||||
|  | ||||
| The `metadata.json` file contains all the item metadata. This may be | ||||
| useful when importing into other tools. | ||||
|  | ||||
| ``` json | ||||
| { | ||||
|   "id": "AWCNx7tJgUw-SdrNtRouNJB-FGs6Y2VP5bV-218sFN8mjjk", | ||||
|   "direction": "incoming", | ||||
|   "name": "Ruecksendung.pdf", | ||||
|   "source": "integration", | ||||
|   "state": "confirmed", | ||||
|   "created": 1606171810005, | ||||
|   "updated": 1606422917826, | ||||
|   "itemDate": null, | ||||
|   "corrOrg": null, | ||||
|   "corrPerson": null, | ||||
|   "concPerson": null, | ||||
|   "concEquipment": null, | ||||
|   "inReplyTo": null, | ||||
|   "folder": null, | ||||
|   "dueDate": null, | ||||
|   "notes": null, | ||||
|   "attachments": [ | ||||
|     { | ||||
|       "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q", | ||||
|       "name": "Ruecksendung.converted.pdf", | ||||
|       "size": 57777, | ||||
|       "contentType": "application/pdf", | ||||
|       "converted": true | ||||
|     } | ||||
|   ], | ||||
|   "sources": [ | ||||
|     { | ||||
|       "id": "4aPmhrjfR9Z-AgknoW6yVoE-YkffioD2KXV-E6Vm6snH17Q", | ||||
|       "name": "Ruecksendung.pdf", | ||||
|       "size": 65715, | ||||
|       "contentType": "application/pdf" | ||||
|     } | ||||
|   ], | ||||
|   "archives": [], | ||||
|   "tags": [ | ||||
|     { | ||||
|       "id": "EQvJ6AHw19Y-Cdg3gF78zZk-BY2zFtNTwes-J95jpXpzhfw", | ||||
|       "name": "Hupe", | ||||
|       "category": "state", | ||||
|       "created": 1606427083171 | ||||
|     }, | ||||
|     { | ||||
|       "id": "4xyZoeeELdJ-tJ91GiRLinJ-7bdauy3U1jR-Bzr4VS96bGS", | ||||
|       "name": "Invoice", | ||||
|       "category": "doctype", | ||||
|       "created": 1594249709473 | ||||
|     } | ||||
|   ], | ||||
|   "customfields": [ | ||||
|     { | ||||
|       "id": "5tYmDHin3Kx-HomKkeEVtJN-v99oKxQ8ot6-yFVrEmMayoo", | ||||
|       "name": "amount", | ||||
|       "label": "EUR", | ||||
|       "ftype": "money", | ||||
|       "value": "151.55" | ||||
|     }, | ||||
|     { | ||||
|       "id": "3jbwbep8rDs-hNJ9ePRE7gv-21nYMbUj3eb-mKRWAr4xSS2", | ||||
|       "name": "invoice-number", | ||||
|       "label": "Invoice-Nr", | ||||
|       "ftype": "text", | ||||
|       "value": "I454602" | ||||
|     }, | ||||
|     { | ||||
|       "id": "AH4p4NUCa9Y-EUkH66wLzxE-Rf2wJPxTAYd-DeGDm4AT4Yg", | ||||
|       "name": "number", | ||||
|       "label": "Number", | ||||
|       "ftype": "numeric", | ||||
|       "value": "0.10" | ||||
|     } | ||||
|   ] | ||||
| } | ||||
| ``` | ||||
		Reference in New Issue
	
	Block a user