Merge pull request #377 from totti4ever/pr-paperless_import

paperless-importer (v0.3 beta) - added SKIP_EXISTING_DOCS variable and improved error handling
This commit is contained in:
eikek 2020-10-22 23:38:39 +02:00 committed by GitHub
commit b2b5ddef97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 191 additions and 126 deletions

View File

@ -0,0 +1,31 @@
# Paperless to Docspell Importer
_by totti4ever_
:warning: **BE AWARE** You should test this script on an empty database (backup yours) or at least an own collective :warning:
## Information
After using [Paperless](https://github.com/the-paperless-project/paperless/) for quite a while, I figured out that there is some room for improvement but only little work still done on the project, which is totally fine as it is a private and open-source project!
So I came around Docspell and found it to have quite a potential, especially regarding the AI and AI-like features growing.
Still I wanted to transfer the tagging and structure from Paperless to Docspell and not only import the files and start over the managing process once again.
This is why I put in my dirty bash scripting skills and made a script, which reads the files from the internal documents folder of Paperless and extracts tags and correspondents from Paperless and imports them to Docspell using the official API, so no dirty DB writes or something like that!
## Usage
1. Clone the project or simply copy the `import-paperless.sh` script to the machine, where Paperless is installed
2. run import-paperless.sh with the following parameters
1. URL of Docspell, including http(s)
2. Username for Docspell, possibly including Collective (if other name as user)
3. Password for Docspell
4. Path to Paperless' database file (`db.sqlite3`). When using Paperless with docker, it is in the mapped directory `/usr/src/paperless/data`
5. Path to Paperless' document base directory. When using Paperless with docker, it is the mapped directory `/usr/src/paperless/media/documents/origin/`
3. You can use the following variables inside the script (right at the top)
* LIMIT="LIMIT 0" (default: inactive)
For testing purposes, limits the number of tags and correspondents read from Paperless (this will most likely lead to warnings when processing the documents)
* LIMIT_DOC="LIMIT 5" (default: inactive)
For testing purposes, limits the number of documents and document-to-tag relations read from Paperless
* SKIP_EXISTING_DOCS=true (default: true)
Won't touch already existing documents. If set to `false` documents, which exist already, won't be uploaded again, but the tags, correspondent, date and title from Paperless will be applied.
:warning: In case you already had set these information in Docspell, they will be overwritten!
I found it quite useful, to start with 5 documents and no tags and then continue with óut a tag limit, but with 20-50 documents. Afterwards I removed both limits.

View File

@ -1,11 +1,13 @@
#!/usr/bin/env bash
# allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
# LIMIT=LIMIT 150
# LIMIT="LIMIT 0"
# LIMIT_DOC="LIMIT 5"
SKIP_EXISTING_DOCS=true
echo "##################### START #####################"
echo " Docspell - Import from Paperless v '0.2 beta'"
echo " Docspell - Import from Paperless v '0.3 beta'"
echo " by totti4ever" && echo
echo " $(date)"
echo
@ -43,7 +45,7 @@ modes=("documents_correspondent" "documents_document" "documents_tag" "documents
# the columns per table we need
declare -A columns
#documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename
columns[documents_document]="id, title, datetime(created,'localtime') as created, added, correspondent_id, file_type, filename"
columns[documents_document]="id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename"
#documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
columns[documents_correspondent]="id, name"
#documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
@ -56,14 +58,15 @@ declare -A corr2name
declare -A tag2name
declare -A doc2name
declare -A pl2ds_id
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then declare -A doc_skip; fi
############# FUCNTIONS
############# FUNCTIONS
function curl_call() {
curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'"
curl_result=$(eval $curl_cmd)
if [ "$curl_result" == '"Authentication failed."' ]; then
printf "\nNew login required... "
if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then
printf "\nNew login required ($curl_result)... "
login
printf "%${#len_resultset}s" " "; printf " .."
curl_call $1
@ -102,7 +105,13 @@ for mode in "${modes[@]}"; do
OLDIFS=$IFS
IFS=$'\n'
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 $LIMIT;"`)
if [ "$mode" == "documents_document" ] || [ "$mode" == "documents_document_tags" ]; then
tmp_limit=$LIMIT_DOC
else
tmp_limit=$LIMIT
fi
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 DESC $tmp_limit;"`)
tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n'))
len_resultset=${#tmp_resultset[@]}
@ -111,7 +120,7 @@ for mode in "${modes[@]}"; do
for ((i=1;i<$len_resultset;i++)); do
# split result into array
tmp_result=($(echo "${tmp_resultset[$i]}" | tr '|' '\n'))
tmp_result=($(echo "${tmp_resultset[$i]/'||'/'| |'}" | tr '|' '\n'))
# process single result array
len_result=${#tmp_result[@]}
@ -167,131 +176,143 @@ for mode in "${modes[@]}"; do
# upload if not existent
if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then
echo -n "File does not exist, uploading... "
echo -n "File does not exist, uploading.."
curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo "done"
printf ". ."
else
echo "FATAL upload failed"
echo -e "FATAL upload failed\nCmd: $curl_cmd\nResp: $curl_result\nStatus: $curl_status"
exit 4
fi
else
echo "File already exists, nothing to upload"
printf "File already exists"
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then
echo ", skipping this item for all types" && echo
doc_skip[${tmp_result_arr[id]}]="true"
else
printf ", nothing to upload.Fetching ID.."
fi
fi
# link orga to document
printf "%${#len_resultset}s" " "; printf " "
printf "Waiting for document to link organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
count=0
countMax=10
while [ $count -le $countMax ]; do
# get Docspell id of document
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
curl_status=$(echo $curl_result | jq -r ".exists")
res=$?
# skip if needed (SKIP_EXISTING_DOCS)
if [ ! ${doc_skip[${tmp_result_arr[id]}]+abc} ]; then
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
curl_status=$(echo $curl_result | jq -r ".items[0].id")
# paperless id to docspell id for later use
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
# waitig for document and get document id
count=0
countMax=25
while [ $count -le $countMax ]; do
# get Docspell id of document
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
curl_status=$(echo $curl_result | jq -r ".exists")
res=$?
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
count2=0
count2Max=5
while [ $count2 -le $count2Max ]; do
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
# file id returned
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
curl_status=$(echo $curl_result | jq -r ".items[0].id")
# paperless id to docspell id for later use
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
echo ".done"
break
# Search for exact match of paperless correspondent in docspell organizations
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
# unknown error
elif [ $res -ne 0 ]; then
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
exit 7
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
# Set actual link to document
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo ". done"
else
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
# Set name of document
printf "%${#len_resultset}s" " "; printf " "
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo "Set name of item: \"${tmp_result_arr[title]}\""
else
echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
# Set created date of document
printf "%${#len_resultset}s" " "; printf " "
tmp_date=${tmp_result_arr[created]:0:10}
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo "Set creation date of item: \"$tmp_date\""
else
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
break
elif [ $count2 -ge $count2Max ]; then
echo "FATAL Upload failed (or processing too slow)"
exit 6
# FIXME I think, the loop is not needed here - organizations seem to be there immediately
else
printf "."
fi
sleep $(( count2*count2 ))
((count2++))
done
# counter too high
elif [ $count -ge $countMax ]; then
echo "FATAL Upload failed (or processing too slow)"
exit 8
else
echo "Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]})"
printf "."
fi
break
sleep $(( count * count ))
((count++))
done
elif [ $res -ne 0 ]; then
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
exit 7
elif [ $count -ge $countMax ]; then
echo "FATAL Upload failed (or processing too slow)"
exit 8
# link orga to document
printf "%${#len_resultset}s" " "; printf " "
if [ ! "${tmp_result_arr[correspondent_id]/' '/''}" == "" ]; then
# check for availability of document id and name of organization
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
printf "Set link to organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
# get organizations matching doc's orga (can be several when parts match)
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
# Search for exact match of paperless correspondent in fetched organizations from Docspell
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
# double-check that found organization matches doc's correspondent
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
# Set actual link to document
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo ". done"
# unknown error
else
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
else
echo "FATAL Unknown error"
exit 6
fi
else
echo "WARNING Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]}) - Limits are $LIMIT / $LIMIT_DOC"
fi
else
echo "No correspondent set in Paperless, skipping."
fi
# Set name of document
printf "%${#len_resultset}s" " "; printf " "
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo "Set name of item: \"${tmp_result_arr[title]}\""
else
printf "."
echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
sleep $(( count * count ))
((count++))
done
echo
# TAGS
elif [ "$mode" == "documents_tag" ]; then
# Set created date of document
printf "%${#len_resultset}s" " "; printf " "
tmp_date="${tmp_result_arr[created]:0:10} 12:00:00" #fix for timezone variations
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo "Set creation date of item: \"${tmp_date:0:10}\""
else
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
exit 5
fi
echo
fi # done with documents
# TAGS
elif [ "$mode" == "documents_tag" ]; then
if [ ! "${tmp_result_arr[name]}" == "" ] && [ ! "${tmp_result_arr[id]}" == "" ]; then
echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]"
printf "%${#len_resultset}s" " "; printf " "
@ -309,26 +330,39 @@ for mode in "${modes[@]}"; do
echo "FATAL Error during creation of tag: $(echo $curl_result | jq -r '.message')"
exit 9
fi
# TAGS 2 DOCUMENTS
elif [ "$mode" == "documents_document_tags" ]; then
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
printf "%${#len_resultset}s" " "; printf " "
#link tags to documents
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo '...applied'
else
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
fi
else
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
fi
done
done
# TAGS 2 DOCUMENTS
elif [ "$mode" == "documents_document_tags" ]; then
# if doc_skip is not set for document_id
if [ ! ${doc_skip[${tmp_result_arr[document_id]}]+abc} ]; then
if [ ! "${tag2name[${tmp_result_arr[tag_id]}]}" == "" ] && [ ! "${tmp_result_arr[tag_id]}" == "" ]; then
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
printf "%${#len_resultset}s" " "; printf " "
#link tags to documents
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
curl_status=$(echo $curl_result | jq -r ".success")
if [ "$curl_status" == "true" ]; then
echo '...applied'
else
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
fi
else
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
fi
else
echo -en "\r"
sleep 0.1
fi
fi # done with mode processing
done # with single resultset
done # with modes
echo ################# DONE #################
date