mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-04 14:15:59 +00:00
added SKIP_EXISTING_DOCS variable and improved error handling
* errors, specially during long initial runs are caught better now and should continue, if possible * set `SKIP_EXISTING_DOCS` to `true` (default) to skip existing documents (if set to false, they won't be uploaded again, but tags, correspondent, date and title will be overwritten) * for testing purposes you can set the variables `LIMIT` and `LIMIT_DOC` to limit the tags and correspondents respetively the documents and document-tag-relations
This commit is contained in:
parent
e69e05fd19
commit
8aaffb0455
@ -1,11 +1,13 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
|
# allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
|
||||||
# LIMIT=LIMIT 150
|
# LIMIT="LIMIT 0"
|
||||||
|
# LIMIT_DOC="LIMIT 5"
|
||||||
|
SKIP_EXISTING_DOCS=true
|
||||||
|
|
||||||
echo "##################### START #####################"
|
echo "##################### START #####################"
|
||||||
|
|
||||||
echo " Docspell - Import from Paperless v '0.2 beta'"
|
echo " Docspell - Import from Paperless v '0.3 beta'"
|
||||||
echo " by totti4ever" && echo
|
echo " by totti4ever" && echo
|
||||||
echo " $(date)"
|
echo " $(date)"
|
||||||
echo
|
echo
|
||||||
@ -43,7 +45,7 @@ modes=("documents_correspondent" "documents_document" "documents_tag" "documents
|
|||||||
# the columns per table we need
|
# the columns per table we need
|
||||||
declare -A columns
|
declare -A columns
|
||||||
#documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename
|
#documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename
|
||||||
columns[documents_document]="id, title, datetime(created,'localtime') as created, added, correspondent_id, file_type, filename"
|
columns[documents_document]="id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename"
|
||||||
#documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
|
#documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
|
||||||
columns[documents_correspondent]="id, name"
|
columns[documents_correspondent]="id, name"
|
||||||
#documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
|
#documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
|
||||||
@ -56,14 +58,15 @@ declare -A corr2name
|
|||||||
declare -A tag2name
|
declare -A tag2name
|
||||||
declare -A doc2name
|
declare -A doc2name
|
||||||
declare -A pl2ds_id
|
declare -A pl2ds_id
|
||||||
|
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then declare -A doc_skip; fi
|
||||||
|
|
||||||
############# FUCNTIONS
|
############# FUNCTIONS
|
||||||
function curl_call() {
|
function curl_call() {
|
||||||
curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'"
|
curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'"
|
||||||
curl_result=$(eval $curl_cmd)
|
curl_result=$(eval $curl_cmd)
|
||||||
|
|
||||||
if [ "$curl_result" == '"Authentication failed."' ]; then
|
if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then
|
||||||
printf "\nNew login required... "
|
printf "\nNew login required (§curl_result)... "
|
||||||
login
|
login
|
||||||
printf "%${#len_resultset}s" " "; printf " .."
|
printf "%${#len_resultset}s" " "; printf " .."
|
||||||
curl_call $1
|
curl_call $1
|
||||||
@ -102,7 +105,13 @@ for mode in "${modes[@]}"; do
|
|||||||
OLDIFS=$IFS
|
OLDIFS=$IFS
|
||||||
IFS=$'\n'
|
IFS=$'\n'
|
||||||
|
|
||||||
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 $LIMIT;"`)
|
if [ "$mode" == "documents_document" ] || [ "$mode" == "documents_document_tags" ]; then
|
||||||
|
tmp_limit=$LIMIT_DOC
|
||||||
|
else
|
||||||
|
tmp_limit=$LIMIT
|
||||||
|
fi
|
||||||
|
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 DESC $tmp_limit;"`)
|
||||||
|
|
||||||
|
|
||||||
tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n'))
|
tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n'))
|
||||||
len_resultset=${#tmp_resultset[@]}
|
len_resultset=${#tmp_resultset[@]}
|
||||||
@ -111,7 +120,7 @@ for mode in "${modes[@]}"; do
|
|||||||
for ((i=1;i<$len_resultset;i++)); do
|
for ((i=1;i<$len_resultset;i++)); do
|
||||||
|
|
||||||
# split result into array
|
# split result into array
|
||||||
tmp_result=($(echo "${tmp_resultset[$i]}" | tr '|' '\n'))
|
tmp_result=($(echo "${tmp_resultset[$i]/'||'/'| |'}" | tr '|' '\n'))
|
||||||
|
|
||||||
# process single result array
|
# process single result array
|
||||||
len_result=${#tmp_result[@]}
|
len_result=${#tmp_result[@]}
|
||||||
@ -167,47 +176,81 @@ for mode in "${modes[@]}"; do
|
|||||||
|
|
||||||
# upload if not existent
|
# upload if not existent
|
||||||
if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then
|
if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then
|
||||||
echo -n "File does not exist, uploading... "
|
echo -n "File does not exist, uploading.."
|
||||||
curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'"
|
curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'"
|
||||||
|
|
||||||
curl_status=$(echo $curl_result | jq -r ".success")
|
curl_status=$(echo $curl_result | jq -r ".success")
|
||||||
if [ "$curl_status" == "true" ]; then
|
if [ "$curl_status" == "true" ]; then
|
||||||
echo "done"
|
printf ". ."
|
||||||
|
|
||||||
else
|
else
|
||||||
echo "FATAL upload failed"
|
echo -e "FATAL upload failed\nCmd: $curl_cmd\nResp: $curl_result\nStatus: $curl_status"
|
||||||
exit 4
|
exit 4
|
||||||
fi
|
fi
|
||||||
|
|
||||||
else
|
else
|
||||||
echo "File already exists, nothing to upload"
|
printf "File already exists"
|
||||||
|
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then
|
||||||
|
echo ", skipping this item for all types" && echo
|
||||||
|
doc_skip[${tmp_result_arr[id]}]="true"
|
||||||
|
else
|
||||||
|
printf ", nothing to upload.Fetching ID.."
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# link orga to document
|
# skip if needed (SKIP_EXISTING_DOCS)
|
||||||
printf "%${#len_resultset}s" " "; printf " "
|
if [ ! ${doc_skip[${tmp_result_arr[id]}]+abc} ]; then
|
||||||
printf "Waiting for document to link organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
|
|
||||||
|
# waitig for document and get document id
|
||||||
count=0
|
count=0
|
||||||
countMax=10
|
countMax=25
|
||||||
while [ $count -le $countMax ]; do
|
while [ $count -le $countMax ]; do
|
||||||
# get Docspell id of document
|
# get Docspell id of document
|
||||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
|
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
|
||||||
curl_status=$(echo $curl_result | jq -r ".exists")
|
curl_status=$(echo $curl_result | jq -r ".exists")
|
||||||
res=$?
|
res=$?
|
||||||
|
|
||||||
|
# file id returned
|
||||||
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
|
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
|
||||||
curl_status=$(echo $curl_result | jq -r ".items[0].id")
|
curl_status=$(echo $curl_result | jq -r ".items[0].id")
|
||||||
# paperless id to docspell id for later use
|
# paperless id to docspell id for later use
|
||||||
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
|
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
|
||||||
|
echo ".done"
|
||||||
|
break
|
||||||
|
|
||||||
|
# unknown error
|
||||||
|
elif [ $res -ne 0 ]; then
|
||||||
|
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
|
||||||
|
exit 7
|
||||||
|
|
||||||
|
# counter too high
|
||||||
|
elif [ $count -ge $countMax ]; then
|
||||||
|
echo "FATAL Upload failed (or processing too slow)"
|
||||||
|
exit 8
|
||||||
|
|
||||||
|
else
|
||||||
|
printf "."
|
||||||
|
fi
|
||||||
|
sleep $(( count * count ))
|
||||||
|
((count++))
|
||||||
|
done
|
||||||
|
|
||||||
|
|
||||||
|
# link orga to document
|
||||||
|
printf "%${#len_resultset}s" " "; printf " "
|
||||||
|
if [ ! "${tmp_result_arr[correspondent_id]/' '/''}" == "" ]; then
|
||||||
|
|
||||||
|
# check for availability of document id and name of organization
|
||||||
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
|
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
|
||||||
count2=0
|
printf "Set link to organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
|
||||||
count2Max=5
|
|
||||||
while [ $count2 -le $count2Max ]; do
|
# get organizations matching doc's orga (can be several when parts match)
|
||||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
|
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
|
||||||
|
|
||||||
# Search for exact match of paperless correspondent in docspell organizations
|
# Search for exact match of paperless correspondent in fetched organizations from Docspell
|
||||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
|
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
|
||||||
|
|
||||||
|
# double-check that found organization matches doc's correspondent
|
||||||
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
|
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
|
||||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
|
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
|
||||||
|
|
||||||
@ -218,10 +261,21 @@ for mode in "${modes[@]}"; do
|
|||||||
if [ "$curl_status" == "true" ]; then
|
if [ "$curl_status" == "true" ]; then
|
||||||
echo ". done"
|
echo ". done"
|
||||||
|
|
||||||
|
# unknown error
|
||||||
else
|
else
|
||||||
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||||
exit 5
|
exit 5
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
echo "FATAL Unknown error"
|
||||||
|
exit 6
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "WARNING Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "No correspondent set in Paperless, skipping."
|
||||||
|
fi
|
||||||
|
|
||||||
# Set name of document
|
# Set name of document
|
||||||
printf "%${#len_resultset}s" " "; printf " "
|
printf "%${#len_resultset}s" " "; printf " "
|
||||||
@ -241,57 +295,24 @@ for mode in "${modes[@]}"; do
|
|||||||
# Set created date of document
|
# Set created date of document
|
||||||
printf "%${#len_resultset}s" " "; printf " "
|
printf "%${#len_resultset}s" " "; printf " "
|
||||||
|
|
||||||
tmp_date=${tmp_result_arr[created]:0:10}
|
tmp_date="${tmp_result_arr[created]:0:10} 12:00:00" #fix for timezone variations
|
||||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
|
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
|
||||||
|
|
||||||
curl_status=$(echo $curl_result | jq -r ".success")
|
curl_status=$(echo $curl_result | jq -r ".success")
|
||||||
if [ "$curl_status" == "true" ]; then
|
if [ "$curl_status" == "true" ]; then
|
||||||
echo "Set creation date of item: \"$tmp_date\""
|
echo "Set creation date of item: \"${tmp_date:0:10}\""
|
||||||
|
|
||||||
else
|
else
|
||||||
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||||
exit 5
|
exit 5
|
||||||
fi
|
fi
|
||||||
|
|
||||||
break
|
|
||||||
|
|
||||||
elif [ $count2 -ge $count2Max ]; then
|
|
||||||
echo "FATAL Upload failed (or processing too slow)"
|
|
||||||
exit 6
|
|
||||||
|
|
||||||
# FIXME I think, the loop is not needed here - organizations seem to be there immediately
|
|
||||||
else
|
|
||||||
printf "."
|
|
||||||
fi
|
|
||||||
|
|
||||||
sleep $(( count2*count2 ))
|
|
||||||
((count2++))
|
|
||||||
done
|
|
||||||
|
|
||||||
else
|
|
||||||
echo "Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]})"
|
|
||||||
|
|
||||||
fi
|
|
||||||
break
|
|
||||||
|
|
||||||
elif [ $res -ne 0 ]; then
|
|
||||||
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
|
|
||||||
exit 7
|
|
||||||
|
|
||||||
elif [ $count -ge $countMax ]; then
|
|
||||||
echo "FATAL Upload failed (or processing too slow)"
|
|
||||||
exit 8
|
|
||||||
|
|
||||||
else
|
|
||||||
printf "."
|
|
||||||
fi
|
|
||||||
sleep $(( count * count ))
|
|
||||||
((count++))
|
|
||||||
done
|
|
||||||
echo
|
echo
|
||||||
|
|
||||||
|
fi # done with documents
|
||||||
|
|
||||||
# TAGS
|
# TAGS
|
||||||
elif [ "$mode" == "documents_tag" ]; then
|
elif [ "$mode" == "documents_tag" ]; then
|
||||||
|
if [ ! "${tmp_result_arr[name]}" == "" ] && [ ! "${tmp_result_arr[id]}" == "" ]; then
|
||||||
echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]"
|
echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]"
|
||||||
printf "%${#len_resultset}s" " "; printf " "
|
printf "%${#len_resultset}s" " "; printf " "
|
||||||
|
|
||||||
@ -309,10 +330,16 @@ for mode in "${modes[@]}"; do
|
|||||||
echo "FATAL Error during creation of tag: $(echo $curl_result | jq -r '.message')"
|
echo "FATAL Error during creation of tag: $(echo $curl_result | jq -r '.message')"
|
||||||
exit 9
|
exit 9
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
# TAGS 2 DOCUMENTS
|
# TAGS 2 DOCUMENTS
|
||||||
elif [ "$mode" == "documents_document_tags" ]; then
|
elif [ "$mode" == "documents_document_tags" ]; then
|
||||||
|
# if doc_skip is not set for document_id
|
||||||
|
if [ ! ${doc_skip[${tmp_result_arr[document_id]}]+abc} ]; then
|
||||||
|
if [ ! "${tag2name[${tmp_result_arr[tag_id]}]}" == "" ] && [ ! "${tmp_result_arr[tag_id]}" == "" ]; then
|
||||||
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
|
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
|
||||||
printf "%${#len_resultset}s" " "; printf " "
|
printf "%${#len_resultset}s" " "; printf " "
|
||||||
|
|
||||||
@ -325,10 +352,17 @@ for mode in "${modes[@]}"; do
|
|||||||
else
|
else
|
||||||
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
|
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||||
fi
|
fi
|
||||||
|
else
|
||||||
|
echo -en "\r"
|
||||||
|
sleep 0.1
|
||||||
|
fi
|
||||||
|
fi # done with mode processing
|
||||||
|
|
||||||
done
|
done # with single resultset
|
||||||
done
|
done # with modes
|
||||||
|
|
||||||
echo ################# DONE #################
|
echo ################# DONE #################
|
||||||
date
|
date
|
||||||
|
Loading…
x
Reference in New Issue
Block a user