From 8aaffb0455f797bb32ab6199f00da292faf2e9c7 Mon Sep 17 00:00:00 2001 From: Malte Date: Thu, 22 Oct 2020 19:58:32 +0200 Subject: [PATCH 1/3] added SKIP_EXISTING_DOCS variable and improved error handling * errors, specially during long initial runs are caught better now and should continue, if possible * set `SKIP_EXISTING_DOCS` to `true` (default) to skip existing documents (if set to false, they won't be uploaded again, but tags, correspondent, date and title will be overwritten) * for testing purposes you can set the variables `LIMIT` and `LIMIT_DOC` to limit the tags and correspondents respetively the documents and document-tag-relations --- tools/import-paperless/import-paperless.sh | 286 ++++++++++++--------- 1 file changed, 160 insertions(+), 126 deletions(-) diff --git a/tools/import-paperless/import-paperless.sh b/tools/import-paperless/import-paperless.sh index 3fd9c6bb..2cb762cb 100755 --- a/tools/import-paperless/import-paperless.sh +++ b/tools/import-paperless/import-paperless.sh @@ -1,11 +1,13 @@ #!/usr/bin/env bash # allows to start small - but affects also tags and correspondents, so they might be missing when linking them! -# LIMIT=LIMIT 150 +# LIMIT="LIMIT 0" +# LIMIT_DOC="LIMIT 5" +SKIP_EXISTING_DOCS=true echo "##################### START #####################" -echo " Docspell - Import from Paperless v '0.2 beta'" +echo " Docspell - Import from Paperless v '0.3 beta'" echo " by totti4ever" && echo echo " $(date)" echo @@ -43,7 +45,7 @@ modes=("documents_correspondent" "documents_document" "documents_tag" "documents # the columns per table we need declare -A columns #documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename -columns[documents_document]="id, title, datetime(created,'localtime') as created, added, correspondent_id, file_type, filename" +columns[documents_document]="id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename" #documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug columns[documents_correspondent]="id, name" #documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug @@ -56,14 +58,15 @@ declare -A corr2name declare -A tag2name declare -A doc2name declare -A pl2ds_id +if [ "$SKIP_EXISTING_DOCS" == "true" ]; then declare -A doc_skip; fi -############# FUCNTIONS +############# FUNCTIONS function curl_call() { curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'" curl_result=$(eval $curl_cmd) - if [ "$curl_result" == '"Authentication failed."' ]; then - printf "\nNew login required... " + if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then + printf "\nNew login required (§curl_result)... " login printf "%${#len_resultset}s" " "; printf " .." curl_call $1 @@ -102,7 +105,13 @@ for mode in "${modes[@]}"; do OLDIFS=$IFS IFS=$'\n' - tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 $LIMIT;"`) + if [ "$mode" == "documents_document" ] || [ "$mode" == "documents_document_tags" ]; then + tmp_limit=$LIMIT_DOC + else + tmp_limit=$LIMIT + fi + tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 DESC $tmp_limit;"`) + tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n')) len_resultset=${#tmp_resultset[@]} @@ -111,7 +120,7 @@ for mode in "${modes[@]}"; do for ((i=1;i<$len_resultset;i++)); do # split result into array - tmp_result=($(echo "${tmp_resultset[$i]}" | tr '|' '\n')) + tmp_result=($(echo "${tmp_resultset[$i]/'||'/'| |'}" | tr '|' '\n')) # process single result array len_result=${#tmp_result[@]} @@ -167,131 +176,143 @@ for mode in "${modes[@]}"; do # upload if not existent if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then - echo -n "File does not exist, uploading... " + echo -n "File does not exist, uploading.." curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'" curl_status=$(echo $curl_result | jq -r ".success") if [ "$curl_status" == "true" ]; then - echo "done" + printf ". ." else - echo "FATAL upload failed" + echo -e "FATAL upload failed\nCmd: $curl_cmd\nResp: $curl_result\nStatus: $curl_status" exit 4 fi else - echo "File already exists, nothing to upload" + printf "File already exists" + if [ "$SKIP_EXISTING_DOCS" == "true" ]; then + echo ", skipping this item for all types" && echo + doc_skip[${tmp_result_arr[id]}]="true" + else + printf ", nothing to upload.Fetching ID.." + fi fi - # link orga to document - printf "%${#len_resultset}s" " "; printf " " - printf "Waiting for document to link organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .." - count=0 - countMax=10 - while [ $count -le $countMax ]; do - # get Docspell id of document - curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'" - curl_status=$(echo $curl_result | jq -r ".exists") - res=$? + # skip if needed (SKIP_EXISTING_DOCS) + if [ ! ${doc_skip[${tmp_result_arr[id]}]+abc} ]; then - if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then - curl_status=$(echo $curl_result | jq -r ".items[0].id") - # paperless id to docspell id for later use - pl2ds_id[${tmp_result_arr[id]}]=$curl_status + # waitig for document and get document id + count=0 + countMax=25 + while [ $count -le $countMax ]; do + # get Docspell id of document + curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'" + curl_status=$(echo $curl_result | jq -r ".exists") + res=$? - if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then - count2=0 - count2Max=5 - while [ $count2 -le $count2Max ]; do - curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'" + # file id returned + if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then + curl_status=$(echo $curl_result | jq -r ".items[0].id") + # paperless id to docspell id for later use + pl2ds_id[${tmp_result_arr[id]}]=$curl_status + echo ".done" + break - # Search for exact match of paperless correspondent in docspell organizations - curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name") + # unknown error + elif [ $res -ne 0 ]; then + echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status" + exit 7 - if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then - curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id") - - # Set actual link to document - curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'" - - curl_status=$(echo $curl_result | jq -r ".success") - if [ "$curl_status" == "true" ]; then - echo ". done" - - else - echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" - exit 5 - fi - - # Set name of document - printf "%${#len_resultset}s" " "; printf " " - - curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'" - - curl_status=$(echo $curl_result | jq -r ".success") - if [ "$curl_status" == "true" ]; then - echo "Set name of item: \"${tmp_result_arr[title]}\"" - - else - echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" - exit 5 - fi - - - # Set created date of document - printf "%${#len_resultset}s" " "; printf " " - - tmp_date=${tmp_result_arr[created]:0:10} - curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'" - - curl_status=$(echo $curl_result | jq -r ".success") - if [ "$curl_status" == "true" ]; then - echo "Set creation date of item: \"$tmp_date\"" - - else - echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" - exit 5 - fi - - break - - elif [ $count2 -ge $count2Max ]; then - echo "FATAL Upload failed (or processing too slow)" - exit 6 - - # FIXME I think, the loop is not needed here - organizations seem to be there immediately - else - printf "." - fi - - sleep $(( count2*count2 )) - ((count2++)) - done + # counter too high + elif [ $count -ge $countMax ]; then + echo "FATAL Upload failed (or processing too slow)" + exit 8 else - echo "Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]})" - + printf "." fi - break + sleep $(( count * count )) + ((count++)) + done - elif [ $res -ne 0 ]; then - echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status" - exit 7 - elif [ $count -ge $countMax ]; then - echo "FATAL Upload failed (or processing too slow)" - exit 8 + # link orga to document + printf "%${#len_resultset}s" " "; printf " " + if [ ! "${tmp_result_arr[correspondent_id]/' '/''}" == "" ]; then + + # check for availability of document id and name of organization + if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then + printf "Set link to organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .." + + # get organizations matching doc's orga (can be several when parts match) + curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'" + + # Search for exact match of paperless correspondent in fetched organizations from Docspell + curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name") + + # double-check that found organization matches doc's correspondent + if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then + curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id") + + # Set actual link to document + curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'" + + curl_status=$(echo $curl_result | jq -r ".success") + if [ "$curl_status" == "true" ]; then + echo ". done" + + # unknown error + else + echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" + exit 5 + fi + else + echo "FATAL Unknown error" + exit 6 + fi + else + echo "WARNING Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]}) - Limits are $LIMIT / $LIMIT_DOC" + fi + else + echo "No correspondent set in Paperless, skipping." + fi + + # Set name of document + printf "%${#len_resultset}s" " "; printf " " + + curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'" + + curl_status=$(echo $curl_result | jq -r ".success") + if [ "$curl_status" == "true" ]; then + echo "Set name of item: \"${tmp_result_arr[title]}\"" else - printf "." + echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" + exit 5 fi - sleep $(( count * count )) - ((count++)) - done - echo - # TAGS - elif [ "$mode" == "documents_tag" ]; then + + # Set created date of document + printf "%${#len_resultset}s" " "; printf " " + + tmp_date="${tmp_result_arr[created]:0:10} 12:00:00" #fix for timezone variations + curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'" + + curl_status=$(echo $curl_result | jq -r ".success") + if [ "$curl_status" == "true" ]; then + echo "Set creation date of item: \"${tmp_date:0:10}\"" + + else + echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})" + exit 5 + fi + echo + + fi # done with documents + + # TAGS + elif [ "$mode" == "documents_tag" ]; then + if [ ! "${tmp_result_arr[name]}" == "" ] && [ ! "${tmp_result_arr[id]}" == "" ]; then echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]" printf "%${#len_resultset}s" " "; printf " " @@ -309,26 +330,39 @@ for mode in "${modes[@]}"; do echo "FATAL Error during creation of tag: $(echo $curl_result | jq -r '.message')" exit 9 fi - - - # TAGS 2 DOCUMENTS - elif [ "$mode" == "documents_document_tags" ]; then - echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})" - printf "%${#len_resultset}s" " "; printf " " - - #link tags to documents - curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'" - - curl_status=$(echo $curl_result | jq -r ".success") - if [ "$curl_status" == "true" ]; then - echo '...applied' - else - echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})" - fi + else + echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC" fi - done -done + + # TAGS 2 DOCUMENTS + elif [ "$mode" == "documents_document_tags" ]; then + # if doc_skip is not set for document_id + if [ ! ${doc_skip[${tmp_result_arr[document_id]}]+abc} ]; then + if [ ! "${tag2name[${tmp_result_arr[tag_id]}]}" == "" ] && [ ! "${tmp_result_arr[tag_id]}" == "" ]; then + echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})" + printf "%${#len_resultset}s" " "; printf " " + + #link tags to documents + curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'" + + curl_status=$(echo $curl_result | jq -r ".success") + if [ "$curl_status" == "true" ]; then + echo '...applied' + else + echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})" + fi + else + echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC" + fi + else + echo -en "\r" + sleep 0.1 + fi + fi # done with mode processing + + done # with single resultset +done # with modes echo ################# DONE ################# date From 9fa93a3ba17a5f48a23292ae9e31ece3fb9d645a Mon Sep 17 00:00:00 2001 From: totti4ever <49901208+totti4ever@users.noreply.github.com> Date: Thu, 22 Oct 2020 20:36:00 +0200 Subject: [PATCH 2/3] Create README.md --- tools/import-paperless/README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tools/import-paperless/README.md diff --git a/tools/import-paperless/README.md b/tools/import-paperless/README.md new file mode 100644 index 00000000..2ddb5c98 --- /dev/null +++ b/tools/import-paperless/README.md @@ -0,0 +1,31 @@ +# Paperless to Docspell Importer +_by totti4ever_ + +:warning: **BE AWARE** You should test this script on an empty database (backup yours) or at least an own collective :warning: + +## Information +After using [Paperless](https://github.com/the-paperless-project/paperless/) for quite a while, I figured out that there is some room for improvement but only little work still done on the project, which is totally fine as it is a private and open-source project! +So I came around Docspell and found it to have quite a potential, especially regarding the AI and AI-like features growing. + +Still I wanted to transfer the tagging and structure from Paperless to Docspell and not only import the files and start over the managing process once again. +This is why I put in my dirty bash scripting skills and made a script, which reads the files from the internal documents folder of Paperless and extracts tags and correspondents from Paperless and imports them to Docspell using the official API, so no dirty DB writes or something like that! + +## Usage + +1. Clone the project or simply copy the `import-paperless.sh` script to the machine, where Paperless is installed +2. run import-paperless.sh with the following parameters + 1. URL of Docspell, including http(s) + 2. Username for Docspell, possibly including Collective (if other name as user) + 3. Password for Docspell + 4. Path to Paperless' database file (`db.sqlite3`). When using Paperless with docker, it is in the mapped directory `/usr/src/paperless/data` + 5. Path to Paperless' document base directory. When using Paperless with docker, it is the mapped directory `/usr/src/paperless/media/documents/origin/` +3. You can use the following variables inside the script (right at the top) + * LIMIT="LIMIT 0" (default: inactive) + For testing purposes, limits the number of tags and correspondents read from Paperless (this will most likely lead to warnings when processing the documents) + * LIMIT_DOC="LIMIT 5" (default: inactive) + For testing purposes, limits the number of documents and document-to-tag relations read from Paperless + * SKIP_EXISTING_DOCS=true (default: true) + Won't touch already existing documents. If set to `false` documents, which exist already, won't be uploaded again, but the tags, correspondent, date and title from Paperless will be applied. + :warning: In case you already had set these information in Docspell, they will be overwritten! + +I found it quite useful, to start with 5 documents and no tags and then continue with óut a tag limit, but with 20-50 documents. Afterwards I removed both limits. From f08fcce4c755460324a27fcef56b8039cdd3a168 Mon Sep 17 00:00:00 2001 From: totti4ever <49901208+totti4ever@users.noreply.github.com> Date: Thu, 22 Oct 2020 23:05:44 +0200 Subject: [PATCH 3/3] Fixed typo at error detail spooling --- tools/import-paperless/import-paperless.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/import-paperless/import-paperless.sh b/tools/import-paperless/import-paperless.sh index 2cb762cb..aa75dac0 100755 --- a/tools/import-paperless/import-paperless.sh +++ b/tools/import-paperless/import-paperless.sh @@ -66,7 +66,7 @@ function curl_call() { curl_result=$(eval $curl_cmd) if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then - printf "\nNew login required (§curl_result)... " + printf "\nNew login required ($curl_result)... " login printf "%${#len_resultset}s" " "; printf " .." curl_call $1