From 8aaffb0455f797bb32ab6199f00da292faf2e9c7 Mon Sep 17 00:00:00 2001
From: Malte <git@jacobsen.online>
Date: Thu, 22 Oct 2020 19:58:32 +0200
Subject: [PATCH] added SKIP_EXISTING_DOCS variable and improved error handling

* errors, specially during long initial runs are caught better now and should continue, if possible
* set `SKIP_EXISTING_DOCS` to `true` (default) to skip existing documents (if set to false, they won't be uploaded again, but tags, correspondent, date and title will be overwritten)
* for testing purposes you can set the variables `LIMIT` and `LIMIT_DOC` to limit the tags and correspondents respetively the documents and document-tag-relations
---
 tools/import-paperless/import-paperless.sh | 286 ++++++++++++---------
 1 file changed, 160 insertions(+), 126 deletions(-)

diff --git a/tools/import-paperless/import-paperless.sh b/tools/import-paperless/import-paperless.sh
index 3fd9c6bb..2cb762cb 100755
--- a/tools/import-paperless/import-paperless.sh
+++ b/tools/import-paperless/import-paperless.sh
@@ -1,11 +1,13 @@
 #!/usr/bin/env bash
 
 # allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
-# LIMIT=LIMIT 150
+# LIMIT="LIMIT 0"
+# LIMIT_DOC="LIMIT 5"
+SKIP_EXISTING_DOCS=true
 
 echo "##################### START #####################"
 
-echo "  Docspell - Import from Paperless v '0.2 beta'"
+echo "  Docspell - Import from Paperless v '0.3 beta'"
 echo "         by totti4ever" && echo
 echo "  $(date)"
 echo
@@ -43,7 +45,7 @@ modes=("documents_correspondent" "documents_document" "documents_tag" "documents
 # the columns per table we need
 declare -A columns
 #documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum,  storage_type, filename
-columns[documents_document]="id, title, datetime(created,'localtime') as created, added, correspondent_id, file_type, filename"
+columns[documents_document]="id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename"
 #documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
 columns[documents_correspondent]="id, name"
 #documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
@@ -56,14 +58,15 @@ declare -A corr2name
 declare -A tag2name
 declare -A doc2name
 declare -A pl2ds_id
+if [ "$SKIP_EXISTING_DOCS" == "true" ]; then declare -A doc_skip; fi
 
-############# FUCNTIONS
+############# FUNCTIONS
 function curl_call() {
   curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'"
   curl_result=$(eval $curl_cmd)
 
-  if [ "$curl_result" == '"Authentication failed."' ]; then
-    printf "\nNew login required... "
+  if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then
+    printf "\nNew login required (§curl_result)... "
     login
     printf "%${#len_resultset}s" " "; printf "           .."
     curl_call $1
@@ -102,7 +105,13 @@ for mode in "${modes[@]}"; do
   OLDIFS=$IFS
   IFS=$'\n'
 
-  tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 $LIMIT;"`)
+  if [ "$mode" == "documents_document" ] || [ "$mode" == "documents_document_tags" ]; then
+    tmp_limit=$LIMIT_DOC
+  else
+    tmp_limit=$LIMIT
+  fi
+  tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 DESC $tmp_limit;"`)
+
 
   tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n'))
   len_resultset=${#tmp_resultset[@]}
@@ -111,7 +120,7 @@ for mode in "${modes[@]}"; do
   for ((i=1;i<$len_resultset;i++)); do
 
     # split result into array
-    tmp_result=($(echo "${tmp_resultset[$i]}" | tr '|' '\n'))
+    tmp_result=($(echo "${tmp_resultset[$i]/'||'/'| |'}" | tr '|' '\n'))
 
     # process single result array
     len_result=${#tmp_result[@]}
@@ -167,131 +176,143 @@ for mode in "${modes[@]}"; do
 
       # upload if not existent
       if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then
-        echo -n "File does not exist, uploading... "
+        echo -n "File does not exist, uploading.."
         curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'"
 
         curl_status=$(echo $curl_result | jq -r ".success")
         if [ "$curl_status" == "true" ]; then
-          echo "done"
+          printf ". ."
 
         else
-          echo "FATAL  upload failed"
+          echo -e "FATAL  upload failed\nCmd: $curl_cmd\nResp: $curl_result\nStatus: $curl_status"
           exit 4
         fi
 
       else
-        echo "File already exists, nothing to upload"
+        printf "File already exists"
+        if [ "$SKIP_EXISTING_DOCS" == "true" ]; then
+          echo ", skipping this item for all types" && echo
+          doc_skip[${tmp_result_arr[id]}]="true"
+        else
+          printf ", nothing to upload.Fetching ID.."
+        fi
       fi
 
-      # link orga to document
-      printf "%${#len_resultset}s" " "; printf "           "
-      printf "Waiting for document to link organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
-      count=0
-      countMax=10
-      while [ $count -le $countMax ]; do
-        # get Docspell id of document
-        curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
-        curl_status=$(echo $curl_result | jq -r ".exists")
-        res=$?
+      # skip if needed (SKIP_EXISTING_DOCS)
+      if [ ! ${doc_skip[${tmp_result_arr[id]}]+abc} ]; then
 
-        if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
-          curl_status=$(echo $curl_result | jq -r ".items[0].id")
-          # paperless id to docspell id for later use
-          pl2ds_id[${tmp_result_arr[id]}]=$curl_status
+        # waitig for document and get document id
+        count=0
+        countMax=25
+        while [ $count -le $countMax ]; do
+          # get Docspell id of document
+          curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
+          curl_status=$(echo $curl_result | jq -r ".exists")
+          res=$?
 
-          if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
-            count2=0
-            count2Max=5
-            while [ $count2 -le $count2Max ]; do
-              curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
+          # file id returned
+          if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
+            curl_status=$(echo $curl_result | jq -r ".items[0].id")
+            # paperless id to docspell id for later use
+            pl2ds_id[${tmp_result_arr[id]}]=$curl_status
+            echo ".done"
+            break
 
-              # Search for exact match of paperless correspondent in docspell organizations
-              curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
+          # unknown error
+          elif [ $res -ne 0 ]; then
+            echo -e "FATAL  Error:\n  Err-Code: $? / $res\n  Command: $curl_cmd\n  Result: $curl_result\n  Status: $curl_status"
+            exit 7
 
-              if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
-                curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
-
-                # Set actual link to document
-                curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
-
-                curl_status=$(echo $curl_result | jq -r ".success")
-                if [ "$curl_status" == "true" ]; then
-                  echo ". done"
-
-                else
-                  echo "FATAL  Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
-                  exit 5
-                fi
-
-                # Set name of document
-                printf "%${#len_resultset}s" " "; printf "           "
-
-                curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
-
-                curl_status=$(echo $curl_result | jq -r ".success")
-                if [ "$curl_status" == "true" ]; then
-                  echo "Set name of item: \"${tmp_result_arr[title]}\""
-
-                else
-                  echo "FATAL  Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
-                  exit 5
-                fi
-
-
-                # Set created date of document
-                printf "%${#len_resultset}s" " "; printf "           "
-
-                tmp_date=${tmp_result_arr[created]:0:10}
-                curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
-
-                curl_status=$(echo $curl_result | jq -r ".success")
-                if [ "$curl_status" == "true" ]; then
-                  echo "Set creation date of item: \"$tmp_date\""
-
-                else
-                  echo "FATAL  Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
-                  exit 5
-                fi
-
-                break
-
-              elif [ $count2 -ge $count2Max ]; then
-                echo "FATAL  Upload failed (or processing too slow)"
-                exit 6
-
-              # FIXME I think, the loop is not needed here - organizations seem to be there immediately
-              else
-                printf "."
-              fi
-
-              sleep $(( count2*count2 ))
-              ((count2++))
-            done
+          # counter too high
+          elif [ $count -ge $countMax ]; then
+            echo "FATAL  Upload failed (or processing too slow)"
+            exit 8
 
           else
-            echo "Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]})"
-
+              printf "."
           fi
-          break
+          sleep $(( count * count ))
+          ((count++))
+        done
 
-        elif [ $res -ne 0 ]; then
-          echo -e "FATAL  Error:\n  Err-Code: $? / $res\n  Command: $curl_cmd\n  Result: $curl_result\n  Status: $curl_status"
-          exit 7
 
-        elif [ $count -ge $countMax ]; then
-          echo "FATAL  Upload failed (or processing too slow)"
-          exit 8
+        # link orga to document
+        printf "%${#len_resultset}s" " "; printf "           "
+        if [ ! "${tmp_result_arr[correspondent_id]/' '/''}" == "" ]; then
+
+          # check for availability of document id and name of organization
+          if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
+            printf "Set link to organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
+
+            # get organizations matching doc's orga (can be several when parts match)
+            curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
+
+            # Search for exact match of paperless correspondent in fetched organizations from Docspell
+            curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
+
+            # double-check that found organization matches doc's correspondent
+            if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
+              curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
+
+              # Set actual link to document
+              curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
+
+              curl_status=$(echo $curl_result | jq -r ".success")
+              if [ "$curl_status" == "true" ]; then
+                echo ". done"
+
+              # unknown error
+              else
+                echo "FATAL  Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
+                exit 5
+              fi
+            else
+              echo "FATAL  Unknown error"
+              exit 6
+            fi
+          else
+            echo "WARNING  Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]}) - Limits are $LIMIT / $LIMIT_DOC"
+          fi
+        else
+          echo "No correspondent set in Paperless, skipping."
+        fi
+
+        # Set name of document
+        printf "%${#len_resultset}s" " "; printf "           "
+
+        curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
+
+        curl_status=$(echo $curl_result | jq -r ".success")
+        if [ "$curl_status" == "true" ]; then
+          echo "Set name of item: \"${tmp_result_arr[title]}\""
 
         else
-            printf "."
+          echo "FATAL  Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
+          exit 5
         fi
-        sleep $(( count * count ))
-        ((count++))
-      done
-      echo
 
-      # TAGS
-      elif [ "$mode" == "documents_tag" ]; then
+
+        # Set created date of document
+        printf "%${#len_resultset}s" " "; printf "           "
+
+        tmp_date="${tmp_result_arr[created]:0:10} 12:00:00" #fix for timezone variations
+        curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
+
+        curl_status=$(echo $curl_result | jq -r ".success")
+        if [ "$curl_status" == "true" ]; then
+          echo "Set creation date of item: \"${tmp_date:0:10}\""
+
+        else
+          echo "FATAL  Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
+          exit 5
+        fi
+        echo
+
+      fi  # done with documents
+
+    # TAGS
+    elif [ "$mode" == "documents_tag" ]; then
+      if [ ! "${tmp_result_arr[name]}" == "" ] && [ ! "${tmp_result_arr[id]}" == "" ]; then
         echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]"
         printf "%${#len_resultset}s" " "; printf "           "
 
@@ -309,26 +330,39 @@ for mode in "${modes[@]}"; do
           echo "FATAL  Error during creation of tag: $(echo $curl_result | jq -r '.message')"
           exit 9
         fi
-
-
-      # TAGS 2 DOCUMENTS
-      elif [ "$mode" == "documents_document_tags" ]; then
-        echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
-        printf "%${#len_resultset}s" " "; printf "           "
-
-        #link tags to documents
-        curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
-
-        curl_status=$(echo $curl_result | jq -r ".success")
-        if [ "$curl_status" == "true" ]; then
-          echo '...applied'
-        else
-          echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
-        fi
+      else
+        echo "WARNING  Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
       fi
 
-  done
-done
+
+    # TAGS 2 DOCUMENTS
+    elif [ "$mode" == "documents_document_tags" ]; then
+      # if doc_skip is not set for document_id
+      if [ ! ${doc_skip[${tmp_result_arr[document_id]}]+abc} ]; then
+        if [ ! "${tag2name[${tmp_result_arr[tag_id]}]}" == "" ] && [ ! "${tmp_result_arr[tag_id]}" == "" ]; then
+          echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
+          printf "%${#len_resultset}s" " "; printf "           "
+
+          #link tags to documents
+          curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
+
+          curl_status=$(echo $curl_result | jq -r ".success")
+          if [ "$curl_status" == "true" ]; then
+            echo '...applied'
+          else
+            echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
+          fi
+        else
+          echo "WARNING  Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
+        fi
+      else
+        echo -en "\r"
+        sleep 0.1
+      fi
+    fi  # done with mode processing
+
+  done  # with single resultset
+done  # with modes
 
 echo ################# DONE #################
 date