2020-10-20 20:35:56 +00:00
#!/usr/bin/env bash
2020-10-20 20:20:00 +00:00
# allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
2020-10-22 17:58:32 +00:00
# LIMIT="LIMIT 0"
# LIMIT_DOC="LIMIT 5"
SKIP_EXISTING_DOCS = true
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
CURL_CMD = "curl"
JQ_CMD = "jq"
SQLITE_CMD = "sqlite3"
2020-10-20 20:06:08 +00:00
echo "##################### START #####################"
2020-10-22 17:58:32 +00:00
echo " Docspell - Import from Paperless v '0.3 beta'"
2020-10-20 20:06:08 +00:00
echo " by totti4ever" && echo
echo " $( date) "
echo
echo "#################################################"
echo && echo
2021-01-22 21:32:02 +00:00
" $JQ_CMD " --version > /dev/null
2020-10-20 20:06:08 +00:00
if [ $? -ne 0 ] ; then
echo "please install 'jq'"
exit -4
fi
ds_url = $1
ds_user = $2
ds_password = $3
db_path = $4
file_path = $5
if [ $# -ne 5 ] ; then
echo "FATAL Exactly five parameters needed"
exit -3
elif [ " $1 " = = "" ] || [ " $2 " = = "" ] || [ " $3 " = = "" ] || [ " $4 " = = "" ] || [ " $5 " = = "" ] ; then
echo "FATAL Parameter missing"
echo " ds_url: $ds_url "
echo " ds_user: $ds_user "
echo " ds_password: $ds_password "
echo " db_path: $db_path "
echo " file_path: $file_path "
exit -2
fi
# the tables we need
modes = ( "documents_correspondent" "documents_document" "documents_tag" "documents_document_tags" )
# the columns per table we need
declare -A columns
#documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename
2020-10-22 17:58:32 +00:00
columns[ documents_document] = "id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename"
2020-10-20 20:06:08 +00:00
#documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
columns[ documents_correspondent] = "id, name"
#documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
columns[ documents_tag] = "id, name"
#documents_document_tags: id, document_id, tag_id
columns[ documents_document_tags] = "document_id, tag_id"
declare -A document2orga
declare -A corr2name
declare -A tag2name
declare -A doc2name
declare -A pl2ds_id
2020-10-22 17:58:32 +00:00
if [ " $SKIP_EXISTING_DOCS " = = "true" ] ; then declare -A doc_skip; fi
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
############# FUNCTIONS
2020-10-20 20:06:08 +00:00
function curl_call( ) {
2021-01-22 21:32:02 +00:00
curl_cmd = " $CURL_CMD $1 -H 'X-Docspell-Auth: $ds_token ' "
2020-10-20 20:06:08 +00:00
curl_result = $( eval $curl_cmd )
2020-10-22 17:58:32 +00:00
if [ " $curl_result " = = '"Authentication failed."' ] || [ " $curl_result " = = 'Response timed out' ] ; then
2020-10-22 21:05:44 +00:00
printf " \nNew login required ( $curl_result )... "
2020-10-20 20:06:08 +00:00
login
2020-10-20 21:03:54 +00:00
printf " % ${# len_resultset } s " " " ; printf " .."
2020-10-20 20:06:08 +00:00
curl_call $1
elif [ " $curl_result " = = "Bad Gateway" ] || [ " $curl_result " = = '404 page not found' ] ; then
echo "FATAL Connection to server failed"
exit -1
fi
}
function login( ) {
2021-01-22 21:32:02 +00:00
curl_call " -s -X POST -d '{\"account\": \" $ds_user \", \"password\": \" $ds_password \"}' ${ ds_url } /api/v1/open/auth/login "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-20 20:06:08 +00:00
if [ " $curl_status " = = "true" ] ; then
2021-01-22 21:32:02 +00:00
ds_token = $( echo $curl_result | " $JQ_CMD " -r ".token" )
2020-10-20 20:06:08 +00:00
echo " Login successfull ( Token: $ds_token ) "
else
echo "FATAL Login not succesfull"
exit 1
fi
}
############# END
# login, get token
login
# go through modes
for mode in " ${ modes [@] } " ; do
echo && echo " ### $mode ### "
OLDIFS = $IFS
IFS = $'\n'
2020-10-22 17:58:32 +00:00
if [ " $mode " = = "documents_document" ] || [ " $mode " = = "documents_document_tags" ] ; then
tmp_limit = $LIMIT_DOC
else
tmp_limit = $LIMIT
fi
2021-01-22 21:32:02 +00:00
tmp_resultset = ( ` $SQLITE_CMD -header $db_path " select ${ columns [ $mode ] } from $mode order by 1 DESC $tmp_limit ; " ` )
2020-10-22 17:58:32 +00:00
2020-10-20 20:06:08 +00:00
tmp_headers = ( $( echo " ${ tmp_resultset [0] } " | tr '|' '\n' ) )
len_resultset = ${# tmp_resultset [@] }
# go through resultset
for ( ( i = 1; i<$len_resultset ; i++) ) ; do
# split result into array
2020-10-22 17:58:32 +00:00
tmp_result = ( $( echo " ${ tmp_resultset [ $i ]/ '||' / '| |' } " | tr '|' '\n' ) )
2020-10-20 20:06:08 +00:00
# process single result array
len_result = ${# tmp_result [@] }
# write array to named array
declare -A tmp_result_arr
for ( ( j = 0; j<$len_result ; j++) ) ; do
tmp_header = ${ tmp_headers [ $j ] }
tmp_result_arr[ $tmp_header ] = ${ tmp_result [ $j ] }
done
printf " % ${# len_resultset } s " " $i " ; printf " / $(( len_resultset-1)) "
# CORRESPONDENTS
if [ " $mode " = = "documents_correspondent" ] ; then
echo " \" ${ tmp_result_arr [name] } \" [id: ${ tmp_result_arr [id] } ] "
2020-10-20 21:03:54 +00:00
printf " % ${# len_resultset } s " " " ; printf " "
2021-01-22 21:32:02 +00:00
curl_call " -s -X POST ' $ds_url /api/v1/sec/organization' -H 'Content-Type: application/json' -d '{\"id\":\"\",\"name\":\" ${ tmp_result_arr [name] } \",\"address\":{\"street\":\"\",\"zip\":\"\",\"city\":\"\",\"country\":\"\"},\"contacts\":[],\"created\":0}' "
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-20 20:06:08 +00:00
if [ " $curl_status " = = "true" ] ; then
echo "Organization successfully created from correspondent"
2021-01-22 21:32:02 +00:00
elif [ " $( echo $curl_result | " $JQ_CMD " -r '.message' ) " = = "Adding failed, because the entity already exists." ] ; then
2020-10-20 20:06:08 +00:00
echo "Organization already exists, nothing to do"
else
2021-01-22 21:32:02 +00:00
echo " FATAL Error during creation of organization: $( echo $curl_result | " $JQ_CMD " -r '.message' ) "
2020-10-20 20:06:08 +00:00
exit 2
fi
echo
# paperless id to name for later purposes
corr2name[ ${ tmp_result_arr [id] } ] = ${ tmp_result_arr [name] }
# DOCUMENTS
elif [ " $mode " = = "documents_document" ] ; then
echo " \" ${ tmp_result_arr [filename] } \" [id: ${ tmp_result_arr [id] } ] "
2020-10-20 21:03:54 +00:00
printf " % ${# len_resultset } s " " " ; printf " "
2020-10-20 20:06:08 +00:00
doc2name[ ${ tmp_result_arr [id] } ] = ${ tmp_result_arr [filename] }
tmp_filepath = $file_path /${ tmp_result_arr [filename] }
if [ ! -f " $tmp_filepath " ] ; then
echo " FATAL no access to file: $tmp_filepath "
exit 3
fi
# check for checksum
tmp_checksum = $( sha256sum " $tmp_filepath " | awk '{print $1}' )
2021-01-22 21:32:02 +00:00
curl_call " -s -X GET ' $ds_url /api/v1/sec/checkfile/ $tmp_checksum ' "
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".exists" )
2020-10-20 20:06:08 +00:00
# upload if not existent
if [ $? -eq 0 ] && [ " $curl_status " = = "false" ] ; then
2020-10-22 17:58:32 +00:00
echo -n "File does not exist, uploading.."
2021-01-22 21:32:02 +00:00
curl_call " -s -X POST ' $ds_url /api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@ $tmp_filepath ;type=application/ ${ tmp_result_arr [file_type] } ' "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-20 20:06:08 +00:00
if [ " $curl_status " = = "true" ] ; then
2020-10-22 17:58:32 +00:00
printf ". ."
2020-10-20 20:06:08 +00:00
else
2020-10-22 17:58:32 +00:00
echo -e " FATAL upload failed\nCmd: $curl_cmd \nResp: $curl_result \nStatus: $curl_status "
2020-10-20 20:06:08 +00:00
exit 4
fi
else
2020-10-22 17:58:32 +00:00
printf "File already exists"
if [ " $SKIP_EXISTING_DOCS " = = "true" ] ; then
echo ", skipping this item for all types" && echo
doc_skip[ ${ tmp_result_arr [id] } ] = "true"
else
printf ", nothing to upload.Fetching ID.."
fi
2020-10-20 20:06:08 +00:00
fi
2020-10-22 17:58:32 +00:00
# skip if needed (SKIP_EXISTING_DOCS)
if [ ! ${ doc_skip [ ${ tmp_result_arr [id] } ]+abc } ] ; then
# waitig for document and get document id
count = 0
countMax = 25
while [ $count -le $countMax ] ; do
# get Docspell id of document
2021-01-22 21:32:02 +00:00
curl_call " -s -X GET ' $ds_url /api/v1/sec/checkfile/ $tmp_checksum ' "
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".exists" )
2020-10-22 17:58:32 +00:00
res = $?
# file id returned
if [ $res -eq 0 ] && [ " $curl_status " = = "true" ] ; then
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".items[0].id" )
2020-10-22 17:58:32 +00:00
# paperless id to docspell id for later use
pl2ds_id[ ${ tmp_result_arr [id] } ] = $curl_status
echo ".done"
break
# unknown error
elif [ $res -ne 0 ] ; then
echo -e " FATAL Error:\n Err-Code: $? / $res \n Command: $curl_cmd \n Result: $curl_result \n Status: $curl_status "
exit 7
# counter too high
elif [ $count -ge $countMax ] ; then
echo "FATAL Upload failed (or processing too slow)"
exit 8
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
else
printf "."
fi
sleep $(( count * count ))
( ( count++) )
done
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# link orga to document
printf " % ${# len_resultset } s " " " ; printf " "
if [ ! " ${ tmp_result_arr [correspondent_id]/ ' ' / '' } " = = "" ] ; then
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# check for availability of document id and name of organization
if [ ! " ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } " = = "" ] && [ ! " ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } " = = "" ] ; then
printf " Set link to organization \" ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } \" .. "
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# get organizations matching doc's orga (can be several when parts match)
2021-01-22 21:32:02 +00:00
curl_call " -s -X GET ' $ds_url /api/v1/sec/organization' -G --data-urlencode 'q= ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } ' "
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# Search for exact match of paperless correspondent in fetched organizations from Docspell
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r " .items[] | select(.name==\" ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } \") | .name " )
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# double-check that found organization matches doc's correspondent
if [ " $curl_status " = = " ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } " ] ; then
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r " .items[] | select(.name==\" ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } \") | .id " )
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# Set actual link to document
2021-01-22 21:32:02 +00:00
curl_call " -s -X PUT ' $ds_url /api/v1/sec/item/ ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } /corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\" $curl_status \"}' "
2020-10-21 11:04:58 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-22 17:58:32 +00:00
if [ " $curl_status " = = "true" ] ; then
echo ". done"
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# unknown error
else
echo " FATAL Failed to link orga \" ${ tmp_result_arr [orga_id] } \" (doc_id: ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } ) "
exit 5
fi
else
echo "FATAL Unknown error"
exit 6
fi
else
echo " WARNING Something went wrong, no information on doc_id and/or org_id ( ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } // ${ corr2name [ ${ tmp_result_arr [correspondent_id] } ] } ) - Limits are $LIMIT / $LIMIT_DOC "
fi
else
echo "No correspondent set in Paperless, skipping."
fi
2020-10-21 11:04:58 +00:00
2020-10-22 17:58:32 +00:00
# Set name of document
printf " % ${# len_resultset } s " " " ; printf " "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_call " -s -X PUT ' $ds_url /api/v1/sec/item/ ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } /name' -H 'Content-Type: application/json' -d '{\"text\":\" ${ tmp_result_arr [title] } \"}' "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-22 17:58:32 +00:00
if [ " $curl_status " = = "true" ] ; then
echo " Set name of item: \" ${ tmp_result_arr [title] } \" "
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
else
echo " FATAL Failed to set item's name \" ${ tmp_result_arr [title] } \" (doc_id: ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } ) "
exit 5
fi
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
# Set created date of document
printf " % ${# len_resultset } s " " " ; printf " "
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
tmp_date = " ${ tmp_result_arr [created] : 0 : 10 } 12:00:00 " #fix for timezone variations
2021-01-22 21:32:02 +00:00
curl_call " -s -X PUT ' $ds_url /api/v1/sec/item/ ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } /date' -H 'Content-Type: application/json' -d '{\"date\": $( echo " $( date -d " $tmp_date " +%s) * 1000 " | bc ) }' "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-22 17:58:32 +00:00
if [ " $curl_status " = = "true" ] ; then
echo " Set creation date of item: \" ${ tmp_date : 0 : 10 } \" "
2020-10-20 20:06:08 +00:00
else
2020-10-22 17:58:32 +00:00
echo " FATAL Failed to set item's creation date \" $tmp_date \" (doc_id: ${ pl2ds_id [ ${ tmp_result_arr [id] } ] } ) "
exit 5
2020-10-20 20:06:08 +00:00
fi
2020-10-22 17:58:32 +00:00
echo
fi # done with documents
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
# TAGS
elif [ " $mode " = = "documents_tag" ] ; then
if [ ! " ${ tmp_result_arr [name] } " = = "" ] && [ ! " ${ tmp_result_arr [id] } " = = "" ] ; then
2020-10-20 20:06:08 +00:00
echo " \" ${ tmp_result_arr [name] } \" [id: ${ tmp_result_arr [id] } ] "
printf " % ${# len_resultset } s " " " ; printf " "
# paperless tag id to name for later use
tag2name[ ${ tmp_result_arr [id] } ] = ${ tmp_result_arr [name] }
2021-01-22 21:32:02 +00:00
curl_call " -s -X POST ' $ds_url /api/v1/sec/tag' -H 'Content-Type: application/json' -d '{\"id\":\"ignored\",\"name\":\" ${ tmp_result_arr [name] } \",\"category\":\"imported (pl)\",\"created\":0}' "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-20 20:06:08 +00:00
if [ " $curl_status " = = "true" ] ; then
echo "Tag successfully created"
2021-01-22 21:32:02 +00:00
elif [ " $( echo $curl_result | " $JQ_CMD " -r '.message' ) " = = " A tag ' ${ tmp_result_arr [name] } ' already exists " ] ; then
2020-10-20 20:06:08 +00:00
echo "Tag already exists, nothing to do"
else
2021-01-22 21:32:02 +00:00
echo " FATAL Error during creation of tag: $( echo $curl_result | " $JQ_CMD " -r '.message' ) "
2020-10-20 20:06:08 +00:00
exit 9
fi
2020-10-22 17:58:32 +00:00
else
echo " WARNING Error on tag processing, no id and/or name ( ${ tmp_result_arr [id] } / ${ tmp_result_arr [name] } ) - Limits are $LIMIT / $LIMIT_DOC "
fi
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
# TAGS 2 DOCUMENTS
elif [ " $mode " = = "documents_document_tags" ] ; then
# if doc_skip is not set for document_id
if [ ! ${ doc_skip [ ${ tmp_result_arr [document_id] } ]+abc } ] ; then
if [ ! " ${ tag2name [ ${ tmp_result_arr [tag_id] } ] } " = = "" ] && [ ! " ${ tmp_result_arr [tag_id] } " = = "" ] ; then
echo " Tag \" ${ tag2name [ ${ tmp_result_arr [tag_id] } ] } \" (id: ${ tmp_result_arr [tag_id] } ) for \" ${ doc2name [ ${ tmp_result_arr [document_id] } ] } \" (id: ${ tmp_result_arr [document_id] } ) "
printf " % ${# len_resultset } s " " " ; printf " "
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
#link tags to documents
2021-01-22 21:32:02 +00:00
curl_call " -s -X PUT ' $ds_url /api/v1/sec/item/ ${ pl2ds_id [ ${ tmp_result_arr [document_id] } ] } /taglink' -H 'Content-Type: application/json' -d '{\"items\":[\" ${ tag2name [ ${ tmp_result_arr [tag_id] } ] } \"]}' "
2020-10-20 20:06:08 +00:00
2021-01-22 21:32:02 +00:00
curl_status = $( echo $curl_result | " $JQ_CMD " -r ".success" )
2020-10-22 17:58:32 +00:00
if [ " $curl_status " = = "true" ] ; then
echo '...applied'
else
echo " Failed to link tag \" ${ tmp_result_arr [tag_id] } \" (doc_id: ${ pl2ds_id [ ${ tmp_result_arr [document_id] } ] } ) "
fi
2020-10-20 20:06:08 +00:00
else
2020-10-22 17:58:32 +00:00
echo " WARNING Error on tag processing, no id and/or name ( ${ tmp_result_arr [id] } / ${ tmp_result_arr [name] } ) - Limits are $LIMIT / $LIMIT_DOC "
2020-10-20 20:06:08 +00:00
fi
2020-10-22 17:58:32 +00:00
else
echo -en "\r"
sleep 0.1
2020-10-20 20:06:08 +00:00
fi
2020-10-22 17:58:32 +00:00
fi # done with mode processing
2020-10-20 20:06:08 +00:00
2020-10-22 17:58:32 +00:00
done # with single resultset
done # with modes
2020-10-20 20:06:08 +00:00
echo ################# DONE #################
date