mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-25 16:45:05 +00:00
Merge pull request #377 from totti4ever/pr-paperless_import
paperless-importer (v0.3 beta) - added SKIP_EXISTING_DOCS variable and improved error handling
This commit is contained in:
commit
b2b5ddef97
31
tools/import-paperless/README.md
Normal file
31
tools/import-paperless/README.md
Normal file
@ -0,0 +1,31 @@
|
||||
# Paperless to Docspell Importer
|
||||
_by totti4ever_
|
||||
|
||||
:warning: **BE AWARE** You should test this script on an empty database (backup yours) or at least an own collective :warning:
|
||||
|
||||
## Information
|
||||
After using [Paperless](https://github.com/the-paperless-project/paperless/) for quite a while, I figured out that there is some room for improvement but only little work still done on the project, which is totally fine as it is a private and open-source project!
|
||||
So I came around Docspell and found it to have quite a potential, especially regarding the AI and AI-like features growing.
|
||||
|
||||
Still I wanted to transfer the tagging and structure from Paperless to Docspell and not only import the files and start over the managing process once again.
|
||||
This is why I put in my dirty bash scripting skills and made a script, which reads the files from the internal documents folder of Paperless and extracts tags and correspondents from Paperless and imports them to Docspell using the official API, so no dirty DB writes or something like that!
|
||||
|
||||
## Usage
|
||||
|
||||
1. Clone the project or simply copy the `import-paperless.sh` script to the machine, where Paperless is installed
|
||||
2. run import-paperless.sh with the following parameters
|
||||
1. URL of Docspell, including http(s)
|
||||
2. Username for Docspell, possibly including Collective (if other name as user)
|
||||
3. Password for Docspell
|
||||
4. Path to Paperless' database file (`db.sqlite3`). When using Paperless with docker, it is in the mapped directory `/usr/src/paperless/data`
|
||||
5. Path to Paperless' document base directory. When using Paperless with docker, it is the mapped directory `/usr/src/paperless/media/documents/origin/`
|
||||
3. You can use the following variables inside the script (right at the top)
|
||||
* LIMIT="LIMIT 0" (default: inactive)
|
||||
For testing purposes, limits the number of tags and correspondents read from Paperless (this will most likely lead to warnings when processing the documents)
|
||||
* LIMIT_DOC="LIMIT 5" (default: inactive)
|
||||
For testing purposes, limits the number of documents and document-to-tag relations read from Paperless
|
||||
* SKIP_EXISTING_DOCS=true (default: true)
|
||||
Won't touch already existing documents. If set to `false` documents, which exist already, won't be uploaded again, but the tags, correspondent, date and title from Paperless will be applied.
|
||||
:warning: In case you already had set these information in Docspell, they will be overwritten!
|
||||
|
||||
I found it quite useful, to start with 5 documents and no tags and then continue with óut a tag limit, but with 20-50 documents. Afterwards I removed both limits.
|
@ -1,11 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# allows to start small - but affects also tags and correspondents, so they might be missing when linking them!
|
||||
# LIMIT=LIMIT 150
|
||||
# LIMIT="LIMIT 0"
|
||||
# LIMIT_DOC="LIMIT 5"
|
||||
SKIP_EXISTING_DOCS=true
|
||||
|
||||
echo "##################### START #####################"
|
||||
|
||||
echo " Docspell - Import from Paperless v '0.2 beta'"
|
||||
echo " Docspell - Import from Paperless v '0.3 beta'"
|
||||
echo " by totti4ever" && echo
|
||||
echo " $(date)"
|
||||
echo
|
||||
@ -43,7 +45,7 @@ modes=("documents_correspondent" "documents_document" "documents_tag" "documents
|
||||
# the columns per table we need
|
||||
declare -A columns
|
||||
#documents_document: id, title, content, created, modified, added, correspondent_id, file_type, checksum, storage_type, filename
|
||||
columns[documents_document]="id, title, datetime(created,'localtime') as created, added, correspondent_id, file_type, filename"
|
||||
columns[documents_document]="id, title, datetime(created,'localtime') as created, correspondent_id, file_type, filename"
|
||||
#documents_correspondent: id, name, match, matching_algorithm, is_insensitive, slug
|
||||
columns[documents_correspondent]="id, name"
|
||||
#documents_tag: id, name, colour, match, matching_algorithm, is_insensitive, slug
|
||||
@ -56,14 +58,15 @@ declare -A corr2name
|
||||
declare -A tag2name
|
||||
declare -A doc2name
|
||||
declare -A pl2ds_id
|
||||
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then declare -A doc_skip; fi
|
||||
|
||||
############# FUCNTIONS
|
||||
############# FUNCTIONS
|
||||
function curl_call() {
|
||||
curl_cmd="$1 -H 'X-Docspell-Auth: $ds_token'"
|
||||
curl_result=$(eval $curl_cmd)
|
||||
|
||||
if [ "$curl_result" == '"Authentication failed."' ]; then
|
||||
printf "\nNew login required... "
|
||||
if [ "$curl_result" == '"Authentication failed."' ] || [ "$curl_result" == 'Response timed out' ]; then
|
||||
printf "\nNew login required ($curl_result)... "
|
||||
login
|
||||
printf "%${#len_resultset}s" " "; printf " .."
|
||||
curl_call $1
|
||||
@ -102,7 +105,13 @@ for mode in "${modes[@]}"; do
|
||||
OLDIFS=$IFS
|
||||
IFS=$'\n'
|
||||
|
||||
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 $LIMIT;"`)
|
||||
if [ "$mode" == "documents_document" ] || [ "$mode" == "documents_document_tags" ]; then
|
||||
tmp_limit=$LIMIT_DOC
|
||||
else
|
||||
tmp_limit=$LIMIT
|
||||
fi
|
||||
tmp_resultset=(`sqlite3 -header $db_path "select ${columns[$mode]} from $mode order by 1 DESC $tmp_limit;"`)
|
||||
|
||||
|
||||
tmp_headers=($(echo "${tmp_resultset[0]}" | tr '|' '\n'))
|
||||
len_resultset=${#tmp_resultset[@]}
|
||||
@ -111,7 +120,7 @@ for mode in "${modes[@]}"; do
|
||||
for ((i=1;i<$len_resultset;i++)); do
|
||||
|
||||
# split result into array
|
||||
tmp_result=($(echo "${tmp_resultset[$i]}" | tr '|' '\n'))
|
||||
tmp_result=($(echo "${tmp_resultset[$i]/'||'/'| |'}" | tr '|' '\n'))
|
||||
|
||||
# process single result array
|
||||
len_result=${#tmp_result[@]}
|
||||
@ -167,131 +176,143 @@ for mode in "${modes[@]}"; do
|
||||
|
||||
# upload if not existent
|
||||
if [ $? -eq 0 ] && [ "$curl_status" == "false" ]; then
|
||||
echo -n "File does not exist, uploading... "
|
||||
echo -n "File does not exist, uploading.."
|
||||
curl_call "curl -s -X POST '$ds_url/api/v1/sec/upload/item' -H 'Content-Type: multipart/form-data' -F 'file=@$tmp_filepath;type=application/${tmp_result_arr[file_type]}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo "done"
|
||||
printf ". ."
|
||||
|
||||
else
|
||||
echo "FATAL upload failed"
|
||||
echo -e "FATAL upload failed\nCmd: $curl_cmd\nResp: $curl_result\nStatus: $curl_status"
|
||||
exit 4
|
||||
fi
|
||||
|
||||
else
|
||||
echo "File already exists, nothing to upload"
|
||||
printf "File already exists"
|
||||
if [ "$SKIP_EXISTING_DOCS" == "true" ]; then
|
||||
echo ", skipping this item for all types" && echo
|
||||
doc_skip[${tmp_result_arr[id]}]="true"
|
||||
else
|
||||
printf ", nothing to upload.Fetching ID.."
|
||||
fi
|
||||
fi
|
||||
|
||||
# link orga to document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
printf "Waiting for document to link organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
|
||||
count=0
|
||||
countMax=10
|
||||
while [ $count -le $countMax ]; do
|
||||
# get Docspell id of document
|
||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
|
||||
curl_status=$(echo $curl_result | jq -r ".exists")
|
||||
res=$?
|
||||
# skip if needed (SKIP_EXISTING_DOCS)
|
||||
if [ ! ${doc_skip[${tmp_result_arr[id]}]+abc} ]; then
|
||||
|
||||
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
|
||||
curl_status=$(echo $curl_result | jq -r ".items[0].id")
|
||||
# paperless id to docspell id for later use
|
||||
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
|
||||
# waitig for document and get document id
|
||||
count=0
|
||||
countMax=25
|
||||
while [ $count -le $countMax ]; do
|
||||
# get Docspell id of document
|
||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/checkfile/$tmp_checksum'"
|
||||
curl_status=$(echo $curl_result | jq -r ".exists")
|
||||
res=$?
|
||||
|
||||
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
|
||||
count2=0
|
||||
count2Max=5
|
||||
while [ $count2 -le $count2Max ]; do
|
||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
|
||||
# file id returned
|
||||
if [ $res -eq 0 ] && [ "$curl_status" == "true" ]; then
|
||||
curl_status=$(echo $curl_result | jq -r ".items[0].id")
|
||||
# paperless id to docspell id for later use
|
||||
pl2ds_id[${tmp_result_arr[id]}]=$curl_status
|
||||
echo ".done"
|
||||
break
|
||||
|
||||
# Search for exact match of paperless correspondent in docspell organizations
|
||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
|
||||
# unknown error
|
||||
elif [ $res -ne 0 ]; then
|
||||
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
|
||||
exit 7
|
||||
|
||||
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
|
||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
|
||||
|
||||
# Set actual link to document
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo ". done"
|
||||
|
||||
else
|
||||
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
|
||||
# Set name of document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo "Set name of item: \"${tmp_result_arr[title]}\""
|
||||
|
||||
else
|
||||
echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
|
||||
|
||||
# Set created date of document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
tmp_date=${tmp_result_arr[created]:0:10}
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo "Set creation date of item: \"$tmp_date\""
|
||||
|
||||
else
|
||||
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
|
||||
break
|
||||
|
||||
elif [ $count2 -ge $count2Max ]; then
|
||||
echo "FATAL Upload failed (or processing too slow)"
|
||||
exit 6
|
||||
|
||||
# FIXME I think, the loop is not needed here - organizations seem to be there immediately
|
||||
else
|
||||
printf "."
|
||||
fi
|
||||
|
||||
sleep $(( count2*count2 ))
|
||||
((count2++))
|
||||
done
|
||||
# counter too high
|
||||
elif [ $count -ge $countMax ]; then
|
||||
echo "FATAL Upload failed (or processing too slow)"
|
||||
exit 8
|
||||
|
||||
else
|
||||
echo "Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]})"
|
||||
|
||||
printf "."
|
||||
fi
|
||||
break
|
||||
sleep $(( count * count ))
|
||||
((count++))
|
||||
done
|
||||
|
||||
elif [ $res -ne 0 ]; then
|
||||
echo -e "FATAL Error:\n Err-Code: $? / $res\n Command: $curl_cmd\n Result: $curl_result\n Status: $curl_status"
|
||||
exit 7
|
||||
|
||||
elif [ $count -ge $countMax ]; then
|
||||
echo "FATAL Upload failed (or processing too slow)"
|
||||
exit 8
|
||||
# link orga to document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
if [ ! "${tmp_result_arr[correspondent_id]/' '/''}" == "" ]; then
|
||||
|
||||
# check for availability of document id and name of organization
|
||||
if [ ! "${pl2ds_id[${tmp_result_arr[id]}]}" == "" ] && [ ! "${corr2name[${tmp_result_arr[correspondent_id]}]}" == "" ]; then
|
||||
printf "Set link to organization \"${corr2name[${tmp_result_arr[correspondent_id]}]}\" .."
|
||||
|
||||
# get organizations matching doc's orga (can be several when parts match)
|
||||
curl_call "curl -s -X GET '$ds_url/api/v1/sec/organization' -G --data-urlencode 'q=${corr2name[${tmp_result_arr[correspondent_id]}]}'"
|
||||
|
||||
# Search for exact match of paperless correspondent in fetched organizations from Docspell
|
||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .name")
|
||||
|
||||
# double-check that found organization matches doc's correspondent
|
||||
if [ "$curl_status" == "${corr2name[${tmp_result_arr[correspondent_id]}]}" ]; then
|
||||
curl_status=$(echo $curl_result | jq -r ".items[] | select(.name==\"${corr2name[${tmp_result_arr[correspondent_id]}]}\") | .id")
|
||||
|
||||
# Set actual link to document
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/corrOrg' -H 'Content-Type: application/json' -d '{\"id\":\"$curl_status\"}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo ". done"
|
||||
|
||||
# unknown error
|
||||
else
|
||||
echo "FATAL Failed to link orga \"${tmp_result_arr[orga_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
else
|
||||
echo "FATAL Unknown error"
|
||||
exit 6
|
||||
fi
|
||||
else
|
||||
echo "WARNING Something went wrong, no information on doc_id and/or org_id (${pl2ds_id[${tmp_result_arr[id]}]} // ${corr2name[${tmp_result_arr[correspondent_id]}]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||
fi
|
||||
else
|
||||
echo "No correspondent set in Paperless, skipping."
|
||||
fi
|
||||
|
||||
# Set name of document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/name' -H 'Content-Type: application/json' -d '{\"text\":\"${tmp_result_arr[title]}\"}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo "Set name of item: \"${tmp_result_arr[title]}\""
|
||||
|
||||
else
|
||||
printf "."
|
||||
echo "FATAL Failed to set item's name \"${tmp_result_arr[title]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
sleep $(( count * count ))
|
||||
((count++))
|
||||
done
|
||||
echo
|
||||
|
||||
# TAGS
|
||||
elif [ "$mode" == "documents_tag" ]; then
|
||||
|
||||
# Set created date of document
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
tmp_date="${tmp_result_arr[created]:0:10} 12:00:00" #fix for timezone variations
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[id]}]}/date' -H 'Content-Type: application/json' -d '{\"date\":$( echo "$(date -d "$tmp_date" +%s) * 1000" | bc )}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo "Set creation date of item: \"${tmp_date:0:10}\""
|
||||
|
||||
else
|
||||
echo "FATAL Failed to set item's creation date \"$tmp_date\" (doc_id: ${pl2ds_id[${tmp_result_arr[id]}]})"
|
||||
exit 5
|
||||
fi
|
||||
echo
|
||||
|
||||
fi # done with documents
|
||||
|
||||
# TAGS
|
||||
elif [ "$mode" == "documents_tag" ]; then
|
||||
if [ ! "${tmp_result_arr[name]}" == "" ] && [ ! "${tmp_result_arr[id]}" == "" ]; then
|
||||
echo "\"${tmp_result_arr[name]}\" [id: ${tmp_result_arr[id]}]"
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
@ -309,26 +330,39 @@ for mode in "${modes[@]}"; do
|
||||
echo "FATAL Error during creation of tag: $(echo $curl_result | jq -r '.message')"
|
||||
exit 9
|
||||
fi
|
||||
|
||||
|
||||
# TAGS 2 DOCUMENTS
|
||||
elif [ "$mode" == "documents_document_tags" ]; then
|
||||
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
#link tags to documents
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo '...applied'
|
||||
else
|
||||
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
|
||||
fi
|
||||
else
|
||||
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||
fi
|
||||
|
||||
done
|
||||
done
|
||||
|
||||
# TAGS 2 DOCUMENTS
|
||||
elif [ "$mode" == "documents_document_tags" ]; then
|
||||
# if doc_skip is not set for document_id
|
||||
if [ ! ${doc_skip[${tmp_result_arr[document_id]}]+abc} ]; then
|
||||
if [ ! "${tag2name[${tmp_result_arr[tag_id]}]}" == "" ] && [ ! "${tmp_result_arr[tag_id]}" == "" ]; then
|
||||
echo "Tag \"${tag2name[${tmp_result_arr[tag_id]}]}\" (id: ${tmp_result_arr[tag_id]}) for \"${doc2name[${tmp_result_arr[document_id]}]}\" (id: ${tmp_result_arr[document_id]})"
|
||||
printf "%${#len_resultset}s" " "; printf " "
|
||||
|
||||
#link tags to documents
|
||||
curl_call "curl -s -X PUT '$ds_url/api/v1/sec/item/${pl2ds_id[${tmp_result_arr[document_id]}]}/taglink' -H 'Content-Type: application/json' -d '{\"items\":[\"${tag2name[${tmp_result_arr[tag_id]}]}\"]}'"
|
||||
|
||||
curl_status=$(echo $curl_result | jq -r ".success")
|
||||
if [ "$curl_status" == "true" ]; then
|
||||
echo '...applied'
|
||||
else
|
||||
echo "Failed to link tag \"${tmp_result_arr[tag_id]}\" (doc_id: ${pl2ds_id[${tmp_result_arr[document_id]}]})"
|
||||
fi
|
||||
else
|
||||
echo "WARNING Error on tag processing, no id and/or name (${tmp_result_arr[id]} / ${tmp_result_arr[name]}) - Limits are $LIMIT / $LIMIT_DOC"
|
||||
fi
|
||||
else
|
||||
echo -en "\r"
|
||||
sleep 0.1
|
||||
fi
|
||||
fi # done with mode processing
|
||||
|
||||
done # with single resultset
|
||||
done # with modes
|
||||
|
||||
echo ################# DONE #################
|
||||
date
|
||||
|
Loading…
x
Reference in New Issue
Block a user