Remove old site
@ -1,159 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="14.60939mm"
|
||||
height="14.738198mm"
|
||||
viewBox="0 0 14.60939 14.738198"
|
||||
version="1.1"
|
||||
id="svg5908"
|
||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
|
||||
sodipodi:docname="search.svg">
|
||||
<defs
|
||||
id="defs5902">
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient6185">
|
||||
<stop
|
||||
style="stop-color:#90b4bc;stop-opacity:1"
|
||||
offset="0"
|
||||
id="stop6181" />
|
||||
<stop
|
||||
style="stop-color:#cbf4f2;stop-opacity:0"
|
||||
offset="1"
|
||||
id="stop6183" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient5984">
|
||||
<stop
|
||||
style="stop-color:#172651;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop5980" />
|
||||
<stop
|
||||
style="stop-color:#172651;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop5982" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient5984"
|
||||
id="linearGradient5986"
|
||||
x1="83.017578"
|
||||
y1="47.484482"
|
||||
x2="124.26878"
|
||||
y2="46.241692"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient5984"
|
||||
id="linearGradient5993"
|
||||
gradientUnits="userSpaceOnUse"
|
||||
x1="83.017578"
|
||||
y1="47.484482"
|
||||
x2="124.26878"
|
||||
y2="46.241692" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient6185"
|
||||
id="linearGradient6195"
|
||||
x1="13.525695"
|
||||
y1="77.657806"
|
||||
x2="52.648178"
|
||||
y2="59.159847"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="10.24"
|
||||
inkscape:cx="-7.723604"
|
||||
inkscape:cy="30.408526"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="g5991"
|
||||
showgrid="false"
|
||||
inkscape:window-width="3838"
|
||||
inkscape:window-height="2141"
|
||||
inkscape:window-x="1"
|
||||
inkscape:window-y="18"
|
||||
inkscape:window-maximized="0" />
|
||||
<metadata
|
||||
id="metadata5905">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Ebene 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(-13.241106,-79.788687)">
|
||||
<g
|
||||
id="g5991"
|
||||
transform="matrix(0.20833464,0,0,0.2961247,10.471793,65.245691)">
|
||||
<rect
|
||||
transform="rotate(27.148744)"
|
||||
ry="4.4855337"
|
||||
y="40.750698"
|
||||
x="70.522202"
|
||||
height="12.160764"
|
||||
width="45.302185"
|
||||
id="rect5929"
|
||||
style="opacity:1;fill:url(#linearGradient5993);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
d="M 35.603664,83.599529 A 17.29241,17.29241 0 0 1 14.40183,71.443624 17.29241,17.29241 0 0 1 26.530387,50.226135 17.29241,17.29241 0 0 1 47.763498,62.327325 17.29241,17.29241 0 0 1 35.689695,83.57602"
|
||||
sodipodi:open="true"
|
||||
sodipodi:end="1.3014728"
|
||||
sodipodi:start="1.3066303"
|
||||
sodipodi:ry="17.29241"
|
||||
sodipodi:rx="17.29241"
|
||||
sodipodi:cy="66.906982"
|
||||
sodipodi:cx="31.088541"
|
||||
sodipodi:type="arc"
|
||||
id="path5912"
|
||||
style="opacity:1;fill:#172651;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
d="M 35.622879,82.440124 A 15.823282,15.823283 0 0 1 16.22231,71.316959 15.823282,15.823283 0 0 1 27.32045,51.902063 15.823282,15.823283 0 0 1 46.749639,62.975161 15.823282,15.823283 0 0 1 35.701602,82.418612"
|
||||
sodipodi:open="true"
|
||||
sodipodi:end="1.3014728"
|
||||
sodipodi:start="1.3066303"
|
||||
sodipodi:ry="15.823283"
|
||||
sodipodi:rx="15.823282"
|
||||
sodipodi:cy="67.165741"
|
||||
sodipodi:cx="31.491352"
|
||||
sodipodi:type="arc"
|
||||
id="path5912-7-3"
|
||||
style="opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.91504204;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
d="M 35.048067,82.374388 A 15.823282,15.823282 0 0 1 15.647498,71.251224 15.823282,15.823282 0 0 1 26.745638,51.836329 15.823282,15.823282 0 0 1 46.174827,62.909427 15.823282,15.823282 0 0 1 35.12679,82.352877"
|
||||
sodipodi:open="true"
|
||||
sodipodi:end="1.3014728"
|
||||
sodipodi:start="1.3066303"
|
||||
sodipodi:ry="15.823282"
|
||||
sodipodi:rx="15.823282"
|
||||
sodipodi:cy="67.100006"
|
||||
sodipodi:cx="30.91654"
|
||||
sodipodi:type="arc"
|
||||
id="path5912-7"
|
||||
style="opacity:1;fill:url(#linearGradient6195);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.91504204;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 5.9 KiB |
161
artwork/stow.svg
@ -1,161 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="14.522366mm"
|
||||
height="14.70852mm"
|
||||
viewBox="0 0 14.522366 14.70852"
|
||||
version="1.1"
|
||||
id="svg5293"
|
||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
|
||||
sodipodi:docname="stow.svg">
|
||||
<defs
|
||||
id="defs5287">
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient5805">
|
||||
<stop
|
||||
style="stop-color:#000000;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop5801" />
|
||||
<stop
|
||||
style="stop-color:#000000;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop5803" />
|
||||
</linearGradient>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient5805"
|
||||
id="linearGradient5807"
|
||||
x1="97.004578"
|
||||
y1="152.66513"
|
||||
x2="97.004578"
|
||||
y2="138.76712"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
</defs>
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="3.959798"
|
||||
inkscape:cx="167.88961"
|
||||
inkscape:cy="92.281344"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="layer1"
|
||||
showgrid="false"
|
||||
inkscape:window-width="3838"
|
||||
inkscape:window-height="2141"
|
||||
inkscape:window-x="1"
|
||||
inkscape:window-y="18"
|
||||
inkscape:window-maximized="0" />
|
||||
<metadata
|
||||
id="metadata5290">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Ebene 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(-76.696143,-143.14098)">
|
||||
<flowRoot
|
||||
xml:space="preserve"
|
||||
id="flowRoot5295"
|
||||
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;line-height:1.25;font-family:'Anonymous Pro';-inkscape-font-specification:'Anonymous Pro';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-feature-settings:normal;text-align:start;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#000000;fill-opacity:1;stroke:none"
|
||||
transform="scale(0.26458333)"><flowRegion
|
||||
id="flowRegion5297"><rect
|
||||
id="rect5299"
|
||||
width="302.85715"
|
||||
height="200"
|
||||
x="151.42857"
|
||||
y="396.80536" /></flowRegion><flowPara
|
||||
id="flowPara5301" /></flowRoot> <g
|
||||
id="g5862"
|
||||
transform="matrix(0.38327373,0,0,0.33829508,47.227117,104.41207)">
|
||||
<g
|
||||
transform="rotate(25.881625,86.124624,321.22969)"
|
||||
id="g5777">
|
||||
<g
|
||||
id="g5763">
|
||||
<g
|
||||
id="g5750">
|
||||
<g
|
||||
id="g5738">
|
||||
<g
|
||||
id="g5727">
|
||||
<g
|
||||
id="g5717">
|
||||
<g
|
||||
id="g5708">
|
||||
<g
|
||||
id="g5700">
|
||||
<rect
|
||||
ry="0.077411793"
|
||||
y="136.3042"
|
||||
x="9.8106909"
|
||||
height="20.178848"
|
||||
width="14.499372"
|
||||
id="rect5317"
|
||||
style="opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.04044545;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
inkscape:connector-curvature="0"
|
||||
id="path5319"
|
||||
d="m 12.160764,141.11504 c 9.68852,-0.13364 9.755337,0 9.755337,0 v -0.13364 0"
|
||||
style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
|
||||
<path
|
||||
inkscape:connector-curvature="0"
|
||||
id="path5319-3"
|
||||
d="m 11.961895,151.16736 c 9.688519,-0.13364 9.755338,0 9.755338,0 v -0.13364 0"
|
||||
style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
|
||||
<path
|
||||
inkscape:connector-curvature="0"
|
||||
id="path5319-6"
|
||||
d="m 11.82826,147.8265 c 9.68852,-0.13364 9.755338,0 9.755338,0 v -0.13364 0"
|
||||
style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
|
||||
<path
|
||||
inkscape:connector-curvature="0"
|
||||
id="path5319-7"
|
||||
d="m 12.09553,144.21835 c 9.688519,-0.13364 9.755338,0 9.755338,0 v -0.13364 0"
|
||||
style="fill:none;stroke:#000000;stroke-width:0.26458332px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" />
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<rect
|
||||
ry="0.097748853"
|
||||
y="138.67198"
|
||||
x="77.277542"
|
||||
height="18.899141"
|
||||
width="37.110588"
|
||||
id="rect5636"
|
||||
style="opacity:1;fill:url(#linearGradient5807);fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.77973491;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
inkscape:connector-curvature="0"
|
||||
id="path5693"
|
||||
d="m 77.432168,138.91961 23.336802,-24.13695 v 0 0"
|
||||
style="fill:none;stroke:#000000;stroke-width:0.86332273;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 6.4 KiB |
@ -1,90 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="14.836508mm"
|
||||
height="14.954134mm"
|
||||
viewBox="0 0 14.836508 14.954134"
|
||||
version="1.1"
|
||||
id="svg5870"
|
||||
inkscape:version="0.92.4 (5da689c313, 2019-01-14)"
|
||||
sodipodi:docname="tag.svg">
|
||||
<defs
|
||||
id="defs5864" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="1.4"
|
||||
inkscape:cx="98.18385"
|
||||
inkscape:cy="-142.24281"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="layer1"
|
||||
showgrid="false"
|
||||
inkscape:window-width="3838"
|
||||
inkscape:window-height="2141"
|
||||
inkscape:window-x="1"
|
||||
inkscape:window-y="18"
|
||||
inkscape:window-maximized="0" />
|
||||
<metadata
|
||||
id="metadata5867">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Ebene 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(-62.826115,-84.959244)">
|
||||
<g
|
||||
id="g5881"
|
||||
transform="matrix(0.16069429,0.16229614,-0.16069429,0.16229614,65.169697,65.422356)">
|
||||
<rect
|
||||
transform="matrix(0.77499969,-0.63196161,0.77499969,0.63196161,0,0)"
|
||||
ry="4.4855332"
|
||||
y="87.517372"
|
||||
x="-19.262943"
|
||||
height="32.940388"
|
||||
width="33.972054"
|
||||
id="rect5874"
|
||||
style="opacity:1;fill:#172651;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.86083698;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<rect
|
||||
ry="12.185672"
|
||||
y="47.226063"
|
||||
x="70.695381"
|
||||
height="40.323494"
|
||||
width="60.637527"
|
||||
id="rect5872"
|
||||
style="opacity:1;fill:#172651;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1.08249819;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
<path
|
||||
d="m 80.186024,74.676744 a 7.0870538,7.5595236 0 0 1 -8.689277,-5.314057 7.0870538,7.5595236 0 0 1 4.970721,-9.275405 7.0870538,7.5595236 0 0 1 8.702095,5.290137 7.0870538,7.5595236 0 0 1 -4.94828,9.289048"
|
||||
sodipodi:open="true"
|
||||
sodipodi:end="1.3014728"
|
||||
sodipodi:start="1.3066303"
|
||||
sodipodi:ry="7.5595236"
|
||||
sodipodi:rx="7.0870538"
|
||||
sodipodi:cy="67.379456"
|
||||
sodipodi:cx="78.335564"
|
||||
sodipodi:type="arc"
|
||||
id="path5876"
|
||||
style="opacity:1;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:round;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:stroke fill markers" />
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
Before Width: | Height: | Size: 3.4 KiB |
@ -1 +0,0 @@
|
||||
docspell.org
|
@ -1,92 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Api
|
||||
permalink: api
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
Docspell is designed as a REST server that uses JSON to exchange
|
||||
data. The REST api can be used to integrate docspell into your
|
||||
workflow.
|
||||
|
||||
[Docspell REST Api Doc](openapi/docspell-openapi.html)
|
||||
|
||||
The "raw" `openapi.yml` specification file can be found
|
||||
[here](openapi/docspell-openapi.yml).
|
||||
|
||||
The routes can be divided into protected and unprotected routes. The
|
||||
unprotected, or open routes are at `/open/*` while the protected
|
||||
routes are at `/sec/*`. Open routes don't require authenticated access
|
||||
and can be used by any user. The protected routes require an
|
||||
authenticated user.
|
||||
|
||||
## Authentication
|
||||
|
||||
The unprotected route `/open/auth/login` can be used to login with
|
||||
account name and password. The response contains a token that can be
|
||||
used for accessing protected routes. The token is only valid for a
|
||||
restricted time which can be configured (default is 5 minutes).
|
||||
|
||||
New tokens can be generated using an existing valid token and the
|
||||
protected route `/sec/auth/session`. This will return the same
|
||||
response as above, giving a new token.
|
||||
|
||||
This token can be added to requests in two ways: as a cookie header or
|
||||
a "normal" http header. If a cookie header is used, the cookie name
|
||||
must be `docspell_auth` and a custom header must be named
|
||||
`X-Docspell-Auth`.
|
||||
|
||||
## Live Api
|
||||
|
||||
Besides the statically generated documentation at this site, the rest
|
||||
server provides a swagger generated api documenation, that allows
|
||||
playing around with the api. It requires a running docspell rest
|
||||
server. If it is deployed at `http://localhost:7880`, then check this
|
||||
url:
|
||||
|
||||
```
|
||||
http://localhost:7880/api/doc
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
These examples use the great command line tool
|
||||
[curl](https://curl.haxx.se/).
|
||||
|
||||
### Login
|
||||
|
||||
```
|
||||
$ curl -X POST -d '{"account": "smith", "password": "test"}' http://localhost:7880/api/v1/open/auth/login
|
||||
{"collective":"smith"
|
||||
,"user":"smith"
|
||||
,"success":true
|
||||
,"message":"Login successful"
|
||||
,"token":"1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI="
|
||||
,"validMs":300000
|
||||
}
|
||||
```
|
||||
|
||||
### Get new token
|
||||
|
||||
```
|
||||
$ curl -XPOST -H 'X-Docspell-Auth: 1568142350115-ZWlrZS9laWtl-$2a$10$rGZUFDAVNIKh4Tj6u6tlI.-O2euwCvmBT0TlyDmIHR1ZsLQPAI=' http://localhost:7880/api/v1/sec/auth/session
|
||||
{"collective":"smith"
|
||||
,"user":"smith"
|
||||
,"success":true
|
||||
,"message":"Login successful"
|
||||
,"token":"1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc="
|
||||
,"validMs":300000
|
||||
}
|
||||
```
|
||||
|
||||
### Get some insights
|
||||
|
||||
```
|
||||
$ curl -H 'X-Docspell-Auth: 1568142446077-ZWlrZS9laWtl-$2a$10$3B0teJ9rMpsBJPzHfZZPoO-WeA1bkfEONBN8fyzWE8DeaAHtUc=' http://localhost:7880/api/v1/sec/collective/insights
|
||||
{"incomingCount":3
|
||||
,"outgoingCount":1
|
||||
,"itemSize":207310
|
||||
,"tagCloud":{"items":[]}
|
||||
}
|
||||
```
|
@ -1,22 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Demo
|
||||
permalink: demo
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Finding Items
|
||||
|
||||
<video width="100%" controls>
|
||||
<source src="../static/docspell-search-2020-06-24.webm" type="video/webm">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
|
||||
|
||||
## Basic Idea (First Version)
|
||||
|
||||
<video width="100%" controls>
|
||||
<source src="../static/docspell-demo.webm" type="video/webm">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
@ -1,128 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Development
|
||||
permalink: dev
|
||||
---
|
||||
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
|
||||
## Building
|
||||
|
||||
[Sbt](https://scala-sbt.org) is used to build the application. Clone
|
||||
the sources and run:
|
||||
|
||||
- `make` to compile all sources (Elm + Scala)
|
||||
- `make-zip` to create zip packages
|
||||
- `make-deb` to create debian packages
|
||||
- `make-tools` to create a zip containing the script in `tools/`
|
||||
- `make-pkg` for a clean compile + building all packages (zip + deb)
|
||||
|
||||
The zip files can be found afterwards in:
|
||||
|
||||
```
|
||||
modules/restserver/target/universal
|
||||
modules/joex/target/universal
|
||||
```
|
||||
|
||||
|
||||
## Starting Servers with `reStart`
|
||||
|
||||
When developing, it's very convenient to use the [revolver sbt
|
||||
plugin](https://github.com/spray/sbt-revolver). Start the sbt console
|
||||
and then run:
|
||||
|
||||
```
|
||||
sbt:docspell-root> restserver/reStart
|
||||
```
|
||||
|
||||
This starts a REST server. Once this started up, type:
|
||||
|
||||
```
|
||||
sbt:docspell-root> joex/reStart
|
||||
```
|
||||
|
||||
if also a joex component is required. Prefixing the commads with `~`,
|
||||
results in recompile+restart once a source file is modified.
|
||||
|
||||
It is possible to start both in the root project:
|
||||
|
||||
```
|
||||
sbt:docspell-root> reStart
|
||||
```
|
||||
|
||||
|
||||
## Custom config file
|
||||
|
||||
The sbt build is setup such that a file `dev.conf` in the directory
|
||||
`local` (at root of the source tree) is picked up as config file, if
|
||||
it exists. So you can create a custom config file for development. For
|
||||
example, a custom database for development may be setup this way:
|
||||
|
||||
```
|
||||
#jdbcurl = "jdbc:h2:///home/dev/workspace/projects/docspell/local/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
|
||||
jdbcurl = "jdbc:postgresql://localhost:5432/docspelldev"
|
||||
#jdbcurl = "jdbc:mariadb://localhost:3306/docspelldev"
|
||||
|
||||
docspell.server {
|
||||
backend {
|
||||
jdbc {
|
||||
url = ${jdbcurl}
|
||||
user = "dev"
|
||||
password = "dev"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docspell.joex {
|
||||
jdbc {
|
||||
url = ${jdbcurl}
|
||||
user = "dev"
|
||||
password = "dev"
|
||||
}
|
||||
scheduler {
|
||||
pool-size = 1
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Nix Expressions
|
||||
|
||||
The directory `/nix` contains nix expressions to install docspell via
|
||||
the nix package manager and to integrate it into NixOS.
|
||||
|
||||
### Testing NixOS Modules
|
||||
|
||||
The modules can be build by building the `configuration-test.nix` file
|
||||
together with some nixpkgs version. For example:
|
||||
|
||||
``` bash
|
||||
nixos-rebuild build-vm -I nixos-config=./configuration-test.nix \
|
||||
-I nixpkgs=https://github.com/NixOS/nixpkgs-channels/archive/nixos-19.09.tar.gz
|
||||
```
|
||||
|
||||
This will build all modules imported in `configuration-test.nix` and
|
||||
create a virtual machine containing the system. After that completes,
|
||||
the system configuration can be found behind the `./result/system`
|
||||
symlink. So it is possible to look at the generated systemd config for
|
||||
example:
|
||||
|
||||
``` bash
|
||||
cat result/system/etc/systemd/system/docspell-joex.service
|
||||
```
|
||||
|
||||
And with some more commands (there probably is an easier way…) the
|
||||
config file can be checked:
|
||||
|
||||
``` bash
|
||||
cat result/system/etc/systemd/system/docspell-joex.service | grep ExecStart | cut -d'=' -f2 | xargs cat | tail -n1 | awk '{print $NF}'| sed 's/.$//' | xargs cat | jq
|
||||
```
|
||||
|
||||
To see the module in action, the vm can be started (the first line
|
||||
sets more memory for the vm):
|
||||
|
||||
``` bash
|
||||
export QEMU_OPTS="-m 2048"
|
||||
./result/bin/run-docspelltest-vm
|
||||
```
|
@ -1,26 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: ADRs
|
||||
permalink: dev/adr
|
||||
---
|
||||
|
||||
# ADR
|
||||
|
||||
Some early information about certain details can be found in a few
|
||||
[ADR](https://adr.github.io/) that exist:
|
||||
|
||||
- [0001 Components](adr/0001_components)
|
||||
- [0002 Component Interaction](adr/0002_component_interaction)
|
||||
- [0003 Encryption](adr/0003_encryption)
|
||||
- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch)
|
||||
- [0005 Job Executor](adr/0005_job-executor)
|
||||
- [0006 More File Types](adr/0006_more-file-types)
|
||||
- [0007 Convert HTML](adr/0007_convert_html_files)
|
||||
- [0008 Convert Text](adr/0008_convert_plain_text)
|
||||
- [0009 Convert Office Files](adr/0009_convert_office_docs)
|
||||
- [0010 Convert Image Files](adr/0010_convert_image_files)
|
||||
- [0011 Extract Text](adr/0011_extract_text)
|
||||
- [0012 Periodic Tasks](adr/0012_periodic_tasks)
|
||||
- [0013 Archive Files](adr/0013_archive_files)
|
||||
- [0014 Full-Text Search](adr/0014_fulltext_search_engine)
|
||||
- [0015 Convert PDF files](adr/0015_convert_pdf_files)
|
@ -1,39 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Use Markdown Architectural Decision Records
|
||||
permalink: dev/adr/0000
|
||||
---
|
||||
|
||||
# Use Markdown Architectural Decision Records
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
We want to [record architectural decisions](https://adr.github.io/)
|
||||
made in this project. Which format and structure should these records
|
||||
follow?
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [MADR](https://adr.github.io/madr/) 2.1.0 - The Markdown Architectural Decision Records
|
||||
* [Michael Nygard's template](http://thinkrelevance.com/blog/2011/11/15/documenting-architecture-decisions) - The first incarnation of the term "ADR"
|
||||
* [Sustainable Architectural
|
||||
Decisions](https://www.infoq.com/articles/sustainable-architectural-design-decisions) -
|
||||
The Y-Statements
|
||||
* Other templates listed at
|
||||
<https://github.com/joelparkerhenderson/architecture_decision_record>
|
||||
* Formless - No conventions for file format and structure
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Chosen option: "MADR 2.1.0", because
|
||||
|
||||
* Implicit assumptions should be made explicit. Design documentation
|
||||
is important to enable people understanding the decisions later on.
|
||||
See also [A rational design process: How and why to fake
|
||||
it](https://doi.org/10.1109/TSE.1986.6312940).
|
||||
* The MADR format is lean and fits our development style.
|
||||
* The MADR structure is comprehensible and facilitates usage &
|
||||
maintenance.
|
||||
* The MADR project is vivid.
|
||||
* Version 2.1.0 is the latest one available when starting to document
|
||||
ADRs.
|
@ -1,67 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Components
|
||||
permalink: dev/adr/0001_components
|
||||
---
|
||||
|
||||
# Components
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
How should the application be structured into its main components? The
|
||||
goal is to be able to have multiple rest servers/webapps and multiple
|
||||
document processor components working togehter.
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
The following are the "main" modules. There may be more helper modules
|
||||
and libraries that support implementing a feature.
|
||||
|
||||
### store
|
||||
|
||||
The code related to database access. It also provides the job
|
||||
queue. It is designed as a library.
|
||||
|
||||
### joex
|
||||
|
||||
Joex stands for "job executor".
|
||||
|
||||
An application that executes jobs from the queue and therefore depends
|
||||
on the `store` module. It provides the code for all tasks that can be
|
||||
submitted as jobs. If no jobs are in the queue, the joex "sleeps"
|
||||
and must be waked via an external request.
|
||||
|
||||
It provides the document processing code.
|
||||
|
||||
It provides a http rest server to get insight into the joex state
|
||||
and also to be notified for new jobs.
|
||||
|
||||
### backend
|
||||
|
||||
It provides all the logic, except document processing, as a set of
|
||||
"operations". An operation can be directly mapped to a rest
|
||||
endpoint.
|
||||
|
||||
It is designed as a library.
|
||||
|
||||
### rest api
|
||||
|
||||
This module contains the specification for the rest server as an
|
||||
`openapi.yml` file. It is packaged as a scala library that also
|
||||
provides types and conversions to/from json.
|
||||
|
||||
The idea is that the `rest server` module can depend on it as well as
|
||||
rest clients.
|
||||
|
||||
### rest server
|
||||
|
||||
This is the main application. It directly depends on the `backend`
|
||||
module, and each rest endpoint maps to a "backend operation". It is
|
||||
also responsible for converting the json data inside http requests
|
||||
to/from types recognized by the `backend` module.
|
||||
|
||||
|
||||
### webapp
|
||||
|
||||
This module provides the user interface as a web application.
|
@ -1,66 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Component Interaction
|
||||
permalink: dev/adr/0002_component_interaction
|
||||
---
|
||||
|
||||
# Component Interaction
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
There are multiple web applications with their rest servers and there
|
||||
are multiple document processors. These processes must communicate:
|
||||
|
||||
- once a new job is added to the queue the rest server must somehow
|
||||
notify processors to wake up
|
||||
- once a processor takes a job, it must propagate the progress and
|
||||
outcome to all rest servers only that the rest server can notify the
|
||||
user that is currently logged in. Since it's not known which
|
||||
rest-server the user is using right now, all must be notified.
|
||||
|
||||
## Considered Options
|
||||
|
||||
1. JMS (ActiveMQ or similiar): Message Broker as another active
|
||||
component
|
||||
2. Akka: using a cluster
|
||||
3. DB: Register with "call back urls"
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Choosing option 3: DB as central synchronisation point.
|
||||
|
||||
The reason is that this is the simplest solution and doesn't require
|
||||
external libraries or more processes. The other options seem too big
|
||||
of a weapon for the task at hand. They are both large components
|
||||
itself and require more knowledge to use them efficiently.
|
||||
|
||||
It works roughly like this:
|
||||
|
||||
- rest servers and processors register at the database on startup each
|
||||
with a unique call-back url
|
||||
- and deregister on shutdown
|
||||
- each component has db access
|
||||
- rest servers can list all processors and vice versa
|
||||
|
||||
### Positive Consequences
|
||||
|
||||
- complexity of the whole application is not touched
|
||||
- since a lot of data must be transferred to the document processors,
|
||||
this is solved by simply accessing the db. So the protocol for data
|
||||
exchange is set. There is no need for other protocols that handle
|
||||
large data (http chunking etc)
|
||||
- uses the already exsting db as synchronisation point
|
||||
- no additional knowledge required
|
||||
- simple to understand and so not hard to debug
|
||||
|
||||
### Negative Consequences
|
||||
|
||||
- all components must have db access. this also is a security con,
|
||||
because if one of those processes is hacked, db access is
|
||||
possible. and it simply is another dependency that is not really
|
||||
required for the joex component
|
||||
- the joex component cannot be in an untrusted environment (untrusted
|
||||
from the db's point of view). For example, it is not possible to
|
||||
create "personal joex" that only receive your own jobs…
|
||||
- in order to know if a component is really active, one must run a
|
||||
ping against the call-back url
|
@ -1,96 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Encryption
|
||||
permalink: dev/adr/0003_encryption
|
||||
---
|
||||
|
||||
# Encryption
|
||||
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Since docspell may store important documents, it should be possible to
|
||||
encrypt them on the server. It should be (almost) transparent to the
|
||||
user, for example, a user must be able to login and download a file in
|
||||
clear form. That is, the server must also decrypt them.
|
||||
|
||||
Then all users of a collective should have access to the files. This
|
||||
requires to share the key among users of a collective.
|
||||
|
||||
But, even when files are encrypted, the associated meta data is not!
|
||||
So especially access to the database would allow to see tags,
|
||||
associated persons and correspondents of documents.
|
||||
|
||||
So in short, encryption means:
|
||||
|
||||
- file contents (the blobs and extracted text) is encrypted
|
||||
- metadata is not
|
||||
- secret keys are stored at the server (protected by a passphrase),
|
||||
such that files can be downloaded in clear form
|
||||
|
||||
|
||||
## Decision Drivers
|
||||
|
||||
* major driver is to provide most possible privacy for users
|
||||
* even at the expense of less features; currently I think that the
|
||||
associated meta data is enough for finding documents (i.e. full text
|
||||
search is not needed)
|
||||
|
||||
## Considered Options
|
||||
|
||||
It is clear, that only blobs (file contents) can be encrypted, but not
|
||||
the associated metadata. And the extracted text must be encrypted,
|
||||
too, obviously.
|
||||
|
||||
|
||||
### Public Key Encryption (PKE)
|
||||
|
||||
With PKE that the server can automatically encrypt files using
|
||||
publicly available key data. It wouldn't require a user to provide a
|
||||
passphrase for encryption, only for decryption.
|
||||
|
||||
This would allows for first processing files (extracting text, doing
|
||||
text analyisis) and encrypting them (and the text) afterwards.
|
||||
|
||||
The public and secret keys are stored at the database. The secret key
|
||||
must be protected. This can be done by encrypting the passphrase to
|
||||
the secret key using each users login password. If a user logs in, he
|
||||
or she must provide the correct password. Using this password, the
|
||||
private key can be unlocked. This requires to store the private key
|
||||
passphrase encrypted with every users password in the database. So the
|
||||
whole security then depends on users password quality.
|
||||
|
||||
There are plenty of other difficulties with this approach (how about
|
||||
password change, new secret keys, adding users etc).
|
||||
|
||||
Using this kind of encryption would protect the data against offline
|
||||
attacks and also for accidental leakage (for example, if a bug in the
|
||||
software would access a file of another user).
|
||||
|
||||
|
||||
### No Encryption
|
||||
|
||||
If only blobs are encrypted, against which type of attack would it
|
||||
provide protection?
|
||||
|
||||
The users must still trust the server. First, in order to provide the
|
||||
wanted features (document processing), the server must see the file
|
||||
contents. Then, it will receive and serve files in clear form, so it
|
||||
has access to them anyways.
|
||||
|
||||
With that in mind, the "only" feature is to protect against "stolen
|
||||
database" attacks. If the database is somehow leaked, the attackers
|
||||
would only see the metadata, but not real documents. It also protects
|
||||
against leakage, maybe caused by a pogramming error.
|
||||
|
||||
But the downside is, that it increases complexity *a lot*. And since
|
||||
this is a personal tool for personal use, is it worth the effort?
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
No encryption, because of its complexity.
|
||||
|
||||
For now, this tool is only meant for "self deployment" and personal
|
||||
use. If this changes or there is enough time, this decision should be
|
||||
reconsidered.
|
@ -1,43 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: ISO8601 vs Millis
|
||||
permalink: dev/adr/0004_iso8601vsEpoch
|
||||
---
|
||||
|
||||
# ISO8601 vs Millis as Date-Time transfer
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
The question is whether the REST Api should return an ISO8601
|
||||
formatted string in UTC timezone, or the unix time (number of
|
||||
milliseconds since 1970-01-01).
|
||||
|
||||
There is quite some controversy about it.
|
||||
|
||||
- <https://stackoverflow.com/questions/47426786/epoch-or-iso8601-date-format>
|
||||
- <https://nbsoftsolutions.com/blog/designing-a-rest-api-unix-time-vs-iso-8601>
|
||||
|
||||
In my opinion, the ISO8601 format (always UTC) is better. The reason
|
||||
is the better readability. But elm folks are on the other side:
|
||||
|
||||
- <https://package.elm-lang.org/packages/elm/time/1.0.0#iso-8601>
|
||||
- <https://package.elm-lang.org/packages/rtfeldman/elm-iso8601-date-strings/latest/>
|
||||
|
||||
One can convert from an ISO8601 date-time string in UTC time into the
|
||||
epoch millis and vice versa. So it is the same to me. There is no less
|
||||
information in a ISO8601 string than in the epoch millis.
|
||||
|
||||
To avoid confusion, all date/time values should use the same encoding.
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
I go with the epoch time. Every timestamp/date-time values is
|
||||
transfered as Unix timestamp.
|
||||
|
||||
Reasons:
|
||||
|
||||
- the Elm application needs to frequently calculate with these values
|
||||
to render the current waiting time etc. This is better if there are
|
||||
numbers without requiring to parse dates first
|
||||
- Since the UI is written with Elm, it's probably good to adopt their
|
||||
style
|
@ -1,137 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Joex - Job Executor
|
||||
permalink: dev/adr/0005_job-executor
|
||||
---
|
||||
|
||||
# Job Executor
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Docspell is a multi-user application. When processing user's
|
||||
documents, there must be some thought on how to distribute all the
|
||||
processing jobs on a much more restricted set of resources. There
|
||||
maybe 100 users but only 4 cores that can process documents at a
|
||||
time. Doing simply FIFO is not enough since it provides an unfair
|
||||
distribution. The first user who submits 20 documents will then occupy
|
||||
all cores for quite some time and all other users would need to wait.
|
||||
|
||||
This tries to find a more fair distribution among the users (strictly
|
||||
meaning collectives here) of docspell.
|
||||
|
||||
The job executor is a separate component that will run in its own
|
||||
process. It takes the next job from the "queue" and executes the
|
||||
associated task. This is used to run the document processing jobs
|
||||
(text extraction, text analysis etc).
|
||||
|
||||
1. The task execution should survive restarts. State and task code
|
||||
must be recreated from some persisted state.
|
||||
|
||||
2. The processing should be fair with respect to collectives.
|
||||
|
||||
3. It must be possible to run many job executors, possibly on
|
||||
different machines. This can be used to quickly enable more
|
||||
processing power and removing it once the peak is over.
|
||||
|
||||
4. Task execution can fail and it should be able to retry those
|
||||
tasks. Reasons are that errors may be temporarily (for example
|
||||
talking to a third party service), and to enable repairing without
|
||||
stopping the job executor. Some errors might be easily repaired (a
|
||||
program was not installed or whatever). In such a case it is good
|
||||
to know that the task will be retried later.
|
||||
|
||||
## Considered Options
|
||||
|
||||
In contrast to other ADRs this is just some sketching of thoughts for
|
||||
the current implementation.
|
||||
|
||||
1. Job description are serialized and written to the database into a
|
||||
table. This becomes the queue. Tasks are identified by names and a
|
||||
job executor implementation must have a map of names to code to
|
||||
lookup the task to perform. The tasks arguments are serialized into
|
||||
a string and written to the database. Tasks must decode the
|
||||
string. This can be conveniently done using JSON and the provided
|
||||
circe decoders.
|
||||
|
||||
2. To provide a fair execution jobs are organized into groups. When a
|
||||
new job is requested from the queue, first a group is selected
|
||||
using a round-robin strategy. This should ensure good enough
|
||||
fairness among groups. A group maps to a collective. Within a
|
||||
group, a job is selected based on priority, submitted time (fifo)
|
||||
and job state (see notes about stuck jobs).
|
||||
|
||||
3. Allowing multiple job executors means that getting the next job can
|
||||
fail due to simultaneous running transactions. It is retried until
|
||||
it succeeds. Taking a job puts in into _scheduled_ state. Each job
|
||||
executor has a unique (manually supplied) id and jobs are marked
|
||||
with that id once it is handed to the executor.
|
||||
|
||||
4. When a task fails, its state is updated to state _stuck_. Stuck
|
||||
jobs are retried in the future. The queue prefers to return stuck
|
||||
jobs that are due at the specific point in time ignoring the
|
||||
priority hint.
|
||||
|
||||
### More Details
|
||||
|
||||
A job has these properties
|
||||
|
||||
- id (something random)
|
||||
- group
|
||||
- taskname (to choose task to run)
|
||||
- submitted-date
|
||||
- worker (the id of the job executor)
|
||||
- state, one of: waiting, scheduled, running, stuck, cancelled,
|
||||
failed, success
|
||||
- waiting: job has been inserted into the queue
|
||||
- scheduled: job has been handed over to some executore and is
|
||||
marked with the job executor id
|
||||
- running: a task is currently executing
|
||||
- stuck: a task has failed and is being retried eventually
|
||||
- cancelled: task has finished and there was a cancel request
|
||||
- failed: task has failed, execeeded the retries
|
||||
- success: task has completed successfully
|
||||
|
||||
The queue has a `take` or `nextJob` operation that takes the worker-id
|
||||
and a priority hint and goes roughly like this:
|
||||
|
||||
- select the next group using round-robin strategy
|
||||
- select all jobs with that group, where
|
||||
- state is stuck and waiting time has elapsed
|
||||
- state is waiting and have the given priority if possible
|
||||
- jobs are ordered by submitted time, but stuck jobs whose waiting
|
||||
time elapsed are preferred
|
||||
|
||||
There are two priorities within a group: high and low. A configured
|
||||
counting scheme determines when to select certain priority. For
|
||||
example, counting scheme of `(2,1)` would select two high priority
|
||||
jobs and then 1 low priority job. The `take` operation tries to prefer
|
||||
this priority but falls back to the other if no job with this priority
|
||||
is available.
|
||||
|
||||
A group corresponds to a collective. Then all collectives get
|
||||
(roughly) equal treatment.
|
||||
|
||||
Once there are no jobs in the queue the executor goes into sleep and
|
||||
must be waked to run again. If a job is submitted, the executors are
|
||||
notified.
|
||||
|
||||
### Stuck Jobs
|
||||
|
||||
A job is going into _stuck_ state, if the task has failed. In this
|
||||
state, the task is rerun after a while until a maximum retry count is
|
||||
reached.
|
||||
|
||||
The problem is how to notify all executors when the waiting time has
|
||||
elapsed. If one executor puts a job into stuck state, it means that
|
||||
all others should start looking into the queue again after `x`
|
||||
minutes. It would be possible to tell all existing executors to
|
||||
schedule themselves to wake up in the future, but this would miss all
|
||||
executors that show up later.
|
||||
|
||||
The waiting time is increased exponentially after each retry (`2 ^
|
||||
retry`) and it is meant as the minimum waiting time. So it is ok if
|
||||
all executors wakeup periodically and check for new work. Most of the
|
||||
time this should not be necessary and is just a fallback if only stuck
|
||||
jobs are in the queue and nothing is submitted for a long time. If the
|
||||
system is used, jobs get submitted once in a while and would awake all
|
||||
executors.
|
@ -1,155 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: More File Types
|
||||
permalink: dev/adr/0006_more-file-types
|
||||
---
|
||||
|
||||
# More File Types
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Docspell currently only supports PDF files. This has simplified early
|
||||
development and design a lot and so helped with starting the project.
|
||||
Handling pdf files is usually easy (to view, to extract text, print
|
||||
etc).
|
||||
|
||||
The pdf format has been chosen, because PDFs files are very common and
|
||||
can be viewed with many tools on many systems (i.e. non-proprietary
|
||||
tools). Docspell also is a document archive and from this perspective,
|
||||
it is important that documents can be viewed in 10 years and more. The
|
||||
hope is, that the PDF format is best suited for this. Therefore all
|
||||
documents in Docspell must be accessible as PDF. The trivial solution
|
||||
to this requirement is to only allow PDF files.
|
||||
|
||||
Support for more document types, must then take care of the following:
|
||||
|
||||
- extracting text
|
||||
- converting into pdf
|
||||
- access original file
|
||||
|
||||
Text should be extracted from the source file, in case conversion is
|
||||
not lossless. Since Docspell can already extract text from PDF files
|
||||
using OCR, text can also be extracted from the converted file as a
|
||||
fallback.
|
||||
|
||||
The original file must always be accessible. The main reason is that
|
||||
all uploaded data should be accessible without any modification. And
|
||||
since the conversion may not always create best results, the original
|
||||
file should be kept.
|
||||
|
||||
|
||||
## Decision Drivers
|
||||
|
||||
People expect that software like Docspell support the most common
|
||||
document types, like all the “office documents” (`docx`, `rtf`, `odt`,
|
||||
`xlsx`, …) and images. For many people it is more common to create
|
||||
those files instead of PDF. Some (older) scanners may not be able to
|
||||
scan into PDF files but only to image files.
|
||||
|
||||
|
||||
## Considered Options
|
||||
|
||||
This ADR does not evaluate different options. It rather documents why
|
||||
this feature is realized and the thoughts that lead to how it is
|
||||
implemented.
|
||||
|
||||
## Realization
|
||||
|
||||
### Data Model
|
||||
|
||||
The `attachment` table holds one file. There will be another table
|
||||
`attachment_source` that holds the original file. It looks like this:
|
||||
|
||||
``` sql
|
||||
CREATE TABLE "attachment_source" (
|
||||
"id" varchar(254) not null primary key,
|
||||
"file_id" varchar(254) not null,
|
||||
"filename" varchar(254),
|
||||
"created" timestamp not null,
|
||||
foreign key ("file_id") references "filemeta"("id"),
|
||||
foreign key ("id") references "attachment"("attachid")
|
||||
);
|
||||
```
|
||||
|
||||
The `id` is the primary key and is the same as the associated
|
||||
`attachment`, creating a `1-1` relationship (well, more correct is
|
||||
`0..1-1`) between `attachment` and `attachment_source`.
|
||||
|
||||
There will always be a `attachment_source` record for every
|
||||
`attachment` record. If the original file is a PDF already, then both
|
||||
table's `file_id` columns point to the same file. But now the user can
|
||||
change the filename of an `attachment` while the original filename is
|
||||
preserved in `attachment_source`. It must not be possible for the user
|
||||
to change anything in `attachment_source`.
|
||||
|
||||
The `attachment` table is not touched in order to keep current code
|
||||
mostly unchanged and to have a simpler data migration. The downside
|
||||
is, that the data model allows to have an `attachment` record without
|
||||
an `attachment_source` record. OTOH, a foreign key inside `attachment`
|
||||
pointing to an `attachment_source` is also not correct, because it
|
||||
allows the same `attachment_source` record to be associated with many
|
||||
`attachment` records. This would do even more harm, in my opinion.
|
||||
|
||||
### Migration
|
||||
|
||||
Creating a new table and not altering existing ones, should simplify
|
||||
data migration.
|
||||
|
||||
Since only PDF files where allowed and the user could not change
|
||||
anything in the `attachment` table, the existing data can simply be
|
||||
inserted into the new table. This presents the trivial case where the
|
||||
attachment and source are the same.
|
||||
|
||||
|
||||
### Processing
|
||||
|
||||
The first step in processing is now converting the file into a pdf. If
|
||||
it already is a pdf, nothing is done. This step is before text
|
||||
extraction, so text can first be tried to extract from the source file
|
||||
and only if that fails (or is not supported), text can be extracted
|
||||
from the converted pdf file. All remaining steps are untouched.
|
||||
|
||||
If conversion is not supported for the input file, it is skipped. If
|
||||
conversion fails, the error is propagated to let the retry mechanism
|
||||
take care.
|
||||
|
||||
#### What types?
|
||||
|
||||
Which file types should be supported? At a first step, all major
|
||||
office documents, common images, plain text (i.e. markdown) and html
|
||||
should be supported. In terms of file extensions: `doc`, `docx`,
|
||||
`xls`, `xlsx`, `odt`, `md`, `html`, `txt`, `jpg`, `png`, `tif`.
|
||||
|
||||
There is always the preference to use jvm internal libraries in order
|
||||
to be more platform independent and to reduce external dependencies.
|
||||
But this is not always possible (like doing OCR).
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/process-files.png" title="Overview processing files">
|
||||
</div>
|
||||
|
||||
#### Conversion
|
||||
|
||||
- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
|
||||
unoconv (see [ADR 9](0009_convert_office_docs))
|
||||
- HTML (`html`): wkhtmltopdf (see [ADR 7](0007_convert_html_files))
|
||||
- Text/Markdown (`txt`, `md`): Java-Lib flexmark + wkhtmltopdf
|
||||
- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
|
||||
10](0010_convert_image_files))
|
||||
|
||||
#### Text Extraction
|
||||
|
||||
- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
|
||||
- Office documends (`odt`, `ods`): Apache Tika (including the sources)
|
||||
- HTML: not supported, extract text from converted PDF
|
||||
- Images (`jpg`, `png`, `tif`): Tesseract
|
||||
- Text/Markdown: n.a.
|
||||
- PDF: Apache PDFBox or Tesseract
|
||||
|
||||
## Links
|
||||
|
||||
* [Convert HTML Files](0007_convert_html_files)
|
||||
* [Convert Plain Text](0008_convert_plain_text)
|
||||
* [Convert Office Documents](0009_convert_office_docs)
|
||||
* [Convert Image Files](0010_convert_image_files)
|
||||
* [Extract Text from Files](0011_extract_text)
|
@ -1,72 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert HTML Files
|
||||
permalink: dev/adr/0007_convert_html_files
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
How can HTML documents be converted into a PDF file that looks as much
|
||||
as possible like the original?
|
||||
|
||||
It would be nice to have a java-only solution. But if an external tool
|
||||
has a better outcome, then an external tool is fine, too.
|
||||
|
||||
Since Docspell is free software, the tools must also be free.
|
||||
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [pandoc](https://pandoc.org/) external command
|
||||
* [wkhtmltopdf](https://wkhtmltopdf.org/) external command
|
||||
* [Unoconv](https://github.com/unoconv/unoconv) external command
|
||||
|
||||
Native (firefox) view:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-html-native.jpg" title="Native view of an HTML example file">
|
||||
</div>
|
||||
|
||||
Note: the example html is from
|
||||
[here](https://www.sparksuite.com/open-source/invoice.html).
|
||||
|
||||
I downloaded the HTML file to disk together with its resources (using
|
||||
*Save as...* in the browser).
|
||||
|
||||
|
||||
### Pandoc
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-html-pandoc-latex.jpg" title="Pandoc (Latex) HTML->PDF">
|
||||
</div>
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-html-pandoc-html.jpg" title="Pandoc (html) HTML->PDF">
|
||||
</div>
|
||||
|
||||
Not showing the version using `context` pdf-engine, since it looked
|
||||
very similiar to the latex variant.
|
||||
|
||||
|
||||
### wkhtmltopdf
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-html-wkhtmltopdf.jpg" title="wkhtmltopdf HTML->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
### Unoconv
|
||||
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-html-unoconv.jpg" title="Unoconv HTML->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
wkhtmltopdf.
|
||||
|
||||
It shows the best results.
|
@ -1,192 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert Text Files
|
||||
permalink: dev/adr/0008_convert_plain_text
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
How can plain text and markdown documents be converted into a PDF
|
||||
files?
|
||||
|
||||
Rendering images is not important here, since the files must be self
|
||||
contained when uploaded to Docspell.
|
||||
|
||||
The test file is the current documentation page of Docspell, found in
|
||||
`microsite/docs/doc.md`.
|
||||
|
||||
```
|
||||
---
|
||||
layout: docs
|
||||
position: 4
|
||||
title: Documentation
|
||||
---
|
||||
|
||||
# {page .title}
|
||||
|
||||
|
||||
Docspell assists in organizing large amounts of PDF files that are
|
||||
...
|
||||
|
||||
## How it works
|
||||
|
||||
Documents have two ...
|
||||
|
||||
1. You maintain a kind of address book. It should list all possible
|
||||
correspondents and the concerning people/things. This grows
|
||||
incrementally with each new unknown document.
|
||||
2. When docspell analyzes a document, it tries to find matches within
|
||||
your address ...
|
||||
3. You can inspect ...
|
||||
|
||||
The set of meta data that docspell uses to draw suggestions from, must
|
||||
be maintained ...
|
||||
|
||||
|
||||
## Terms
|
||||
|
||||
In order to better understand these pages, some terms should be
|
||||
explained first.
|
||||
|
||||
### Item
|
||||
|
||||
An **Item** is roughly your (pdf) document, only that an item may span
|
||||
multiple files, which are called **attachments**. And an item has
|
||||
**meta data** associated:
|
||||
|
||||
- a **correspondent**: the other side of the communication. It can be
|
||||
an organization or a person.
|
||||
- a **concerning person** or **equipment**: a person or thing that
|
||||
this item is about. Maybe it is an insurance contract about your
|
||||
car.
|
||||
- ...
|
||||
|
||||
### Collective
|
||||
|
||||
The users of the application are part of a **collective**. A
|
||||
**collective** is a group of users that share access to the same
|
||||
items. The account name is therefore comprised of a *collective name*
|
||||
and a *user name*.
|
||||
|
||||
All users of a collective are equal; they have same permissions to
|
||||
access all...
|
||||
```
|
||||
|
||||
Then a plain text file is tried, too (without any markup).
|
||||
|
||||
```
|
||||
Maecenas mauris lectus, lobortis et purus mattis
|
||||
|
||||
Duis vehicula mi vel mi pretium
|
||||
|
||||
In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu.
|
||||
|
||||
Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut.
|
||||
Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros
|
||||
efficitur tincidunt. Cras justo mi, porttitor quis mattis vel,
|
||||
ultricies ut purus. Ut facilisis et lacus eu cursus.
|
||||
|
||||
In eleifend velit vitae libero sollicitudin euismod:
|
||||
|
||||
- Fusce vitae vestibulum velit,
|
||||
- Pellentesque vulputate lectus quis pellentesque commodo
|
||||
|
||||
the end.
|
||||
```
|
||||
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [flexmark](https://github.com/vsch/flexmark-java) for markdown to
|
||||
HTML, then use existing machinery described in [adr
|
||||
7](./0007_convert_html_files)
|
||||
* [pandoc](https://pandoc.org/) external command
|
||||
|
||||
|
||||
### flexmark markdown library for java
|
||||
|
||||
Process files with [flexmark](https://github.com/vsch/flexmark-java)
|
||||
and then create a PDF from the resulting html.
|
||||
|
||||
Using the following snippet:
|
||||
|
||||
``` scala
|
||||
def renderMarkdown(): ExitCode = {
|
||||
val opts = new MutableDataSet()
|
||||
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
||||
util.Arrays.asList(TablesExtension.create(),
|
||||
StrikethroughExtension.create()));
|
||||
|
||||
val parser = Parser.builder(opts).build()
|
||||
val renderer = HtmlRenderer.builder(opts).build()
|
||||
val reader = Files.newBufferedReader(Paths.get("in.txt|md"))
|
||||
val doc = parser.parseReader(reader)
|
||||
val html = renderer.render(doc)
|
||||
val body = "<html><head></head><body style=\"padding: 0 5em;\">" + html + "</body></html>"
|
||||
Files.write(
|
||||
Paths.get("test.html"),
|
||||
body.getBytes(StandardCharsets.UTF_8))
|
||||
|
||||
ExitCode.Success
|
||||
}
|
||||
```
|
||||
|
||||
Then run the result through `wkhtmltopdf`.
|
||||
|
||||
Markdown file:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-md-java.jpg" title="Flexmark/wkhtmltopdf MD->PDF">
|
||||
</div>
|
||||
|
||||
TXT file:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-txt-java.jpg" title="Flexmark/wkhtmltopdf TXT->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
### pandoc
|
||||
|
||||
Command:
|
||||
|
||||
```
|
||||
pandoc -f markdown -t html -o test.pdf microsite/docs/doc.md
|
||||
```
|
||||
|
||||
Markdown/Latex:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-md-pandoc-latex.jpg" title="Pandoc (Latex) MD->PDF">
|
||||
</div>
|
||||
|
||||
Markdown/Html:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-md-pandoc-html.jpg" title="Pandoc (html) MD->PDF">
|
||||
</div>
|
||||
|
||||
Text/Latex:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-txt-pandoc-latex.jpg" title="Pandoc (Latex) TXT->PDF">
|
||||
</div>
|
||||
|
||||
Text/Html:
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-txt-pandoc-html.jpg" title="Pandoc (html) TXT->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Java library "flexmark".
|
||||
|
||||
I think all results are great. It depends on the type of document and
|
||||
what one expects to see. I guess that most people expect something
|
||||
like pandoc-html produces for the kind of files docspell is for (it is
|
||||
not for newspaper articles, where pandoc-latex would be best fit).
|
||||
|
||||
But choosing pandoc means yet another external command to depend on.
|
||||
And the results from flexmark are really good, too. One can fiddle
|
||||
with options and css to make it look better.
|
||||
|
||||
To not introduce another external command, decision is to use flexmark
|
||||
and then the already existing html->pdf conversion.
|
@ -1,232 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert Office Documents
|
||||
permalink: dev/adr/0009_convert_office_docs
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
How can office documents, like `docx` or `odt` be converted into a PDF
|
||||
file that looks as much as possible like the original?
|
||||
|
||||
It would be nice to have a java-only solution. But if an external tool
|
||||
has a better outcome, then an external tool is fine, too.
|
||||
|
||||
Since Docspell is free software, the tools must also be free.
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [Apache POI](https://poi.apache.org) together with
|
||||
[this](https://search.maven.org/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.pdf/1.0.6/jar)
|
||||
library
|
||||
* [pandoc](https://pandoc.org/) external command
|
||||
* [abiword]() external command
|
||||
* [Unoconv](https://github.com/unoconv/unoconv) external command
|
||||
|
||||
To choose an option, some documents are converted to pdf and compared.
|
||||
Only the formats `docx` and `odt` are considered here. These are the
|
||||
most used formats. They have to look well, if a `xlsx` or `pptx`
|
||||
doesn't look so great, that is ok.
|
||||
|
||||
Here is the native view to compare with:
|
||||
|
||||
ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-native.jpg" title="Native view of an ODT example file">
|
||||
</div>
|
||||
|
||||
### `XWPFConverter`
|
||||
|
||||
I couldn't get any example to work. There were exceptions:
|
||||
|
||||
```
|
||||
java.lang.IllegalArgumentException: Value for parameter 'id' was out of bounds
|
||||
at org.apache.poi.util.IdentifierManager.reserve(IdentifierManager.java:80)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFRun.<init>(XWPFRun.java:101)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFRun.<init>(XWPFRun.java:146)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFParagraph.buildRunsInOrderFromXml(XWPFParagraph.java:135)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFParagraph.<init>(XWPFParagraph.java:88)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFDocument.onDocumentRead(XWPFDocument.java:147)
|
||||
at org.apache.poi.POIXMLDocument.load(POIXMLDocument.java:159)
|
||||
at org.apache.poi.xwpf.usermodel.XWPFDocument.<init>(XWPFDocument.java:124)
|
||||
at docspell.convert.Testing$.withPoi(Testing.scala:17)
|
||||
at docspell.convert.Testing$.$anonfun$run$1(Testing.scala:12)
|
||||
at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:87)
|
||||
at cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:355)
|
||||
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:376)
|
||||
at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:316)
|
||||
at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
|
||||
at cats.effect.internals.PoolUtils$$anon$2$$anon$3.run(PoolUtils.scala:51)
|
||||
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
|
||||
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
|
||||
at java.lang.Thread.run(Thread.java:748)
|
||||
```
|
||||
|
||||
The project (not Apache Poi, the other) seems unmaintained. I could
|
||||
not find any website and the artifact in maven central is from 2016.
|
||||
|
||||
|
||||
### Pandoc
|
||||
|
||||
I know pandoc as a very great tool when converting between markup
|
||||
documents. So this tries it with office documents. It supports `docx`
|
||||
and `odt` from there `--list-input-formats`.
|
||||
|
||||
From the pandoc manual:
|
||||
|
||||
> By default, pandoc will use LaTeX to create the PDF, which requires
|
||||
> that a LaTeX engine be installed (see --pdf-engine below).
|
||||
> Alternatively, pandoc can use ConTeXt, roff ms, or HTML as an
|
||||
> intermediate format. To do this, specify an output file with a .pdf
|
||||
> extension, as before, but add the --pdf-engine option or -t context,
|
||||
> -t html, or -t ms to the command line. The tool used to generate the
|
||||
> PDF from the intermediate format may be specified using --pdf-engine.
|
||||
|
||||
Trying with latex engine:
|
||||
|
||||
```
|
||||
pandoc -f odt -o test.pdf example.odt
|
||||
```
|
||||
|
||||
Results ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-pandoc-latex.jpg" title="Pandoc (Latex) ODT->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
```
|
||||
pandoc -f odt -o test.pdf example.docx
|
||||
```
|
||||
|
||||
Results DOCX:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-docx-pandoc-latex.jpg" title="Pandoc (Latex) DOCX->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
----
|
||||
|
||||
Trying with context engine:
|
||||
|
||||
```
|
||||
pandoc -f odt -t context -o test.pdf example.odt
|
||||
```
|
||||
|
||||
Results ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-pandoc-context.jpg" title="Pandoc (Context) ODT->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
Results DOCX:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-docx-pandoc-context.jpg" title="Pandoc (Context) DOCX->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
----
|
||||
|
||||
Trying with ms engine:
|
||||
|
||||
```
|
||||
pandoc -f odt -t ms -o test.pdf example.odt
|
||||
```
|
||||
|
||||
Results ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-pandoc-ms.jpg" title="Pandoc (MS) ODT->PDF">
|
||||
</div>
|
||||
|
||||
Results DOCX:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-docx-pandoc-ms.jpg" title="Pandoc (MS) DOCX->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
---
|
||||
|
||||
Trying with html engine (this requires `wkhtmltopdf` to be present):
|
||||
|
||||
```
|
||||
$ pandoc --extract-media . -f odt -t html -o test.pdf example.odt
|
||||
```
|
||||
|
||||
Results ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-pandoc-html.jpg" title="Pandoc (html) ODT->PDF">
|
||||
</div>
|
||||
|
||||
Results DOCX:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-docx-pandoc-html.jpg" title="Pandoc (html) DOCX->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
### Abiword
|
||||
|
||||
Trying with:
|
||||
|
||||
```
|
||||
abiword --to=pdf example.odt
|
||||
```
|
||||
|
||||
Results:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-abiword.jpg" title="Abiword ODT->PDF">
|
||||
</div>
|
||||
|
||||
|
||||
Trying with a `docx` file failed. It worked with a `doc` file.
|
||||
|
||||
|
||||
### Unoconv
|
||||
|
||||
Unoconv relies on libreoffice/openoffice, so installing it will result
|
||||
in installing parts of libreoffice, which is a very large dependency.
|
||||
|
||||
Trying with:
|
||||
|
||||
```
|
||||
unoconv -f pdf example.odt
|
||||
```
|
||||
|
||||
Results ODT:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-odt-unoconv.jpg" title="Unoconv ODT->PDF">
|
||||
</div>
|
||||
|
||||
Results DOCX:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/example-docx-unoconv.jpg" title="Unoconv ODT->PDF">
|
||||
</div>
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Unoconv.
|
||||
|
||||
The results from `unoconv` are really good.
|
||||
|
||||
Abiword also is not that bad, it didn't convert the chart, but all
|
||||
font markup is there. It would be great to not depend on something as
|
||||
big as libreoffice, but the results are so much better.
|
||||
|
||||
Also pandoc deals very well with DOCX files (using the `context`
|
||||
engine). The only thing that was not rendered was the embedded chart
|
||||
(like abiword). But all images and font styling was present.
|
||||
|
||||
It will be a configurable external command anyways, so users can
|
||||
exchange it at any time with a different one.
|
@ -1,193 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert Image Files
|
||||
permalink: dev/adr/0010_convert_image_files
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
How to convert image files properly to pdf?
|
||||
|
||||
Since there are thousands of different image formats, there will never
|
||||
be support for all. The most common containers should be supported,
|
||||
though:
|
||||
|
||||
- jpeg (jfif, exif)
|
||||
- png
|
||||
- tiff (baseline, single page)
|
||||
|
||||
The focus is on document images, maybe from digital cameras or
|
||||
scanners.
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [pdfbox]() library
|
||||
* [imagemagick](https://www.imagemagick.org/) external command
|
||||
* [img2pdf](https://github.com/josch/img2pdf) external command
|
||||
* [tesseract](https://github.com/tesseract-ocr/tesseract) external command
|
||||
|
||||
There are no screenshots here, because it doesn't make sense since
|
||||
they all look the same on the screen. Instead we look at the files
|
||||
properties.
|
||||
|
||||
**Input File**
|
||||
|
||||
The input files are:
|
||||
|
||||
```
|
||||
$ identify input/*
|
||||
input/jfif.jpg JPEG 2480x3514 2480x3514+0+0 8-bit sRGB 240229B 0.000u 0:00.000
|
||||
input/letter-en.jpg JPEG 1695x2378 1695x2378+0+0 8-bit Gray 256c 467341B 0.000u 0:00.000
|
||||
input/letter-en.png PNG 1695x2378 1695x2378+0+0 8-bit Gray 256c 191571B 0.000u 0:00.000
|
||||
input/letter-en.tiff TIFF 1695x2378 1695x2378+0+0 8-bit Grayscale Gray 4030880B 0.000u 0:00.000
|
||||
```
|
||||
|
||||
Size:
|
||||
- jfif.jpg 240k
|
||||
- letter-en.jpg 467k
|
||||
- letter-en.png 191k
|
||||
- letter-en.tiff 4.0M
|
||||
|
||||
### pdfbox
|
||||
|
||||
Using a java library is preferred, if the quality is good enough.
|
||||
There is an
|
||||
[example](https://github.com/apache/pdfbox/blob/2cea31cc63623fd6ece149c60d5f0cc05a696ea7/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ImageToPDF.java)
|
||||
for this exact use case.
|
||||
|
||||
This is the sample code:
|
||||
|
||||
``` scala
|
||||
def imgtopdf(file: String): ExitCode = {
|
||||
val jpg = Paths.get(file).toAbsolutePath
|
||||
if (!Files.exists(jpg)) {
|
||||
sys.error(s"file doesn't exist: $jpg")
|
||||
}
|
||||
val pd = new PDDocument()
|
||||
val page = new PDPage(PDRectangle.A4)
|
||||
pd.addPage(page)
|
||||
val bimg = ImageIO.read(jpg.toFile)
|
||||
|
||||
val img = LosslessFactory.createFromImage(pd, bimg)
|
||||
|
||||
val stream = new PDPageContentStream(pd, page)
|
||||
stream.drawImage(img, 0, 0, PDRectangle.A4.getWidth, PDRectangle.A4.getHeight)
|
||||
stream.close()
|
||||
|
||||
pd.save("test.pdf")
|
||||
pd.close()
|
||||
|
||||
ExitCode.Success
|
||||
}
|
||||
```
|
||||
|
||||
Using pdfbox 2.0.18 and twelvemonkeys 3.5. Running time: `1384ms`
|
||||
|
||||
```
|
||||
$ identify *.pdf
|
||||
jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129660B 0.000u 0:00.000
|
||||
letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
|
||||
letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
|
||||
letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000
|
||||
```
|
||||
|
||||
Size:
|
||||
- jfif.jpg 1.1M
|
||||
- letter-en.jpg 142k
|
||||
- letter-en.png 142k
|
||||
- letter-en.tiff 142k
|
||||
|
||||
### img2pdf
|
||||
|
||||
This is a python tool that adds the image into the pdf without
|
||||
reencoding.
|
||||
|
||||
Using version 0.3.1. Running time: `323ms`.
|
||||
|
||||
```
|
||||
$ identify *.pdf
|
||||
jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129708B 0.000u 0:00.000
|
||||
letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
|
||||
letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
|
||||
letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000
|
||||
```
|
||||
|
||||
Size:
|
||||
- jfif.jpg 241k
|
||||
- letter-en.jpg 468k
|
||||
- letter-en.png 191k
|
||||
- letter-en.tiff 192k
|
||||
|
||||
### ImageMagick
|
||||
|
||||
The well known imagemagick tool can convert images to pdfs, too.
|
||||
|
||||
Using version 6.9.10-71. Running time: `881ms`.
|
||||
|
||||
```
|
||||
$ identify *.pdf
|
||||
jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 134873B 0.000u 0:00.000
|
||||
letter-en.jpg.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 360100B 0.000u 0:00.000
|
||||
letter-en.png.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
|
||||
letter-en.tiff.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000
|
||||
```
|
||||
|
||||
Size:
|
||||
- jfif.jpg 300k
|
||||
- letter-en.jpg 390k
|
||||
- letter-en.png 180k
|
||||
- letter-en.tiff 5.1M
|
||||
|
||||
|
||||
### Tesseract
|
||||
|
||||
Docspell already relies on tesseract for doing OCR. And in contrast to
|
||||
all other candidates, it can create PDFs that are searchable. Of
|
||||
course, this yields in much longer running time, that cannot be
|
||||
compared to the times of the other options.
|
||||
|
||||
```
|
||||
tesseract doc3.jpg out -l deu pdf
|
||||
```
|
||||
|
||||
It can also create both outputs in one go:
|
||||
|
||||
```
|
||||
tesseract doc3.jpg out -l deu pdf txt
|
||||
```
|
||||
|
||||
Using tesseract 4. Running time: `6661ms`
|
||||
|
||||
```
|
||||
$ identify *.pdf
|
||||
tesseract/jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 130535B 0.000u 0:00.000
|
||||
tesseract/letter-en.jpg.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
|
||||
tesseract/letter-en.png.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
|
||||
tesseract/letter-en.tiff.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000
|
||||
```
|
||||
|
||||
Size:
|
||||
- jfif.jpg 246k
|
||||
- letter-en.jpg 473k
|
||||
- letter-en.png 183k
|
||||
- letter-en.tiff 183k
|
||||
|
||||
|
||||
## Decision
|
||||
|
||||
Tesseract.
|
||||
|
||||
To not use more external tools, imagemagick and img2pdf are not
|
||||
chosen, even though img2pdf shows the best results and is fastest.
|
||||
|
||||
Pdfbox library would be the favorite, because results are good and
|
||||
with the [twelvemonkeys](https://github.com/haraldk/TwelveMonkeys)
|
||||
library there is support for many images. The priority is to avoid
|
||||
more external commands if possible.
|
||||
|
||||
But since there already is a dependency to tesseract and it can create
|
||||
searchable pdfs, the decision is to use tesseract for this. Then PDFs
|
||||
with images can be converted to searchable PDFs with images. And text
|
||||
extraction is required anyways.
|
@ -1,78 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Extract Text from Files
|
||||
permalink: dev/adr/0011_extract_text
|
||||
---
|
||||
|
||||
# Extract Text from Files
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
With support for more file types there must be a way to extract text
|
||||
from all of them. It is better to extract text from the source files,
|
||||
in contrast to extracting the text from the converted pdf file.
|
||||
|
||||
There are multiple options and multiple file types. Again, most
|
||||
priority is to use a java/scala library to reduce external
|
||||
dependencies.
|
||||
|
||||
## Considered Options
|
||||
|
||||
### MS Office Documents
|
||||
|
||||
There is only one library I know: [Apache
|
||||
POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
|
||||
However, it doesn't support open-document format (odt and ods).
|
||||
|
||||
### OpenDocument Format
|
||||
|
||||
There are two libraries:
|
||||
|
||||
- [Apache Tika Parser](https://tika.apache.org/)
|
||||
- [ODFToolkit](https://github.com/tdf/odftoolkit)
|
||||
|
||||
*Tika:* The tika-parsers package contains an opendocument parser for
|
||||
extracting text. But it has a huge dependency tree, since it is a
|
||||
super-package containing a parser for almost every common file type.
|
||||
|
||||
*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
|
||||
and also pulls in quite some dependencies (while not as much as
|
||||
tika-parser). It is not too bad, since it is a library for
|
||||
manipulating opendocument files. But all I need is to only extract
|
||||
text. I created tests that extracted text from my odt/ods files. It
|
||||
worked at first sight, but running the tests in a loop resulted in
|
||||
strange nullpointer exceptions (it only worked the first run).
|
||||
|
||||
### Richtext
|
||||
|
||||
Richtext is supported by the jdk (using `RichtextEditorKit` from
|
||||
swing).
|
||||
|
||||
### PDF
|
||||
|
||||
For "image" pdf files, tesseract is used. For "text" PDF files, the
|
||||
library [Apache PDFBox](https://pdfbox.apache.org) can be used.
|
||||
|
||||
There also is [iText](https://github.com/itext/itext7) with a AGPL
|
||||
license.
|
||||
|
||||
### Images
|
||||
|
||||
For images and "image" PDF files, there is already tesseract in place.
|
||||
|
||||
### HTML
|
||||
|
||||
HTML must be converted into a PDF file before text can be extracted.
|
||||
|
||||
### Text/Markdown
|
||||
|
||||
These files can be used as-is, obviously.
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
- MS Office files: POI library
|
||||
- Open Document files: Tika, but integrating the few source files that
|
||||
make up the open document parser. Due to its huge dependency tree,
|
||||
the library is not added.
|
||||
- PDF: Apache PDFBox. I know this library better than itext.
|
@ -1,106 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Periodic Tasks
|
||||
permalink: dev/adr/0012_periodic_tasks
|
||||
---
|
||||
|
||||
# Periodic Tasks
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Currently there is a `Scheduler` that consumes tasks off a queue in
|
||||
the database. This allows multiple job executors running in parallel
|
||||
racing for the next job to execute. This is for executing tasks
|
||||
immediately – as long as there are enough resource.
|
||||
|
||||
What is missing, is a component that maintains periodic tasks. The
|
||||
reason for this is to have house keeping tasks that run regularily and
|
||||
clean up stale or unused data. Later, users should be able to create
|
||||
periodic tasks, for example to read e-mails from an inbox or to be
|
||||
notified of due items.
|
||||
|
||||
The problem is again, that it must work with multiple job executor
|
||||
instances running at the same time. This is the same pattern as with
|
||||
the `Scheduler`: it must be ensured that only one task is used at a
|
||||
time. Multiple job exectuors must not schedule a perdiodic task more
|
||||
than once. If a periodic tasks takes longer than the time between
|
||||
runs, it must wait for the next interval.
|
||||
|
||||
|
||||
## Considered Options
|
||||
|
||||
1. Adding a `timer` and `nextrun` field to the current `job` table
|
||||
2. Creating a separate table for periodic tasks
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
The 2. option.
|
||||
|
||||
For internal housekeeping tasks, it may suffice to reuse the existing
|
||||
`job` queue by adding more fields such that a job may be considered
|
||||
periodic. But this conflates with what the `Scheduler` is doing now
|
||||
(executing tasks as soon as possible while being bound to some
|
||||
resource limits) with a completely different subject.
|
||||
|
||||
There will be a new `PeriodicScheduler` that works on a new table in
|
||||
the database that is representing periodic tasks. This table will
|
||||
share fields with the `job` table to be able to create `RJob` records.
|
||||
This new component is only taking care of periodically submitting jobs
|
||||
to the job queue such that the `Scheduler` will eventually pick it up
|
||||
and run it. If the tasks cannot run (for example due to resource
|
||||
limitation), the periodic scheduler can't do nothing but wait and try
|
||||
next time.
|
||||
|
||||
```sql
|
||||
CREATE TABLE "periodic_task" (
|
||||
"id" varchar(254) not null primary key,
|
||||
"enabled" boolean not null,
|
||||
"task" varchar(254) not null,
|
||||
"group_" varchar(254) not null,
|
||||
"args" text not null,
|
||||
"subject" varchar(254) not null,
|
||||
"submitter" varchar(254) not null,
|
||||
"priority" int not null,
|
||||
"worker" varchar(254),
|
||||
"marked" timestamp,
|
||||
"timer" varchar(254) not null,
|
||||
"nextrun" timestamp not null,
|
||||
"created" timestamp not null
|
||||
);
|
||||
```
|
||||
|
||||
Preparing for other features, at some point periodic tasks will be
|
||||
created by users. It should be possible to disable/enable them. The
|
||||
next 6 properties are needed to insert jobs into the `job` table. The
|
||||
`worker` field (and `marked`) are used to mark a periodic job as
|
||||
"being worked on by a job executor".
|
||||
|
||||
The `timer` is the schedule, which is a
|
||||
[systemd-like](https://man.cx/systemd.time#heading7) calendar event
|
||||
string. This is parsed by [this
|
||||
library](https://github.com/eikek/calev). The `nextrun` field will
|
||||
store the timestamp of the next time the task would need to be
|
||||
executed. This is needed to query this table for the newest task.
|
||||
|
||||
The `PeriodicScheduler` works roughly like this:
|
||||
|
||||
On startup:
|
||||
- Remove stale worker values. If the process has been killed, there
|
||||
may be marked tasks which must be cleared now.
|
||||
|
||||
Main-Loop:
|
||||
0. Cancel current scheduled notify (see 4. below)
|
||||
1. get next (= earliest & enabled) periodic job
|
||||
2. if none: stop
|
||||
3. if triggered (= `nextrun <= 'now'`):
|
||||
- Mark periodic task. On fail: goto 1.
|
||||
- Submit new job into the jobqueue:
|
||||
- Update `nextrun` field
|
||||
- Check for non-final jobs of that name. This is required to not
|
||||
run the same periodic task multiple times concurrently.
|
||||
- if exist: goto 4.
|
||||
- if not exist: submit job
|
||||
- Unmark periodic task
|
||||
4. if future
|
||||
- schedule notify: notify self to run again next time the task
|
||||
schedule triggers
|
@ -1,45 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Archive Files
|
||||
permalink: dev/adr/0013_archive_files
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Docspell should have support for files that contain the actual files
|
||||
that matter, like zip files and other such things. It should extract
|
||||
its contents automatcially.
|
||||
|
||||
Since docspell should never drop or modify user data, the archive file
|
||||
must be present in the database. And it must be possible to download
|
||||
the file unmodified.
|
||||
|
||||
On the other hand, files in there need to be text analysed and
|
||||
converted to pdf files.
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
There is currently a table `attachment_source` which holds references
|
||||
to "original" files. These are the files as uploaded by the user,
|
||||
before converted to pdf. Archive files add a subtlety to this: in case
|
||||
of an archive, an `attachment_source` is the original (non-archive)
|
||||
file inside an archive.
|
||||
|
||||
The archive file itself will be stored in a separate table `attachment_archive`.
|
||||
|
||||
Example: uploading a `files.zip` ZIP file containing `report.jpg`:
|
||||
|
||||
- `attachment_source`: report.jpg
|
||||
- `attachment`: report.pdf
|
||||
- `attachment_archive`: files.zip
|
||||
|
||||
Archive may contain other archives. Then the inner archives will not
|
||||
be saved. The archive file is extracted recursively, until there is no
|
||||
known archive file found.
|
||||
|
||||
## Initial Support
|
||||
|
||||
Initial support is implemented for ZIP and EML (e-mail files) files.
|
@ -1,50 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Fulltext Search Engine
|
||||
permalink: dev/adr/0014_fulltext_search_engine
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
It should be possible to search the contents of all documents.
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
To allow searching the documents contents efficiently, a separate
|
||||
index is necessary. The "defacto standard" for fulltext search on the
|
||||
JVM is something backed by [Lucene](https://lucene.apache.org).
|
||||
Another option is to use a RDBMS that supports fulltext search.
|
||||
|
||||
This adds another component to the mix, which increases the complexity
|
||||
of the setup and the software. Since docspell works great without this
|
||||
feature, it shouldn't have a huge impact on the application, i.e. if
|
||||
the fulltext search component is down or broken, docspell should still
|
||||
work (just the fulltext search is then not working).
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [Apache SOLR](https://lucene.apache.org/solr)
|
||||
* [ElasticSearch](https://www.elastic.co/elasticsearch/)
|
||||
* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html)
|
||||
* All of them or a subset
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
If docspell is running on PostgreSQL, it would be nice to also use it
|
||||
for fulltext search to save the cost of running another component. But
|
||||
I don't want to lock the database to PostgreSQL *only* because of the
|
||||
fulltext search feature.
|
||||
|
||||
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
|
||||
part of Lucene and therefore lives in the Apache ecosystem. I would
|
||||
choose SOLR over ElasticSearch, because I used it before.
|
||||
|
||||
The last option (supporting all) is interesting, since it would enable
|
||||
to use PostgreSQL for fulltext search for those that use PostgreSQL as
|
||||
the database for docspell.
|
||||
|
||||
In a first step, identify what docspell needs from a fulltext search
|
||||
component and create this interface and an implementation for Apache
|
||||
SOLR. This enables all users to use the fulltext search feature. As a
|
||||
later step, an implementation based on PostgreSQL and/or ElasticSearch
|
||||
could be provided, too.
|
@ -1,67 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Convert PDF Files
|
||||
permalink: dev/adr/0015_convert_pdf_files
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
Some PDFs contain only images (when coming from a scanner) and
|
||||
therefore one is not able to click into the pdf and select text for
|
||||
copy&paste. Also it is not searchable in a PDF viewer. These are
|
||||
really shortcomings that can be fixed, especially when there is
|
||||
already OCR build in.
|
||||
|
||||
For images, this works already as tesseract is used to create the PDF
|
||||
files. Tesseract creates the files with an additional text layer
|
||||
containing the OCRed text.
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
|
||||
OCR text layer to scanned PDF files, allowing them to be searched
|
||||
|
||||
|
||||
### ocrmypdf
|
||||
|
||||
This is a very nice python tool, that uses tesseract to do OCR on each
|
||||
page and add the extracted text as a pdf text layer to the page.
|
||||
Additionally it creates PDF/A type pdfs, which are great for
|
||||
archiving. This fixes exactly the things stated above.
|
||||
|
||||
#### Integration
|
||||
|
||||
Docspell already has this built in for images. When converting images
|
||||
to a PDF (which is done early in processing), the process creates a
|
||||
text and a PDF file. Docspell then sets the text in this step and the
|
||||
text extraction step skips doing its work, if there is already text
|
||||
available.
|
||||
|
||||
It would be possible to use the `--sidecar` option with ocrmypdf to
|
||||
create a text file of the extracted text with one run, too (exactly
|
||||
like it works for tesseract). But for "text" pdfs, ocrmypdf writes
|
||||
some info-message into this text file:
|
||||
|
||||
```
|
||||
[OCR skipped on page 1][OCR skipped on page 2]
|
||||
```
|
||||
|
||||
Docspell cannot reliably tell, wether this is extracted text or not.
|
||||
It would be reqiured to load the pdf and check its contents. This is a
|
||||
bit of bad luck, because everything would just work already. So it
|
||||
requires a (small) change in the text-extraction step. By default,
|
||||
text extraction happens on the source file. For PDFs, text extraction
|
||||
should now be run on the converted file, to avoid running OCR twice.
|
||||
|
||||
The converted pdf file is either be a text-pdf in the first place,
|
||||
where ocrmypdf would only convert it to a PDF/A file; or it may be a
|
||||
converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
|
||||
is disabled, the converted file and the source file are the same for
|
||||
PDFs.
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
|
||||
distributed under the GPL-3 license.
|
Before Width: | Height: | Size: 385 KiB |
Before Width: | Height: | Size: 443 KiB |
Before Width: | Height: | Size: 291 KiB |
Before Width: | Height: | Size: 353 KiB |
Before Width: | Height: | Size: 292 KiB |
Before Width: | Height: | Size: 145 KiB |
Before Width: | Height: | Size: 167 KiB |
Before Width: | Height: | Size: 135 KiB |
Before Width: | Height: | Size: 148 KiB |
Before Width: | Height: | Size: 142 KiB |
Before Width: | Height: | Size: 586 KiB |
Before Width: | Height: | Size: 479 KiB |
Before Width: | Height: | Size: 280 KiB |
Before Width: | Height: | Size: 270 KiB |
Before Width: | Height: | Size: 363 KiB |
Before Width: | Height: | Size: 418 KiB |
Before Width: | Height: | Size: 500 KiB |
Before Width: | Height: | Size: 349 KiB |
Before Width: | Height: | Size: 350 KiB |
Before Width: | Height: | Size: 296 KiB |
Before Width: | Height: | Size: 176 KiB |
Before Width: | Height: | Size: 174 KiB |
Before Width: | Height: | Size: 155 KiB |
Before Width: | Height: | Size: 49 KiB |
@ -1,43 +0,0 @@
|
||||
@startuml
|
||||
scale 1200 width
|
||||
title: Processing Files
|
||||
skinparam monochrome true
|
||||
skinparam backgroundColor white
|
||||
skinparam rectangle {
|
||||
roundCorner<<Input>> 25
|
||||
roundCorner<<Output>> 5
|
||||
}
|
||||
rectangle Input <<Input>> {
|
||||
file "html"
|
||||
file "plaintext"
|
||||
file "image"
|
||||
file "msoffice"
|
||||
file "rtf"
|
||||
file "odf"
|
||||
file "pdf"
|
||||
}
|
||||
|
||||
node toBoth [
|
||||
PDF + TXT
|
||||
]
|
||||
node toPdf [
|
||||
PDF
|
||||
]
|
||||
node toTxt [
|
||||
TXT
|
||||
]
|
||||
|
||||
image --> toBoth:<tesseract>
|
||||
html --> toPdf:<wkhtmltopdf>
|
||||
toPdf --> toTxt:[pdfbox]
|
||||
plaintext --> html:[flexmark]
|
||||
msoffice --> toPdf:<unoconv>
|
||||
msoffice --> toTxt:[poi]
|
||||
rtf --> toTxt:[jdk]
|
||||
rtf --> toPdf:<unoconv>
|
||||
odf --> toTxt:[tika]
|
||||
odf --> toPdf:<unoconv>
|
||||
pdf --> toTxt:<tesseract>
|
||||
pdf --> toTxt:[pdfbox]
|
||||
plaintext -> toTxt:[identity]
|
||||
@enduml
|
@ -1,77 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Short Title
|
||||
---
|
||||
|
||||
# [short title of solved problem and solution]
|
||||
|
||||
* Status: [proposed | rejected | accepted | deprecated | … | superseded by [ADR-0005](0005-example.md)] <!-- optional -->
|
||||
* Deciders: [list everyone involved in the decision] <!-- optional -->
|
||||
* Date: [YYYY-MM-DD when the decision was last updated] <!-- optional -->
|
||||
|
||||
Technical Story: [description | ticket/issue URL] <!-- optional -->
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
[Describe the context and problem statement, e.g., in free form using two to three sentences. You may want to articulate the problem in form of a question.]
|
||||
|
||||
## Decision Drivers <!-- optional -->
|
||||
|
||||
* [driver 1, e.g., a force, facing concern, …]
|
||||
* [driver 2, e.g., a force, facing concern, …]
|
||||
* … <!-- numbers of drivers can vary -->
|
||||
|
||||
## Considered Options
|
||||
|
||||
* [option 1]
|
||||
* [option 2]
|
||||
* [option 3]
|
||||
* … <!-- numbers of options can vary -->
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
Chosen option: "[option 1]", because [justification. e.g., only option, which meets k.o. criterion decision driver | which resolves force force | … | comes out best (see below)].
|
||||
|
||||
### Positive Consequences <!-- optional -->
|
||||
|
||||
* [e.g., improvement of quality attribute satisfaction, follow-up decisions required, …]
|
||||
* …
|
||||
|
||||
### Negative Consequences <!-- optional -->
|
||||
|
||||
* [e.g., compromising quality attribute, follow-up decisions required, …]
|
||||
* …
|
||||
|
||||
## Pros and Cons of the Options <!-- optional -->
|
||||
|
||||
### [option 1]
|
||||
|
||||
[example | description | pointer to more information | …] <!-- optional -->
|
||||
|
||||
* Good, because [argument a]
|
||||
* Good, because [argument b]
|
||||
* Bad, because [argument c]
|
||||
* … <!-- numbers of pros and cons can vary -->
|
||||
|
||||
### [option 2]
|
||||
|
||||
[example | description | pointer to more information | …] <!-- optional -->
|
||||
|
||||
* Good, because [argument a]
|
||||
* Good, because [argument b]
|
||||
* Bad, because [argument c]
|
||||
* … <!-- numbers of pros and cons can vary -->
|
||||
|
||||
### [option 3]
|
||||
|
||||
[example | description | pointer to more information | …] <!-- optional -->
|
||||
|
||||
* Good, because [argument a]
|
||||
* Good, because [argument b]
|
||||
* Bad, because [argument c]
|
||||
* … <!-- numbers of pros and cons can vary -->
|
||||
|
||||
## Links <!-- optional -->
|
||||
|
||||
* [Link type] [Link to ADR] <!-- example: Refined by [ADR-0005](0005-example.md) -->
|
||||
* … <!-- numbers of links can vary -->
|
@ -1,95 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Documentation
|
||||
permalink: doc
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
This is the documentation for Docspell @VERSION@.
|
||||
|
||||
Docspell assists in organizing large amounts of files that are
|
||||
typically scanned paper documents. You can associate tags, set
|
||||
correspondends, what a document is concerned with, a name, a date and
|
||||
some more. If your documents are associated with this meta data, you
|
||||
should be able to quickly find them later using the search feature.
|
||||
But adding this manually to each document is a tedious task. What if
|
||||
most of it could be attached automatically?
|
||||
|
||||
## How it works
|
||||
|
||||
Documents have two main properties: a correspondent (sender or
|
||||
receiver that is not you) and something the document is about. Usually
|
||||
it is about a person or some thing – maybe your car, or contracts
|
||||
concerning some familiy member, etc.
|
||||
|
||||
1. You maintain a kind of address book. It should list all possible
|
||||
correspondents and the concerning people/things. This grows
|
||||
incrementally with each new unknown document.
|
||||
2. When docspell analyzes a document, it tries to find matches within
|
||||
your address book. It can detect the correspondent and a concerning
|
||||
person or thing. It will then associate this data to your
|
||||
documents.
|
||||
3. You can inspect what docspell has done and correct it. If docspell
|
||||
has found multiple suggestions, they will be shown for you to
|
||||
select one. If it is not correctly associated, very often the
|
||||
correct one is just one click away.
|
||||
|
||||
The set of meta data that docspell uses to draw suggestions from, must
|
||||
be maintained manually. But usually, this data doesn't grow as fast as
|
||||
the documents. After a while there is a quite complete address book
|
||||
and only once in a while it has to be revisited.
|
||||
|
||||
Besides extracting the text from documents to analyze, docspell also
|
||||
converts all files into PDF files. This unifies the different formats
|
||||
your documents may be in originally and makes them more accessible
|
||||
from other systems and the future.
|
||||
|
||||
## Terms
|
||||
|
||||
In order to better understand these pages, some terms should be
|
||||
explained first.
|
||||
|
||||
### Item
|
||||
|
||||
An **Item** is roughly your (pdf) document, only that an item may span
|
||||
multiple files, which are called **attachments**. And an item has
|
||||
**meta data** associated:
|
||||
|
||||
- a **correspondent**: the other side of the communication. It can be
|
||||
an organization or a person.
|
||||
- a **concerning person** or **equipment**: a person or thing that
|
||||
this item is about. Maybe it is an insurance contract about your
|
||||
car.
|
||||
- **tag**: an item can be tagged with custom tags. A tag can have a
|
||||
*category*. This is intended for grouping tags, for example a
|
||||
category `doctype` could be used to group tags like `bill`,
|
||||
`contract`, `receipt` etc. Usually an item is not tagged with more
|
||||
than one tag of a category.
|
||||
- a **item date**: this is the date of the document – if this is not
|
||||
set, the created date of the item is used.
|
||||
- a **due date**: an optional date indicating that something has to be
|
||||
done (e.g. paying a bill, submitting it) about this item until this
|
||||
date
|
||||
- a **direction**: one of "incoming" or "outgoing"
|
||||
- a **name**: some item name, defaults to the file name of the
|
||||
attachments
|
||||
- some **notes**: arbitrary descriptive text. You can use markdown
|
||||
here, which is appropriately formatted in the web application.
|
||||
|
||||
### Collective
|
||||
|
||||
The users of the application are part of a **collective**. A
|
||||
**collective** is a group of users that share access to the same
|
||||
items. The account name is therefore comprised of a *collective name*
|
||||
and a *user name*.
|
||||
|
||||
All users of a collective are equal; they have same permissions to
|
||||
access all items. The items don't belong to a user, but to the
|
||||
collective.
|
||||
|
||||
That means, to identify yourself when signing in, you have to give the
|
||||
collective name and your user name. By default it is separated by a
|
||||
slash `/`, for example `smith/john`. If your user name is the same as
|
||||
the collective name, you can omit one; so `smith/smith` can be
|
||||
abbreviated to just `smith`.
|
@ -1,331 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Configuring
|
||||
permalink: doc/configure
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
Docspell's executable can take one argument – a configuration file. If
|
||||
that is not given, the defaults are used. The config file overrides
|
||||
default values, so only values that differ from the defaults are
|
||||
necessary.
|
||||
|
||||
This applies to the restserver and the joex as well.
|
||||
|
||||
## Important Config Options
|
||||
|
||||
The configuration of both components uses separate namespaces. The
|
||||
configuration for the REST server is below `docspell.server`, while
|
||||
the one for joex is below `docspell.joex`.
|
||||
|
||||
### JDBC
|
||||
|
||||
This configures the connection to the database. This has to be
|
||||
specified for the rest server and joex. By default, a H2 database in
|
||||
the current `/tmp` directory is configured.
|
||||
|
||||
The config looks like this (both components):
|
||||
|
||||
```
|
||||
docspell.joex.jdbc {
|
||||
url = ...
|
||||
user = ...
|
||||
password = ...
|
||||
}
|
||||
|
||||
docspell.server.backend.jdbc {
|
||||
url = ...
|
||||
user = ...
|
||||
password = ...
|
||||
}
|
||||
```
|
||||
|
||||
The `url` is the connection to the database. It must start with
|
||||
`jdbc`, followed by name of the database. The rest is specific to the
|
||||
database used: it is either a path to a file for H2 or a host/database
|
||||
url for MariaDB and PostgreSQL.
|
||||
|
||||
When using H2, the user and password can be chosen freely on first
|
||||
start, but must stay the same on subsequent starts. Usually, the user
|
||||
is `sa` and the password is left empty. Additionally, the url must
|
||||
include these options:
|
||||
|
||||
```
|
||||
;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
PostgreSQL:
|
||||
```
|
||||
url = "jdbc:postgresql://localhost:5432/docspelldb"
|
||||
```
|
||||
|
||||
MariaDB:
|
||||
```
|
||||
url = "jdbc:mariadb://localhost:3306/docspelldb"
|
||||
```
|
||||
|
||||
H2
|
||||
```
|
||||
url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
|
||||
```
|
||||
|
||||
|
||||
### Full-Text Search: SOLR
|
||||
|
||||
[Apache SOLR](https://lucene.apache.org/solr) is used to provide the
|
||||
full-text search. Both docspell components must provide the same
|
||||
connection setup. This is defined in the `full-text-search.solr`
|
||||
subsection:
|
||||
|
||||
```
|
||||
...
|
||||
full-text-search {
|
||||
enabled = true
|
||||
...
|
||||
solr = {
|
||||
url = "http://localhost:8983/solr/docspell"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The default configuration at the end of this page contains more
|
||||
information about each setting.
|
||||
|
||||
The `solr.url` is the mandatory setting that you need to change to
|
||||
point to your SOLR instance. Then you need to set the `enabled` flag
|
||||
to `true`.
|
||||
|
||||
When installing docspell manually, just install solr and create a core
|
||||
as described in the [solr
|
||||
documentation](https://lucene.apache.org/solr/guide/8_4/installing-solr.html).
|
||||
That will provide you with the connection url (the last part is the
|
||||
core name).
|
||||
|
||||
While the `full-text-search.solr` options are the same for joex and
|
||||
the restserver, there are some settings that differ. The restserver
|
||||
has this additional setting, that may be of interest:
|
||||
|
||||
```
|
||||
full-text-search {
|
||||
recreate-key = "test123"
|
||||
}
|
||||
```
|
||||
|
||||
This key is required if you want docspell to drop and re-create the
|
||||
entire index. This is possible via a REST call:
|
||||
|
||||
``` bash
|
||||
$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
|
||||
```
|
||||
|
||||
Here the `test123` is the key defined with `recreate-key`. If it is
|
||||
empty (the default), this REST call is disabled. Otherwise, the POST
|
||||
request will submit a system task that is executed by a joex instance
|
||||
eventually.
|
||||
|
||||
Using this endpoint, the index will be re-created. This is sometimes
|
||||
necessary, for example if you upgrade SOLR or delete the core to
|
||||
provide a new one (see
|
||||
[here](https://lucene.apache.org/solr/guide/8_4/reindexing.html) for
|
||||
details). Note that a collective can also re-index their data using a
|
||||
similiar endpoint; but this is only deleting their data and doesn't do
|
||||
a full re-index.
|
||||
|
||||
The solr index doesn't contain any new information, it can be
|
||||
regenerated any time using the above REST call. Thus it doesn't need
|
||||
to be backed up.
|
||||
|
||||
### Bind
|
||||
|
||||
The host and port the http server binds to. This applies to both
|
||||
components. The joex component also exposes a small REST api to
|
||||
inspect its state and notify the scheduler.
|
||||
|
||||
```
|
||||
docspell.server.bind {
|
||||
address = localhost
|
||||
port = 7880
|
||||
}
|
||||
docspell.joex.bind {
|
||||
address = localhost
|
||||
port = 7878
|
||||
}
|
||||
```
|
||||
|
||||
By default, it binds to `localhost` and some predefined port. This
|
||||
must be changed, if components are on different machines.
|
||||
|
||||
### baseurl
|
||||
|
||||
The base url is an important setting that defines the http URL where
|
||||
the corresponding component can be reached. It applies to both
|
||||
components. For a joex component, the url must be resolvable from a
|
||||
REST server component. The REST server also uses this url to create
|
||||
absolute urls and to configure the authenication cookie.
|
||||
|
||||
By default it is build using the information from the `bind` setting.
|
||||
|
||||
|
||||
```
|
||||
docspell.server.baseurl = ...
|
||||
docspell.joex.baseurl = ...
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
docspell.server.baseurl = "https://docspell.example.com"
|
||||
docspell.joex.baseurl = "http://192.168.101.10"
|
||||
```
|
||||
|
||||
|
||||
### app-id
|
||||
|
||||
The `app-id` is the identifier of the corresponding instance. It *must
|
||||
be unique* for all instances. By default the REST server uses `rest1`
|
||||
and joex `joex1`. It is recommended to overwrite this setting to have
|
||||
an explicit and stable identifier.
|
||||
|
||||
```
|
||||
docspell.server.app-id = "rest1"
|
||||
docspell.joex.app-id = "joex1"
|
||||
```
|
||||
|
||||
### registration options
|
||||
|
||||
This defines if and how new users can create accounts. There are 3
|
||||
options:
|
||||
|
||||
- *closed* no new user can sign up
|
||||
- *open* new users can sign up
|
||||
- *invite* new users can sign up but require an invitation key
|
||||
|
||||
This applies only to the REST sevrer component.
|
||||
|
||||
```
|
||||
docspell.server.signup {
|
||||
mode = "open"
|
||||
|
||||
# If mode == 'invite', a password must be provided to generate
|
||||
# invitation keys. It must not be empty.
|
||||
new-invite-password = ""
|
||||
|
||||
# If mode == 'invite', this is the period an invitation token is
|
||||
# considered valid.
|
||||
invite-time = "3 days"
|
||||
}
|
||||
```
|
||||
|
||||
The mode `invite` is intended to open the application only to some
|
||||
users. The admin can create these invitation keys and distribute them
|
||||
to the desired people. For this, the `new-invite-password` must be
|
||||
given. The idea is that only the person who installs docspell knows
|
||||
this. If it is not set, then invitation won't work. New invitation
|
||||
keys can be generated from within the web application or via REST
|
||||
calls (using `curl`, for example).
|
||||
|
||||
```
|
||||
curl -X POST -d '{"password":"blabla"}' "http://localhost:7880/api/v1/open/signup/newinvite"
|
||||
```
|
||||
|
||||
### Authentication
|
||||
|
||||
Authentication works in two ways:
|
||||
|
||||
- with an account-name / password pair
|
||||
- with an authentication token
|
||||
|
||||
The initial authentication must occur with an accountname/password
|
||||
pair. This will generate an authentication token which is valid for a
|
||||
some time. Subsequent calls to secured routes can use this token. The
|
||||
token can be given as a normal http header or via a cookie header.
|
||||
|
||||
These settings apply only to the REST server.
|
||||
|
||||
```
|
||||
docspell.server.auth {
|
||||
server-secret = "hex:caffee" # or "b64:Y2FmZmVlCg=="
|
||||
session-valid = "5 minutes"
|
||||
}
|
||||
```
|
||||
|
||||
The `server-secret` is used to sign the token. If multiple REST
|
||||
servers are deployed, all must share the same server secret. Otherwise
|
||||
tokens from one instance are not valid on another instance. The secret
|
||||
can be given as Base64 encoded string or in hex form. Use the prefix
|
||||
`hex:` and `b64:`, respectively. If no prefix is given, the UTF8 bytes
|
||||
of the string are used.
|
||||
|
||||
The `session-valid` deterimens how long a token is valid. This can be
|
||||
just some minutes, the web application obtains new ones
|
||||
periodically. So a short time is recommended.
|
||||
|
||||
|
||||
## File Format
|
||||
|
||||
The format of the configuration files can be
|
||||
[HOCON](https://github.com/lightbend/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation),
|
||||
JSON or whatever the used [config
|
||||
library](https://github.com/lightbend/config) understands. The default
|
||||
values below are in HOCON format, which is recommended, since it
|
||||
allows comments and has some [advanced
|
||||
features](https://github.com/lightbend/config/blob/master/README.md#features-of-hocon). Please
|
||||
refer to their documentation for more on this.
|
||||
|
||||
Here are the default configurations.
|
||||
|
||||
|
||||
## Default Config
|
||||
|
||||
### Rest Server
|
||||
|
||||
```
|
||||
{% include server.conf %}
|
||||
```
|
||||
|
||||
### Joex
|
||||
|
||||
```
|
||||
{% include joex.conf %}
|
||||
```
|
||||
|
||||
## Logging
|
||||
|
||||
By default, docspell logs to stdout. This works well, when managed by
|
||||
systemd or other inits. Logging is done by
|
||||
[logback](https://logback.qos.ch/). Please refer to its documentation
|
||||
for how to configure logging.
|
||||
|
||||
If you created your logback config file, it can be added as argument
|
||||
to the executable using this syntax:
|
||||
|
||||
```
|
||||
/path/to/docspell -Dlogback.configurationFile=/path/to/your/logging-config-file
|
||||
```
|
||||
|
||||
To get started, the default config looks like this:
|
||||
|
||||
``` xml
|
||||
<configuration>
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<withJansi>true</withJansi>
|
||||
|
||||
<encoder>
|
||||
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<logger name="docspell" level="debug" />
|
||||
<root level="INFO">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
||||
```
|
||||
|
||||
The `<root level="INFO">` means, that only log statements with level
|
||||
"INFO" will be printed. But the `<logger name="docspell"
|
||||
level="debug">` above says, that for loggers with name "docspell"
|
||||
statements with level "DEBUG" will be printed, too.
|
@ -1,78 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Curate Items
|
||||
permalink: doc/curate
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
Curating the items meta data helps finding them later. This page
|
||||
describes how you can quickly go through those items and correct or
|
||||
amend with existing data.
|
||||
|
||||
## Select New items
|
||||
|
||||
After files have been uploaded and the job executor created the
|
||||
corresponding items, they will show up on the main page. All items,
|
||||
the job executor has created are initially marked as *New*. The option
|
||||
*only New* in the left search menu can be used to select only new
|
||||
items:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/docspell-curate-1.jpg">
|
||||
</div>
|
||||
|
||||
|
||||
## Check selected items
|
||||
|
||||
Then you can go through all new items and check their metadata: Click
|
||||
on the first item to open the detail view. This shows the documents
|
||||
and the meta data in the header.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/docspell-curate-2.jpg">
|
||||
</div>
|
||||
|
||||
|
||||
## Modify if necessary
|
||||
|
||||
To change something, click the *Edit* button in the menu above the
|
||||
document view. This will open a form next to your documents. You can
|
||||
compare the data with the documents and change as you like. Since the
|
||||
item status is *New*, you'll see the suggestions docspell found during
|
||||
processing. If there were multiple candidates, you can select another
|
||||
one by clicking its name in the suggestion list.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/docspell-curate-3.jpg">
|
||||
</div>
|
||||
|
||||
|
||||
When you change something in the form, it is immediatly applied. Only
|
||||
when changing text fields, a click on the *Save* symbol next to the
|
||||
field is required.
|
||||
|
||||
|
||||
## Confirm
|
||||
|
||||
If everything looks good, click the *Confirm* button to confirm the
|
||||
current data. The *New* status goes away and also the suggestions are
|
||||
hidden in this state. You can always go back by clicking the
|
||||
*Unconfirm* button.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/docspell-curate-5.jpg">
|
||||
</div>
|
||||
|
||||
|
||||
## Proceed with next item
|
||||
|
||||
To look at the next item in the search results, click the *Next*
|
||||
button in the menu (next to the *Edit* button). Clicking next, will
|
||||
keep the current view, so you can continue checking the data. If you
|
||||
are on the last item, the view switches to the listing view when
|
||||
clicking *Next*.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/docspell-curate-6.jpg">
|
||||
</div>
|
@ -1,238 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: E-Mail Settings
|
||||
permalink: doc/emailsettings
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
Docspell has a good integration for E-Mail. You can send e-mails
|
||||
related to an item and you can import e-mails from your mailbox into
|
||||
docspell.
|
||||
|
||||
This requires to define settings to use for sending and receiving
|
||||
e-mails. E-Mails are commonly send via
|
||||
[SMTP](https://en.wikipedia.org/wiki/Simple_Mail_Transfer_Protocol)
|
||||
and for receiving
|
||||
[IMAP](https://en.wikipedia.org/wiki/Internet_Message_Access_Protocol)
|
||||
is quite common. Docspell has support for SMTP and IMAP. These
|
||||
settings are associated to a user, so that each user can specify its
|
||||
own settings separately from others in the collective.
|
||||
|
||||
*Note: Passwords to your e-mail accounts are stored in plain-text in
|
||||
docspell's database. This is necessary to have docspell connect to
|
||||
your e-mail account to send mails on behalf of you and receive your
|
||||
mails.*
|
||||
|
||||
|
||||
## SMTP Settings
|
||||
|
||||
For sending mail, you need to provide information to connect to a SMTP
|
||||
server. Every e-mail provider has this information somewhere
|
||||
available.
|
||||
|
||||
Configure this in *User Settings -> E-Mail Settings (SMTP)*:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-settings-1.png">
|
||||
</div>
|
||||
|
||||
First, you need to provide some name that is used to recognize this
|
||||
account. This name is also used in URLs to docspell and so it must not
|
||||
contain whitespace or any special characters. A good value is the
|
||||
domain of your provider, for example `gmail.com`, or something like
|
||||
that.
|
||||
|
||||
These information should be available from your e-mail provider. For
|
||||
example, for google-mail it is:
|
||||
|
||||
- SMTP Host: `smtp.gmail.com`
|
||||
- SMTP Port: `587` or `465`
|
||||
- SMTP User: Your Gmail address (for example, example@gmail.com)
|
||||
- SMTP Password: Your Gmail password
|
||||
- SSL: use `SSL` for port `465` and `StartSSL` for port `587`
|
||||
|
||||
Then you need to define the e-mail address that is used for the `From`
|
||||
field. This is in most cases the same address as used for the SMTP
|
||||
User field.
|
||||
|
||||
The `Reply-To` field is optional and can be set to define a different
|
||||
e-mail address that your recipients should use to answer a mail.
|
||||
|
||||
Once this is setup, you can start sending mails within docspell. It is
|
||||
possible to set up these settings for multiple providers, so you can
|
||||
choose from which account you want to send mails.
|
||||
|
||||
|
||||
## IMAP Settings
|
||||
|
||||
For receiving e-mails, you need to provide information to connect to
|
||||
an IMAP server. Your e-mail provider should have this information
|
||||
somewhere available.
|
||||
|
||||
Configure this in *User Settings -> E-Mail Settings (IMAP)*:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-settings-2.png">
|
||||
</div>
|
||||
|
||||
First you need to define a *Name* to recognize this connection inside
|
||||
docspell. This name is also used in URLs to docspell and so it must
|
||||
not contain whitespace or any special characters. A good value is the
|
||||
domain of your provider, for example `gmail.com`, or something like
|
||||
that.
|
||||
|
||||
You can provide imap connections to multiple mailboxes.
|
||||
|
||||
Here is an example for posteo.de:
|
||||
|
||||
- IMAP Server: `posteo.de`
|
||||
- IMAP Port: 143
|
||||
- IMAP User: Your posteo address
|
||||
- IMAP Password: Your posteo password
|
||||
- SSL: use `StartTLS`
|
||||
|
||||
|
||||
## SSL / TLS / StartTLS
|
||||
|
||||
*Please Note: If `SSL` is set to `None`, then mails will be sent
|
||||
unencrypted to your mail provider! If `Ignore certificate check` is
|
||||
enabled, connections to your mail provider will succeed even if the
|
||||
provider is wrongly configured for SSL/TLS. This flag should only be
|
||||
enabled if you know why.*
|
||||
|
||||
|
||||
## GMail
|
||||
|
||||
Authenticating with GMail may be not so simple. GMail implements an
|
||||
authentication scheme called *XOAUTH2* (at least for Imap). It will
|
||||
not work with your normal password. This is to avoid giving an
|
||||
application full access to your gmail account.
|
||||
|
||||
The e-mail integration in docspell relies on the
|
||||
[JavaMail](https://javaee.github.io/javamail) library which has
|
||||
support for XOAUTH2. It also has documentation on what you need to do
|
||||
on your gmail account: <https://javaee.github.io/javamail/OAuth2>.
|
||||
|
||||
First you need to go to the [Google Developers
|
||||
Console](https://console.developers.google.com) and create an "App" to
|
||||
get a Client-Id and a Client-Secret. This "App" will be your instance
|
||||
of docspell. You tell google that this app may send and read your
|
||||
mails and then you get an *access token* that should be used instead
|
||||
of the password.
|
||||
|
||||
Once you setup an App in Google Developers Console, you get the
|
||||
Client-Id and the Client-Secret, which look something like this:
|
||||
|
||||
- Client-Id: 106701....d8c.apps.googleusercontent.com
|
||||
- Client-Secret: 5Z1...Kir_t
|
||||
|
||||
Google has a python tool to help with getting this access token.
|
||||
Download the `oauth2.py` script from
|
||||
[here](https://github.com/google/gmail-oauth2-tools) and first create
|
||||
an *oauth2-token*:
|
||||
|
||||
``` bash
|
||||
./oauth2.py --user=your.name@gmail.com \
|
||||
--client_id=106701....d8c.apps.googleusercontent.com \
|
||||
--client_secret=5Z1...Kir_t \
|
||||
--generate_oauth2_token
|
||||
```
|
||||
|
||||
This will "redirect you" to an URL where you have to authenticate with
|
||||
google. Afterwards it lets you add permissions to the app for
|
||||
accessing your mail account. The result is another code you need to
|
||||
give to the script to proceed:
|
||||
|
||||
```
|
||||
4/zwE....q0QBAb-99yD7lw
|
||||
```
|
||||
|
||||
Then the scripts produces this:
|
||||
|
||||
```
|
||||
Refresh Token: 1//09zH.........Lj6oc2SmFlZww
|
||||
Access Token: ya29.a0........SECDQ
|
||||
Access Token Expiration Seconds: 3599
|
||||
```
|
||||
|
||||
The access token can be used to sign in via IMAP with google. The
|
||||
Refresh Token doesn't expire and can be used to generate new access
|
||||
tokens:
|
||||
|
||||
```
|
||||
./oauth2.py --user=your.name@gmail.com \
|
||||
--client_id=106701....d8c.apps.googleusercontent.com \
|
||||
--client_secret=5Z1...Kir_t \
|
||||
--refresh_token=1//09zH.........Lj6oc2SmFlZww
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Access Token: ya29.a0....._q-lX3ypntk3ln0h9Yk
|
||||
Access Token Expiration Seconds: 3599
|
||||
```
|
||||
|
||||
The problem is that the access token expires. Docspell doesn't support
|
||||
updating the access token. It could be worked around by setting up a
|
||||
cron-job or similiar which uses the `oauth2.py` tool to generate new
|
||||
access tokens and update your imap settings via a [REST](../api) call.
|
||||
|
||||
``` bash
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
## Change this to your values:
|
||||
|
||||
DOCSPELL_USER="[docspell-user]"
|
||||
DOCSPELL_PASSWORD="[docspell-password]"
|
||||
DOCSPELL_URL="http://localhost:7880"
|
||||
DOCSPELL_IMAP_NAME="gmail.com"
|
||||
|
||||
GMAIL_USER="your.name@gmail.com"
|
||||
CLIENT_ID="106701....d8c.apps.googleusercontent.com"
|
||||
CLIENT_SECRET="secret=5Z1...Kir_t"
|
||||
REFRESH_TOKEN="1//09zH.........Lj6oc2SmFlZww"
|
||||
# Path to the oauth2.py tool
|
||||
OAUTH_TOOL="./oauth2.py"
|
||||
|
||||
##############################################################################
|
||||
## Script
|
||||
|
||||
|
||||
# Login to docspell and store the auth-token
|
||||
AUTH_DATA=$(curl --silent -XPOST \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data-binary "{\"account\":\"$DOCSPELL_USER\",\"password\":\"$DOCSPELL_PASSWORD\"}" \
|
||||
$DOCSPELL_URL/api/v1/open/auth/login)
|
||||
if [ $(echo $AUTH_DATA | jq .success) == "false" ]; then
|
||||
echo "Auth failed"
|
||||
echo $AUTH_DATA
|
||||
fi
|
||||
TOKEN="$(echo $AUTH_DATA | jq -r .token)"
|
||||
|
||||
|
||||
# Get the imap settings
|
||||
UPDATE_URL="$DOCSPELL_URL/api/v1/sec/email/settings/imap/$DOCSPELL_IMAP_NAME"
|
||||
IMAP_DATA=$(curl -s -H "X-Docspell-Auth: $TOKEN" "$UPDATE_URL")
|
||||
|
||||
echo "Current Settings:"
|
||||
echo $IMAP_DATA | jq
|
||||
|
||||
|
||||
# Get the new access token
|
||||
ACCESS_TOKEN=$($OAUTH_TOOL --user=$GMAIL_USER \
|
||||
--client_id="$CLIENT_ID" \
|
||||
--client_secret="$CLIENT_SECRET" \
|
||||
--refresh_token="$REFRESH_TOKEN" | head -n1 | cut -d':' -f2 | xargs)
|
||||
|
||||
# Update settings
|
||||
echo "Updating IMAP settings"
|
||||
NEW_IMAP=$(echo $IMAP_DATA | jq ".imapPassword |= \"$ACCESS_TOKEN\"")
|
||||
curl -s -XPUT -H "X-Docspell-Auth: $TOKEN" \
|
||||
-H 'Content-Type: application/json' \
|
||||
--data-binary "$NEW_IMAP" "$UPDATE_URL"
|
||||
echo
|
||||
echo "New Settings:"
|
||||
curl -s -H "X-Docspell-Auth: $TOKEN" "$UPDATE_URL" | jq
|
||||
```
|
@ -1,189 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Finding Items
|
||||
permalink: doc/finding
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
Items can be searched by their annotated meta data and their contents
|
||||
using full text search. The landing page shows a list of current
|
||||
items. Items are displayed sorted by their date, newest first.
|
||||
|
||||
Docspell has two modes for searching: a simple search bar and a search
|
||||
menu with many options. Both are active at the same time, but only one
|
||||
is visible. You can switch between them without affecting the results.
|
||||
|
||||
|
||||
## Search Bar
|
||||
|
||||
<img style="float:right;" src="../img/search-bar.png" height="50">
|
||||
|
||||
By default, the search bar is shown. It provides a refined view of the
|
||||
search menu. The dropdown contains different options to do a quick
|
||||
search.
|
||||
|
||||
### *All Names* and *Contents*
|
||||
|
||||
These two options correspond to the same named field in the search
|
||||
menu. If you switch between search menu and search bar (by clicking
|
||||
the icon on the left), you'll see that they are the same fields.
|
||||
Typing in the search bar also fills the corresponding field in the
|
||||
search menu (and vice versa).
|
||||
|
||||
- The *All Names* searches in the item name, item notes, names of
|
||||
correspondent organization and person, and names of concering person
|
||||
and equipment. It uses a simple substring search.
|
||||
- The option *Contents* searches the contents of all attachments
|
||||
(documents), attachment names, the item name and item notes. It uses
|
||||
full text search. However, it does not search the names of attached
|
||||
meta data.
|
||||
|
||||
When searching with one of these fields active, it simply submits the
|
||||
(hidden) search menu. So if the menu has other fields filled out, they
|
||||
will affect the result, too. Using one of these fields, the bar is
|
||||
just a reduced view of the search menu.
|
||||
|
||||
So you can choose tags or correspondents in the search menu and
|
||||
further restrict the results using full text search. The results will
|
||||
be returned sorted by the item date, newest first.
|
||||
|
||||
If the left button in the search bar shows a little blue bubble, it
|
||||
means that there are more search fields filled out in the search menu
|
||||
that you currently can't see. In this case the results are not only
|
||||
restricted by the search term given in the search-bar, but also by
|
||||
what is specified in the search menu.
|
||||
|
||||
|
||||
### *Contents Only*
|
||||
|
||||
This option has no corresponding part in the search menu. Searching
|
||||
with this option active, there is only a full text search done in the
|
||||
attachments contents, attachment names, item name and item notes.
|
||||
|
||||
The results are not ordered by item date, but by relevance with
|
||||
respect to the search term. This ordering is returned from the full
|
||||
text search engine and is simply transfered unmodified.
|
||||
|
||||
|
||||
## Search Menu
|
||||
|
||||
<img style="float:right;" src="../img/search-menu.png" width="250">
|
||||
|
||||
The search menu can be opened by clicking the left icon in the top
|
||||
bar. It shows some options to constrain the item list:
|
||||
|
||||
### Show new items
|
||||
|
||||
Clicking the checkbox "Only new" shows items that have not been
|
||||
"Confirmed". All items that have been created by docspell and not
|
||||
looked at are marked as "new" automatically.
|
||||
|
||||
### Names
|
||||
|
||||
Searches in names of certain properties. The `All Names` field is the
|
||||
same as the search in the search bar (see above).
|
||||
|
||||
The `Name` field only searches in the name property of an item.
|
||||
|
||||
### Folder
|
||||
|
||||
Set a folder to only show items in that folder. If no folder is set,
|
||||
all accessible items are shown. These are all items that either have
|
||||
no folder set, or a folder where the current user is member.
|
||||
|
||||
### Tags
|
||||
|
||||
Specify a list of tags that the items must have. When adding tags to
|
||||
the "Include" list, an item must have all these tags in order to be
|
||||
included in the results.
|
||||
|
||||
When adding tags to the "Exclude" list, then an item is removed from
|
||||
the results if it has at least one of these tags.
|
||||
|
||||
### Correspondent
|
||||
|
||||
Pick a correspondent to show only these items.
|
||||
|
||||
### Concerned
|
||||
|
||||
Pick a concerned entity to show only these items.
|
||||
|
||||
### Date
|
||||
|
||||
Specify a date range to show only items whose date property is within
|
||||
this range. If you want to see items of a specific day, choose the
|
||||
same day for both fields.
|
||||
|
||||
For items that don't have an explicitly date property set, the created
|
||||
date is used.
|
||||
|
||||
### Due Date
|
||||
|
||||
Specify a date range to show only items whose due date property is
|
||||
within this range. Items without a due date are not shown.
|
||||
|
||||
|
||||
### Direction
|
||||
|
||||
Specify whether to show only incoming, only outgoing or all items.
|
||||
|
||||
|
||||
## Customize Substring Search
|
||||
|
||||
The substring search of the *All Names* and *Name* field can be
|
||||
customized in the following way: A wildcard `*` can be used at the
|
||||
start or end of a search term to do a substring match. A `*` means
|
||||
"everything". So a term `*company` matches all names ending in
|
||||
`company` and `*company*` matches all names containing the word
|
||||
`company`. The matching is case insensitive.
|
||||
|
||||
Docspell adds a `*` to the front and end of a term automatically,
|
||||
unless one of the following is true:
|
||||
|
||||
- The term already has a wildcard.
|
||||
- The term is enclosed in quotes `"`.
|
||||
|
||||
|
||||
## Full Text Search
|
||||
|
||||
|
||||
### The Query
|
||||
|
||||
The query string for full text search is very powerful. Docspell
|
||||
currently supports [Apache SOLR](https://lucene.apache.org/solr/) as
|
||||
full text search backend, so you may want to have a look at their
|
||||
[documentation on query
|
||||
syntax](https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing)
|
||||
for a in depth guide.
|
||||
|
||||
- Wildcards: `?` matches any single character, `*` matches zero or
|
||||
more characters
|
||||
- Fuzzy search: Appending a `~` to a term, results in a fuzzy search
|
||||
(search this term and similiar spelled ones)
|
||||
- Proximity Search: Search for terms that "near" each other, again
|
||||
using `~` appended to a search phrase. Example: `"cheese cake"~5`.
|
||||
- Boosting: apply more weight to a term with `^`. Example: `cheese^4
|
||||
cake` – cheese is 4x more important.
|
||||
|
||||
Docspell will preprocess the search query to prepare a query for SOLR.
|
||||
It will by default search all indexed fields, which are: attachment
|
||||
contents, attachment names, item name and item notes.
|
||||
|
||||
|
||||
### The Results
|
||||
|
||||
When using full text search, each item in the result list is annotated
|
||||
with the highlighted occurrence of the match.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/search-content-results.png">
|
||||
</div>
|
||||
|
||||
|
||||
## Screencast
|
||||
|
||||
<video width="100%" controls>
|
||||
<source src="../static/docspell-search-2020-06-24.webm" type="video/webm">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
@ -1,235 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Installation
|
||||
permalink: doc/install
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
This page contains detailed installation instructions. For a quick
|
||||
start, refer to [this page](../getit).
|
||||
|
||||
Docspell has been developed and tested on a GNU/Linux system. It may
|
||||
run on Windows and MacOS machines, too (ghostscript and tesseract are
|
||||
available on these systems). But I've never tried.
|
||||
|
||||
Docspell consists of two components that are started in separate
|
||||
processes:
|
||||
|
||||
1. *REST Server* This is the main application, providing the REST Api
|
||||
and the web application.
|
||||
2. *Joex* (job executor) This is the component that does the document
|
||||
processing.
|
||||
|
||||
They can run on multiple machines. All REST server and Joex instances
|
||||
should be on the same network. It is not strictly required that they
|
||||
can reach each other, but the components can then notify themselves
|
||||
about new or done work.
|
||||
|
||||
While this is possible, the simple setup is to start both components
|
||||
once on the same machine.
|
||||
|
||||
The [download page](https://github.com/eikek/docspell/releases)
|
||||
provides pre-compiled packages and the [development page](../dev)
|
||||
contains build instructions.
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
The two components have one prerequisite in common: they both require
|
||||
Java to run. While this is the only requirement for the *REST server*,
|
||||
the *Joex* components requires some more external programs.
|
||||
|
||||
### Java
|
||||
|
||||
Very often, Java is already installed. You can check this by opening a
|
||||
terminal and typing `java -version`. Otherwise install Java using your
|
||||
package manager or see [this site](https://adoptopenjdk.net/) for
|
||||
other options.
|
||||
|
||||
It is enough to install the JRE. The JDK is required, if you want to
|
||||
build docspell from source.
|
||||
|
||||
Docspell has been tested with Java version 1.8 (or sometimes referred
|
||||
to as JRE 8 and JDK 8, respectively). The pre-build packages are also
|
||||
build using JDK 8. But a later version of Java should work as well.
|
||||
|
||||
The next tools are only required on machines running the *Joex*
|
||||
component.
|
||||
|
||||
### External Programs for Joex
|
||||
|
||||
- [Ghostscript](http://pages.cs.wisc.edu/~ghost/) (the `gs` command)
|
||||
is used to extract/convert PDF files into images that are then fed
|
||||
to ocr. It is available on most GNU/Linux distributions.
|
||||
- [Unpaper](https://github.com/Flameeyes/unpaper) is a program that
|
||||
pre-processes images to yield better results when doing ocr. If this
|
||||
is not installed, docspell tries without it. However, it is
|
||||
recommended to install, because it [improves text
|
||||
extraction](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
|
||||
(at the expense of a longer runtime).
|
||||
- [Tesseract](https://github.com/tesseract-ocr/tesseract) is the tool
|
||||
doing the OCR (converts images into text). It can also convert
|
||||
images into pdf files. It is a widely used open source OCR engine.
|
||||
Tesseract 3 and 4 should work with docspell; you can adopt the
|
||||
command line in the configuration file, if necessary.
|
||||
- [Unoconv](https://github.com/unoconv/unoconv) is used to convert
|
||||
office documents into PDF files. It uses libreoffice/openoffice.
|
||||
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
|
||||
PDF files.
|
||||
- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
|
||||
used to convert PDF to PDF files. It adds an OCR layer to scanned
|
||||
PDF files to make them searchable. It also creates PDF/A files from
|
||||
the input pdf.
|
||||
|
||||
The performance of `unoconv` can be improved by starting `unoconv -l`
|
||||
in a separate process. This runs a libreoffice/openoffice listener
|
||||
therefore avoids starting one each time `unoconv` is called.
|
||||
|
||||
### Example Debian
|
||||
|
||||
On Debian this should install all joex requirements:
|
||||
|
||||
``` bash
|
||||
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
|
||||
```
|
||||
|
||||
|
||||
## Database
|
||||
|
||||
Both components must have access to a SQL database. Docspell has
|
||||
support these databases:
|
||||
|
||||
- PostreSQL
|
||||
- MariaDB
|
||||
- H2
|
||||
|
||||
The H2 database is an interesting option for personal and mid-size
|
||||
setups, as it requires no additional work. It is integrated into
|
||||
docspell and works really well. It is also configured as the default
|
||||
database.
|
||||
|
||||
For large installations, PostgreSQL or MariaDB is recommended. Create
|
||||
a database and a user with enough privileges (read, write, create
|
||||
table) to that database.
|
||||
|
||||
When using H2, make sure that all components access the same database
|
||||
– the jdbc url must point to the same file. Then, it is important to
|
||||
add the options
|
||||
`;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE` at the end
|
||||
of the url. See the [config page](configure#jdbc) for an example.
|
||||
|
||||
|
||||
## Installing from ZIP files
|
||||
|
||||
After extracting the zip files, you'll find a start script in the
|
||||
`bin/` folder.
|
||||
|
||||
|
||||
## Installing from DEB packages
|
||||
|
||||
The DEB packages can be installed on Debian, or Debian based Distros:
|
||||
|
||||
``` bash
|
||||
$ sudo dpkg -i docspell*.deb
|
||||
```
|
||||
|
||||
Then the start scripts are in your `$PATH`. Run `docspell-restserver`
|
||||
or `docspell-joex` from a terminal window.
|
||||
|
||||
The packages come with a systemd unit file that will be installed to
|
||||
autostart the services.
|
||||
|
||||
|
||||
## Running
|
||||
|
||||
Run the start script (in the corresponding `bin/` directory when using
|
||||
the zip files):
|
||||
|
||||
```
|
||||
$ ./docspell-restserver*/bin/docspell-restserver
|
||||
$ ./docspell-joex*/bin/docspell-joex
|
||||
```
|
||||
|
||||
This will startup both components using the default configuration. The
|
||||
configuration should be adopted to your needs. For example, the
|
||||
database connection is configured to use a H2 database in the `/tmp`
|
||||
directory. Please refer to the [configuration page](configure) for how
|
||||
to create a custom config file. Once you have your config file, simply
|
||||
pass it as argument to the command:
|
||||
|
||||
```
|
||||
$ ./docspell-restserver*/bin/docspell-restserver /path/to/server-config.conf
|
||||
$ ./docspell-joex*/bin/docspell-joex /path/to/joex-config.conf
|
||||
```
|
||||
|
||||
After starting the rest server, you can reach the web application at
|
||||
path `/app`, so using default values it would be
|
||||
`http://localhost:7880/app`.
|
||||
|
||||
You should be able to create a new account and sign in. Check the
|
||||
[configuration page](configure) to further customize docspell.
|
||||
|
||||
|
||||
### Options
|
||||
|
||||
The start scripts support some options to configure the JVM. One often
|
||||
used setting is the maximum heap size of the JVM. By default, java
|
||||
determines it based on properties of the current machine. You can
|
||||
specify it by given java startup options to the command:
|
||||
|
||||
```
|
||||
$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -- /path/to/server-config.conf
|
||||
```
|
||||
|
||||
This would limit the maximum heap to 1GB. The double slash separates
|
||||
internal options and the arguments to the program. Another frequently
|
||||
used option is to change the default temp directory. Usually it is
|
||||
`/tmp`, but it may be desired to have a dedicated temp directory,
|
||||
which can be configured:
|
||||
|
||||
```
|
||||
$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -Djava.io.tmpdir=/path/to/othertemp -- /path/to/server-config.conf
|
||||
```
|
||||
|
||||
The command:
|
||||
|
||||
```
|
||||
$ ./docspell-restserver*/bin/docspell-restserver -h
|
||||
```
|
||||
|
||||
gives an overview of supported options.
|
||||
|
||||
|
||||
## Raspberry Pi, and similiar
|
||||
|
||||
Both component can run next to each other on a raspberry pi or
|
||||
similiar device.
|
||||
|
||||
|
||||
### REST Server
|
||||
|
||||
The REST server component runs very well on the Raspberry Pi and
|
||||
similiar devices. It doesn't require much resources, because the heavy
|
||||
work is done by the joex components.
|
||||
|
||||
|
||||
### Joex
|
||||
|
||||
Running the joex component on the Raspberry Pi is possible, but will
|
||||
result in long processing times for OCR. Files that don't require OCR
|
||||
are no problem.
|
||||
|
||||
Tested on a RPi model 3 (4 cores, 1G RAM) processing a PDF (scanned
|
||||
with 300dpi) with two pages took 9:52. You can speed it up
|
||||
considerably by uninstalling the `unpaper` command, because this step
|
||||
takes quite long. This, of course, reduces the quality of OCR. But
|
||||
without `unpaper` the same sample pdf was then processed in 1:24, a
|
||||
speedup of 8 minutes.
|
||||
|
||||
You should limit the joex pool size to 1 and, depending on your model
|
||||
and the amount of RAM, set a heap size of at least 500M
|
||||
(`-J-Xmx500M`).
|
||||
|
||||
For personal setups, when you don't need the processing results asap,
|
||||
this can work well enough.
|
@ -1,175 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Joex
|
||||
permalink: doc/joex
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
Joex is short for *Job Executor* and it is the component managing long
|
||||
running tasks in docspell. One of these long running tasks is the file
|
||||
processing task.
|
||||
|
||||
One joex component handles the processing of all files of all
|
||||
collectives/users. It requires much more resources than the rest
|
||||
server component. Therefore the number of jobs that can run in
|
||||
parallel is limited with respect to the hardware it is running on.
|
||||
|
||||
For larger installations, it is probably better to run several joex
|
||||
components on different machines. That works out of the box, as long
|
||||
as all components point to the same database and use different
|
||||
`app-id`s (see [configuring docspell](./configure#app-id)).
|
||||
|
||||
When files are submitted to docspell, they are stored in the database
|
||||
and all known joex components are notified about new work. Then they
|
||||
compete on getting the next job from the queue. After a job finishes
|
||||
and no job is waiting in the queue, joex will sleep until notified
|
||||
again. It will also periodically notify itself as a fallback.
|
||||
|
||||
## Task vs Job
|
||||
|
||||
Just for the sake of this document, a task denotes the code that has
|
||||
to be executed or the thing that has to be done. It emerges in a job,
|
||||
once a task is submitted into the queue from where it will be picked
|
||||
up and executed eventually. A job maintains a state and other things,
|
||||
while a task is just code.
|
||||
|
||||
|
||||
## Scheduler and Queue
|
||||
|
||||
The scheduler is the part that runs and monitors the long running
|
||||
jobs. It works together with the job queue, which defines what job to
|
||||
take next.
|
||||
|
||||
To create a somewhat fair distribution among multiple collectives, a
|
||||
collective is first chosen in a simple round-robin way. Then a job
|
||||
from this collective is chosen by priority.
|
||||
|
||||
There are only two priorities: low and high. A simple *counting
|
||||
scheme* determines if a low prio or high prio job is selected
|
||||
next. The default is `4, 1`, meaning to first select 4 high priority
|
||||
jobs and then 1 low priority job, then starting over. If no such job
|
||||
exists, its falls back to the other priority.
|
||||
|
||||
The priority can be set on a *Source* (see [uploads](uploading)).
|
||||
Uploading through the web application will always use priority *high*.
|
||||
The idea is that while logged in, jobs are more important that those
|
||||
submitted when not logged in.
|
||||
|
||||
|
||||
## Scheduler Config
|
||||
|
||||
The relevant part of the config file regarding the scheduler is shown
|
||||
below with some explanations.
|
||||
|
||||
```
|
||||
docspell.joex {
|
||||
# other settings left out for brevity
|
||||
|
||||
scheduler {
|
||||
|
||||
# Number of processing allowed in parallel.
|
||||
pool-size = 2
|
||||
|
||||
# A counting scheme determines the ratio of how high- and low-prio
|
||||
# jobs are run. For example: 4,1 means run 4 high prio jobs, then
|
||||
# 1 low prio and then start over.
|
||||
counting-scheme = "4,1"
|
||||
|
||||
# How often a failed job should be retried until it enters failed
|
||||
# state. If a job fails, it becomes "stuck" and will be retried
|
||||
# after a delay.
|
||||
retries = 5
|
||||
|
||||
# The delay until the next try is performed for a failed job. This
|
||||
# delay is increased exponentially with the number of retries.
|
||||
retry-delay = "1 minute"
|
||||
|
||||
# The queue size of log statements from a job.
|
||||
log-buffer-size = 500
|
||||
|
||||
# If no job is left in the queue, the scheduler will wait until a
|
||||
# notify is requested (using the REST interface). To also retry
|
||||
# stuck jobs, it will notify itself periodically.
|
||||
wakeup-period = "30 minutes"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `pool-size` setting determines how many jobs run in parallel. You
|
||||
need to play with this setting on your machine to find an optimal
|
||||
value.
|
||||
|
||||
The `counting-scheme` determines for all collectives how to select
|
||||
between high and low priority jobs; as explained above. It is
|
||||
currently not possible to define that per collective.
|
||||
|
||||
If a job fails, it will be set to *stuck* state and retried by the
|
||||
scheduler. The `retries` setting defines how many times a job is
|
||||
retried until it enters the final *failed* state. The scheduler waits
|
||||
some time until running the next try. This delay is given by
|
||||
`retry-delay`. This is the initial delay, the time until the first
|
||||
re-try (the second attempt). This time increases exponentially with
|
||||
the number of retries.
|
||||
|
||||
The jobs will log about what they do, which is picked up and stored
|
||||
into the database asynchronously. The log events are buffered in a
|
||||
queue and another thread will consume this queue and store them in the
|
||||
database. The `log-buffer-size` determines the size of the queue.
|
||||
|
||||
At last, there is a `wakeup-period` that determines at what interval
|
||||
the joex component notifies itself to look for new jobs. If jobs get
|
||||
stuck, and joex is not notified externally it could miss to
|
||||
retry. Also, since networks are not reliable, a notification may not
|
||||
reach a joex component. This periodic wakup is just to ensure that
|
||||
jobs are eventually run.
|
||||
|
||||
|
||||
## Periodic Tasks
|
||||
|
||||
The job executor can execute tasks periodically. These tasks are
|
||||
stored in the database such that they can be submitted into the job
|
||||
queue. Multiple job executors can run at once, only one is ever doing
|
||||
something with a task. So a periodic task is never submitted twice. It
|
||||
is also not submitted, if a previous task has not finished yet.
|
||||
|
||||
|
||||
## Starting on demand
|
||||
|
||||
The job executor and rest server can be started multiple times. This
|
||||
is especially useful for the job executor. For example, when
|
||||
submitting a lot of files in a short time, you can simply startup more
|
||||
job executors on other computers on your network. Maybe use your
|
||||
laptop to help with processing for a while.
|
||||
|
||||
You have to make sure, that all connect to the same database, and that
|
||||
all have unique `app-id`s.
|
||||
|
||||
Once the files have been processced you can stop the additional
|
||||
executors.
|
||||
|
||||
|
||||
## Shutting down
|
||||
|
||||
If a job executor is sleeping and not executing any jobs, you can just
|
||||
quit using SIGTERM or `Ctrl-C` when running in a terminal. But if
|
||||
there are jobs currently executing, it is advisable to initiate a
|
||||
graceful shutdown. The job executor will then stop taking new jobs
|
||||
from the queue but it will wait until all running jobs have completed
|
||||
before shutting down.
|
||||
|
||||
This can be done by sending a http POST request to the api of this job
|
||||
executor:
|
||||
|
||||
```
|
||||
curl -XPOST "http://localhost:7878/api/v1/shutdownAndExit"
|
||||
```
|
||||
|
||||
If joex receives this request it will immediately stop taking new jobs
|
||||
and it will quit when all running jobs are done.
|
||||
|
||||
If a job executor gets terminated while there are running jobs, the
|
||||
jobs are still in the current state marked to be executed by this job
|
||||
executor. In order to fix this, start the job executor again. It will
|
||||
search all jobs that are marked with its id and put them back into
|
||||
waiting state. Then send a graceful shutdown request as shown above.
|
@ -1,80 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Send items via E-Mail
|
||||
permalink: doc/mailitem
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
You can send e-mails from within docspell attaching the files of an
|
||||
item. This is useful to collaborate or share certain documents with
|
||||
people outside docspell.
|
||||
|
||||
All sent mails are stored attached to the item.
|
||||
|
||||
|
||||
## E-Mail Settings (SMTP)
|
||||
|
||||
To send mails, there are SMTP settings required. Please see the page
|
||||
about [e-mail settings](emailsettings#smtp-settings).
|
||||
|
||||
|
||||
## Sending Mails
|
||||
|
||||
Currently, it is possible to send mails related to only one item. You
|
||||
can define the mail body and docspell will add the attachments of an
|
||||
item, or you may choose to send the mail without any attachments.
|
||||
|
||||
In the item detail view, click on the envelope icon to open the mail
|
||||
form:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-item-1.jpg">
|
||||
</div>
|
||||
|
||||
Then write the mail. Multiple recipients may be specified. The input
|
||||
field shows completion proposals from all contacts in your address
|
||||
book (from organizations and persons). Choose an address by pressing
|
||||
*Enter* or by clicking a proposal from the list. The proposal list can
|
||||
be iterated by the *Up* and *Down* arrows. You can type in any
|
||||
address, of course, it doesn't need to match a proposal.
|
||||
|
||||
If you have multiple mail settings defined, you can choose in the top
|
||||
dropdown which account to use for sending.
|
||||
|
||||
The last checkbox allows to choose whether docspell should add all
|
||||
attachments of the item to the mail. If it is unchecked, no
|
||||
attachments will be added. It is currently not possible to pick
|
||||
specific attachments, it's all or nothing.
|
||||
|
||||
Clicking *Cancel* will delete the inputs and close the mail form, but
|
||||
clicking the envelope icon again, will only close the form without
|
||||
clearing its contents.
|
||||
|
||||
The *Send* button is active once all input fields have been filled.
|
||||
Once you click *Send*, the docspell server will send the mail using
|
||||
your connection settings. If that succeeds the mail is saved to the
|
||||
database and you'll see a message in the form.
|
||||
|
||||
## Accessing Sent Mails
|
||||
|
||||
If there is an e-mail for an item, a tab shows up at the right side,
|
||||
next to the attachments.
|
||||
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-item-2.jpg">
|
||||
</div>
|
||||
|
||||
This tab shows a list of all mails that have been sent related to this
|
||||
item.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-item-3.jpg">
|
||||
</div>
|
||||
|
||||
Clicking on a mail opens it in detail.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/mail-item-4.jpg">
|
||||
</div>
|
@ -1,117 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Meta Data
|
||||
permalink: doc/metadata
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Meta Data
|
||||
|
||||
Docspell processes each uploaded file. Processing involves extracting
|
||||
archives, extracting text, anlyzing the extracted text and converting
|
||||
the file into a pdf. Text is analyzed to find metadata that can be set
|
||||
automatically. Docspell compares the extracted text against a set of
|
||||
known meta data. The *Meta Data* page allows to manage this meta data:
|
||||
|
||||
- Tags
|
||||
- Organizations
|
||||
- Persons
|
||||
- Equipments
|
||||
- Folders
|
||||
|
||||
|
||||
### Tags
|
||||
|
||||
Items can be tagged with multiple custom tags (aka labels). This
|
||||
allows to describe many different workflows people may have with their
|
||||
documents.
|
||||
|
||||
A tag can have a *category*. This is meant to group tags together. For
|
||||
example, you may want to have a tag category *doctype* that is
|
||||
comprised of tags like *bill*, *contract*, *receipt* and so on. Or for
|
||||
workflows, a tag category *state* may exist that includes tags like
|
||||
*Todo* or *Waiting*. Or you can tag items with user names to provide
|
||||
"assignment" semantics. Docspell doesn't propose any workflow, but it
|
||||
can help to implement some.
|
||||
|
||||
The tags are *not* taken into account when processing. Docspell will
|
||||
not automatically associate tags to your items. The tags are only
|
||||
meant to be used manually for now.
|
||||
|
||||
|
||||
### Organization and Person
|
||||
|
||||
The organization entity represents an non-personal (organization or
|
||||
company) correspondent of an item. Docspell will choose one or more
|
||||
organizations when processing documents and associate the "best" match
|
||||
with your item.
|
||||
|
||||
The person entitiy can appear in two roles: It may be a correspondent
|
||||
or the person an item is about. So a person is either a correspondent
|
||||
or a concerning person. Docspell can not know which person is which,
|
||||
therefore you need to tell this by checking the box "Use for
|
||||
concerning person suggestion only". If this is checked, docspell will
|
||||
use this person only to suggest a concerning person. Otherwise the
|
||||
person is used only for correspondent suggestions.
|
||||
|
||||
Document processing uses the following properties:
|
||||
|
||||
- name
|
||||
- websites
|
||||
- e-mails
|
||||
|
||||
The website and e-mails can be added as contact information. If these
|
||||
three are present, you should get good matches from docspell. All
|
||||
other fields of an organization and person are not used during
|
||||
document processing. They might be useful when using this as a real
|
||||
address book.
|
||||
|
||||
|
||||
### Equipment
|
||||
|
||||
The equipment entity is almost like a tag. In fact, it could be
|
||||
replaced by a tag with a specific known category. The difference is
|
||||
that docspell will try to find a match and associate it with your
|
||||
item. The equipment represents non-personal things that an item is
|
||||
about. Examples are: bills or insurances for *cars*, contracts for
|
||||
*houses* or *flats*.
|
||||
|
||||
Equipments don't have contact information, so the only property that
|
||||
is used to find matches during document processing is its name.
|
||||
|
||||
|
||||
### Folders
|
||||
|
||||
Folders provide a way to divide all documents into disjoint subsets.
|
||||
Unlike with tags, an item can have at most one folder or none. A
|
||||
folder has an owner – the user who created the folder. Additionally,
|
||||
it can have members: users of the collective that the owner can assign
|
||||
to a folder.
|
||||
|
||||
When searching for items, the results are restricted to items that
|
||||
have either no folder assigned or a folder where the current user is
|
||||
owner or member. It can be used to control visibility when searching.
|
||||
However: there are no hard access checks. For example, if the item id
|
||||
is known, any user of the collective can see it and modify its meta
|
||||
data.
|
||||
|
||||
One use case is, that you can hide items from other users, like bills
|
||||
for birthday presents. In this case it is very unlikely that someone
|
||||
can guess the item-id.
|
||||
|
||||
While folders are *not* taken into account when processing documents,
|
||||
they can be specified with the upload request or a [source
|
||||
url](uploading#anonymous-upload) to have them automatically set when
|
||||
they arrive.
|
||||
|
||||
|
||||
## Document Language
|
||||
|
||||
An important setting is the language of your documents. This helps OCR
|
||||
and text analysis. You can select between English and German
|
||||
currently.
|
||||
|
||||
Go to the *Collective Settings* page and click *Document
|
||||
Language*. This will set the lanugage for all your documents. It is
|
||||
not (yet) possible to specify it when uploading.
|
@ -1,237 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Nix/NixOS
|
||||
permalink: doc/nix
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
## Install via Nix
|
||||
|
||||
Docspell can be installed via the [nix](https://nixos.org/nix) package
|
||||
manager, which is available for Linux and OSX. Docspell is currently not
|
||||
part of the [nixpkgs collection](https://nixos.org/nixpkgs/), but you
|
||||
can use the derivation from this repository. This is sometimes
|
||||
referred to as [import from
|
||||
derivation](https://nixos.wiki/wiki/Import_From_Derivation).
|
||||
|
||||
For example, the `builtins.fetchTarball` function can be used to
|
||||
retrieve the files; then import the `release.nix` file:
|
||||
|
||||
``` nix
|
||||
let
|
||||
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
|
||||
in
|
||||
import "${docspellsrc}/nix/release.nix";
|
||||
```
|
||||
|
||||
This creates a set containing a function for creating a derivation for
|
||||
docspell. This then needs to be called like other custom packages. For
|
||||
example, in your `~/.nixpkgs/config.nix` you could write this:
|
||||
|
||||
``` nix
|
||||
let
|
||||
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
|
||||
docspell = import "${docspellsrc}/nix/release.nix";
|
||||
in
|
||||
{ packageOverrides = pkgs:
|
||||
let
|
||||
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
|
||||
custom = {
|
||||
docspell = callPackage docspell.currentPkg {};
|
||||
};
|
||||
in custom;
|
||||
}
|
||||
```
|
||||
|
||||
The `docspell` custom package is again a set that contains derivations
|
||||
for all 3 installable docspell programs: the restserver, joex and the
|
||||
tools.
|
||||
|
||||
Then you can install docspell via `nix-shell` or `nix-env`, for example:
|
||||
|
||||
``` bash
|
||||
$ nix-env -iA nixpkgs.docspell.server nixpkgs.docspell.joex nixpkgs.docspell.tools
|
||||
```
|
||||
|
||||
You may need to replace `nixpkgs` with `nixos` when you're on NixOS.
|
||||
|
||||
The expression `docspell.currentPkg` refers to the most current release
|
||||
of Docspell. So even if you use the tarball of the current master
|
||||
branch, the `release.nix` file only contains derivations for releases.
|
||||
The expression `docspell.currentPkg` is a shortcut for selecting the
|
||||
most current release. For example it translates to `docspell.pkg
|
||||
docspell.cfg.v@PVERSION@` – if the current version is `@VERSION@`.
|
||||
|
||||
|
||||
## Docspell as a service on NixOS
|
||||
|
||||
If you are running [NixOS](https://nixos.org), there is a module
|
||||
definition for installing Docspell as a service using systemd.
|
||||
|
||||
There are the following modules provided:
|
||||
|
||||
- restserver
|
||||
- joex
|
||||
- consumedir
|
||||
|
||||
The `consumedir` module defines a systemd unit that starts the
|
||||
`consumedir.sh` script to watch one or more directories for new files.
|
||||
|
||||
You need to import the `release.nix` file as described above in your
|
||||
`configuration.nix` and then append the docspell module to your list of
|
||||
modules. Here is an example:
|
||||
|
||||
```nix
|
||||
{ config, pkgs, ... }:
|
||||
let
|
||||
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
|
||||
docspell = import "${docspellsrc}/nix/release.nix";
|
||||
in
|
||||
{
|
||||
imports = [ mymodule1 mymodule2 ] ++ docspell.modules;
|
||||
|
||||
nixpkgs = {
|
||||
config = {
|
||||
packageOverrides = pkgs:
|
||||
let
|
||||
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
|
||||
custom = {
|
||||
docspell = callPackage docspell.currentPkg {};
|
||||
};
|
||||
in custom;
|
||||
};
|
||||
};
|
||||
|
||||
services.docspell-restserver = {
|
||||
enable = true;
|
||||
base-url = "http://docspelltest:7880";
|
||||
# ... more settings here
|
||||
};
|
||||
services.docspell-joex = {
|
||||
enable = true;
|
||||
base-url = "http://docspelltexst:7878";
|
||||
# ... more settings here
|
||||
};
|
||||
services.docspell-consumedir = {
|
||||
enable = true;
|
||||
watchDirs = ["/tmp/test"];
|
||||
urls = ["http://localhost:7880/api/v1/open/upload/item/the-source-id"];
|
||||
};
|
||||
|
||||
...
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
Please see the `nix/module-server.nix` and `nix/module-joex.nix` files
|
||||
for the set of options. The nixos options are modelled after the
|
||||
default configuration file.
|
||||
|
||||
The modules files are only applicable to the newest version of
|
||||
Docspell. If you really need an older version, checkout the
|
||||
appropriate commit.
|
||||
|
||||
## NixOs Example
|
||||
|
||||
This is a example system configuration that installs docspell with a
|
||||
postgres database. This snippet can be used to create a vm (using
|
||||
`nixos-rebuild build-vm` as shown above) or a container, for example.
|
||||
|
||||
``` nix
|
||||
{ config, pkgs, ... }:
|
||||
let
|
||||
docspellsrc = builtins.fetchTarball "https://github.com/eikek/docspell/archive/master.tar.gz";
|
||||
docspell = import "${docspellsrc}/nix/release.nix";
|
||||
in
|
||||
{
|
||||
imports = docspell.modules;
|
||||
|
||||
nixpkgs = {
|
||||
config = {
|
||||
packageOverrides = pkgs:
|
||||
let
|
||||
callPackage = pkgs.lib.callPackageWith(custom // pkgs);
|
||||
custom = {
|
||||
docspell = callPackage docspell.currentPkg {};
|
||||
};
|
||||
in custom;
|
||||
};
|
||||
};
|
||||
|
||||
##### just for the example…
|
||||
users.users.root = {
|
||||
password = "root";
|
||||
};
|
||||
#####
|
||||
|
||||
# install docspell-joex and enable the systemd service
|
||||
services.docspell-joex = {
|
||||
enable = true;
|
||||
base-url = "http://localhost:7878";
|
||||
bind = {
|
||||
address = "0.0.0.0";
|
||||
port = 7878;
|
||||
};
|
||||
scheduler = {
|
||||
pool-size = 1;
|
||||
};
|
||||
jdbc = {
|
||||
url = "jdbc:postgresql://localhost:5432/docspell";
|
||||
user = "docspell";
|
||||
password = "docspell";
|
||||
};
|
||||
};
|
||||
|
||||
# install docspell-restserver and enable the systemd service
|
||||
services.docspell-restserver = {
|
||||
enable = true;
|
||||
base-url = "http://localhost:7880";
|
||||
bind = {
|
||||
address = "0.0.0.0";
|
||||
port = 7880;
|
||||
};
|
||||
auth = {
|
||||
server-secret = "b64:EirgaudMyNvWg4TvxVGxTu-fgtrto4ETz--Hk9Pv2o4=";
|
||||
};
|
||||
backend = {
|
||||
signup = {
|
||||
mode = "invite";
|
||||
new-invite-password = "dsinvite2";
|
||||
invite-time = "30 days";
|
||||
};
|
||||
jdbc = {
|
||||
url = "jdbc:postgresql://localhost:5432/docspell";
|
||||
user = "docspell";
|
||||
password = "docspell";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
# install postgresql and initially create user/database
|
||||
services.postgresql =
|
||||
let
|
||||
pginit = pkgs.writeText "pginit.sql" ''
|
||||
CREATE USER docspell WITH PASSWORD 'docspell' LOGIN CREATEDB;
|
||||
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO docspell;
|
||||
GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO docspell;
|
||||
CREATE DATABASE DOCSPELL OWNER 'docspell';
|
||||
'';
|
||||
in {
|
||||
enable = true;
|
||||
package = pkgs.postgresql_11;
|
||||
enableTCPIP = true;
|
||||
initialScript = pginit;
|
||||
port = 5432;
|
||||
authentication = ''
|
||||
host all all 0.0.0.0/0 md5
|
||||
'';
|
||||
};
|
||||
|
||||
|
||||
networking = {
|
||||
hostName = "docspellexample";
|
||||
firewall.allowedTCPPorts = [7880];
|
||||
};
|
||||
}
|
||||
```
|
@ -1,76 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Notify about due items
|
||||
permalink: doc/notifydueitems
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
A user that provides valid email (smtp) settings, can be notified by
|
||||
docspell about due items. You will then receive an e-mail containing a
|
||||
list of items, sorted by their due date.
|
||||
|
||||
You need first define smtp settings, please see [this
|
||||
page](mailitem#e-mail-settings).
|
||||
|
||||
Notifying works simply by searching for due items periodically. It
|
||||
will be submitted to the job queue and is picked up by an available
|
||||
[job executor](joex) eventually. This can be setup in the user
|
||||
settings page.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/notify-due-items.jpg">
|
||||
</div>
|
||||
|
||||
At first, the task can be disabled/enabled any time.
|
||||
|
||||
Then two settings are required for sending an e-mail. You need to
|
||||
specify the connection to use and the recipients.
|
||||
|
||||
It follows some settings to customize the query for searching items.
|
||||
You can choose to only include items that have one or more tags (these
|
||||
are `and`-ed, so all tags must exist on the item). You can also
|
||||
provide tags that must *not* appear on an item (these tags are
|
||||
`or`-ed, so only one such tag is enough ot exclude an item). A common
|
||||
use-case would be to manually tag an item with *Done* once there is
|
||||
nothing more to do. Then these items can be excluded from the search.
|
||||
The somewhat inverse use-case is to always tag items with a *Todo* tag
|
||||
and remove it once completed.
|
||||
|
||||
The *Remind Days* field species the number of days the due date may be
|
||||
in the future. Each time the task executes, it searches for items with
|
||||
a due date lower than `today + remindDays`.
|
||||
|
||||
If you don't restrict the search using tags, then all items with a due
|
||||
date lower than this value are selected. Since items are (usually) not
|
||||
deleted, this only makes sense, if you remove the due date once you
|
||||
are done with an item.
|
||||
|
||||
The last option is to check *cap overdue items*, which uses the value
|
||||
in *Remind Days* to further restrict the due date of an item: only
|
||||
those with a due date *greater than* `today - remindDays` are
|
||||
selected. In other words, only items with an overdue time of *at most*
|
||||
*Remind Days* are included.
|
||||
|
||||
The *Schedule* field specifies the periodicity. The syntax is similiar
|
||||
to a date-time string, like `2019-09-15 12:32`, where each part is a
|
||||
pattern to also match multple values. The ui tries to help a little by
|
||||
displaying the next two date-times this task would execute. A more in
|
||||
depth help is available
|
||||
[here](https://github.com/eikek/calev#what-are-calendar-events). For
|
||||
example, to execute the task every monday at noon, you would write:
|
||||
`Mon *-*-* 12:00`. A date-time part can match all values (`*`), a list
|
||||
of values (e.g. `1,5,12,19`) or a range (e.g. `1..9`). Long lists may
|
||||
be written in a shorter way using a repetition value. It is written
|
||||
like this: `1/7` which is the same as a list with `1` and all
|
||||
multiples of `7` added to it. In other words, it matches `1`, `1+7`,
|
||||
`1+7+7`, `1+7+7+7` and so on.
|
||||
|
||||
You can click on *Start Once* to run this task right now, without
|
||||
saving the form to the database ("right now" means it is picked up by
|
||||
a free job executor).
|
||||
|
||||
If you click *Submit* these settings are saved and the task runs
|
||||
periodically.
|
||||
|
||||
You can see the task executing at the [processing page](processing).
|
@ -1,42 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Processing Queue
|
||||
permalink: doc/processing
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
|
||||
The page *Processing Queue* shows the current state of document
|
||||
processing for your uploads.
|
||||
|
||||
At the top of the page a list of running jobs is shown. Below that,
|
||||
the left column shows jobs that wait to be picked up by the job
|
||||
executor. On the right are finished jobs. The number of finished jobs
|
||||
is cut to some maximum and is also restricted by a date range. The
|
||||
page refreshes itself automatically to show the progress.
|
||||
|
||||
Example screenshot:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/processing-queue.jpg">
|
||||
</div>
|
||||
|
||||
You can cancel running jobs or remove waiting ones from the queue. If
|
||||
you click on the small file symbol on finished jobs, you can inspect
|
||||
its log messages again. A running job displays the job executor id
|
||||
that executes the job.
|
||||
|
||||
The jobs listed here are all long-running tasks for your collective.
|
||||
Most of the time it executes the document processing tasks. But user
|
||||
defined tasks, like "import mailbox", are also visible here.
|
||||
|
||||
Since job executors are shared among all collectives, it may happen
|
||||
that a job is some time waiting until it is picked up by a job
|
||||
executor. You can always start more job executors to help out.
|
||||
|
||||
If a job fails, it is retried after some time. Only if it fails too
|
||||
often (can be configured), it then is finished with *failed* state.
|
||||
|
||||
For the document-processing task, if processing finally fails or a job
|
||||
is cancelled, the item is still created, just without suggestions.
|
@ -1,98 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Reverse Proxy
|
||||
permalink: doc/reverseproxy
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
This contains examples for how to use docspell behind a reverse proxy.
|
||||
|
||||
For the examples below, assume the following:
|
||||
|
||||
- Docspell app is available at `192.168.1.11:7880`. If it is running
|
||||
on the same machine as the reverse proxy server, you can set
|
||||
`localhost:7880` instead.
|
||||
- The external domain/hostname is `docspell.example.com`
|
||||
|
||||
## Configuring Docspell
|
||||
|
||||
These settings require a complement config part in the docspell
|
||||
configuration file:
|
||||
|
||||
- First, if Docspell REST server is on a different machine, you need
|
||||
to change the `bind.address` setting to be either `0.0.0.0` or the
|
||||
ip address of the network interface that the reverse proxy server
|
||||
connects to.
|
||||
```
|
||||
docspell.server {
|
||||
# Where the server binds to.
|
||||
bind {
|
||||
address = "192.168.1.11"
|
||||
port = 7880
|
||||
}
|
||||
}
|
||||
```
|
||||
Note that a value of `0.0.0.0` instead of `192.168.1.11` will bind
|
||||
the server to every network interface.
|
||||
- Docspell needs to know the external url. The `base-url` setting
|
||||
must point to the external address. Using above values, it must be
|
||||
set to `https://docspell.example.com`.
|
||||
```
|
||||
docspell.server {
|
||||
# This is the base URL this application is deployed to. This is used
|
||||
# to create absolute URLs and to configure the cookie.
|
||||
base-url = "https://docspell.example.com"
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
Note that this example assumes that the docspell-joex component is on
|
||||
the same machine. This page is only related for exposing the REST
|
||||
server and web application.
|
||||
|
||||
If you have examples for more servers, please let me know or add it to
|
||||
this site.
|
||||
|
||||
## Nginx
|
||||
|
||||
This defines two servers: one listens for http traffic and redirects
|
||||
to the https variant. Additionally it defines the let's encrypt
|
||||
`.well-known` folder name.
|
||||
|
||||
The https server endpoint is configured with the let's encrypt
|
||||
certificates and acts as a proxy for the application at
|
||||
`192.168.1.11:7880`.
|
||||
|
||||
```
|
||||
server {
|
||||
listen 0.0.0.0:80 ;
|
||||
listen [::]:80 ;
|
||||
server_name docspell.example.com ;
|
||||
location /.well-known/acme-challenge {
|
||||
root /var/data/nginx/ACME-PUBLIC;
|
||||
auth_basic off;
|
||||
}
|
||||
location / {
|
||||
return 301 https://$host$request_uri;
|
||||
}
|
||||
}
|
||||
server {
|
||||
listen 0.0.0.0:443 ssl http2 ;
|
||||
listen [::]:443 ssl http2 ;
|
||||
server_name docspell.example.com ;
|
||||
location /.well-known/acme-challenge {
|
||||
root /var/data/nginx/ACME-PUBLIC;
|
||||
auth_basic off;
|
||||
}
|
||||
ssl_certificate /var/lib/acme/docspell.example.com/fullchain.pem;
|
||||
ssl_certificate_key /var/lib/acme/docspell.example.com/key.pem;
|
||||
ssl_trusted_certificate /var/lib/acme/docspell.example.com/full.pem;
|
||||
location / {
|
||||
proxy_pass http://192.168.1.11:7880;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection $connection_upgrade;
|
||||
}
|
||||
}
|
||||
```
|
@ -1,127 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Scan Mailboxes
|
||||
permalink: doc/scanmailbox
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
User that provide valid email (imap) settings, can import mails from
|
||||
their mailbox into docspell periodically.
|
||||
|
||||
You need first define imap settings, please see [this
|
||||
page](emailsettings#imap-settings).
|
||||
|
||||
Go to *User Settings -> Scan Mailbox Task*. You can define periodic
|
||||
tasks that connects to your mailbox and import mails into docspell. It
|
||||
is possible to define multiple tasks, for example, if you have
|
||||
multiple e-mail accounts you want to import periodically.
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/scanmailbox-list.png">
|
||||
</div>
|
||||
|
||||
|
||||
## Details
|
||||
|
||||
Creating a task requires the following information:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/scanmailbox-detail.png">
|
||||
</div>
|
||||
|
||||
You can enable or disable this task. A disabled task will not run
|
||||
periodically. You can still choose to run it manually if you click the
|
||||
`Start Once` button.
|
||||
|
||||
Then you need to specify which [IMAP
|
||||
connection](emailsettings#imap-settings) to use.
|
||||
|
||||
A list of folders is required. Docspell will only look into these
|
||||
folders. You can specify multiple folders. The "Inbox" folder is a
|
||||
special folder, which will usually appear translated in your web-mail
|
||||
client. You can specify "INBOX" case insensitive, it will then read
|
||||
mails in your inbox. Any other folder is usually case-sensitive
|
||||
(depends on the imap server, but usually they are case sensitive
|
||||
except the INBOX folder). Type in a folder name and click the add
|
||||
button on the right.
|
||||
|
||||
Then the field *Received Since Hours* defines how many hours to go
|
||||
back and look for mails. Usually there are many mails in your inbox
|
||||
and importing them all at once is not feasible or desirable. It can
|
||||
work together with the *Schedule* field below. For example, you could
|
||||
run this task all 6 hours and read mails from 8 hours back.
|
||||
|
||||
The next two settings tell docspell what to do once a mail has been
|
||||
submitted to docspell. It can be moved into another folder in your
|
||||
mail account. This moves it out of the way for the next run. You can
|
||||
also choose to delete the mail, but *note that it will really be
|
||||
deleted and not moved to your trash folder*. If both options are off,
|
||||
nothing happens with that mail, it simply stays (and could be re-read
|
||||
on the next run).
|
||||
|
||||
When docspell creates an item from a mail, it needs to set a direction
|
||||
value (incoming or outgoing). If you know that all mails you want to
|
||||
import have a specific directon, then you can set it here. Otherwise,
|
||||
*automatic* means that docspell chooses a direction based on the
|
||||
`From` header of a mail. If the `From` header is an e-mail address
|
||||
that belongs to a “concerning” person in your address book, then it is
|
||||
set to "outgoing". Otherwise it is set to "incoming". To support this,
|
||||
you need to add your own e-mail address(es) to your address book.
|
||||
|
||||
The *Item Folder* setting is used to put all items that are created
|
||||
from mails into the specified [folder](metadata#folders). If you
|
||||
define a folder here, where you are not a member, you won't find
|
||||
resulting items.
|
||||
|
||||
The last field is the *Schedule* which defines when and how often this
|
||||
task should run. The syntax is similiar to a date-time string, like
|
||||
`2019-09-15 12:32`, where each part is a pattern to also match multple
|
||||
values. The ui tries to help a little by displaying the next two
|
||||
date-times this task would execute. A more in depth help is available
|
||||
[here](https://github.com/eikek/calev#what-are-calendar-events). For
|
||||
example, to execute the task every monday at noon, you would write:
|
||||
`Mon *-*-* 12:00`. A date-time part can match all values (`*`), a list
|
||||
of values (e.g. `1,5,12,19`) or a range (e.g. `1..9`). Long lists may
|
||||
be written in a shorter way using a repetition value. It is written
|
||||
like this: `1/7` which is the same as a list with `1` and all
|
||||
multiples of `7` added to it. In other words, it matches `1`, `1+7`,
|
||||
`1+7+7`, `1+7+7+7` and so on.
|
||||
|
||||
|
||||
## Reading Mails twice / Duplicates
|
||||
|
||||
Since users can move around mails in their mailboxes, it can happen
|
||||
that docspell unintentionally reads a mail multiple times. If docspell
|
||||
reads a mail, it will first check if an item already exists that
|
||||
originated from this mail. It only proceeds to import it, if it cannot
|
||||
find any. If you deleted an item in the meantime, docspell would
|
||||
import the mail again.
|
||||
|
||||
This check uses the
|
||||
[`Message-ID`](https://en.wikipedia.org/wiki/Message-ID) of an e-mail.
|
||||
This is usually there and should identify a complete mail. But it
|
||||
won't catch duplicate mails, that are sent multiple times - they might
|
||||
have different `Message-ID`s. Also some mails have no such ids and are
|
||||
then imported from docspell without any checks.
|
||||
|
||||
In later versions, docspell may use the checksum of the generated eml
|
||||
file to look for duplicates, too.
|
||||
|
||||
|
||||
## How it works
|
||||
|
||||
Docspell will go through all folders and download mails in “batches”.
|
||||
This size can be set by the admin in the [configuration
|
||||
file](configure#joex) and applies to all these tasks (same for all
|
||||
users). This batch only contains the mail headers and not the complete
|
||||
mail.
|
||||
|
||||
Then each mail is downloaded completely one by one and converted into
|
||||
an [eml](https://en.wikipedia.org/wiki/Email#Filename_extensions) file
|
||||
which is then submitted to docspell. Then the usual processing
|
||||
machinery starts, just like uploading an eml file via the webapp.
|
||||
|
||||
The number of folders and the number of mails to import can be limited
|
||||
by an admin via the config file. Note that this limit applies to one
|
||||
task run only, it is meant to reduce resource allocation of one task.
|
@ -1,20 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Tools
|
||||
permalink: doc/tools
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
The `tools/` folder contains some scripts and other resources intented
|
||||
for integrating docspell.
|
||||
|
||||
- [ds.sh](ds) A script to quickly upload files from the command
|
||||
line.
|
||||
- [Consume Directory](consumedir) A script to watch a directory
|
||||
for new files and upload them to docspell.
|
||||
- [Browser Extension](browserext) An extension for firefox to
|
||||
upload files from your browser via *right-click -> upload to
|
||||
docspell*.
|
||||
- [SMTP Gateway](smtpgateway) Start a SMTP server that forwards all
|
||||
mails to docspell.
|
@ -1,84 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Browser Extension (Firefox)
|
||||
permalink: doc/tools/browserext
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
The idea is to click on a file in firefox and send it to docspell. It
|
||||
is downloaded in the context of your current page. Then handed to an
|
||||
application that pushes it to docspell. There is a browser add-on
|
||||
implementing this in `tools/webextension`. This add-on only works with
|
||||
firefox.
|
||||
|
||||
## Install
|
||||
|
||||
This is a bit complicated, since you need to install external tools
|
||||
and the web extension. Both work together.
|
||||
|
||||
### Install `ds.sh`
|
||||
|
||||
First copy the `ds.sh` tool somewhere in your `PATH`, maybe
|
||||
`/usr/local/bin` as described above.
|
||||
|
||||
|
||||
### Install the native part
|
||||
|
||||
Then install the "native" part of the web extension:
|
||||
|
||||
Copy or symlink the `native.py` script into some known location. For
|
||||
example:
|
||||
|
||||
``` bash
|
||||
ln -s ~/docspell-checkout/tools/webextension/native/native.py /usr/local/share/docspell/native.py
|
||||
```
|
||||
|
||||
Then copy the `app_manifest.json` to
|
||||
`$HOME/.mozilla/native-messaging-hosts/docspell.json`. For example:
|
||||
|
||||
``` bash
|
||||
cp ~/docspell-checkout/tools/webextension/native/app_manifest.json ~/.mozilla/native-messaging-hosts/docspell.json
|
||||
```
|
||||
|
||||
See
|
||||
[here](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/Native_manifests#Manifest_location)
|
||||
for details.
|
||||
|
||||
And you might want to modify this json file, so the path to the
|
||||
`native.py` script is correct (it must be absolute).
|
||||
|
||||
If the `ds.sh` script is in your `$PATH`, then this should
|
||||
work. Otherwise, edit the `native.py` script and change the path to
|
||||
the tool. Or create a file `$HOME/.config/docspell/ds.cmd` whose
|
||||
content is the path to the `ds.sh` script.
|
||||
|
||||
|
||||
### Install the extension
|
||||
|
||||
An extension file can be build using the `make-xpi.sh` script. But
|
||||
installing it in "standard" firefox won't work, because [Mozilla
|
||||
requires extensions to be signed by
|
||||
them](https://wiki.mozilla.org/Add-ons/Extension_Signing). This means
|
||||
creating an account and going through some process…. So here are two
|
||||
alternatives:
|
||||
|
||||
1. Open firefox and type `about:debugging` in the addressbar. Then
|
||||
click on *'Load Temporary Add-on...'* and select the
|
||||
`manifest.json` file. The extension is now installed. The downside
|
||||
is, that the extension will be removed once firefox is closed.
|
||||
2. Use Firefox ESR, which allows to install Add-ons not signed by
|
||||
Mozilla. But it has to be configured: Open firefox and type
|
||||
`about:config` in the address bar. Search for key
|
||||
`xpinstall.signatures.required` and set it to `false`. This is
|
||||
described on the last paragraph on [this
|
||||
page](https://support.mozilla.org/en-US/kb/add-on-signing-in-firefox).
|
||||
|
||||
When you right click on a file link, there should be a context menu
|
||||
entry *'Docspell Upload Helper'*. The add-on will download this file
|
||||
using the browser and then send the file path to the `native.py`
|
||||
script. This script will in turn call `ds.sh` which finally uploads it
|
||||
to your configured URLs.
|
||||
|
||||
Open the Add-ons page (`Ctrl`+`Shift`+`A`), the new add-on should be
|
||||
there.
|
@ -1,141 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Consume Directory
|
||||
permalink: doc/tools/consumedir
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
The `consumerdir.sh` is a bash script that works in two modes:
|
||||
|
||||
- Go through all files in given directories (recursively, if `-r` is
|
||||
specified) and sent each to docspell.
|
||||
- Watch one or more directories for new files and upload them to
|
||||
docspell.
|
||||
|
||||
It can watch or go through one or more directories. Files can be
|
||||
uploaded to multiple urls.
|
||||
|
||||
Run the script with the `-h` or `--help` option, to see a short help
|
||||
text. The help text will also show the values for any given option.
|
||||
|
||||
The script requires `curl` for uploading. It requires the
|
||||
`inotifywait` command if directories should be watched for new
|
||||
files.
|
||||
|
||||
Example for watching two directories:
|
||||
|
||||
``` bash
|
||||
./tools/consumedir.sh --path ~/Downloads --path ~/pdfs -m -dv http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
|
||||
```
|
||||
|
||||
The script by default watches the given directories. If the `-o` or
|
||||
`--once` option is used, it will instead go through these directories
|
||||
and upload all files in there.
|
||||
|
||||
Example for uploading all immediatly (the same as above only with `-o`
|
||||
added):
|
||||
|
||||
``` bash
|
||||
$ consumedir.sh -o --path ~/Downloads --path ~/pdfs/ -m -dv http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
|
||||
```
|
||||
|
||||
|
||||
The URL can be any docspell url that accepts uploads without
|
||||
authentication. This is usually a [source
|
||||
url](../uploading#anonymous-upload). It is also possible to use the
|
||||
script with the [integration
|
||||
endpoint](../uploading#integration-endpoint).
|
||||
|
||||
|
||||
## Integration Endpoint
|
||||
|
||||
When given the `-i` or `--integration` option, the script changes its
|
||||
behaviour slightly to work with the [integration
|
||||
endpoint](../uploading#integration-endpoint).
|
||||
|
||||
First, if `-i` is given, it implies `-r` – so the directories are
|
||||
watched or traversed recursively. The script then assumes that there
|
||||
is a subfolder with the collective name. Files must not be placed
|
||||
directly into a folder given by `-p`, but below a sub-directory that
|
||||
matches a collective name. In order to know for which collective the
|
||||
file is, the script uses the first subfolder.
|
||||
|
||||
If the endpoint is protected, these credentials can be specified as
|
||||
arguments `--iuser` and `--iheader`, respectively. The format is for
|
||||
both `<name>:<value>`, so the username cannot contain a colon
|
||||
character (but the password can).
|
||||
|
||||
Example:
|
||||
``` bash
|
||||
$ consumedir.sh -i -iheader 'Docspell-Integration:test123' -m -p ~/Downloads/ http://localhost:7880/api/v1/open/integration/item
|
||||
```
|
||||
|
||||
The url is the integration endpoint url without the collective, as
|
||||
this is amended by the script.
|
||||
|
||||
This watches the folder `~/Downloads`. If a file is placed in this
|
||||
folder directly, say `~/Downloads/test.pdf` the upload will fail,
|
||||
because the collective cannot be determined. Create a subfolder below
|
||||
`~/Downloads` with the name of a collective, for example
|
||||
`~/Downloads/family` and place files somewhere below this `family`
|
||||
subfolder, like `~/Downloads/family/test.pdf`.
|
||||
|
||||
|
||||
## Duplicates
|
||||
|
||||
With the `-m` option, the script will not upload files that already
|
||||
exist at docspell. For this the `sha256sum` command is required.
|
||||
|
||||
So you can move and rename files in those folders without worring
|
||||
about duplicates. This allows to keep your files organized using the
|
||||
file-system and have them mirrored into docspell as well.
|
||||
|
||||
|
||||
## Systemd
|
||||
|
||||
The script can be used with systemd to run as a service. This is an
|
||||
example unit file:
|
||||
|
||||
```
|
||||
[Unit]
|
||||
After=networking.target
|
||||
Description=Docspell Consumedir
|
||||
|
||||
[Service]
|
||||
Environment="PATH=/set/a/path"
|
||||
|
||||
ExecStart=/bin/su -s /bin/bash someuser -c "consumedir.sh --path '/a/path/' -m 'http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ'"
|
||||
```
|
||||
|
||||
This unit file is just an example, it needs some fiddling. It assumes
|
||||
an existing user `someuser` that is used to run this service. The url
|
||||
`http://localhost:7880/api/v1/open/upload/...` is an anonymous upload
|
||||
url as described [here](../uploading#anonymous-upload).
|
||||
|
||||
|
||||
## Docker
|
||||
|
||||
The provided docker image runs this script to watch a single
|
||||
directory, `./docs` in current directory, for new files. If a new file
|
||||
is detected, it is pushed to docspell.
|
||||
|
||||
This utilizes the [integration
|
||||
endpoint](../uploading#integration-endpoint), which is enabled in the
|
||||
config file, to allow uploading documents for all collectives. A
|
||||
subfolder must be created for each registered collective. The docker
|
||||
containers are configured to use http-header protection for the
|
||||
integration endpoint. This requires you to provide a secret, that is
|
||||
shared between the rest-server and the `consumedir.sh` script. This
|
||||
can be done by defining an environment variable which gets picked up
|
||||
by the containers defined in `docker-compose.yml`:
|
||||
|
||||
```
|
||||
export DOCSPELL_HEADER_VALUE="my-secret"
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
|
||||
Now you can create a folder `./docs/<collective-name>` and place all
|
||||
files in there that you want to import. Once dropped in this folder
|
||||
the `consumedir` container will push it to docspell.
|
@ -1,47 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Upload CLI
|
||||
permalink: doc/tools/ds
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
A bash script is provided to quickly upload files from the command
|
||||
line. It reads a configuration file containing the URLs to upload to.
|
||||
Then each file given to the script will be uploaded to al URLs in the
|
||||
config.
|
||||
|
||||
The config file is expected in
|
||||
`$XDG_CONFIG_HOME/docspell/ds.conf`. `$XDG_CONFIG_HOME` defaults to
|
||||
`~/.config`.
|
||||
|
||||
The config file contains lines with key-value pairs, separated by a
|
||||
`=` sign. Lines starting with `#` are ignored. Example:
|
||||
|
||||
```
|
||||
# Config file
|
||||
url.1 = http://localhost:7880/api/v1/open/upload/item/5DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
|
||||
url.2 = http://localhost:7880/api/v1/open/upload/item/6DxhjkvWf9S-CkWqF3Kr892-WgoCspFWDo7-XBykwCyAUxQ
|
||||
```
|
||||
|
||||
The key must start with `url`. The urls should be [anonymous upload
|
||||
urls](./uploading#anonymous-upload).
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
- The `-c` option allows to specifiy a different config file.
|
||||
- The `-h` option shows a help overview.
|
||||
- The `-d` option deletes files after upload was successful
|
||||
- The `-e` option can be used to check for file existence in docspell.
|
||||
Instead of uploading, the script only checks whether the file is in
|
||||
docspell or not.
|
||||
|
||||
The script takes a list of files as arguments.
|
||||
|
||||
|
||||
Example:
|
||||
|
||||
``` bash
|
||||
./ds.sh ~/Downloads/*.pdf
|
||||
```
|
@ -1,195 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: SMTP Gateway with Exim
|
||||
permalink: doc/tools/smtpgateway
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
One possible use case for the [integration
|
||||
endpoint](../uploading#integration-endpoint) is a SMTP server that
|
||||
forwards all local mail to docspell. This way there is no periodic
|
||||
polling involved and documents (e-mails) get into docspell without
|
||||
delay.
|
||||
|
||||
The `tools/exim` folder contains a docker file and a sample
|
||||
`exim.conf` to help start with this setup. Note that these files
|
||||
provide a minimal setup, you might want to add tls and spam protection
|
||||
when opening it to the public.
|
||||
|
||||
|
||||
## What you need
|
||||
|
||||
You need to own a domain and add the appropriate MX records to point
|
||||
to your server. In this document, the domain `test.org` is used.
|
||||
|
||||
You need to enable the [integration
|
||||
endpoint](../uploading#integration-endpoint) in the docspell
|
||||
configuration.
|
||||
|
||||
## Exim
|
||||
|
||||
[Exim](http://exim.org/) is a popular smtp server (message transfer
|
||||
agent). It is used here only because of previous knowledge, but same
|
||||
can be achieved with other MTAs.
|
||||
|
||||
|
||||
## The Config File
|
||||
|
||||
Here is the example config file for exim:
|
||||
|
||||
```
|
||||
{% include sample-exim.conf %}
|
||||
```
|
||||
|
||||
Exim has good [documentation](https://www.exim.org/docs.html), look
|
||||
there for more info. The following is only a quick summary of the file
|
||||
above.
|
||||
|
||||
The `domainlist local_domains` should list your domain. Only mails to
|
||||
this domain are allowed, as specified in the first rule in
|
||||
`acl_check_rcpt`. So mails to `name@test.org` are ok, but
|
||||
`name@someother.org` not.
|
||||
|
||||
Another rule in `acl_check_rcpt` executes a `GET` request against the
|
||||
integration endpoint. If that fails, the recipient is wrong (or the
|
||||
endpoint disabled) and the mail is rejected right away.
|
||||
|
||||
Then the `routers` define how a mail is handled. There is only one
|
||||
router that accepts all mails (that have not been rejected by a rule
|
||||
in acls) and uses the `docspell` transport to deliver it. The
|
||||
transport specifies a command via the `pipe` driver that is run with
|
||||
the mail. The mail itself is provided via stdin. So a simple `curl`
|
||||
command can upload it to the integration endpoint. Here are some quick
|
||||
notes about the used options (see `man curl`):
|
||||
|
||||
- `--silent` and `--out /dev/null` don't print upload progress
|
||||
information and no output to stdout
|
||||
- `--fail` return non-zero if http status code is not success
|
||||
- `-F` use a multipart/form-data request (defaults to a POST request)
|
||||
- `"file=@-;filename=\"$_subject:\""` add one part with name `file`
|
||||
and take the data from stdin (`@-`). Since there is no filename, we
|
||||
use the subject of the mail. This is [supported by
|
||||
exim](http://exim.org/exim-html-current/doc/html/spec_html/ch-string_expansions.html)
|
||||
by expanding the subject mail header via `$h_subject:` (the colon is
|
||||
required).
|
||||
- `$local_part` this is expanded by exim to the recipient address,
|
||||
only the part until the `@` sign.
|
||||
- `${env{DS_HEADER}{$value} fail}` looks up an environment variable by
|
||||
key `DS_HEADER`. This is usually defined in `docker-compose.yml`.
|
||||
The value must be the "secret" header value as defined in docspell's
|
||||
configuration file.
|
||||
- `${env{DS_URL}{$value} fail}` the url to docspell. It is looked up
|
||||
from the environment with key `DS_URL`, which is usually defined in
|
||||
`docker-compose.yml`. Adding the `$local_part` at the end means that
|
||||
mails to `somename@test.org` are uploaded to the collective
|
||||
`somename`.
|
||||
|
||||
|
||||
## Install with Docker
|
||||
|
||||
Go into the `tools/exim` directory and build the docker image:
|
||||
|
||||
``` bash
|
||||
docker build -t ds-exim:latest -f exim.dockerfile .
|
||||
```
|
||||
|
||||
Then start docspell somewhere and configure the integration endpoint
|
||||
to use http-header protection; i.e. set this in the config file:
|
||||
|
||||
```
|
||||
docspell.server {
|
||||
integration-endpoint {
|
||||
enabled = true
|
||||
http-header = {
|
||||
enabled = true
|
||||
header-value = "test123"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then edit the `docker-compose.yml` and change the environment
|
||||
variables as needed.
|
||||
|
||||
Finally start the container:
|
||||
|
||||
``` bash
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
|
||||
## Test Run
|
||||
|
||||
Now it is possible to send mails to this MTA which will be immediatly
|
||||
uploaded to docspell for the collective corresponding to the
|
||||
`$local_part` of the recipients address. Here is a quick telnet
|
||||
session (the collective is named `family`):
|
||||
|
||||
```
|
||||
fish ~> telnet localhost 25
|
||||
Trying ::1...
|
||||
Connected to localhost.
|
||||
Escape character is '^]'.
|
||||
220 test.org ESMTP Exim 4.93 Sun, 14 Jun 2020 19:03:51 +0000
|
||||
ehlo localhost
|
||||
250-test.org Hello localhost [::1]
|
||||
250-SIZE 31457280
|
||||
250-8BITMIME
|
||||
250-PIPELINING
|
||||
250-CHUNKING
|
||||
250 HELP
|
||||
mail from:<me@test.org>
|
||||
250 OK
|
||||
rcpt to:<family@test.org>
|
||||
250 Accepted
|
||||
data
|
||||
354 Enter message, ending with "." on a line by itself
|
||||
From: me@test.org
|
||||
To: family@test.org
|
||||
Subject: This is a test
|
||||
|
||||
Test,
|
||||
|
||||
this is just a test mail.
|
||||
.
|
||||
250 OK id=1jkXwf-000007-0d
|
||||
quit
|
||||
221 test.org closing connection
|
||||
Connection closed by foreign host.
|
||||
fish ~>
|
||||
```
|
||||
|
||||
The mail is processed and results in an item:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../../img/exim-mail.png">
|
||||
</div>
|
||||
|
||||
However, if a mail is to an unknown collective or not to the
|
||||
configured local domain, the server rejects it immediately:
|
||||
|
||||
``` bash
|
||||
fish ~> telnet localhost 25
|
||||
Trying ::1...
|
||||
Connected to localhost.
|
||||
Escape character is '^]'.
|
||||
220 test.org ESMTP Exim 4.93 Sun, 14 Jun 2020 19:07:04 +0000
|
||||
ehlo localhost
|
||||
250-test.org Hello localhost [::1]
|
||||
250-SIZE 31457280
|
||||
250-8BITMIME
|
||||
250-PIPELINING
|
||||
250-CHUNKING
|
||||
250 HELP
|
||||
mail from:<me@test.org>
|
||||
250 OK
|
||||
rcpt to:<family22@test.org>
|
||||
550 Recipient unknown
|
||||
rcpt to:<family@gmail.com>
|
||||
550 Administrative prohibition
|
||||
quit
|
||||
221 test.org closing connection
|
||||
Connection closed by foreign host.
|
||||
fish ~>
|
||||
```
|
@ -1,184 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Uploads
|
||||
permalink: doc/uploading
|
||||
---
|
||||
|
||||
# {{page.title}}
|
||||
|
||||
|
||||
This page describes, how files can get into docspell. Technically,
|
||||
there is just one way: via http multipart/form-data requests.
|
||||
|
||||
|
||||
## Authenticated Upload
|
||||
|
||||
From within the web application there is the "Upload Files"
|
||||
page. There you can select multiple files to upload. You can also
|
||||
specify whether these files should become one item or if every file is
|
||||
a separate item.
|
||||
|
||||
When you click "Submit" the files are uploaded and stored in the
|
||||
database. Then the job executor(s) are notified which immediately
|
||||
start processing them.
|
||||
|
||||
Go to the top-right menu and click "Processing Queue" to see the
|
||||
current state.
|
||||
|
||||
This obviously requires an authenticated user. While this is handy for
|
||||
ad-hoc uploads, it is very inconvenient for automating it by custom
|
||||
scripts. For this the next variant exists.
|
||||
|
||||
## Anonymous Upload
|
||||
|
||||
It is also possible to upload files without authentication. This
|
||||
should make tools that interact with docspell much easier to write.
|
||||
|
||||
|
||||
### Creating Anonymous Uploads
|
||||
|
||||
Go to "Collective Settings" and then to the "Source" tab. A *Source*
|
||||
identifies an endpoint where files can be uploaded
|
||||
anonymously. Creating a new source creates a long unique id which is
|
||||
part on an url that can be used to upload files. You can choose any
|
||||
time to deactivate or delete the source at which point uploading is
|
||||
not possible anymore. The idea is to give this URL away safely. You
|
||||
can delete it any time and no passwords or secrets are visible, even
|
||||
your username is not visible.
|
||||
|
||||
Example screenshot:
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="../img/sources-form.png">
|
||||
</div>
|
||||
|
||||
This example shows a source with name "test". Besides a description
|
||||
and a name that is only used for displaying purposes, a priority and a
|
||||
[folder](metadata#folders) can be specified.
|
||||
|
||||
The priority is used for the processing jobs that are submitted when
|
||||
files are uploaded via this endpoint.
|
||||
|
||||
The folder is used to place all items, that result from uploads to
|
||||
this endpoint, into this folder.
|
||||
|
||||
The source endpoint defines two urls:
|
||||
|
||||
- `/app/upload/<id>`
|
||||
- `/api/v1/open/upload/item/<id>`
|
||||
|
||||
The first points to a web page where everyone could upload files into
|
||||
your account. You could give this url to people for sending files
|
||||
directly into your docspell.
|
||||
|
||||
The second url is the API url, which accepts the requests to upload
|
||||
files (which is used by the first url).
|
||||
|
||||
For example, this url can be used to upload files with curl:
|
||||
|
||||
``` bash
|
||||
$ curl -XPOST -F file=@test.pdf http://localhost:7880/api/v1/open/upload/item/CqpFTb7UmGe-9nMVPZSmnwc-AHH6nWFh52t-M1JFQ9y7cdH
|
||||
{"success":true,"message":"Files submitted."}
|
||||
```
|
||||
|
||||
You could add more `-F file=@/path/to/your/file.pdf` to upload
|
||||
multiple files (note, the `@` is required by curl, so it knows that
|
||||
the following is a file).
|
||||
|
||||
When files are uploaded to an source endpoint, the items resulting
|
||||
from this uploads are marked with the name of the source. So you know
|
||||
which source an item originated.
|
||||
|
||||
If files are uploaded using the web applications *Upload files* page,
|
||||
the source is implicitly set to `webapp`. If you also want to let
|
||||
docspell count the files uploaded through the web interface, just
|
||||
create a source (can be inactive) with that name (`webapp`).
|
||||
|
||||
|
||||
## Integration Endpoint
|
||||
|
||||
Another option for uploading files is the special *integration
|
||||
endpoint*. This endpoint allows an admin to upload files to any
|
||||
collective, that is known by name.
|
||||
|
||||
```
|
||||
/api/v1/open/integration/item/[collective-name]
|
||||
```
|
||||
|
||||
The endpoint is behind `/api/v1/open`, so this route is not protected
|
||||
by an authentication token (see [REST Api](../api) for more
|
||||
information). However, it can be protected via settings in the
|
||||
configuration file. The idea is that this endpoint is controlled by an
|
||||
administrator and not the user of the application. The admin can
|
||||
enable this endpoint and choose between some methods to protect it.
|
||||
Then the administrator can upload files to any collective. This might
|
||||
be useful to connect other trusted applications to docspell (that run
|
||||
on the same host or network).
|
||||
|
||||
The endpoint is disabled by default, an admin must change the
|
||||
`docspell.server.integration-endpoint.enabled` flag to `true` in the
|
||||
[configuration file](configure#rest-server).
|
||||
|
||||
If queried by a `GET` request, it returns whether it is enabled and
|
||||
the collective exists.
|
||||
|
||||
It is also possible to check for existing files using their sha256
|
||||
checksum with:
|
||||
|
||||
```
|
||||
/api/v1/open/integration/checkfile/[collective-name]/[sha256-checksum]
|
||||
```
|
||||
|
||||
See the [SMTP gateway](tools/smtpgateway) or the [consumedir
|
||||
script](tools/consumedir) for examples to use this endpoint.
|
||||
|
||||
## The Request
|
||||
|
||||
This gives more details about the request for uploads. It is a http
|
||||
`multipart/form-data` request, with two possible fields:
|
||||
|
||||
- meta
|
||||
- file
|
||||
|
||||
The `file` field can appear multiple times and is required at least
|
||||
once. It is the part containing the file to upload.
|
||||
|
||||
The `meta` part is completely optional and can define additional meta
|
||||
data, that docspell uses to create items from the given files. It
|
||||
allows to transfer structured information together with the
|
||||
unstructured binary files.
|
||||
|
||||
The `meta` content must be `application/json` containing this
|
||||
structure:
|
||||
|
||||
```
|
||||
{ multiple: Bool
|
||||
, direction: Maybe String
|
||||
, folder: Maybe String
|
||||
}
|
||||
```
|
||||
|
||||
The `multiple` property is by default `true`. It means that each file
|
||||
in the upload request corresponds to a single item. An upload with 5
|
||||
files will result in 5 items created. If it is `false`, then docspell
|
||||
will create just one item, that will then contain all files.
|
||||
|
||||
Furthermore, the direction of the document (one of `incoming` or
|
||||
`outgoing`) can be given. It is optional, it can be left out or
|
||||
`null`.
|
||||
|
||||
A `folder` id can be specified. Each item created by this request will
|
||||
be placed into this folder. Errors are logged (for example, the folder
|
||||
may have been deleted before the task is executed) and the item is
|
||||
then not put into any folder.
|
||||
|
||||
This kind of request is very common and most programming languages
|
||||
have support for this. For example, here is another curl command
|
||||
uploading two files with meta data:
|
||||
|
||||
```
|
||||
curl -XPOST -F meta='{"multiple":false, "direction": "outgoing"}' \
|
||||
-F file=@letter-en-source.pdf \
|
||||
-F file=@letter-de-source.pdf \
|
||||
http://localhost:7880/api/v1/open/upload/item/CqpFTb7UmGe-9nMVPZSmnwc-AHH6nWFh52t-M1JFQ9y7cdH
|
||||
```
|
@ -1,70 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Features and Limitations
|
||||
permalink: features
|
||||
---
|
||||
|
||||
# Features
|
||||
|
||||
- Multi-account application
|
||||
- Multiple users per account (multiple users can access the same
|
||||
account)
|
||||
- Handle multiple documents as one unit
|
||||
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||
- [Full-Text Search](doc/finding#full-text-search) based on [Apache
|
||||
SOLR](https://lucene.apache.org/solr)
|
||||
- Conversion to PDF: all files are converted into a PDF file. PDFs
|
||||
with only images (as often returned from scanners) are converted
|
||||
into searchable PDF/A pdfs.
|
||||
- Non-destructive: all your uploaded files are never modified and can
|
||||
always be downloaded untouched
|
||||
- Text is analysed to find and attach meta data automatically
|
||||
- [Manage document processing](doc/processing): cancel jobs, set
|
||||
priorities
|
||||
- Everything available via a [documented](https://www.openapis.org/)
|
||||
[REST Api](api); allows to [generate
|
||||
clients](https://openapi-generator.tech/docs/generators) for
|
||||
(almost) any language
|
||||
- mobile-friendly Web-UI
|
||||
- [Create “share-urls”](doc/uploading#anonymous-upload) to upload files
|
||||
anonymously
|
||||
- [Send documents via e-mail](doc/mailitem)
|
||||
- [E-Mail notification](doc/notifydueitems) for documents with due dates
|
||||
- [Read your mailboxes](doc/scanmailbox) via IMAP to import mails into
|
||||
docspell
|
||||
- REST server and document processing are separate applications which
|
||||
can be scaled-out independently
|
||||
- Everything stored in a SQL database: PostgreSQL, MariaDB or H2
|
||||
- H2 is embedded, a "one-file-only" database, avoids installing db
|
||||
servers
|
||||
- Files supported:
|
||||
- Documents:
|
||||
- PDF
|
||||
- common MS Office (doc, docx, xls, xlsx)
|
||||
- OpenDocument (odt, ods)
|
||||
- RichText (rtf)
|
||||
- Images (jpg, png, tiff)
|
||||
- HTML
|
||||
- text/* (treated as Markdown)
|
||||
- Archives (extracted automatically, can be nested)
|
||||
- zip
|
||||
- [eml](https://en.wikipedia.org/wiki/Email#Filename_extensions)
|
||||
(e-mail files in plain text MIME)
|
||||
- [Tooling](doc/tools):
|
||||
- [Watch a folder](doc/tools/consumedir): watch folders for changes
|
||||
and send files to docspell
|
||||
- [Simple CLI for uploading files](doc/tools/ds)
|
||||
- [Firefox plugin](doc/tools/browserext): right click on a link and
|
||||
send the file to docspell
|
||||
- [SMTP Gateway](doc/tools/smtpgateway): Setup a SMTP server that
|
||||
delivers mails directly to docspell.
|
||||
- License: GPLv3
|
||||
|
||||
|
||||
# Limitations
|
||||
|
||||
These are current known limitations that may be of interest for
|
||||
considering docspell at the moment.
|
||||
|
||||
- Documents cannot be modified.
|
||||
- You can remove and add documents but there is no versioning.
|
@ -1,107 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Quickstart
|
||||
permalink: getit
|
||||
---
|
||||
|
||||
# {{ page.title }}
|
||||
|
||||
There are the following quick ways to get docspell to run on your
|
||||
machine:
|
||||
|
||||
- [Download, Unpack, Run](#without-docker) You can download
|
||||
pre-compiled binaries from the [Release
|
||||
Page](https://github.com/eikek/docspell/releases). There are `deb`
|
||||
packages and generic zip files.
|
||||
- [With Docker](#with-docker)
|
||||
- [NixOs Module](doc/nix#docspell-as-a-service-on-nixos)
|
||||
|
||||
Check the [demo videos](demo) to see the basic idea. Refer to the
|
||||
[documentation](doc) for more information on how to use docspell.
|
||||
|
||||
|
||||
## Download, Unpack, Run
|
||||
|
||||
### Prerequisite
|
||||
|
||||
Install Java (use your package manager or look
|
||||
[here](https://adoptopenjdk.net/)).
|
||||
|
||||
OCR functionality requires the following tools:
|
||||
|
||||
- [tesseract](https://github.com/tesseract-ocr/tesseract),
|
||||
- [ghostscript](http://pages.cs.wisc.edu/~ghost/) and possibly
|
||||
- [unpaper](https://github.com/Flameeyes/unpaper).
|
||||
|
||||
The last is not really required, but improves OCR.
|
||||
|
||||
PDF conversion requires the following tools:
|
||||
|
||||
- [unoconv](https://github.com/unoconv/unoconv)
|
||||
- [wkhtmltopdf](https://wkhtmltopdf.org/)
|
||||
|
||||
|
||||
### Using zip files
|
||||
|
||||
You need to download the two files:
|
||||
|
||||
- [docspell-restserver-{{site.version}}.zip](https://github.com/eikek/docspell/releases/download/v{{site.version}}/docspell-restserver-{{site.version}}.zip)
|
||||
- [docspell-joex-{{site.version}}.zip](https://github.com/eikek/docspell/releases/download/v{{site.version}}/docspell-joex-{{site.version}}.zip)
|
||||
|
||||
|
||||
1. Unzip both files:
|
||||
``` bash
|
||||
$ unzip docspell-*.zip
|
||||
```
|
||||
2. Open two terminal windows and navigate to the the directory
|
||||
containing the zip files.
|
||||
3. Start both components executing:
|
||||
``` bash
|
||||
$ ./docspell-restserver*/bin/docspell-restserver
|
||||
```
|
||||
in one terminal and
|
||||
``` bash
|
||||
$ ./docspell-joex*/bin/docspell-joex
|
||||
```
|
||||
in the other.
|
||||
4. Point your browser to: <http://localhost:7880/app>
|
||||
5. Register a new account, sign in and try it.
|
||||
|
||||
Note, that this setup doesn't include watching a directory. You can
|
||||
use the [`consumedir.sh`](doc/tools/consumedir) tool for this or use
|
||||
the docker variant below.
|
||||
|
||||
## With Docker
|
||||
|
||||
There is a [docker-compose](https://docs.docker.com/compose/) setup
|
||||
available in the `/docker` folder.
|
||||
|
||||
1. Clone the github repository
|
||||
```bash
|
||||
$ git clone https://github.com/eikek/docspell
|
||||
```
|
||||
2. Change into the `docker` directory:
|
||||
```bash
|
||||
$ cd docspell/docker
|
||||
```
|
||||
3. Run `docker-compose up`:
|
||||
|
||||
```bash
|
||||
$ export DOCSPELL_HEADER_VALUE="my-secret-123"
|
||||
$ docker-compose up
|
||||
```
|
||||
|
||||
The environment variable defines a secret that is shared between
|
||||
some containers. You can define whatever you like. Please see the
|
||||
[`consumedir.sh`](doc/tools/consumedir#docker) docs for additional
|
||||
info.
|
||||
4. Goto <http://localhost:7880>, signup and login. When signing up,
|
||||
you can choose the same name for collective and user. Then login
|
||||
with this name and the password.
|
||||
|
||||
5. (Optional) Create a folder `./docs/<collective-name>` (the name you
|
||||
chose for the collective at registration) and place files in there
|
||||
for importing them.
|
||||
|
||||
The directory contains a file `docspell.conf` that you can
|
||||
[modify](doc/configure) as needed.
|
@ -1,13 +0,0 @@
|
||||
---
|
||||
layout: homeFeatures
|
||||
features:
|
||||
- first: ["Stow documents away", "Most of the time documents (emails, postal mail) are received or created. It should be fast to stow them away, knowing that they can be found if necessary.", "uploading"]
|
||||
- second: ["Automatic Tagging", "All documents are analyzed and tagged automatically. It may not always be correct; results can be reviewed and corrected.", "metadata"]
|
||||
- third: ["Work with them", "Search for documents by their meta data or via full-text search. Send them via e-mail. Add your own tags, names etc to better match your workflow.", "finding"]
|
||||
---
|
||||
|
||||
|
||||
## License
|
||||
|
||||
This project is distributed under the
|
||||
[GPLv3](http://www.gnu.org/licenses/gpl-3.0.html)
|
@ -1,46 +0,0 @@
|
||||
#site-main {
|
||||
background: aliceblue;
|
||||
}
|
||||
|
||||
#masthead {
|
||||
background: url(../img/back-master-small.jpg);
|
||||
background-repeat: no-repeat;
|
||||
background-size: 100% 100%;
|
||||
animation: none;
|
||||
height: 26vh;
|
||||
}
|
||||
|
||||
.thumbnail {
|
||||
border: 1px solid #aaa;
|
||||
}
|
||||
|
||||
.features-image {
|
||||
height: 200px;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.docs #sidebar-wrapper .sidebar-nav .sidebar-nav-item.open>a, .docs #sidebar-wrapper .sidebar-nav .sidebar-nav-item.open button {
|
||||
background: #495680;
|
||||
}
|
||||
.docs #sidebar-wrapper .sidebar-nav .sidebar-nav-item .sub-section {
|
||||
background: #172651;
|
||||
}
|
||||
.docs #sidebar-wrapper .sidebar-nav .sidebar-nav-item .sub-section a.active {
|
||||
font-weight: bold;
|
||||
}
|
||||
.docs #sidebar-wrapper {
|
||||
background: #172651;
|
||||
}
|
||||
|
||||
.docs #sidebar-wrapper #sidebar-brand {
|
||||
background: #495680;
|
||||
|
||||
}
|
||||
|
||||
.docs h4 {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.docs .thumbnail img {
|
||||
width: 100%;
|
||||
}
|
@ -1,94 +0,0 @@
|
||||
options:
|
||||
- title: Home
|
||||
url: index.html
|
||||
|
||||
- title: Demo
|
||||
url: demo
|
||||
|
||||
- title: Quickstart
|
||||
url: getit
|
||||
|
||||
- title: Features/Limitations
|
||||
url: features
|
||||
|
||||
- title: Documentation
|
||||
url: doc
|
||||
|
||||
nested_options:
|
||||
- title: Installation
|
||||
url: doc/install
|
||||
|
||||
- title: Configuring
|
||||
url: doc/configure
|
||||
|
||||
- title: Nix/NixOS
|
||||
url: doc/nix
|
||||
|
||||
- title: Reverse Proxy
|
||||
url: doc/reverseproxy
|
||||
|
||||
- title: Meta Data
|
||||
url: doc/metadata
|
||||
|
||||
- title: Finding Items
|
||||
url: doc/finding
|
||||
|
||||
- title: Curate Items
|
||||
url: doc/curate
|
||||
|
||||
- title: Uploads
|
||||
url: doc/uploading
|
||||
|
||||
- title: Processing Queue
|
||||
url: doc/processing
|
||||
|
||||
- title: E-Mail Settings
|
||||
url: doc/emailsettings
|
||||
|
||||
- title: Send via E-Mail
|
||||
url: doc/mailitem
|
||||
|
||||
- title: Notify on due Items
|
||||
url: doc/notifydueitems
|
||||
|
||||
- title: Scan Mailboxes
|
||||
url: doc/scanmailbox
|
||||
|
||||
- title: Joex
|
||||
url: doc/joex
|
||||
|
||||
- title: Tools
|
||||
url: doc/tools
|
||||
|
||||
nested_options:
|
||||
- title: Upload CLI
|
||||
url: doc/tools/ds
|
||||
|
||||
- title: Consume Directory
|
||||
url: doc/tools/consumedir
|
||||
|
||||
- title: Browser Extension (Firefox)
|
||||
url: doc/tools/browserext
|
||||
|
||||
- title: SMTP Gateway
|
||||
url: doc/tools/smtpgateway
|
||||
|
||||
- title: Api
|
||||
url: api
|
||||
|
||||
nested_options:
|
||||
- title: REST Api Doc
|
||||
url: openapi/docspell-openapi.html
|
||||
|
||||
- title: REST OpenApi Spec
|
||||
url: openapi/docspell-openapi.yml
|
||||
|
||||
- title: Development
|
||||
url: dev
|
||||
|
||||
nested_options:
|
||||
- title: ADRs
|
||||
url: dev/adr
|
||||
|
||||
- title: Changelog
|
||||
url : changelog
|
Before Width: | Height: | Size: 339 KiB |
Before Width: | Height: | Size: 87 KiB |
Before Width: | Height: | Size: 89 KiB |
Before Width: | Height: | Size: 123 KiB |
Before Width: | Height: | Size: 124 KiB |
Before Width: | Height: | Size: 108 KiB |
Before Width: | Height: | Size: 1.2 MiB |
Before Width: | Height: | Size: 1.7 MiB |
Before Width: | Height: | Size: 93 KiB |
@ -1 +0,0 @@
|
||||
../../../../../../../artwork/logo-96.png
|
@ -1 +0,0 @@
|
||||
../../../../../../../artwork/logo-only.svg
|
@ -1 +0,0 @@
|
||||
../../../../../../../artwork/stow.svg
|
@ -1 +0,0 @@
|
||||
../../../../../../../artwork/logo-only-36.svg
|
@ -1 +0,0 @@
|
||||
light-navbar-brand.svg
|
Before Width: | Height: | Size: 162 KiB |
Before Width: | Height: | Size: 177 KiB |
Before Width: | Height: | Size: 130 KiB |
Before Width: | Height: | Size: 150 KiB |
Before Width: | Height: | Size: 71 KiB |
Before Width: | Height: | Size: 66 KiB |
Before Width: | Height: | Size: 233 KiB |
Before Width: | Height: | Size: 105 KiB |
Before Width: | Height: | Size: 228 KiB |
Before Width: | Height: | Size: 72 KiB |
Before Width: | Height: | Size: 4.8 KiB |