mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 10:28:27 +00:00
Adding extraction primitives
This commit is contained in:
43
modules/microsite/docs/dev/adr/process-files.puml
Normal file
43
modules/microsite/docs/dev/adr/process-files.puml
Normal file
@ -0,0 +1,43 @@
|
||||
@startuml
|
||||
scale 1200 width
|
||||
title: Processing Files
|
||||
skinparam monochrome true
|
||||
skinparam backgroundColor white
|
||||
skinparam rectangle {
|
||||
roundCorner<<Input>> 25
|
||||
roundCorner<<Output>> 5
|
||||
}
|
||||
rectangle Input <<Input>> {
|
||||
file "html"
|
||||
file "plaintext"
|
||||
file "image"
|
||||
file "msoffice"
|
||||
file "rtf"
|
||||
file "odf"
|
||||
file "pdf"
|
||||
}
|
||||
|
||||
node toBoth [
|
||||
PDF + TXT
|
||||
]
|
||||
node toPdf [
|
||||
PDF
|
||||
]
|
||||
node toTxt [
|
||||
TXT
|
||||
]
|
||||
|
||||
image --> toBoth:<tesseract>
|
||||
html --> toPdf:<wkhtmltopdf>
|
||||
toPdf --> toTxt:[pdfbox]
|
||||
plaintext --> html:[flexmark]
|
||||
msoffice --> toPdf:<unoconv>
|
||||
msoffice --> toTxt:[poi]
|
||||
rtf --> toTxt:[jdk]
|
||||
rtf --> toPdf:<unoconv>
|
||||
odf --> toTxt:[tika]
|
||||
odf --> toPdf:<unoconv>
|
||||
pdf --> toTxt:<tesseract>
|
||||
pdf --> toTxt:[pdfbox]
|
||||
plaintext -> toTxt:[identity]
|
||||
@enduml
|
Reference in New Issue
Block a user