Adding extraction primitives

This commit is contained in:
Eike Kettner
2020-02-16 21:37:26 +01:00
parent 851ee7ef0f
commit 8143a4edcc
46 changed files with 2731 additions and 83 deletions

View File

@ -0,0 +1,43 @@
@startuml
scale 1200 width
title: Processing Files
skinparam monochrome true
skinparam backgroundColor white
skinparam rectangle {
roundCorner<<Input>> 25
roundCorner<<Output>> 5
}
rectangle Input <<Input>> {
file "html"
file "plaintext"
file "image"
file "msoffice"
file "rtf"
file "odf"
file "pdf"
}
node toBoth [
PDF + TXT
]
node toPdf [
PDF
]
node toTxt [
TXT
]
image --> toBoth:<tesseract>
html --> toPdf:<wkhtmltopdf>
toPdf --> toTxt:[pdfbox]
plaintext --> html:[flexmark]
msoffice --> toPdf:<unoconv>
msoffice --> toTxt:[poi]
rtf --> toTxt:[jdk]
rtf --> toPdf:<unoconv>
odf --> toTxt:[tika]
odf --> toPdf:<unoconv>
pdf --> toTxt:<tesseract>
pdf --> toTxt:[pdfbox]
plaintext -> toTxt:[identity]
@enduml