mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 10:59:33 +00:00
Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
45 lines
994 B
Docker
45 lines
994 B
Docker
## JOEX-BASE
|
|
ARG VERSION=
|
|
ARG REPO=
|
|
|
|
|
|
FROM ${REPO}:base-${VERSION}
|
|
|
|
ARG UNO_URL=https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
|
|
ENV JAVA_OPTS="-Xmx1536M"
|
|
|
|
RUN apk add --no-cache openjdk11-jre \
|
|
bash \
|
|
curl \
|
|
ghostscript \
|
|
tesseract-ocr \
|
|
tesseract-ocr-data-deu \
|
|
tesseract-ocr-data-fra \
|
|
tesseract-ocr-data-ita \
|
|
unpaper \
|
|
wkhtmltopdf \
|
|
libreoffice \
|
|
ttf-droid-nonlatin \
|
|
ttf-droid \
|
|
ttf-dejavu \
|
|
ttf-freefont \
|
|
ttf-liberation \
|
|
libxml2-dev \
|
|
libxslt-dev \
|
|
pngquant \
|
|
zlib-dev \
|
|
g++ \
|
|
qpdf \
|
|
py3-pip \
|
|
python3-dev \
|
|
libffi-dev\
|
|
qpdf-dev \
|
|
openssl-dev \
|
|
ocrmypdf \
|
|
&& pip3 install --upgrade pip \
|
|
&& pip3 install ocrmypdf \
|
|
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
|
&& chmod +x /usr/local/bin/unoconv \
|
|
&& apk del curl libxml2-dev libxslt-dev zlib-dev g++ python3-dev py3-pip libffi-dev qpdf-dev openssl-dev \
|
|
&& ln -s /usr/bin/python3 /usr/bin/python
|