mirror of
synced 2025-03-05 16:13:26 +00:00
- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
48 lines
1.1 KiB
48 lines
1.1 KiB
FROM alpine:latest
ENV UNO_URL https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
LABEL maintainer="eikek0 <eike@docspell.org>"
RUN apk add --no-cache openjdk11-jre \
unzip \
bash \
curl \
ghostscript \
tesseract-ocr \
tesseract-ocr-data-deu \
unpaper \
wkhtmltopdf \
libreoffice \
ttf-droid-nonlatin \
ttf-droid \
ttf-dejavu \
ttf-freefont \
ttf-liberation \
libxml2-dev \
libxslt-dev \
pngquant \
zlib-dev \
g++ \
qpdf \
python3-dev \
qpdf-dev \
&& pip3 install --upgrade pip \
&& pip3 install ocrmypdf \
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
&& chmod +x /usr/local/bin/unoconv \
&& ln -s /usr/bin/python3 /usr/bin/python \
&& mkdir -p /opt \
&& cd /opt \
&& curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
&& unzip docspell.zip \
&& rm docspell.zip \
&& apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
COPY entrypoint-joex.sh /opt/entrypoint.sh
ENTRYPOINT ["/opt/entrypoint.sh"]