2023-05-04 10:10:41 +00:00
|
|
|
FROM alpine:20230329
|
2021-05-30 22:07:11 +00:00
|
|
|
|
|
|
|
ARG version=
|
|
|
|
ARG joex_url=
|
|
|
|
ARG UNO_URL=https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
|
2021-06-07 23:58:05 +00:00
|
|
|
ARG TARGETPLATFORM
|
2021-05-30 22:07:11 +00:00
|
|
|
|
2022-11-04 18:45:21 +00:00
|
|
|
RUN apk update && \
|
|
|
|
apk add --no-cache openjdk17-jre \
|
2021-05-30 22:07:11 +00:00
|
|
|
tzdata \
|
|
|
|
bash \
|
|
|
|
curl \
|
2022-05-20 22:44:17 +00:00
|
|
|
docker \
|
2021-05-30 22:07:11 +00:00
|
|
|
ghostscript \
|
|
|
|
tesseract-ocr \
|
|
|
|
tesseract-ocr-data-deu \
|
|
|
|
tesseract-ocr-data-fra \
|
|
|
|
tesseract-ocr-data-ita \
|
|
|
|
tesseract-ocr-data-spa \
|
|
|
|
tesseract-ocr-data-por \
|
|
|
|
tesseract-ocr-data-ces \
|
|
|
|
tesseract-ocr-data-nld \
|
|
|
|
tesseract-ocr-data-dan \
|
|
|
|
tesseract-ocr-data-fin \
|
|
|
|
tesseract-ocr-data-nor \
|
|
|
|
tesseract-ocr-data-swe \
|
|
|
|
tesseract-ocr-data-rus \
|
|
|
|
tesseract-ocr-data-ron \
|
|
|
|
tesseract-ocr-data-lav \
|
2021-07-28 18:05:38 +00:00
|
|
|
tesseract-ocr-data-jpn \
|
2021-08-23 22:19:42 +00:00
|
|
|
tesseract-ocr-data-heb \
|
2022-05-21 12:11:38 +00:00
|
|
|
tesseract-ocr-data-lit \
|
2022-05-21 12:34:48 +00:00
|
|
|
tesseract-ocr-data-pol \
|
2022-10-31 22:38:41 +00:00
|
|
|
tesseract-ocr-data-est \
|
2022-11-09 21:24:32 +00:00
|
|
|
tesseract-ocr-data-ukr \
|
2023-08-03 13:20:27 +00:00
|
|
|
tesseract-ocr-data-slk \
|
2021-05-30 22:07:11 +00:00
|
|
|
unpaper \
|
2022-11-04 18:45:21 +00:00
|
|
|
weasyprint \
|
2021-05-30 22:07:11 +00:00
|
|
|
libreoffice \
|
|
|
|
ttf-droid \
|
|
|
|
ttf-dejavu \
|
|
|
|
ttf-freefont \
|
|
|
|
ttf-liberation \
|
2023-03-29 10:48:45 +00:00
|
|
|
font-noto-khmer \
|
2021-05-30 22:07:11 +00:00
|
|
|
libxml2-dev \
|
|
|
|
libxslt-dev \
|
|
|
|
pngquant \
|
|
|
|
zlib-dev \
|
|
|
|
g++ \
|
|
|
|
qpdf \
|
|
|
|
py3-pip \
|
|
|
|
python3-dev \
|
|
|
|
libffi-dev\
|
|
|
|
qpdf-dev \
|
|
|
|
openssl-dev \
|
|
|
|
ocrmypdf \
|
|
|
|
&& pip3 install --upgrade pip \
|
|
|
|
&& pip3 install ocrmypdf \
|
|
|
|
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
|
|
|
&& chmod +x /usr/local/bin/unoconv \
|
2022-05-20 22:44:17 +00:00
|
|
|
&& apk del libxml2-dev libxslt-dev zlib-dev g++ python3-dev py3-pip libffi-dev qpdf-dev openssl-dev \
|
2022-11-04 18:45:21 +00:00
|
|
|
&& ln -nfs /usr/bin/python3 /usr/bin/python
|
2021-05-30 22:07:11 +00:00
|
|
|
|
2022-11-19 17:16:54 +00:00
|
|
|
# Special treatment for ocrmypdf. It is broken quite often
|
|
|
|
RUN apk add --no-cache py3-setuptools && ocrmypdf --version
|
|
|
|
|
2021-05-30 22:07:11 +00:00
|
|
|
WORKDIR /opt
|
2023-03-29 10:48:45 +00:00
|
|
|
|
2021-05-30 22:07:11 +00:00
|
|
|
RUN wget ${joex_url:-https://github.com/eikek/docspell/releases/download/v$version/docspell-joex-$version.zip} && \
|
|
|
|
unzip docspell-joex-*.zip && \
|
|
|
|
rm docspell-joex-*.zip && \
|
2021-10-25 09:27:17 +00:00
|
|
|
ln -snf docspell-joex-* docspell-joex && \
|
|
|
|
rm docspell-joex/conf/docspell-joex.conf
|
2021-05-30 22:07:11 +00:00
|
|
|
|
2023-03-17 10:50:48 +00:00
|
|
|
# temporary download traineddata directly for khmer lang
|
|
|
|
# before tesseract-ocr-data-khm being added to the registry
|
|
|
|
RUN \
|
|
|
|
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
|
|
|
|
mv khm.traineddata /usr/share/tessdata
|
|
|
|
|
2021-08-13 14:44:56 +00:00
|
|
|
# Using these data files for japanese, because they work better. See #973
|
|
|
|
RUN \
|
|
|
|
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
|
|
|
|
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
|
|
|
|
mv jpn*.traineddata /usr/share/tessdata
|
|
|
|
|
2021-05-30 22:07:11 +00:00
|
|
|
COPY joex-entrypoint.sh /opt/joex-entrypoint.sh
|
|
|
|
|
2022-11-04 18:45:21 +00:00
|
|
|
ENTRYPOINT ["/opt/joex-entrypoint.sh"]
|
2021-05-30 22:07:11 +00:00
|
|
|
EXPOSE 7878
|
|
|
|
|
2021-08-11 17:21:41 +00:00
|
|
|
HEALTHCHECK --interval=1m --timeout=10s --retries=2 --start-period=30s \
|
|
|
|
CMD wget --spider http://localhost:7878/api/info/version
|