mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
		
			
				
	
	
		
			48 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			48 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
FROM alpine:latest
 | 
						|
 | 
						|
ENV UNO_URL https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
 | 
						|
 | 
						|
LABEL maintainer="eikek0 <eike@docspell.org>"
 | 
						|
 | 
						|
RUN apk add --no-cache openjdk11-jre \
 | 
						|
    unzip \
 | 
						|
    bash \
 | 
						|
    curl \
 | 
						|
    ghostscript \
 | 
						|
    tesseract-ocr \
 | 
						|
    tesseract-ocr-data-deu \
 | 
						|
    unpaper \
 | 
						|
    wkhtmltopdf \
 | 
						|
    libreoffice \
 | 
						|
    ttf-droid-nonlatin \
 | 
						|
    ttf-droid \
 | 
						|
    ttf-dejavu \
 | 
						|
    ttf-freefont \
 | 
						|
    ttf-liberation \
 | 
						|
    libxml2-dev \
 | 
						|
    libxslt-dev \
 | 
						|
    pngquant \
 | 
						|
    zlib-dev \
 | 
						|
    g++ \
 | 
						|
    qpdf \
 | 
						|
    python3-dev \
 | 
						|
    libffi-dev\
 | 
						|
    qpdf-dev \
 | 
						|
  && pip3 install --upgrade pip \
 | 
						|
  && pip3 install ocrmypdf \
 | 
						|
  && curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
 | 
						|
  && chmod +x /usr/local/bin/unoconv \
 | 
						|
  && ln -s /usr/bin/python3 /usr/bin/python \
 | 
						|
  && mkdir -p /opt \
 | 
						|
  && cd /opt \
 | 
						|
  && curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
 | 
						|
  && unzip docspell.zip \
 | 
						|
  && rm docspell.zip \
 | 
						|
  && apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
 | 
						|
 | 
						|
COPY entrypoint-joex.sh /opt/entrypoint.sh
 | 
						|
 | 
						|
EXPOSE 7878
 | 
						|
 | 
						|
ENTRYPOINT ["/opt/entrypoint.sh"]
 |