mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-30 21:40:12 +00:00 
			
		
		
		
	- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
		
			
				
	
	
		
			48 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
			
		
		
	
	
			48 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Docker
		
	
	
	
	
	
| FROM alpine:latest
 | |
| 
 | |
| ENV UNO_URL https://raw.githubusercontent.com/unoconv/unoconv/0.9.0/unoconv
 | |
| 
 | |
| LABEL maintainer="eikek0 <eike@docspell.org>"
 | |
| 
 | |
| RUN apk add --no-cache openjdk11-jre \
 | |
|     unzip \
 | |
|     bash \
 | |
|     curl \
 | |
|     ghostscript \
 | |
|     tesseract-ocr \
 | |
|     tesseract-ocr-data-deu \
 | |
|     unpaper \
 | |
|     wkhtmltopdf \
 | |
|     libreoffice \
 | |
|     ttf-droid-nonlatin \
 | |
|     ttf-droid \
 | |
|     ttf-dejavu \
 | |
|     ttf-freefont \
 | |
|     ttf-liberation \
 | |
|     libxml2-dev \
 | |
|     libxslt-dev \
 | |
|     pngquant \
 | |
|     zlib-dev \
 | |
|     g++ \
 | |
|     qpdf \
 | |
|     python3-dev \
 | |
|     libffi-dev\
 | |
|     qpdf-dev \
 | |
|   && pip3 install --upgrade pip \
 | |
|   && pip3 install ocrmypdf \
 | |
|   && curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
 | |
|   && chmod +x /usr/local/bin/unoconv \
 | |
|   && ln -s /usr/bin/python3 /usr/bin/python \
 | |
|   && mkdir -p /opt \
 | |
|   && cd /opt \
 | |
|   && curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
 | |
|   && unzip docspell.zip \
 | |
|   && rm docspell.zip \
 | |
|   && apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
 | |
| 
 | |
| COPY entrypoint-joex.sh /opt/entrypoint.sh
 | |
| 
 | |
| EXPOSE 7878
 | |
| 
 | |
| ENTRYPOINT ["/opt/entrypoint.sh"]
 |