mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Use ocrmypdf tool to create pdf/a during conversion
- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
This commit is contained in:
parent
99210365ce
commit
3d49ceaab5
docker
modules
convert/src
main/scala/docspell/convert
test/scala/docspell/convert
joex/src/main
microsite/docs
store/src/main/scala/docspell/store/records
nix
@ -19,6 +19,17 @@ RUN apk add --no-cache openjdk11-jre \
|
|||||||
ttf-dejavu \
|
ttf-dejavu \
|
||||||
ttf-freefont \
|
ttf-freefont \
|
||||||
ttf-liberation \
|
ttf-liberation \
|
||||||
|
libxml2-dev \
|
||||||
|
libxslt-dev \
|
||||||
|
pngquant \
|
||||||
|
zlib-dev \
|
||||||
|
g++ \
|
||||||
|
qpdf \
|
||||||
|
python3-dev \
|
||||||
|
libffi-dev\
|
||||||
|
qpdf-dev \
|
||||||
|
&& pip3 install --upgrade pip \
|
||||||
|
&& pip3 install ocrmypdf \
|
||||||
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
&& curl -Ls $UNO_URL -o /usr/local/bin/unoconv \
|
||||||
&& chmod +x /usr/local/bin/unoconv \
|
&& chmod +x /usr/local/bin/unoconv \
|
||||||
&& ln -s /usr/bin/python3 /usr/bin/python \
|
&& ln -s /usr/bin/python3 /usr/bin/python \
|
||||||
@ -27,7 +38,7 @@ RUN apk add --no-cache openjdk11-jre \
|
|||||||
&& curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
|
&& curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \
|
||||||
&& unzip docspell.zip \
|
&& unzip docspell.zip \
|
||||||
&& rm docspell.zip \
|
&& rm docspell.zip \
|
||||||
&& apk del curl unzip
|
&& apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev
|
||||||
|
|
||||||
COPY entrypoint-joex.sh /opt/entrypoint.sh
|
COPY entrypoint-joex.sh /opt/entrypoint.sh
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ import fs2._
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConversionResult.Handler
|
import docspell.convert.ConversionResult.Handler
|
||||||
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
|
import docspell.convert.extern._
|
||||||
import docspell.convert.flexmark.Markdown
|
import docspell.convert.flexmark.Markdown
|
||||||
import docspell.files.{ImageSize, TikaMimetype}
|
import docspell.files.{ImageSize, TikaMimetype}
|
||||||
|
|
||||||
@ -35,7 +35,8 @@ object Conversion {
|
|||||||
): F[A] =
|
): F[A] =
|
||||||
TikaMimetype.resolve(dataType, in).flatMap {
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
handler.run(ConversionResult.successPdf(in))
|
OcrMyPdf
|
||||||
|
.toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
|
|
||||||
case MimeType.HtmlMatch(mt) =>
|
case MimeType.HtmlMatch(mt) =>
|
||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import docspell.convert.extern.OcrMyPdfConfig
|
||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
@ -9,5 +10,6 @@ case class ConvertConfig(
|
|||||||
markdown: MarkdownConfig,
|
markdown: MarkdownConfig,
|
||||||
wkhtmlpdf: WkHtmlPdfConfig,
|
wkhtmlpdf: WkHtmlPdfConfig,
|
||||||
tesseract: TesseractConfig,
|
tesseract: TesseractConfig,
|
||||||
unoconv: UnoconvConfig
|
unoconv: UnoconvConfig,
|
||||||
|
ocrmypdf: OcrMyPdfConfig
|
||||||
)
|
)
|
||||||
|
@ -41,7 +41,7 @@ private[extern] object ExternConv {
|
|||||||
|
|
||||||
in.through(createInput).flatMap { _ =>
|
in.through(createInput).flatMap { _ =>
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](
|
.exec[F](
|
||||||
sysCfg,
|
sysCfg,
|
||||||
blocker,
|
blocker,
|
||||||
logger,
|
logger,
|
||||||
@ -65,11 +65,20 @@ private[extern] object ExternConv {
|
|||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||||
File.existsNonEmpty[F](out).flatMap {
|
File.existsNonEmpty[F](out).flatMap {
|
||||||
case true =>
|
case true if result.rc == 0 =>
|
||||||
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
val outTxt = out.resolveSibling(out.getFileName.toString + ".txt")
|
||||||
else
|
File.existsNonEmpty[F](outTxt).flatMap {
|
||||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
case true =>
|
||||||
|
successPdfTxt(
|
||||||
|
File.readAll(out, blocker, chunkSize),
|
||||||
|
File.readText(outTxt, blocker)
|
||||||
|
).pure[F]
|
||||||
|
case false =>
|
||||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||||
|
}
|
||||||
|
case true if result.rc != 0 =>
|
||||||
|
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||||
|
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
ConversionResult
|
ConversionResult
|
||||||
|
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
37
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala
vendored
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
|
||||||
|
object OcrMyPdf {
|
||||||
|
|
||||||
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
|
cfg: OcrMyPdfConfig,
|
||||||
|
lang: Language,
|
||||||
|
chunkSize: Int,
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||||
|
if (cfg.enabled) {
|
||||||
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
|
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||||
|
|
||||||
|
ExternConv.toPDF[F, A](
|
||||||
|
"ocrmypdf",
|
||||||
|
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||||
|
cfg.workingDir,
|
||||||
|
false,
|
||||||
|
blocker,
|
||||||
|
logger,
|
||||||
|
reader
|
||||||
|
)(in, handler)
|
||||||
|
} else
|
||||||
|
handler(ConversionResult.unsupportedFormat(MimeType.pdf))
|
||||||
|
|
||||||
|
}
|
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
11
modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import docspell.common.SystemCommand
|
||||||
|
|
||||||
|
case class OcrMyPdfConfig(
|
||||||
|
enabled: Boolean,
|
||||||
|
command: SystemCommand.Config,
|
||||||
|
workingDir: Path
|
||||||
|
)
|
@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
|||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
import docspell.files.{ExampleFiles, TestFiles}
|
import docspell.files.{ExampleFiles, TestFiles}
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
|
import docspell.convert.extern.OcrMyPdfConfig
|
||||||
|
|
||||||
object ConversionTest extends SimpleTestSuite with FileChecks {
|
object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||||
val blocker = TestFiles.blocker
|
val blocker = TestFiles.blocker
|
||||||
@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
|
|||||||
Duration.seconds(20)
|
Duration.seconds(20)
|
||||||
),
|
),
|
||||||
target
|
target
|
||||||
|
),
|
||||||
|
OcrMyPdfConfig(
|
||||||
|
true,
|
||||||
|
SystemCommand.Config(
|
||||||
|
"ocrmypdf",
|
||||||
|
Seq(
|
||||||
|
"-l",
|
||||||
|
"{{lang}}",
|
||||||
|
"--skip-text",
|
||||||
|
"--deskew",
|
||||||
|
"-j",
|
||||||
|
"1",
|
||||||
|
"{{infile}}",
|
||||||
|
"{{outfile}}"
|
||||||
|
),
|
||||||
|
Duration.seconds(20)
|
||||||
|
),
|
||||||
|
target
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -339,6 +339,39 @@ docspell.joex {
|
|||||||
}
|
}
|
||||||
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# The tool ocrmypdf can be used to convert pdf files to pdf files
|
||||||
|
# in order to add extracted text as a separate layer. This makes
|
||||||
|
# image-only pdfs searchable and you can select and copy/paste the
|
||||||
|
# text. It also converts pdfs into pdf/a type pdfs, which are best
|
||||||
|
# suited for archiving. So it makes sense to use this even for
|
||||||
|
# text-only pdfs.
|
||||||
|
#
|
||||||
|
# It is recommended to install ocrympdf, but it also is optional.
|
||||||
|
# If it is enabled but fails, the error is not fatal and the
|
||||||
|
# processing will continue using the original pdf for extracting
|
||||||
|
# text. You can also disable it to remove the errors from the
|
||||||
|
# processing logs.
|
||||||
|
#
|
||||||
|
# The `--skip-text` option is necessary to not fail on "text" pdfs
|
||||||
|
# (where ocr is not necessary). In this case, the pdf will be
|
||||||
|
# converted to PDF/A.
|
||||||
|
ocrmypdf = {
|
||||||
|
enabled = true
|
||||||
|
command = {
|
||||||
|
program = "ocrmypdf"
|
||||||
|
args = [
|
||||||
|
"-l", "{{lang}}",
|
||||||
|
"--skip-text",
|
||||||
|
"--deskew",
|
||||||
|
"-j", "1",
|
||||||
|
"{{infile}}",
|
||||||
|
"{{outfile}}"
|
||||||
|
]
|
||||||
|
timeout = "5 minutes"
|
||||||
|
}
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# General config for processing documents
|
# General config for processing documents
|
||||||
|
@ -64,10 +64,6 @@ object ConvertPdf {
|
|||||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||||
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
|
Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv =>
|
||||||
mime.toLocal match {
|
mime.toLocal match {
|
||||||
case MimeType.PdfMatch(_) =>
|
|
||||||
ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *>
|
|
||||||
(ra, None: Option[RAttachmentMeta]).pure[F]
|
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
val data = ctx.store.bitpeace
|
val data = ctx.store.bitpeace
|
||||||
.get(ra.fileId.id)
|
.get(ra.fileId.id)
|
||||||
|
@ -85,9 +85,10 @@ object TextExtraction {
|
|||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
)(ra: RAttachment): F[RAttachmentMeta] =
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||||
dst <- Duration.stopTime[F]
|
dst <- Duration.stopTime[F]
|
||||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
|
fids <- filesToExtract(ctx)(item, ra)
|
||||||
|
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||||
meta = item.changeMeta(
|
meta = item.changeMeta(
|
||||||
ra.id,
|
ra.id,
|
||||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
||||||
@ -151,11 +152,24 @@ object TextExtraction {
|
|||||||
|
|
||||||
/** Returns the fileIds to extract text from. First, the source file
|
/** Returns the fileIds to extract text from. First, the source file
|
||||||
* is tried. If that fails, the converted file is tried.
|
* is tried. If that fails, the converted file is tried.
|
||||||
|
*
|
||||||
|
* If the source file is a PDF, then use the converted file. This
|
||||||
|
* may then already contain the text if ocrmypdf is enabled. If it
|
||||||
|
* is disabled, both files are the same.
|
||||||
*/
|
*/
|
||||||
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
|
private def filesToExtract[F[_]: Sync](ctx: Context[F, _])(
|
||||||
|
item: ItemData,
|
||||||
|
ra: RAttachment
|
||||||
|
): F[List[Ident]] =
|
||||||
item.originFile.get(ra.id) match {
|
item.originFile.get(ra.id) match {
|
||||||
case Some(sid) => List(sid, ra.fileId).distinct
|
case Some(sid) =>
|
||||||
case None => List(ra.fileId)
|
ctx.store.transact(RFileMeta.findMime(sid)).map {
|
||||||
|
case Some(MimeType.PdfMatch(_)) =>
|
||||||
|
List(ra.fileId)
|
||||||
|
case _ =>
|
||||||
|
List(sid, ra.fileId).distinct
|
||||||
|
}
|
||||||
|
case None => List(ra.fileId).pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
private def stripAttachmentName(ra: RAttachment): String =
|
private def stripAttachmentName(ra: RAttachment): String =
|
||||||
|
@ -23,3 +23,4 @@ Some early information about certain details can be found in a few
|
|||||||
- [0012 Periodic Tasks](adr/0012_periodic_tasks)
|
- [0012 Periodic Tasks](adr/0012_periodic_tasks)
|
||||||
- [0013 Archive Files](adr/0013_archive_files)
|
- [0013 Archive Files](adr/0013_archive_files)
|
||||||
- [0014 Full-Text Search](adr/0014_fulltext_search_engine)
|
- [0014 Full-Text Search](adr/0014_fulltext_search_engine)
|
||||||
|
- [0015 Convert PDF files](adr/0015_convert_pdf_files)
|
||||||
|
67
modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
Normal file
67
modules/microsite/docs/dev/adr/0015_convert_pdf_files.md
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
---
|
||||||
|
layout: docs
|
||||||
|
title: Convert PDF Files
|
||||||
|
permalink: dev/adr/0015_convert_pdf_files
|
||||||
|
---
|
||||||
|
|
||||||
|
# {{ page.title }}
|
||||||
|
|
||||||
|
## Context and Problem Statement
|
||||||
|
|
||||||
|
Some PDFs contain only images (when coming from a scanner) and
|
||||||
|
therefore one is not able to click into the pdf and select text for
|
||||||
|
copy&paste. Also it is not searchable in a PDF viewer. These are
|
||||||
|
really shortcomings that can be fixed, especially when there is
|
||||||
|
already OCR build in.
|
||||||
|
|
||||||
|
For images, this works already as tesseract is used to create the PDF
|
||||||
|
files. Tesseract creates the files with an additional text layer
|
||||||
|
containing the OCRed text.
|
||||||
|
|
||||||
|
## Considered Options
|
||||||
|
|
||||||
|
* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an
|
||||||
|
OCR text layer to scanned PDF files, allowing them to be searched
|
||||||
|
|
||||||
|
|
||||||
|
### ocrmypdf
|
||||||
|
|
||||||
|
This is a very nice python tool, that uses tesseract to do OCR on each
|
||||||
|
page and add the extracted text as a pdf text layer to the page.
|
||||||
|
Additionally it creates PDF/A type pdfs, which are great for
|
||||||
|
archiving. This fixes exactly the things stated above.
|
||||||
|
|
||||||
|
#### Integration
|
||||||
|
|
||||||
|
Docspell already has this built in for images. When converting images
|
||||||
|
to a PDF (which is done early in processing), the process creates a
|
||||||
|
text and a PDF file. Docspell then sets the text in this step and the
|
||||||
|
text extraction step skips doing its work, if there is already text
|
||||||
|
available.
|
||||||
|
|
||||||
|
It would be possible to use the `--sidecar` option with ocrmypdf to
|
||||||
|
create a text file of the extracted text with one run, too (exactly
|
||||||
|
like it works for tesseract). But for "text" pdfs, ocrmypdf writes
|
||||||
|
some info-message into this text file:
|
||||||
|
|
||||||
|
```
|
||||||
|
[OCR skipped on page 1][OCR skipped on page 2]
|
||||||
|
```
|
||||||
|
|
||||||
|
Docspell cannot reliably tell, wether this is extracted text or not.
|
||||||
|
It would be reqiured to load the pdf and check its contents. This is a
|
||||||
|
bit of bad luck, because everything would just work already. So it
|
||||||
|
requires a (small) change in the text-extraction step. By default,
|
||||||
|
text extraction happens on the source file. For PDFs, text extraction
|
||||||
|
should now be run on the converted file, to avoid running OCR twice.
|
||||||
|
|
||||||
|
The converted pdf file is either be a text-pdf in the first place,
|
||||||
|
where ocrmypdf would only convert it to a PDF/A file; or it may be a
|
||||||
|
converted file containing the OCR-ed text as a pdf layer. If ocrmypdf
|
||||||
|
is disabled, the converted file and the source file are the same for
|
||||||
|
PDFs.
|
||||||
|
|
||||||
|
## Decision Outcome
|
||||||
|
|
||||||
|
Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is
|
||||||
|
distributed under the GPL-3 license.
|
@ -77,6 +77,10 @@ component.
|
|||||||
office documents into PDF files. It uses libreoffice/openoffice.
|
office documents into PDF files. It uses libreoffice/openoffice.
|
||||||
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
|
- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into
|
||||||
PDF files.
|
PDF files.
|
||||||
|
- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally
|
||||||
|
used to convert PDF to PDF files. It adds an OCR layer to scanned
|
||||||
|
PDF files to make them searchable. It also creates PDF/A files from
|
||||||
|
the input pdf.
|
||||||
|
|
||||||
The performance of `unoconv` can be improved by starting `unoconv -l`
|
The performance of `unoconv` can be improved by starting `unoconv -l`
|
||||||
in a separate process. This runs a libreoffice/openoffice listener
|
in a separate process. This runs a libreoffice/openoffice listener
|
||||||
@ -87,7 +91,7 @@ therefore avoids starting one each time `unoconv` is called.
|
|||||||
On Debian this should install all joex requirements:
|
On Debian this should install all joex requirements:
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf
|
sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,7 +13,9 @@ permalink: features
|
|||||||
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||||
- [Full-Text Search](doc/finding#full-text-search) based on [Apache
|
- [Full-Text Search](doc/finding#full-text-search) based on [Apache
|
||||||
SOLR](https://lucene.apache.org/solr)
|
SOLR](https://lucene.apache.org/solr)
|
||||||
- Conversion to PDF: all files are converted into a PDF file
|
- Conversion to PDF: all files are converted into a PDF file. PDFs
|
||||||
|
with only images (as often returned from scanners) are converted
|
||||||
|
into searchable PDF/A pdfs.
|
||||||
- Non-destructive: all your uploaded files are never modified and can
|
- Non-destructive: all your uploaded files are never modified and can
|
||||||
always be downloaded untouched
|
always be downloaded untouched
|
||||||
- Text is analysed to find and attach meta data automatically
|
- Text is analysed to find and attach meta data automatically
|
||||||
|
@ -3,8 +3,10 @@ package docspell.store.records
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.store.impl.Implicits._
|
import docspell.store.impl.Implicits._
|
||||||
import docspell.store.impl._
|
import docspell.store.impl._
|
||||||
|
import docspell.store.syntax.MimeTypes._
|
||||||
|
|
||||||
import bitpeace.FileMeta
|
import bitpeace.FileMeta
|
||||||
|
import bitpeace.Mimetype
|
||||||
import doobie._
|
import doobie._
|
||||||
import doobie.implicits._
|
import doobie.implicits._
|
||||||
|
|
||||||
@ -30,4 +32,13 @@ object RFileMeta {
|
|||||||
|
|
||||||
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
|
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
|
||||||
|
import bitpeace.sql._
|
||||||
|
|
||||||
|
selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
|
||||||
|
.query[Mimetype]
|
||||||
|
.option
|
||||||
|
.map(_.map(_.toLocal))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,6 +131,23 @@ let
|
|||||||
};
|
};
|
||||||
working-dir = "/tmp/docspell-convert";
|
working-dir = "/tmp/docspell-convert";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ocrmypdf = {
|
||||||
|
enabled = true;
|
||||||
|
command = {
|
||||||
|
program = "${pkgs.ocrmypdf}/bin/ocrmypdf";
|
||||||
|
args = [
|
||||||
|
"-l" "{{lang}}"
|
||||||
|
"--skip-text"
|
||||||
|
"--deskew"
|
||||||
|
"-j" "1"
|
||||||
|
"{{infile}}"
|
||||||
|
"{{outfile}}"
|
||||||
|
];
|
||||||
|
timeout = "5 minutes";
|
||||||
|
};
|
||||||
|
working-dir = "/tmp/docspell-convert";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
files = {
|
files = {
|
||||||
chunk-size = 524288;
|
chunk-size = 524288;
|
||||||
@ -860,6 +877,66 @@ in {
|
|||||||
process.
|
process.
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
ocrmypdf = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.convert.ocrmypdf.enabled;
|
||||||
|
description = "Whether to use ocrmypdf to convert pdf to pdf/a.";
|
||||||
|
};
|
||||||
|
working-dir = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.convert.ocrmypdf.working-dir;
|
||||||
|
description = "Directory where the conversion processes can put their temp files";
|
||||||
|
};
|
||||||
|
command = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
program = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.convert.ocrmypdf.command.program;
|
||||||
|
description = "The path to the executable.";
|
||||||
|
};
|
||||||
|
args = mkOption {
|
||||||
|
type = types.listOf types.str;
|
||||||
|
default = defaults.convert.ocrmypdf.command.args;
|
||||||
|
description = "The arguments to the program";
|
||||||
|
};
|
||||||
|
timeout = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.convert.ocrmypdf.command.timeout;
|
||||||
|
description = "The timeout when executing the command";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.convert.ocrmypdf.command;
|
||||||
|
description = "The system command";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.convert.orcmypdf;
|
||||||
|
description = ''
|
||||||
|
The tool ocrmypdf can be used to convert pdf files to pdf files
|
||||||
|
in order to add extracted text as a separate layer. This makes
|
||||||
|
image-only pdfs searchable and you can select and copy/paste the
|
||||||
|
text. It also converts pdfs into pdf/a type pdfs, which are best
|
||||||
|
suited for archiving. So it makes sense to use this even for
|
||||||
|
text-only pdfs.
|
||||||
|
|
||||||
|
It is recommended to install ocrympdf, but it also is optional.
|
||||||
|
If it is enabled but fails, the error is not fatal and the
|
||||||
|
processing will continue using the original pdf for extracting
|
||||||
|
text. You can also disable it to remove the errors from the
|
||||||
|
processing logs.
|
||||||
|
|
||||||
|
The `--skip-text` option is necessary to not fail on "text" pdfs
|
||||||
|
(where ocr is not necessary). In this case, the pdf will be
|
||||||
|
converted to PDF/A.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.convert;
|
default = defaults.convert;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user