diff --git a/docker/joex.dockerfile b/docker/joex.dockerfile index 636926bb..2ef463c1 100644 --- a/docker/joex.dockerfile +++ b/docker/joex.dockerfile @@ -19,6 +19,17 @@ RUN apk add --no-cache openjdk11-jre \ ttf-dejavu \ ttf-freefont \ ttf-liberation \ + libxml2-dev \ + libxslt-dev \ + pngquant \ + zlib-dev \ + g++ \ + qpdf \ + python3-dev \ + libffi-dev\ + qpdf-dev \ + && pip3 install --upgrade pip \ + && pip3 install ocrmypdf \ && curl -Ls $UNO_URL -o /usr/local/bin/unoconv \ && chmod +x /usr/local/bin/unoconv \ && ln -s /usr/bin/python3 /usr/bin/python \ @@ -27,7 +38,7 @@ RUN apk add --no-cache openjdk11-jre \ && curl -L -o docspell.zip https://github.com/eikek/docspell/releases/download/v0.8.0/docspell-joex-0.8.0.zip \ && unzip docspell.zip \ && rm docspell.zip \ - && apk del curl unzip + && apk del curl unzip libxml2-dev libxslt-dev zlib-dev g++ python3-dev libffi-dev qpdf-dev COPY entrypoint-joex.sh /opt/entrypoint.sh diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index 518340f3..589e9db7 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -8,7 +8,7 @@ import fs2._ import docspell.common._ import docspell.convert.ConversionResult.Handler -import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf} +import docspell.convert.extern._ import docspell.convert.flexmark.Markdown import docspell.files.{ImageSize, TikaMimetype} @@ -35,7 +35,8 @@ object Conversion { ): F[A] = TikaMimetype.resolve(dataType, in).flatMap { case MimeType.PdfMatch(_) => - handler.run(ConversionResult.successPdf(in)) + OcrMyPdf + .toPDF(cfg.ocrmypdf, lang, cfg.chunkSize, blocker, logger)(in, handler) case MimeType.HtmlMatch(mt) => val cs = mt.charsetOrUtf8 diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala index b268190c..f51791c0 100644 --- a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -1,5 +1,6 @@ package docspell.convert +import docspell.convert.extern.OcrMyPdfConfig import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} import docspell.convert.flexmark.MarkdownConfig @@ -9,5 +10,6 @@ case class ConvertConfig( markdown: MarkdownConfig, wkhtmlpdf: WkHtmlPdfConfig, tesseract: TesseractConfig, - unoconv: UnoconvConfig + unoconv: UnoconvConfig, + ocrmypdf: OcrMyPdfConfig ) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala index 677affdf..dcb02206 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -41,7 +41,7 @@ private[extern] object ExternConv { in.through(createInput).flatMap { _ => SystemCommand - .execSuccess[F]( + .exec[F]( sysCfg, blocker, logger, @@ -65,11 +65,20 @@ private[extern] object ExternConv { logger: Logger[F] )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = File.existsNonEmpty[F](out).flatMap { - case true => - if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F] - else - logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + case true if result.rc == 0 => + val outTxt = out.resolveSibling(out.getFileName.toString + ".txt") + File.existsNonEmpty[F](outTxt).flatMap { + case true => + successPdfTxt( + File.readAll(out, blocker, chunkSize), + File.readText(outTxt, blocker) + ).pure[F] + case false => successPdf(File.readAll(out, blocker, chunkSize)).pure[F] + } + case true if result.rc != 0 => + logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + successPdf(File.readAll(out, blocker, chunkSize)).pure[F] case false => ConversionResult diff --git a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala new file mode 100644 index 00000000..c57170d8 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdf.scala @@ -0,0 +1,37 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.effect._ +import fs2.Stream + +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler + +object OcrMyPdf { + + def toPDF[F[_]: Sync: ContextShift, A]( + cfg: OcrMyPdfConfig, + lang: Language, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = + if (cfg.enabled) { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](blocker, chunkSize, logger) + + ExternConv.toPDF[F, A]( + "ocrmypdf", + cfg.command.replace(Map("{{lang}}" -> lang.iso3)), + cfg.workingDir, + false, + blocker, + logger, + reader + )(in, handler) + } else + handler(ConversionResult.unsupportedFormat(MimeType.pdf)) + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala new file mode 100644 index 00000000..218e52ad --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/OcrMyPdfConfig.scala @@ -0,0 +1,11 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class OcrMyPdfConfig( + enabled: Boolean, + command: SystemCommand.Config, + workingDir: Path +) diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index e018ec8e..ab2bbc1a 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -12,6 +12,7 @@ import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} import docspell.convert.flexmark.MarkdownConfig import docspell.files.{ExampleFiles, TestFiles} import minitest.SimpleTestSuite +import docspell.convert.extern.OcrMyPdfConfig object ConversionTest extends SimpleTestSuite with FileChecks { val blocker = TestFiles.blocker @@ -47,6 +48,24 @@ object ConversionTest extends SimpleTestSuite with FileChecks { Duration.seconds(20) ), target + ), + OcrMyPdfConfig( + true, + SystemCommand.Config( + "ocrmypdf", + Seq( + "-l", + "{{lang}}", + "--skip-text", + "--deskew", + "-j", + "1", + "{{infile}}", + "{{outfile}}" + ), + Duration.seconds(20) + ), + target ) ) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index f9d51cae..059e6d05 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -339,6 +339,39 @@ docspell.joex { } working-dir = ${java.io.tmpdir}"/docspell-convert" } + + # The tool ocrmypdf can be used to convert pdf files to pdf files + # in order to add extracted text as a separate layer. This makes + # image-only pdfs searchable and you can select and copy/paste the + # text. It also converts pdfs into pdf/a type pdfs, which are best + # suited for archiving. So it makes sense to use this even for + # text-only pdfs. + # + # It is recommended to install ocrympdf, but it also is optional. + # If it is enabled but fails, the error is not fatal and the + # processing will continue using the original pdf for extracting + # text. You can also disable it to remove the errors from the + # processing logs. + # + # The `--skip-text` option is necessary to not fail on "text" pdfs + # (where ocr is not necessary). In this case, the pdf will be + # converted to PDF/A. + ocrmypdf = { + enabled = true + command = { + program = "ocrmypdf" + args = [ + "-l", "{{lang}}", + "--skip-text", + "--deskew", + "-j", "1", + "{{infile}}", + "{{outfile}}" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } } # General config for processing documents diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index b571a306..ba75ec3a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -64,10 +64,6 @@ object ConvertPdf { )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = Conversion.create[F](cfg, sanitizeHtml, ctx.blocker, ctx.logger).use { conv => mime.toLocal match { - case MimeType.PdfMatch(_) => - ctx.logger.debug(s"Not going to convert a PDF file ${ra.name} into a PDF.") *> - (ra, None: Option[RAttachmentMeta]).pure[F] - case mt => val data = ctx.store.bitpeace .get(ra.fileId.id) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 912507a5..384741e2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -85,9 +85,10 @@ object TextExtraction { item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { - _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") - dst <- Duration.stopTime[F] - txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra)) + _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") + dst <- Duration.stopTime[F] + fids <- filesToExtract(ctx)(item, ra) + txt <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)) @@ -151,11 +152,24 @@ object TextExtraction { /** Returns the fileIds to extract text from. First, the source file * is tried. If that fails, the converted file is tried. + * + * If the source file is a PDF, then use the converted file. This + * may then already contain the text if ocrmypdf is enabled. If it + * is disabled, both files are the same. */ - private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] = + private def filesToExtract[F[_]: Sync](ctx: Context[F, _])( + item: ItemData, + ra: RAttachment + ): F[List[Ident]] = item.originFile.get(ra.id) match { - case Some(sid) => List(sid, ra.fileId).distinct - case None => List(ra.fileId) + case Some(sid) => + ctx.store.transact(RFileMeta.findMime(sid)).map { + case Some(MimeType.PdfMatch(_)) => + List(ra.fileId) + case _ => + List(sid, ra.fileId).distinct + } + case None => List(ra.fileId).pure[F] } private def stripAttachmentName(ra: RAttachment): String = diff --git a/modules/microsite/docs/dev/adr.md b/modules/microsite/docs/dev/adr.md index 67872229..8410f065 100644 --- a/modules/microsite/docs/dev/adr.md +++ b/modules/microsite/docs/dev/adr.md @@ -23,3 +23,4 @@ Some early information about certain details can be found in a few - [0012 Periodic Tasks](adr/0012_periodic_tasks) - [0013 Archive Files](adr/0013_archive_files) - [0014 Full-Text Search](adr/0014_fulltext_search_engine) +- [0015 Convert PDF files](adr/0015_convert_pdf_files) diff --git a/modules/microsite/docs/dev/adr/0015_convert_pdf_files.md b/modules/microsite/docs/dev/adr/0015_convert_pdf_files.md new file mode 100644 index 00000000..b2f3ec02 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0015_convert_pdf_files.md @@ -0,0 +1,67 @@ +--- +layout: docs +title: Convert PDF Files +permalink: dev/adr/0015_convert_pdf_files +--- + +# {{ page.title }} + +## Context and Problem Statement + +Some PDFs contain only images (when coming from a scanner) and +therefore one is not able to click into the pdf and select text for +copy&paste. Also it is not searchable in a PDF viewer. These are +really shortcomings that can be fixed, especially when there is +already OCR build in. + +For images, this works already as tesseract is used to create the PDF +files. Tesseract creates the files with an additional text layer +containing the OCRed text. + +## Considered Options + +* [ocrmypdf](https://github.com/jbarlow83/OCRmyPDF) OCRmyPDF adds an + OCR text layer to scanned PDF files, allowing them to be searched + + +### ocrmypdf + +This is a very nice python tool, that uses tesseract to do OCR on each +page and add the extracted text as a pdf text layer to the page. +Additionally it creates PDF/A type pdfs, which are great for +archiving. This fixes exactly the things stated above. + +#### Integration + +Docspell already has this built in for images. When converting images +to a PDF (which is done early in processing), the process creates a +text and a PDF file. Docspell then sets the text in this step and the +text extraction step skips doing its work, if there is already text +available. + +It would be possible to use the `--sidecar` option with ocrmypdf to +create a text file of the extracted text with one run, too (exactly +like it works for tesseract). But for "text" pdfs, ocrmypdf writes +some info-message into this text file: + +``` +[OCR skipped on page 1][OCR skipped on page 2] +``` + +Docspell cannot reliably tell, wether this is extracted text or not. +It would be reqiured to load the pdf and check its contents. This is a +bit of bad luck, because everything would just work already. So it +requires a (small) change in the text-extraction step. By default, +text extraction happens on the source file. For PDFs, text extraction +should now be run on the converted file, to avoid running OCR twice. + +The converted pdf file is either be a text-pdf in the first place, +where ocrmypdf would only convert it to a PDF/A file; or it may be a +converted file containing the OCR-ed text as a pdf layer. If ocrmypdf +is disabled, the converted file and the source file are the same for +PDFs. + +## Decision Outcome + +Add ocrmypdf as an optional conversion from PDF to PDF. Ocrmypdf is +distributed under the GPL-3 license. diff --git a/modules/microsite/docs/doc/install.md b/modules/microsite/docs/doc/install.md index ae84ae5b..90236233 100644 --- a/modules/microsite/docs/doc/install.md +++ b/modules/microsite/docs/doc/install.md @@ -77,6 +77,10 @@ component. office documents into PDF files. It uses libreoffice/openoffice. - [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into PDF files. +- [OCRmyPDF](https://github.com/jbarlow83/OCRmyPDF) can be optionally + used to convert PDF to PDF files. It adds an OCR layer to scanned + PDF files to make them searchable. It also creates PDF/A files from + the input pdf. The performance of `unoconv` can be improved by starting `unoconv -l` in a separate process. This runs a libreoffice/openoffice listener @@ -87,7 +91,7 @@ therefore avoids starting one each time `unoconv` is called. On Debian this should install all joex requirements: ``` bash -sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf +sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ocrmypdf ``` diff --git a/modules/microsite/docs/features.md b/modules/microsite/docs/features.md index 0adbe905..85af705a 100644 --- a/modules/microsite/docs/features.md +++ b/modules/microsite/docs/features.md @@ -13,7 +13,9 @@ permalink: features - OCR using [tesseract](https://github.com/tesseract-ocr/tesseract) - [Full-Text Search](doc/finding#full-text-search) based on [Apache SOLR](https://lucene.apache.org/solr) -- Conversion to PDF: all files are converted into a PDF file +- Conversion to PDF: all files are converted into a PDF file. PDFs + with only images (as often returned from scanners) are converted + into searchable PDF/A pdfs. - Non-destructive: all your uploaded files are never modified and can always be downloaded untouched - Text is analysed to find and attach meta data automatically diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index dc04054d..076bfd68 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -3,8 +3,10 @@ package docspell.store.records import docspell.common._ import docspell.store.impl.Implicits._ import docspell.store.impl._ +import docspell.store.syntax.MimeTypes._ import bitpeace.FileMeta +import bitpeace.Mimetype import doobie._ import doobie.implicits._ @@ -30,4 +32,13 @@ object RFileMeta { selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option } + + def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = { + import bitpeace.sql._ + + selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid)) + .query[Mimetype] + .option + .map(_.map(_.toLocal)) + } } diff --git a/nix/module-joex.nix b/nix/module-joex.nix index d92afc02..6e16581f 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -131,6 +131,23 @@ let }; working-dir = "/tmp/docspell-convert"; }; + + ocrmypdf = { + enabled = true; + command = { + program = "${pkgs.ocrmypdf}/bin/ocrmypdf"; + args = [ + "-l" "{{lang}}" + "--skip-text" + "--deskew" + "-j" "1" + "{{infile}}" + "{{outfile}}" + ]; + timeout = "5 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; }; files = { chunk-size = 524288; @@ -860,6 +877,66 @@ in { process. ''; }; + + ocrmypdf = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.convert.ocrmypdf.enabled; + description = "Whether to use ocrmypdf to convert pdf to pdf/a."; + }; + working-dir = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.ocrmypdf.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.ocrmypdf.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.orcmypdf; + description = '' + The tool ocrmypdf can be used to convert pdf files to pdf files + in order to add extracted text as a separate layer. This makes + image-only pdfs searchable and you can select and copy/paste the + text. It also converts pdfs into pdf/a type pdfs, which are best + suited for archiving. So it makes sense to use this even for + text-only pdfs. + + It is recommended to install ocrympdf, but it also is optional. + If it is enabled but fails, the error is not fatal and the + processing will continue using the original pdf for extracting + text. You can also disable it to remove the errors from the + processing logs. + + The `--skip-text` option is necessary to not fail on "text" pdfs + (where ocr is not necessary). In this case, the pdf will be + converted to PDF/A. + ''; + }; + }; }); default = defaults.convert;