diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala index 233cfa96..1a6b966d 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -17,11 +17,11 @@ object Tesseract { blocker: Blocker, logger: Logger[F] )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { - val outBase = cfg.cmd.args.tail.headOption.getOrElse("out") + val outBase = cfg.command.args.tail.headOption.getOrElse("out") val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger) - ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler) + ExternConv.toPDF[F, A]("tesseract", cfg.command.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler) } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala index f5bef831..51f25c23 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala @@ -4,4 +4,4 @@ import java.nio.file.Path import docspell.common.SystemCommand -case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path) +case class TesseractConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala index a6bb5b04..0f362428 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -19,7 +19,7 @@ object Unoconv { val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResult[F](blocker, chunkSize, logger) - ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler) + ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(in, handler) } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala index da4af43c..70fd7975 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala @@ -4,4 +4,4 @@ import java.nio.file.Path import docspell.common.SystemCommand -case class UnoconvConfig (cmd: SystemCommand.Config, workingDir: Path) +case class UnoconvConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala index 0c5657c1..7b70a78f 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -19,7 +19,7 @@ object WkHtmlPdf { val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResult[F](blocker, chunkSize, logger) - ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler) + ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(in, handler) } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala index 11d8aa1c..3be06951 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala @@ -4,4 +4,4 @@ import java.nio.file.Path import docspell.common.SystemCommand -case class WkHtmlPdfConfig (cmd: SystemCommand.Config, workingDir: Path) +case class WkHtmlPdfConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index 294ce4db..3c6eebc5 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -154,7 +154,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks { }) def commandsExist: Boolean = - commandExists(convertConfig.unoconv.cmd.program) && - commandExists(convertConfig.wkhtmlpdf.cmd.program) && - commandExists(convertConfig.tesseract.cmd.program) + commandExists(convertConfig.unoconv.command.program) && + commandExists(convertConfig.wkhtmlpdf.command.program) && + commandExists(convertConfig.tesseract.command.program) } diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 12f55c9f..9712f54d 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -173,7 +173,7 @@ docspell.joex { # To convert HTML files into PDF files, the external tool # wkhtmltopdf is used. wkhtmlpdf { - cmd = { + command = { program = "wkhtmltopdf" args = [ "-s", @@ -191,7 +191,7 @@ docspell.joex { # To convert image files to PDF files, tesseract is used. This # also extracts the text in one go. tesseract = { - cmd = { + command = { program = "tesseract" args = [ "{{infile}}", @@ -215,7 +215,7 @@ docspell.joex { # a libreoffice listener by running `unoconv -l` in a separate # process. unoconv = { - cmd = { + command = { program = "unoconv" args = [ "-f", diff --git a/modules/microsite/docs/doc.md b/modules/microsite/docs/doc.md index 34f59942..b7bfd088 100644 --- a/modules/microsite/docs/doc.md +++ b/modules/microsite/docs/doc.md @@ -72,7 +72,7 @@ multiple files, which are called **attachments**. And an item has - a **direction**: one of "incoming" or "outgoing" - a **name**: some item name, defaults to the file name of the attachments -- some **notes**: arbitraty descriptive text. You can use markdown +- some **notes**: arbitrary descriptive text. You can use markdown here, which is appropriately formatted in the web application. ### Collective diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 516947fa..9c39651f 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -30,30 +30,74 @@ let wakeup-period = "30 minutes"; }; extraction = { - page-range = { - begin = 10; + pdf = { + min-text-len = 10; }; - ghostscript = { - working-dir = "/tmp/docspell-extraction"; - command = { - program = "${pkgs.ghostscript}/bin/gs"; - args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ]; - timeout = "5 minutes"; + + ocr = { + max-image-size = 14000000; + page-range = { + begin = 10; + }; + ghostscript = { + working-dir = "/tmp/docspell-extraction"; + command = { + program = "${pkgs.ghostscript}/bin/gs"; + args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ]; + timeout = "5 minutes"; + }; + }; + unpaper = { + command = { + program = "${pkgs.unpaper}/bin/unpaper"; + args = [ "{{infile}}" "{{outfile}}" ]; + timeout = "5 minutes"; + }; + }; + tesseract = { + command= { + program = "${pkgs.tesseract4}/bin/tesseract"; + args = ["{{file}}" "stdout" "-l" "{{lang}}" ]; + timeout = "5 minutes"; + }; }; }; - unpaper = { - command = { - program = "${pkgs.unpaper}/bin/unpaper"; - args = [ "{{infile}}" "{{outfile}}" ]; - timeout = "5 minutes"; - }; + }; + convert = { + chunk-size = 524288; + max-image-size = 14000000; + + markdown = { + internal-css = '' + body { padding: 2em 5em; } + ''; }; + + wkhtmlpdf = { + command = { + program = "${pkgs.wkhtmltopdf}/bin/wkhtmltopdf"; + args = ["-s" "A4" "--encoding" "UTF-8" "-" "{{outfile}}"]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + tesseract = { - command= { + command = { program = "${pkgs.tesseract4}/bin/tesseract"; - args = ["{{file}}" "stdout" "-l" "{{lang}}" ]; + args = ["{{infile}}" "out" "-l" "{{lang}}" "pdf" "txt"]; timeout = "5 minutes"; }; + working-dir = "/tmp/docspell-convert"; + }; + + unoconv = { + command = { + program = "${pkgs.unoconv}/bin/unoconv"; + args = ["-f" "pdf" "-o" "{{outfile}}" "{{infile}}"]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; }; }; }; @@ -199,128 +243,164 @@ in { extraction = mkOption { type = types.submodule({ options = { - page-range = mkOption { + pdf = mkOption { type = types.submodule({ options = { - begin = mkOption { + min-text-len = mkOption { type = types.int; - default = defaults.extraction.page-range.begin; - description = "Specifies the first N pages of a file to process."; + default = defaults.extraction.pdf.min-text-len; + description = '' + For PDF files it is first tried to read the text parts of the + PDF. But PDFs can be complex documents and they may contain text + and images. If the returned text is shorter than the value + below, OCR is run afterwards. Then both extracted texts are + compared and the longer will be used. + ''; }; }; }); - default = defaults.extraction.page-range; - description = '' - Defines what pages to process. If a PDF with 600 pages is - submitted, it is probably not necessary to scan through all of - them. This would take a long time and occupy resources for no - value. The first few pages should suffice. The default is first - 10 pages. + default = defaults.extraction.pdf; + description = "Settings for PDF extraction"; + }; + ocr = mkOption { + type = types.submodule({ + options = { + max-image-size = mkOption { + type = types.int; + default = defaults.extraction.ocr.max-image-size; + description = '' + Images greater than this size are skipped. Note that every + image is loaded completely into memory for doing OCR. + ''; + }; + page-range = mkOption { + type = types.submodule({ + options = { + begin = mkOption { + type = types.int; + default = defaults.extraction.page-range.begin; + description = "Specifies the first N pages of a file to process."; + }; + }; + }); + default = defaults.extraction.page-range; + description = '' + Defines what pages to process. If a PDF with 600 pages is + submitted, it is probably not necessary to scan through all of + them. This would take a long time and occupy resources for no + value. The first few pages should suffice. The default is first + 10 pages. - If you want all pages being processed, set this number to -1. + If you want all pages being processed, set this number to -1. - Note: if you change the ghostscript command below, be aware that - this setting (if not -1) will add another parameter to the - beginning of the command. - ''; - }; - ghostscript = mkOption { - type = types.submodule({ - options = { - working-dir = mkOption { - type = types.str; - default = defaults.extraction.ghostscript.working-dir; - description = "Directory where the extraction processes can put their temp files"; + Note: if you change the ghostscript command below, be aware that + this setting (if not -1) will add another parameter to the + beginning of the command. + ''; }; - command = mkOption { + ghostscript = mkOption { type = types.submodule({ options = { - program = mkOption { + working-dir = mkOption { type = types.str; - default = defaults.extraction.ghostscript.command.program; - description = "The path to the executable."; + default = defaults.extraction.ghostscript.working-dir; + description = "Directory where the extraction processes can put their temp files"; }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.ghostscript.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.ghostscript.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.ghostscript.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.ghostscript.command; + description = "The system command"; }; }; }); - default = defaults.extraction.ghostscript.command; - description = "The system command"; + default = defaults.extraction.ghostscript; + description = "The ghostscript command."; }; - }; - }); - default = defaults.extraction.ghostscript; - description = "The ghostscript command."; - }; - unpaper = mkOption { - type = types.submodule({ - options = { - command = mkOption { + unpaper = mkOption { type = types.submodule({ options = { - program = mkOption { - type = types.str; - default = defaults.extraction.unpaper.command.program; - description = "The path to the executable."; - }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.unpaper.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.unpaper.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.unpaper.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.unpaper.command; + description = "The system command"; }; }; }); - default = defaults.extraction.unpaper.command; - description = "The system command"; + default = defaults.extraction.unpaper; + description = "The unpaper command."; }; - }; - }); - default = defaults.extraction.unpaper; - description = "The unpaper command."; - }; - tesseract = mkOption { - type = types.submodule({ - options = { - command = mkOption { + tesseract = mkOption { type = types.submodule({ options = { - program = mkOption { - type = types.str; - default = defaults.extraction.tesseract.command.program; - description = "The path to the executable."; - }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.tesseract.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.tesseract.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.tesseract.command; + description = "The system command"; }; }; }); - default = defaults.extraction.tesseract.command; - description = "The system command"; + default = defaults.extraction.tesseract; + description = "The tesseract command."; }; + }; }); - default = defaults.extraction.tesseract; - description = "The tesseract command."; + default = defaults.extraction.ocr; + description = ""; }; }; }); @@ -336,6 +416,182 @@ in { below. ''; }; + + convert = mkOption { + type = types.submodule({ + options = { + chunk-size = mkOption { + type = types.int; + default = defaults.convert.chunk-size; + description = '' + The chunk size used when storing files. This should be the same + as used with the rest server. + ''; + }; + max-image-size = mkOption { + type = types.int; + default = defaults.convert.max-image-size; + description = '' + When reading images, this is the maximum size. Images that are + larger are not processed. + ''; + }; + markdown = mkOption { + type = types.submodule({ + options = { + internal-css = mkOption { + type = types.str; + default = defaults.convert.markdown.internal-css; + description = '' + The CSS that is used to style the resulting HTML. + ''; + }; + }; + }); + default = defaults.convert.markdown; + description = '' + Settings when processing markdown files (and other text files) + to HTML. + + In order to support text formats, text files are first converted + to HTML using a markdown processor. The resulting HTML is then + converted to a PDF file. + ''; + }; + wkhtmlpdf = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.wktmlpdf.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.wkhtmlpdf.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf; + description = '' + To convert HTML files into PDF files, the external tool + wkhtmltopdf is used. + ''; + }; + tesseract = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.tesseract.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.tesseract.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.tesseract; + description = '' + To convert image files to PDF files, tesseract is used. This + also extracts the text in one go. + ''; + }; + unoconv = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.unoconv.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.unoconv.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.unoconv.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.unoconv; + description = '' + To convert "office" files to PDF files, the external tool + unoconv is used. Unoconv uses libreoffice/openoffice for + converting. So it supports all formats that are possible to read + with libreoffice/openoffic. + + Note: to greatly improve performance, it is recommended to start + a libreoffice listener by running `unoconv -l` in a separate + process. + ''; + }; + }; + }); + default = defaults.convert; + description = '' + Configuration for converting files into PDFs. + + Most of it is delegated to external tools, which can be configured + below. They must be in the PATH environment or specify the full + path below via the `program` key. + ''; + }; }; }; diff --git a/nix/module-server.nix b/nix/module-server.nix index 7a160499..0ac4922b 100644 --- a/nix/module-server.nix +++ b/nix/module-server.nix @@ -34,7 +34,7 @@ let }; files = { chunk-size = 524288; - valid-mime-types = ["application/pdf"]; + valid-mime-types = []; }; }; };