Adopt nix modules to new config

2025-09-15 21:46:53 +00:00 · 2020-02-22 12:40:56 +01:00
parent 74a037887d
commit ec419c7bfd
11 changed files with 378 additions and 122 deletions
--- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala
@@ -17,11 +17,11 @@ object Tesseract {
      blocker: Blocker,
      logger: Logger[F]
  )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
-    val outBase = cfg.cmd.args.tail.headOption.getOrElse("out")
+    val outBase = cfg.command.args.tail.headOption.getOrElse("out")
    val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
      ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
-    ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
+    ExternConv.toPDF[F, A]("tesseract", cfg.command.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
  }
 }
--- a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
@@ -4,4 +4,4 @@ import java.nio.file.Path
 import docspell.common.SystemCommand
-case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path)
+case class TesseractConfig (command: SystemCommand.Config, workingDir: Path)
--- a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala
@@ -19,7 +19,7 @@ object Unoconv {
    val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
      ExternConv.readResult[F](blocker, chunkSize, logger)
-    ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
+    ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(in, handler)
  }
 }
--- a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala
@@ -4,4 +4,4 @@ import java.nio.file.Path
 import docspell.common.SystemCommand
-case class UnoconvConfig (cmd: SystemCommand.Config, workingDir: Path)
+case class UnoconvConfig (command: SystemCommand.Config, workingDir: Path)
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala
@@ -19,7 +19,7 @@ object WkHtmlPdf {
    val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
      ExternConv.readResult[F](blocker, chunkSize, logger)
-    ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler)
+    ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(in, handler)
  }
 }
--- a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala
+++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala
@@ -4,4 +4,4 @@ import java.nio.file.Path
 import docspell.common.SystemCommand
-case class WkHtmlPdfConfig (cmd: SystemCommand.Config, workingDir: Path)
+case class WkHtmlPdfConfig (command: SystemCommand.Config, workingDir: Path)
--- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
+++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala
@@ -154,7 +154,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
      })
  def commandsExist: Boolean =
-    commandExists(convertConfig.unoconv.cmd.program) &&
+    commandExists(convertConfig.unoconv.command.program) &&
-      commandExists(convertConfig.wkhtmlpdf.cmd.program) &&
+      commandExists(convertConfig.wkhtmlpdf.command.program) &&
-      commandExists(convertConfig.tesseract.cmd.program)
+      commandExists(convertConfig.tesseract.command.program)
 }
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -173,7 +173,7 @@ docspell.joex {
    # To convert HTML files into PDF files, the external tool
    # wkhtmltopdf is used.
    wkhtmlpdf {
-      cmd = {
+      command = {
        program = "wkhtmltopdf"
        args = [
          "-s",
@@ -191,7 +191,7 @@ docspell.joex {
    # To convert image files to PDF files, tesseract is used. This
    # also extracts the text in one go.
    tesseract = {
-      cmd = {
+      command = {
        program = "tesseract"
        args = [
          "{{infile}}",
@@ -215,7 +215,7 @@ docspell.joex {
    # a libreoffice listener by running `unoconv -l` in a separate
    # process.
    unoconv = {
-      cmd = {
+      command = {
        program = "unoconv"
        args = [
          "-f",
--- a/modules/microsite/docs/doc.md
+++ b/modules/microsite/docs/doc.md
@@ -72,7 +72,7 @@ multiple files, which are called **attachments**. And an item has
 - a **direction**: one of "incoming" or "outgoing"
 - a **name**: some item name, defaults to the file name of the
  attachments
- some **notes**: arbitraty descriptive text. You can use markdown
+- some **notes**: arbitrary descriptive text. You can use markdown
  here, which is appropriately formatted in the web application.
 ### Collective
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -30,30 +30,74 @@ let
      wakeup-period = "30 minutes";
    };
    extraction = {
-      page-range = {
+      pdf = {
-        begin = 10;
+        min-text-len = 10;
      };
-      ghostscript =  {
+
-        working-dir = "/tmp/docspell-extraction";
+      ocr = {
-        command = {
+        max-image-size = 14000000;
-          program = "${pkgs.ghostscript}/bin/gs";
+        page-range = {
-          args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ];
+          begin = 10;
-          timeout = "5 minutes";
+        };
        ghostscript =  {
          working-dir = "/tmp/docspell-extraction";
          command = {
            program = "${pkgs.ghostscript}/bin/gs";
            args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ];
            timeout = "5 minutes";
          };
        };
        unpaper = {
          command = {
            program = "${pkgs.unpaper}/bin/unpaper";
            args = [ "{{infile}}" "{{outfile}}" ];
            timeout = "5 minutes";
          };
        };
        tesseract = {
          command= {
            program = "${pkgs.tesseract4}/bin/tesseract";
            args = ["{{file}}" "stdout" "-l" "{{lang}}" ];
            timeout = "5 minutes";
          };
        };
      };
-      unpaper = {
+    };
-        command = {
+    convert = {
-          program = "${pkgs.unpaper}/bin/unpaper";
+      chunk-size = 524288;
-          args = [ "{{infile}}" "{{outfile}}" ];
+      max-image-size = 14000000;
-          timeout = "5 minutes";
+
-        };
+      markdown = {
        internal-css = ''
            body { padding: 2em 5em; }
          '';
      };
      wkhtmlpdf = {
        command = {
          program = "${pkgs.wkhtmltopdf}/bin/wkhtmltopdf";
          args = ["-s" "A4" "--encoding" "UTF-8" "-" "{{outfile}}"];
          timeout = "2 minutes";
        };
        working-dir = "/tmp/docspell-convert";
      };
      tesseract = {
-        command= {
+        command = {
          program = "${pkgs.tesseract4}/bin/tesseract";
-          args = ["{{file}}" "stdout" "-l" "{{lang}}" ];
+          args = ["{{infile}}" "out" "-l" "{{lang}}" "pdf" "txt"];
          timeout = "5 minutes";
        };
        working-dir = "/tmp/docspell-convert";
      };
      unoconv = {
        command = {
          program = "${pkgs.unoconv}/bin/unoconv";
          args = ["-f" "pdf" "-o" "{{outfile}}" "{{infile}}"];
          timeout = "2 minutes";
        };
        working-dir = "/tmp/docspell-convert";
      };
    };
  };
@@ -199,128 +243,164 @@ in {
      extraction = mkOption {
        type = types.submodule({
          options = {
-            page-range = mkOption {
+            pdf = mkOption {
              type = types.submodule({
                options = {
-                  begin = mkOption {
+                  min-text-len = mkOption {
                    type = types.int;
-                    default = defaults.extraction.page-range.begin;
+                    default = defaults.extraction.pdf.min-text-len;
-                    description = "Specifies the first N pages of a file to process.";
+                    description = ''
                      For PDF files it is first tried to read the text parts of the
                      PDF. But PDFs can be complex documents and they may contain text
                      and images. If the returned text is shorter than the value
                      below, OCR is run afterwards. Then both extracted texts are
                      compared and the longer will be used.
                    '';
                  };
                };
              });
-              default = defaults.extraction.page-range;
+              default = defaults.extraction.pdf;
-              description = ''
+              description = "Settings for PDF extraction";
-                Defines what pages to process. If a PDF with 600 pages is
+            };
-                submitted, it is probably not necessary to scan through all of
+            ocr = mkOption {
-                them. This would take a long time and occupy resources for no
+              type = types.submodule({
-                value. The first few pages should suffice. The default is first
+                options = {
-                10 pages.
+                  max-image-size = mkOption {
                    type = types.int;
                    default = defaults.extraction.ocr.max-image-size;
                    description = ''
                      Images greater than this size are skipped. Note that every
                      image is loaded completely into memory for doing OCR.
                    '';
                  };
                  page-range = mkOption {
                    type = types.submodule({
                      options = {
                        begin = mkOption {
                          type = types.int;
                          default = defaults.extraction.page-range.begin;
                          description = "Specifies the first N pages of a file to process.";
                        };
                      };
                    });
                    default = defaults.extraction.page-range;
                    description = ''
                      Defines what pages to process. If a PDF with 600 pages is
                      submitted, it is probably not necessary to scan through all of
                      them. This would take a long time and occupy resources for no
                      value. The first few pages should suffice. The default is first
                      10 pages.
-                If you want all pages being processed, set this number to -1.
+                      If you want all pages being processed, set this number to -1.
-                Note: if you change the ghostscript command below, be aware that
+                      Note: if you change the ghostscript command below, be aware that
-                this setting (if not -1) will add another parameter to the
+                      this setting (if not -1) will add another parameter to the
-                beginning of the command.
+                      beginning of the command.
-              '';
+                    '';
            };
            ghostscript = mkOption {
              type = types.submodule({
                options = {
                  working-dir = mkOption {
                    type = types.str;
                    default = defaults.extraction.ghostscript.working-dir;
                    description = "Directory where the extraction processes can put their temp files";
                  };
-                  command = mkOption {
+                  ghostscript = mkOption {
                    type = types.submodule({
                      options = {
-                        program = mkOption {
+                        working-dir = mkOption {
                          type = types.str;
-                          default = defaults.extraction.ghostscript.command.program;
+                          default = defaults.extraction.ghostscript.working-dir;
-                          description = "The path to the executable.";
+                          description = "Directory where the extraction processes can put their temp files";
                        };
-                        args = mkOption {
+                        command = mkOption {
-                          type = types.listOf types.str;
+                          type = types.submodule({
-                          default = defaults.extraction.ghostscript.command.args;
+                            options = {
-                          description = "The arguments to the program";
+                              program = mkOption {
-                        };
+                                type = types.str;
-                        timeout = mkOption {
+                                default = defaults.extraction.ghostscript.command.program;
-                          type = types.str;
+                                description = "The path to the executable.";
-                          default = defaults.extraction.ghostscript.command.timeout;
+                              };
-                          description = "The timeout when executing the command";
+                              args = mkOption {
                                type = types.listOf types.str;
                                default = defaults.extraction.ghostscript.command.args;
                                description = "The arguments to the program";
                              };
                              timeout = mkOption {
                                type = types.str;
                                default = defaults.extraction.ghostscript.command.timeout;
                                description = "The timeout when executing the command";
                              };
                            };
                          });
                          default = defaults.extraction.ghostscript.command;
                          description = "The system command";
                        };
                      };
                    });
-                    default = defaults.extraction.ghostscript.command;
+                    default = defaults.extraction.ghostscript;
-                    description = "The system command";
+                    description = "The ghostscript command.";
                  };
-                };
+                  unpaper = mkOption {
              });
              default = defaults.extraction.ghostscript;
              description = "The ghostscript command.";
            };
            unpaper = mkOption {
              type = types.submodule({
                options = {
                  command = mkOption {
                    type = types.submodule({
                      options = {
-                        program = mkOption {
+                        command = mkOption {
-                          type = types.str;
+                          type = types.submodule({
-                          default = defaults.extraction.unpaper.command.program;
+                            options = {
-                          description = "The path to the executable.";
+                              program = mkOption {
-                        };
+                                type = types.str;
-                        args = mkOption {
+                                default = defaults.extraction.unpaper.command.program;
-                          type = types.listOf types.str;
+                                description = "The path to the executable.";
-                          default = defaults.extraction.unpaper.command.args;
+                              };
-                          description = "The arguments to the program";
+                              args = mkOption {
-                        };
+                                type = types.listOf types.str;
-                        timeout = mkOption {
+                                default = defaults.extraction.unpaper.command.args;
-                          type = types.str;
+                                description = "The arguments to the program";
-                          default = defaults.extraction.unpaper.command.timeout;
+                              };
-                          description = "The timeout when executing the command";
+                              timeout = mkOption {
                                type = types.str;
                                default = defaults.extraction.unpaper.command.timeout;
                                description = "The timeout when executing the command";
                              };
                            };
                          });
                          default = defaults.extraction.unpaper.command;
                          description = "The system command";
                        };
                      };
                    });
-                    default = defaults.extraction.unpaper.command;
+                    default = defaults.extraction.unpaper;
-                    description = "The system command";
+                    description = "The unpaper command.";
                  };
-                };
+                  tesseract = mkOption {
              });
              default = defaults.extraction.unpaper;
              description = "The unpaper command.";
            };
            tesseract = mkOption {
              type = types.submodule({
                options = {
                  command = mkOption {
                    type = types.submodule({
                      options = {
-                        program = mkOption {
+                        command = mkOption {
-                          type = types.str;
+                          type = types.submodule({
-                          default = defaults.extraction.tesseract.command.program;
+                            options = {
-                          description = "The path to the executable.";
+                              program = mkOption {
-                        };
+                                type = types.str;
-                        args = mkOption {
+                                default = defaults.extraction.tesseract.command.program;
-                          type = types.listOf types.str;
+                                description = "The path to the executable.";
-                          default = defaults.extraction.tesseract.command.args;
+                              };
-                          description = "The arguments to the program";
+                              args = mkOption {
-                        };
+                                type = types.listOf types.str;
-                        timeout = mkOption {
+                                default = defaults.extraction.tesseract.command.args;
-                          type = types.str;
+                                description = "The arguments to the program";
-                          default = defaults.extraction.tesseract.command.timeout;
+                              };
-                          description = "The timeout when executing the command";
+                              timeout = mkOption {
                                type = types.str;
                                default = defaults.extraction.tesseract.command.timeout;
                                description = "The timeout when executing the command";
                              };
                            };
                          });
                          default = defaults.extraction.tesseract.command;
                          description = "The system command";
                        };
                      };
                    });
-                    default = defaults.extraction.tesseract.command;
+                    default = defaults.extraction.tesseract;
-                    description = "The system command";
+                    description = "The tesseract command.";
                  };
                };
              });
-              default = defaults.extraction.tesseract;
+              default = defaults.extraction.ocr;
-              description = "The tesseract command.";
+              description = "";
            };
          };
        });
@@ -336,6 +416,182 @@ in {
          below.
        '';
      };
      convert = mkOption {
        type = types.submodule({
          options = {
            chunk-size = mkOption {
              type = types.int;
              default = defaults.convert.chunk-size;
              description = ''
                The chunk size used when storing files. This should be the same
                as used with the rest server.
              '';
            };
            max-image-size = mkOption {
              type = types.int;
              default = defaults.convert.max-image-size;
              description = ''
                When reading images, this is the maximum size. Images that are
                larger are not processed.
              '';
            };
            markdown = mkOption {
              type = types.submodule({
                options = {
                  internal-css = mkOption {
                    type = types.str;
                    default = defaults.convert.markdown.internal-css;
                    description = ''
                      The CSS that is used to style the resulting HTML.
                    '';
                  };
                };
              });
              default = defaults.convert.markdown;
              description = ''
                Settings when processing markdown files (and other text files)
                to HTML.
                In order to support text formats, text files are first converted
                to HTML using a markdown processor. The resulting HTML is then
                converted to a PDF file.
              '';
            };
            wkhtmlpdf = mkOption {
              type = types.submodule({
                options = {
                  working-dir = mkOption {
                    type = types.str;
                    default = defaults.convert.wktmlpdf.working-dir;
                    description = "Directory where the conversion processes can put their temp files";
                  };
                  command = mkOption {
                    type = types.submodule({
                      options = {
                        program = mkOption {
                          type = types.str;
                          default = defaults.convert.wkhtmlpdf.command.program;
                          description = "The path to the executable.";
                        };
                        args = mkOption {
                          type = types.listOf types.str;
                          default = defaults.convert.wkhtmlpdf.command.args;
                          description = "The arguments to the program";
                        };
                        timeout = mkOption {
                          type = types.str;
                          default = defaults.convert.wkhtmlpdf.command.timeout;
                          description = "The timeout when executing the command";
                        };
                      };
                    });
                    default = defaults.convert.wkhtmlpdf.command;
                    description = "The system command";
                  };
                };
              });
              default = defaults.convert.wkhtmlpdf;
              description = ''
                To convert HTML files into PDF files, the external tool
                wkhtmltopdf is used.
              '';
            };
            tesseract = mkOption {
              type = types.submodule({
                options = {
                  working-dir = mkOption {
                    type = types.str;
                    default = defaults.convert.tesseract.working-dir;
                    description = "Directory where the conversion processes can put their temp files";
                  };
                  command = mkOption {
                    type = types.submodule({
                      options = {
                        program = mkOption {
                          type = types.str;
                          default = defaults.convert.tesseract.command.program;
                          description = "The path to the executable.";
                        };
                        args = mkOption {
                          type = types.listOf types.str;
                          default = defaults.convert.tesseract.command.args;
                          description = "The arguments to the program";
                        };
                        timeout = mkOption {
                          type = types.str;
                          default = defaults.convert.tesseract.command.timeout;
                          description = "The timeout when executing the command";
                        };
                      };
                    });
                    default = defaults.convert.tesseract.command;
                    description = "The system command";
                  };
                };
              });
              default = defaults.convert.tesseract;
              description = ''
                To convert image files to PDF files, tesseract is used. This
                also extracts the text in one go.
              '';
            };
            unoconv = mkOption {
              type = types.submodule({
                options = {
                  working-dir = mkOption {
                    type = types.str;
                    default = defaults.convert.unoconv.working-dir;
                    description = "Directory where the conversion processes can put their temp files";
                  };
                  command = mkOption {
                    type = types.submodule({
                      options = {
                        program = mkOption {
                          type = types.str;
                          default = defaults.convert.unoconv.command.program;
                          description = "The path to the executable.";
                        };
                        args = mkOption {
                          type = types.listOf types.str;
                          default = defaults.convert.unoconv.command.args;
                          description = "The arguments to the program";
                        };
                        timeout = mkOption {
                          type = types.str;
                          default = defaults.convert.unoconv.command.timeout;
                          description = "The timeout when executing the command";
                        };
                      };
                    });
                    default = defaults.convert.unoconv.command;
                    description = "The system command";
                  };
                };
              });
              default = defaults.convert.unoconv;
              description = ''
                To convert "office" files to PDF files, the external tool
                unoconv is used. Unoconv uses libreoffice/openoffice for
                converting. So it supports all formats that are possible to read
                with libreoffice/openoffic.
                Note: to greatly improve performance, it is recommended to start
                a libreoffice listener by running `unoconv -l` in a separate
                process.
              '';
            };
          };
        });
        default = defaults.convert;
        description = ''
          Configuration for converting files into PDFs.
          Most of it is delegated to external tools, which can be configured
          below. They must be in the PATH environment or specify the full
          path below via the `program` key.
        '';
      };
    };
  };
--- a/nix/module-server.nix
+++ b/nix/module-server.nix
@@ -34,7 +34,7 @@ let
      };
      files = {
        chunk-size = 524288;
-        valid-mime-types = ["application/pdf"];
+        valid-mime-types = [];
      };
    };
  };
`@@ -4,4 +4,4 @@ import java.nio.file.Path`

	`import docspell.common.SystemCommand`	`import docspell.common.SystemCommand`

	`case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path)`	`case class TesseractConfig (command: SystemCommand.Config, workingDir: Path)`