diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index dfda4652..f010fbd0 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -53,6 +53,9 @@ object Duration { def days(n: Long): Duration = apply(JDur.ofDays(n)) + def years(n: Long): Duration = + days(n * 365) + def nanos(n: Long): Duration = Duration(n) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 16bc791a..f9d51cae 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -68,7 +68,7 @@ docspell.joex { # How often a failed job should be retried until it enters failed # state. If a job fails, it becomes "stuck" and will be retried # after a delay. - retries = 5 + retries = 2 # The delay until the next try is performed for a failed job. This # delay is increased exponentially with the number of retries. @@ -341,6 +341,13 @@ docspell.joex { } } + # General config for processing documents + processing { + # Restricts proposals for due dates. Only dates earlier than this + # number of years in the future are considered. + max-due-date-years = 10 + } + # The same section is also present in the rest-server config. It is # used when submitting files into the job queue for processing. # diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index c9c54528..095bfdc3 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -25,7 +25,8 @@ case class Config( sendMail: MailSendConfig, files: Files, mailDebug: Boolean, - fullTextSearch: Config.FullTextSearch + fullTextSearch: Config.FullTextSearch, + processing: Config.Processing ) object Config { @@ -47,4 +48,6 @@ object Config { final case class Migration(indexAllChunk: Int) } + + case class Processing(maxDueDateYears: Int) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index d9d629b8..aa06ee39 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -8,6 +8,7 @@ import cats.effect.Sync import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ +import docspell.joex.Config import docspell.joex.scheduler.{Context, Task} import docspell.store.records._ @@ -16,33 +17,42 @@ import docspell.store.records._ */ object FindProposal { - def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + def apply[F[_]: Sync]( + cfg: Config.Processing + )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) ctx.logger.info("Starting find-proposal") *> rmas .traverse(rm => - processAttachment(rm, data.findDates(rm), ctx) + processAttachment(cfg, rm, data.findDates(rm), ctx) .map(ml => rm.copy(proposals = ml)) ) .map(rmv => data.copy(metas = rmv)) } def processAttachment[F[_]: Sync]( + cfg: Config.Processing, rm: RAttachmentMeta, rd: Vector[NerDateLabel], ctx: Context[F, ProcessItemArgs] ): F[MetaProposalList] = { val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx)) - List(finder.find(rm.nerlabels), makeDateProposal(rd)) + List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd)) .traverse(identity) .map(MetaProposalList.flatten) } - def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] = + def makeDateProposal[F[_]: Sync]( + cfg: Config.Processing, + dates: Vector[NerDateLabel] + ): F[MetaProposalList] = Timestamp.current[F].map { now => - val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date)) + val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong)) + val latestFirst = dates + .filter(_.date.isBefore(maxFuture.toUtcDate)) + .sortWith((l1, l2) => l1.date.isAfter(l2.date)) val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index de5de412..1de74072 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -2,7 +2,6 @@ package docspell.joex.process import cats.effect._ import docspell.common.ProcessItemArgs -import docspell.analysis.TextAnalysisConfig import docspell.joex.scheduler.Task import docspell.joex.Config import docspell.ftsclient.FtsClient @@ -19,16 +18,16 @@ object ProcessItem { .flatMap(Task.setProgress(40)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(Task.setProgress(60)) - .flatMap(analysisOnly[F](cfg.textAnalysis)) + .flatMap(analysisOnly[F](cfg)) .flatMap(Task.setProgress(80)) .flatMap(LinkProposal[F]) .flatMap(Task.setProgress(99)) def analysisOnly[F[_]: Sync]( - cfg: TextAnalysisConfig + cfg: Config )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](cfg)(item) - .flatMap(FindProposal[F]) + TextAnalysis[F](cfg.textAnalysis)(item) + .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.7.1__fix_item_date.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.7.1__fix_item_date.sql new file mode 100644 index 00000000..77c8b1f4 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.7.1__fix_item_date.sql @@ -0,0 +1,5 @@ +ALTER TABLE `item` +MODIFY `itemdate` DATETIME NULL; + +ALTER TABLE `item` +MODIFY `duedate` DATETIME NULL; diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 3b1d6f1e..c4ebe8fa 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -35,7 +35,7 @@ let scheduler = { pool-size = 2; counting-scheme = "4,1"; - retries = 5; + retries = 2; retry-delay = "1 minute"; log-buffer-size = 500; wakeup-period = "30 minutes"; @@ -92,6 +92,9 @@ let text-analysis = { max-length = 10000; }; + processing = { + max-due-date-years = 10; + }; convert = { chunk-size = 524288; max-image-size = 14000000; @@ -133,6 +136,19 @@ let chunk-size = 524288; valid-mime-types = []; }; + full-text-search = { + enabled = false; + solr = { + url = "http://localhost:8983/solr/docspell"; + commit-within = 1000; + log-verbose = false; + def-type = "lucene"; + q-op = "OR"; + }; + migration = { + index-all-chunk = 10; + }; + }; }; in { @@ -653,6 +669,23 @@ in { description = "Settings for text analysis"; }; + processing = mkOption { + type = types.submodule({ + options = { + max-due-date-years = mkOption { + type = types.int; + default = defaults.processing.max-due-date-years; + description = '' + Restricts proposals for due dates. Only dates earlier than this + number of years in the future are considered. + ''; + }; + }; + }); + default = defaults.processing; + description = "General config for processing documents"; + }; + convert = mkOption { type = types.submodule({ options = { @@ -860,6 +893,79 @@ in { default = defaults.files; description= "Settings for how files are stored."; }; + full-text-search = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.full-text-search.enabled; + description = '' + The full-text search feature can be disabled. It requires an + additional index server which needs additional memory and disk + space. It can be enabled later any time. + + Currently the SOLR search platform is supported. + ''; + }; + solr = mkOption { + type = types.submodule({ + options = { + url = mkOption { + type = types.str; + default = defaults.full-text-search.solr.url; + description = "The URL to solr"; + }; + commit-within = mkOption { + type = types.int; + default = defaults.full-text-search.solr.commit-within; + description = "Used to tell solr when to commit the data"; + }; + log-verbose = mkOption { + type = types.bool; + default = defaults.full-text-search.solr.log-verbose; + description = "If true, logs request and response bodies"; + }; + def-type = mkOption { + type = types.str; + default = defaults.full-text-search.solr.def-type; + description = '' + The defType parameter to lucene that defines the parser to + use. You might want to try "edismax" or look here: + https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + ''; + }; + q-op = mkOption { + type = types.str; + default = defaults.full-text-search.solr.q-op; + description = "The default combiner for tokens. One of {AND, OR}."; + }; + }; + }); + default = defaults.full-text-search.solr; + description = "Configuration for the SOLR backend."; + }; + migration = mkOption { + type = types.submodule({ + options = { + index-all-chunk = mkOption { + type = types.int; + default = defaults.full-text-search.migration.index-all-chunk; + description = '' + Chunk size to use when indexing data from the database. This + many attachments are loaded into memory and pushed to the + full-text index. + ''; + }; + }; + }); + default = defaults.full-text-search.migration; + description = "Settings for running the index migration tasks"; + }; + }; + }); + default = defaults.full-text-search; + description = "Configuration for full-text search."; + }; }; }; diff --git a/nix/module-server.nix b/nix/module-server.nix index 6c40a7ae..e713b508 100644 --- a/nix/module-server.nix +++ b/nix/module-server.nix @@ -37,6 +37,17 @@ let header-value = "some-secret"; }; }; + full-text-search = { + enabled = false; + solr = { + url = "http://localhost:8983/solr/docspell"; + commit-within = 1000; + log-verbose = false; + def-type = "lucene"; + q-op = "OR"; + }; + recreate-key = ""; + }; auth = { server-secret = "hex:caffee"; session-valid = "5 minutes"; @@ -271,6 +282,75 @@ in { ''; }; + full-text-search = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.full-text-search.enabled; + description = '' + The full-text search feature can be disabled. It requires an + additional index server which needs additional memory and disk + space. It can be enabled later any time. + + Currently the SOLR search platform is supported. + ''; + }; + solr = mkOption { + type = types.submodule({ + options = { + url = mkOption { + type = types.str; + default = defaults.full-text-search.solr.url; + description = "The URL to solr"; + }; + commit-within = mkOption { + type = types.int; + default = defaults.full-text-search.solr.commit-within; + description = "Used to tell solr when to commit the data"; + }; + log-verbose = mkOption { + type = types.bool; + default = defaults.full-text-search.solr.log-verbose; + description = "If true, logs request and response bodies"; + }; + def-type = mkOption { + type = types.str; + default = defaults.full-text-search.solr.def-type; + description = '' + The defType parameter to lucene that defines the parser to + use. You might want to try "edismax" or look here: + https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + ''; + }; + q-op = mkOption { + type = types.str; + default = defaults.full-text-search.solr.q-op; + description = "The default combiner for tokens. One of {AND, OR}."; + }; + }; + }); + default = defaults.full-text-search.solr; + description = "Configuration for the SOLR backend."; + }; + recreate-key = mkOption { + type = types.str; + default = defaults.full-text-search.recreate-key; + description = '' + When re-creating the complete index via a REST call, this key + is required. If left empty (the default), recreating the index + is disabled. + + Example curl command: + curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123 + ''; + }; + }; + }); + default = defaults.full-text-search; + description = "Configuration for full-text search."; + }; + backend = mkOption { type = types.submodule({ options = {