From d79ae6233add35c63803d3c564165c53fe22f638 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 26 Jun 2020 01:27:11 +0200 Subject: [PATCH] Restrict proposals for due date Avoid dates too far in the future. --- .../main/scala/docspell/common/Duration.scala | 3 +++ .../joex/src/main/resources/reference.conf | 7 +++++++ .../src/main/scala/docspell/joex/Config.scala | 5 ++++- .../docspell/joex/process/FindProposal.scala | 20 ++++++++++++++----- .../docspell/joex/process/ProcessItem.scala | 9 ++++----- nix/module-joex.nix | 20 +++++++++++++++++++ 6 files changed, 53 insertions(+), 11 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index dfda4652..f010fbd0 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -53,6 +53,9 @@ object Duration { def days(n: Long): Duration = apply(JDur.ofDays(n)) + def years(n: Long): Duration = + days(n * 365) + def nanos(n: Long): Duration = Duration(n) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index bd2b19f9..f9d51cae 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -341,6 +341,13 @@ docspell.joex { } } + # General config for processing documents + processing { + # Restricts proposals for due dates. Only dates earlier than this + # number of years in the future are considered. + max-due-date-years = 10 + } + # The same section is also present in the rest-server config. It is # used when submitting files into the job queue for processing. # diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index c9c54528..095bfdc3 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -25,7 +25,8 @@ case class Config( sendMail: MailSendConfig, files: Files, mailDebug: Boolean, - fullTextSearch: Config.FullTextSearch + fullTextSearch: Config.FullTextSearch, + processing: Config.Processing ) object Config { @@ -47,4 +48,6 @@ object Config { final case class Migration(indexAllChunk: Int) } + + case class Processing(maxDueDateYears: Int) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index d9d629b8..aa06ee39 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -8,6 +8,7 @@ import cats.effect.Sync import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ +import docspell.joex.Config import docspell.joex.scheduler.{Context, Task} import docspell.store.records._ @@ -16,33 +17,42 @@ import docspell.store.records._ */ object FindProposal { - def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + def apply[F[_]: Sync]( + cfg: Config.Processing + )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) ctx.logger.info("Starting find-proposal") *> rmas .traverse(rm => - processAttachment(rm, data.findDates(rm), ctx) + processAttachment(cfg, rm, data.findDates(rm), ctx) .map(ml => rm.copy(proposals = ml)) ) .map(rmv => data.copy(metas = rmv)) } def processAttachment[F[_]: Sync]( + cfg: Config.Processing, rm: RAttachmentMeta, rd: Vector[NerDateLabel], ctx: Context[F, ProcessItemArgs] ): F[MetaProposalList] = { val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx)) - List(finder.find(rm.nerlabels), makeDateProposal(rd)) + List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd)) .traverse(identity) .map(MetaProposalList.flatten) } - def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] = + def makeDateProposal[F[_]: Sync]( + cfg: Config.Processing, + dates: Vector[NerDateLabel] + ): F[MetaProposalList] = Timestamp.current[F].map { now => - val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date)) + val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong)) + val latestFirst = dates + .filter(_.date.isBefore(maxFuture.toUtcDate)) + .sortWith((l1, l2) => l1.date.isAfter(l2.date)) val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index de5de412..1de74072 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -2,7 +2,6 @@ package docspell.joex.process import cats.effect._ import docspell.common.ProcessItemArgs -import docspell.analysis.TextAnalysisConfig import docspell.joex.scheduler.Task import docspell.joex.Config import docspell.ftsclient.FtsClient @@ -19,16 +18,16 @@ object ProcessItem { .flatMap(Task.setProgress(40)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(Task.setProgress(60)) - .flatMap(analysisOnly[F](cfg.textAnalysis)) + .flatMap(analysisOnly[F](cfg)) .flatMap(Task.setProgress(80)) .flatMap(LinkProposal[F]) .flatMap(Task.setProgress(99)) def analysisOnly[F[_]: Sync]( - cfg: TextAnalysisConfig + cfg: Config )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](cfg)(item) - .flatMap(FindProposal[F]) + TextAnalysis[F](cfg.textAnalysis)(item) + .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 3f22ad7f..c4ebe8fa 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -92,6 +92,9 @@ let text-analysis = { max-length = 10000; }; + processing = { + max-due-date-years = 10; + }; convert = { chunk-size = 524288; max-image-size = 14000000; @@ -666,6 +669,23 @@ in { description = "Settings for text analysis"; }; + processing = mkOption { + type = types.submodule({ + options = { + max-due-date-years = mkOption { + type = types.int; + default = defaults.processing.max-due-date-years; + description = '' + Restricts proposals for due dates. Only dates earlier than this + number of years in the future are considered. + ''; + }; + }; + }); + default = defaults.processing; + description = "General config for processing documents"; + }; + convert = mkOption { type = types.submodule({ options = {