Restrict proposals for due date

Avoid dates too far in the future.
This commit is contained in:
Eike Kettner 2020-06-26 01:27:11 +02:00
parent 91da3b149e
commit d79ae6233a
6 changed files with 53 additions and 11 deletions

View File

@ -53,6 +53,9 @@ object Duration {
def days(n: Long): Duration = def days(n: Long): Duration =
apply(JDur.ofDays(n)) apply(JDur.ofDays(n))
def years(n: Long): Duration =
days(n * 365)
def nanos(n: Long): Duration = def nanos(n: Long): Duration =
Duration(n) Duration(n)

View File

@ -341,6 +341,13 @@ docspell.joex {
} }
} }
# General config for processing documents
processing {
# Restricts proposals for due dates. Only dates earlier than this
# number of years in the future are considered.
max-due-date-years = 10
}
# The same section is also present in the rest-server config. It is # The same section is also present in the rest-server config. It is
# used when submitting files into the job queue for processing. # used when submitting files into the job queue for processing.
# #

View File

@ -25,7 +25,8 @@ case class Config(
sendMail: MailSendConfig, sendMail: MailSendConfig,
files: Files, files: Files,
mailDebug: Boolean, mailDebug: Boolean,
fullTextSearch: Config.FullTextSearch fullTextSearch: Config.FullTextSearch,
processing: Config.Processing
) )
object Config { object Config {
@ -47,4 +48,6 @@ object Config {
final case class Migration(indexAllChunk: Int) final case class Migration(indexAllChunk: Int)
} }
case class Processing(maxDueDateYears: Int)
} }

View File

@ -8,6 +8,7 @@ import cats.effect.Sync
import docspell.analysis.contact._ import docspell.analysis.contact._
import docspell.common.MetaProposal.Candidate import docspell.common.MetaProposal.Candidate
import docspell.common._ import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task} import docspell.joex.scheduler.{Context, Task}
import docspell.store.records._ import docspell.store.records._
@ -16,33 +17,42 @@ import docspell.store.records._
*/ */
object FindProposal { object FindProposal {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = def apply[F[_]: Sync](
cfg: Config.Processing
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
ctx.logger.info("Starting find-proposal") *> ctx.logger.info("Starting find-proposal") *>
rmas rmas
.traverse(rm => .traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx) processAttachment(cfg, rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml)) .map(ml => rm.copy(proposals = ml))
) )
.map(rmv => data.copy(metas = rmv)) .map(rmv => data.copy(metas = rmv))
} }
def processAttachment[F[_]: Sync]( def processAttachment[F[_]: Sync](
cfg: Config.Processing,
rm: RAttachmentMeta, rm: RAttachmentMeta,
rd: Vector[NerDateLabel], rd: Vector[NerDateLabel],
ctx: Context[F, ProcessItemArgs] ctx: Context[F, ProcessItemArgs]
): F[MetaProposalList] = { ): F[MetaProposalList] = {
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx)) val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
List(finder.find(rm.nerlabels), makeDateProposal(rd)) List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd))
.traverse(identity) .traverse(identity)
.map(MetaProposalList.flatten) .map(MetaProposalList.flatten)
} }
def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] = def makeDateProposal[F[_]: Sync](
cfg: Config.Processing,
dates: Vector[NerDateLabel]
): F[MetaProposalList] =
Timestamp.current[F].map { now => Timestamp.current[F].map { now =>
val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date)) val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong))
val latestFirst = dates
.filter(_.date.isBefore(maxFuture.toUtcDate))
.sortWith((l1, l2) => l1.date.isAfter(l2.date))
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate)) val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))

View File

@ -2,7 +2,6 @@ package docspell.joex.process
import cats.effect._ import cats.effect._
import docspell.common.ProcessItemArgs import docspell.common.ProcessItemArgs
import docspell.analysis.TextAnalysisConfig
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.joex.Config import docspell.joex.Config
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
@ -19,16 +18,16 @@ object ProcessItem {
.flatMap(Task.setProgress(40)) .flatMap(Task.setProgress(40))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(60)) .flatMap(Task.setProgress(60))
.flatMap(analysisOnly[F](cfg.textAnalysis)) .flatMap(analysisOnly[F](cfg))
.flatMap(Task.setProgress(80)) .flatMap(Task.setProgress(80))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync]( def analysisOnly[F[_]: Sync](
cfg: TextAnalysisConfig cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg)(item) TextAnalysis[F](cfg.textAnalysis)(item)
.flatMap(FindProposal[F]) .flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F]) .flatMap(EvalProposals[F])
.flatMap(SaveProposals[F]) .flatMap(SaveProposals[F])

View File

@ -92,6 +92,9 @@ let
text-analysis = { text-analysis = {
max-length = 10000; max-length = 10000;
}; };
processing = {
max-due-date-years = 10;
};
convert = { convert = {
chunk-size = 524288; chunk-size = 524288;
max-image-size = 14000000; max-image-size = 14000000;
@ -666,6 +669,23 @@ in {
description = "Settings for text analysis"; description = "Settings for text analysis";
}; };
processing = mkOption {
type = types.submodule({
options = {
max-due-date-years = mkOption {
type = types.int;
default = defaults.processing.max-due-date-years;
description = ''
Restricts proposals for due dates. Only dates earlier than this
number of years in the future are considered.
'';
};
};
});
default = defaults.processing;
description = "General config for processing documents";
};
convert = mkOption { convert = mkOption {
type = types.submodule({ type = types.submodule({
options = { options = {