Merge pull request #161 from eikek/date-columns

Date columns
This commit is contained in:
mergify[bot] 2020-06-26 15:34:40 +00:00 committed by GitHub
commit 1a2893358e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 226 additions and 13 deletions

View File

@ -53,6 +53,9 @@ object Duration {
def days(n: Long): Duration = def days(n: Long): Duration =
apply(JDur.ofDays(n)) apply(JDur.ofDays(n))
def years(n: Long): Duration =
days(n * 365)
def nanos(n: Long): Duration = def nanos(n: Long): Duration =
Duration(n) Duration(n)

View File

@ -68,7 +68,7 @@ docspell.joex {
# How often a failed job should be retried until it enters failed # How often a failed job should be retried until it enters failed
# state. If a job fails, it becomes "stuck" and will be retried # state. If a job fails, it becomes "stuck" and will be retried
# after a delay. # after a delay.
retries = 5 retries = 2
# The delay until the next try is performed for a failed job. This # The delay until the next try is performed for a failed job. This
# delay is increased exponentially with the number of retries. # delay is increased exponentially with the number of retries.
@ -341,6 +341,13 @@ docspell.joex {
} }
} }
# General config for processing documents
processing {
# Restricts proposals for due dates. Only dates earlier than this
# number of years in the future are considered.
max-due-date-years = 10
}
# The same section is also present in the rest-server config. It is # The same section is also present in the rest-server config. It is
# used when submitting files into the job queue for processing. # used when submitting files into the job queue for processing.
# #

View File

@ -25,7 +25,8 @@ case class Config(
sendMail: MailSendConfig, sendMail: MailSendConfig,
files: Files, files: Files,
mailDebug: Boolean, mailDebug: Boolean,
fullTextSearch: Config.FullTextSearch fullTextSearch: Config.FullTextSearch,
processing: Config.Processing
) )
object Config { object Config {
@ -47,4 +48,6 @@ object Config {
final case class Migration(indexAllChunk: Int) final case class Migration(indexAllChunk: Int)
} }
case class Processing(maxDueDateYears: Int)
} }

View File

@ -8,6 +8,7 @@ import cats.effect.Sync
import docspell.analysis.contact._ import docspell.analysis.contact._
import docspell.common.MetaProposal.Candidate import docspell.common.MetaProposal.Candidate
import docspell.common._ import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task} import docspell.joex.scheduler.{Context, Task}
import docspell.store.records._ import docspell.store.records._
@ -16,33 +17,42 @@ import docspell.store.records._
*/ */
object FindProposal { object FindProposal {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = def apply[F[_]: Sync](
cfg: Config.Processing
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
ctx.logger.info("Starting find-proposal") *> ctx.logger.info("Starting find-proposal") *>
rmas rmas
.traverse(rm => .traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx) processAttachment(cfg, rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml)) .map(ml => rm.copy(proposals = ml))
) )
.map(rmv => data.copy(metas = rmv)) .map(rmv => data.copy(metas = rmv))
} }
def processAttachment[F[_]: Sync]( def processAttachment[F[_]: Sync](
cfg: Config.Processing,
rm: RAttachmentMeta, rm: RAttachmentMeta,
rd: Vector[NerDateLabel], rd: Vector[NerDateLabel],
ctx: Context[F, ProcessItemArgs] ctx: Context[F, ProcessItemArgs]
): F[MetaProposalList] = { ): F[MetaProposalList] = {
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx)) val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
List(finder.find(rm.nerlabels), makeDateProposal(rd)) List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd))
.traverse(identity) .traverse(identity)
.map(MetaProposalList.flatten) .map(MetaProposalList.flatten)
} }
def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] = def makeDateProposal[F[_]: Sync](
cfg: Config.Processing,
dates: Vector[NerDateLabel]
): F[MetaProposalList] =
Timestamp.current[F].map { now => Timestamp.current[F].map { now =>
val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date)) val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong))
val latestFirst = dates
.filter(_.date.isBefore(maxFuture.toUtcDate))
.sortWith((l1, l2) => l1.date.isAfter(l2.date))
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate)) val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))

View File

@ -2,7 +2,6 @@ package docspell.joex.process
import cats.effect._ import cats.effect._
import docspell.common.ProcessItemArgs import docspell.common.ProcessItemArgs
import docspell.analysis.TextAnalysisConfig
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.joex.Config import docspell.joex.Config
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
@ -19,16 +18,16 @@ object ProcessItem {
.flatMap(Task.setProgress(40)) .flatMap(Task.setProgress(40))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(60)) .flatMap(Task.setProgress(60))
.flatMap(analysisOnly[F](cfg.textAnalysis)) .flatMap(analysisOnly[F](cfg))
.flatMap(Task.setProgress(80)) .flatMap(Task.setProgress(80))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync]( def analysisOnly[F[_]: Sync](
cfg: TextAnalysisConfig cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg)(item) TextAnalysis[F](cfg.textAnalysis)(item)
.flatMap(FindProposal[F]) .flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F]) .flatMap(EvalProposals[F])
.flatMap(SaveProposals[F]) .flatMap(SaveProposals[F])

View File

@ -0,0 +1,5 @@
ALTER TABLE `item`
MODIFY `itemdate` DATETIME NULL;
ALTER TABLE `item`
MODIFY `duedate` DATETIME NULL;

View File

@ -35,7 +35,7 @@ let
scheduler = { scheduler = {
pool-size = 2; pool-size = 2;
counting-scheme = "4,1"; counting-scheme = "4,1";
retries = 5; retries = 2;
retry-delay = "1 minute"; retry-delay = "1 minute";
log-buffer-size = 500; log-buffer-size = 500;
wakeup-period = "30 minutes"; wakeup-period = "30 minutes";
@ -92,6 +92,9 @@ let
text-analysis = { text-analysis = {
max-length = 10000; max-length = 10000;
}; };
processing = {
max-due-date-years = 10;
};
convert = { convert = {
chunk-size = 524288; chunk-size = 524288;
max-image-size = 14000000; max-image-size = 14000000;
@ -133,6 +136,19 @@ let
chunk-size = 524288; chunk-size = 524288;
valid-mime-types = []; valid-mime-types = [];
}; };
full-text-search = {
enabled = false;
solr = {
url = "http://localhost:8983/solr/docspell";
commit-within = 1000;
log-verbose = false;
def-type = "lucene";
q-op = "OR";
};
migration = {
index-all-chunk = 10;
};
};
}; };
in { in {
@ -653,6 +669,23 @@ in {
description = "Settings for text analysis"; description = "Settings for text analysis";
}; };
processing = mkOption {
type = types.submodule({
options = {
max-due-date-years = mkOption {
type = types.int;
default = defaults.processing.max-due-date-years;
description = ''
Restricts proposals for due dates. Only dates earlier than this
number of years in the future are considered.
'';
};
};
});
default = defaults.processing;
description = "General config for processing documents";
};
convert = mkOption { convert = mkOption {
type = types.submodule({ type = types.submodule({
options = { options = {
@ -860,6 +893,79 @@ in {
default = defaults.files; default = defaults.files;
description= "Settings for how files are stored."; description= "Settings for how files are stored.";
}; };
full-text-search = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.full-text-search.enabled;
description = ''
The full-text search feature can be disabled. It requires an
additional index server which needs additional memory and disk
space. It can be enabled later any time.
Currently the SOLR search platform is supported.
'';
};
solr = mkOption {
type = types.submodule({
options = {
url = mkOption {
type = types.str;
default = defaults.full-text-search.solr.url;
description = "The URL to solr";
};
commit-within = mkOption {
type = types.int;
default = defaults.full-text-search.solr.commit-within;
description = "Used to tell solr when to commit the data";
};
log-verbose = mkOption {
type = types.bool;
default = defaults.full-text-search.solr.log-verbose;
description = "If true, logs request and response bodies";
};
def-type = mkOption {
type = types.str;
default = defaults.full-text-search.solr.def-type;
description = ''
The defType parameter to lucene that defines the parser to
use. You might want to try "edismax" or look here:
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
'';
};
q-op = mkOption {
type = types.str;
default = defaults.full-text-search.solr.q-op;
description = "The default combiner for tokens. One of {AND, OR}.";
};
};
});
default = defaults.full-text-search.solr;
description = "Configuration for the SOLR backend.";
};
migration = mkOption {
type = types.submodule({
options = {
index-all-chunk = mkOption {
type = types.int;
default = defaults.full-text-search.migration.index-all-chunk;
description = ''
Chunk size to use when indexing data from the database. This
many attachments are loaded into memory and pushed to the
full-text index.
'';
};
};
});
default = defaults.full-text-search.migration;
description = "Settings for running the index migration tasks";
};
};
});
default = defaults.full-text-search;
description = "Configuration for full-text search.";
};
}; };
}; };

View File

@ -37,6 +37,17 @@ let
header-value = "some-secret"; header-value = "some-secret";
}; };
}; };
full-text-search = {
enabled = false;
solr = {
url = "http://localhost:8983/solr/docspell";
commit-within = 1000;
log-verbose = false;
def-type = "lucene";
q-op = "OR";
};
recreate-key = "";
};
auth = { auth = {
server-secret = "hex:caffee"; server-secret = "hex:caffee";
session-valid = "5 minutes"; session-valid = "5 minutes";
@ -271,6 +282,75 @@ in {
''; '';
}; };
full-text-search = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.full-text-search.enabled;
description = ''
The full-text search feature can be disabled. It requires an
additional index server which needs additional memory and disk
space. It can be enabled later any time.
Currently the SOLR search platform is supported.
'';
};
solr = mkOption {
type = types.submodule({
options = {
url = mkOption {
type = types.str;
default = defaults.full-text-search.solr.url;
description = "The URL to solr";
};
commit-within = mkOption {
type = types.int;
default = defaults.full-text-search.solr.commit-within;
description = "Used to tell solr when to commit the data";
};
log-verbose = mkOption {
type = types.bool;
default = defaults.full-text-search.solr.log-verbose;
description = "If true, logs request and response bodies";
};
def-type = mkOption {
type = types.str;
default = defaults.full-text-search.solr.def-type;
description = ''
The defType parameter to lucene that defines the parser to
use. You might want to try "edismax" or look here:
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
'';
};
q-op = mkOption {
type = types.str;
default = defaults.full-text-search.solr.q-op;
description = "The default combiner for tokens. One of {AND, OR}.";
};
};
});
default = defaults.full-text-search.solr;
description = "Configuration for the SOLR backend.";
};
recreate-key = mkOption {
type = types.str;
default = defaults.full-text-search.recreate-key;
description = ''
When re-creating the complete index via a REST call, this key
is required. If left empty (the default), recreating the index
is disabled.
Example curl command:
curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
'';
};
};
});
default = defaults.full-text-search;
description = "Configuration for full-text search.";
};
backend = mkOption { backend = mkOption {
type = types.submodule({ type = types.submodule({
options = { options = {