Merge pull request #161 from eikek/date-columns

Date columns
This commit is contained in:
mergify[bot] 2020-06-26 15:34:40 +00:00 committed by GitHub
commit 1a2893358e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 226 additions and 13 deletions

View File

@ -53,6 +53,9 @@ object Duration {
def days(n: Long): Duration =
apply(JDur.ofDays(n))
def years(n: Long): Duration =
days(n * 365)
def nanos(n: Long): Duration =
Duration(n)

View File

@ -68,7 +68,7 @@ docspell.joex {
# How often a failed job should be retried until it enters failed
# state. If a job fails, it becomes "stuck" and will be retried
# after a delay.
retries = 5
retries = 2
# The delay until the next try is performed for a failed job. This
# delay is increased exponentially with the number of retries.
@ -341,6 +341,13 @@ docspell.joex {
}
}
# General config for processing documents
processing {
# Restricts proposals for due dates. Only dates earlier than this
# number of years in the future are considered.
max-due-date-years = 10
}
# The same section is also present in the rest-server config. It is
# used when submitting files into the job queue for processing.
#

View File

@ -25,7 +25,8 @@ case class Config(
sendMail: MailSendConfig,
files: Files,
mailDebug: Boolean,
fullTextSearch: Config.FullTextSearch
fullTextSearch: Config.FullTextSearch,
processing: Config.Processing
)
object Config {
@ -47,4 +48,6 @@ object Config {
final case class Migration(indexAllChunk: Int)
}
case class Processing(maxDueDateYears: Int)
}

View File

@ -8,6 +8,7 @@ import cats.effect.Sync
import docspell.analysis.contact._
import docspell.common.MetaProposal.Candidate
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records._
@ -16,33 +17,42 @@ import docspell.store.records._
*/
object FindProposal {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
def apply[F[_]: Sync](
cfg: Config.Processing
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
ctx.logger.info("Starting find-proposal") *>
rmas
.traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx)
processAttachment(cfg, rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml))
)
.map(rmv => data.copy(metas = rmv))
}
def processAttachment[F[_]: Sync](
cfg: Config.Processing,
rm: RAttachmentMeta,
rd: Vector[NerDateLabel],
ctx: Context[F, ProcessItemArgs]
): F[MetaProposalList] = {
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
List(finder.find(rm.nerlabels), makeDateProposal(rd))
List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd))
.traverse(identity)
.map(MetaProposalList.flatten)
}
def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] =
def makeDateProposal[F[_]: Sync](
cfg: Config.Processing,
dates: Vector[NerDateLabel]
): F[MetaProposalList] =
Timestamp.current[F].map { now =>
val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date))
val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong))
val latestFirst = dates
.filter(_.date.isBefore(maxFuture.toUtcDate))
.sortWith((l1, l2) => l1.date.isAfter(l2.date))
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))

View File

@ -2,7 +2,6 @@ package docspell.joex.process
import cats.effect._
import docspell.common.ProcessItemArgs
import docspell.analysis.TextAnalysisConfig
import docspell.joex.scheduler.Task
import docspell.joex.Config
import docspell.ftsclient.FtsClient
@ -19,16 +18,16 @@ object ProcessItem {
.flatMap(Task.setProgress(40))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(60))
.flatMap(analysisOnly[F](cfg.textAnalysis))
.flatMap(analysisOnly[F](cfg))
.flatMap(Task.setProgress(80))
.flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync](
cfg: TextAnalysisConfig
cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg)(item)
.flatMap(FindProposal[F])
TextAnalysis[F](cfg.textAnalysis)(item)
.flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])

View File

@ -0,0 +1,5 @@
ALTER TABLE `item`
MODIFY `itemdate` DATETIME NULL;
ALTER TABLE `item`
MODIFY `duedate` DATETIME NULL;

View File

@ -35,7 +35,7 @@ let
scheduler = {
pool-size = 2;
counting-scheme = "4,1";
retries = 5;
retries = 2;
retry-delay = "1 minute";
log-buffer-size = 500;
wakeup-period = "30 minutes";
@ -92,6 +92,9 @@ let
text-analysis = {
max-length = 10000;
};
processing = {
max-due-date-years = 10;
};
convert = {
chunk-size = 524288;
max-image-size = 14000000;
@ -133,6 +136,19 @@ let
chunk-size = 524288;
valid-mime-types = [];
};
full-text-search = {
enabled = false;
solr = {
url = "http://localhost:8983/solr/docspell";
commit-within = 1000;
log-verbose = false;
def-type = "lucene";
q-op = "OR";
};
migration = {
index-all-chunk = 10;
};
};
};
in {
@ -653,6 +669,23 @@ in {
description = "Settings for text analysis";
};
processing = mkOption {
type = types.submodule({
options = {
max-due-date-years = mkOption {
type = types.int;
default = defaults.processing.max-due-date-years;
description = ''
Restricts proposals for due dates. Only dates earlier than this
number of years in the future are considered.
'';
};
};
});
default = defaults.processing;
description = "General config for processing documents";
};
convert = mkOption {
type = types.submodule({
options = {
@ -860,6 +893,79 @@ in {
default = defaults.files;
description= "Settings for how files are stored.";
};
full-text-search = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.full-text-search.enabled;
description = ''
The full-text search feature can be disabled. It requires an
additional index server which needs additional memory and disk
space. It can be enabled later any time.
Currently the SOLR search platform is supported.
'';
};
solr = mkOption {
type = types.submodule({
options = {
url = mkOption {
type = types.str;
default = defaults.full-text-search.solr.url;
description = "The URL to solr";
};
commit-within = mkOption {
type = types.int;
default = defaults.full-text-search.solr.commit-within;
description = "Used to tell solr when to commit the data";
};
log-verbose = mkOption {
type = types.bool;
default = defaults.full-text-search.solr.log-verbose;
description = "If true, logs request and response bodies";
};
def-type = mkOption {
type = types.str;
default = defaults.full-text-search.solr.def-type;
description = ''
The defType parameter to lucene that defines the parser to
use. You might want to try "edismax" or look here:
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
'';
};
q-op = mkOption {
type = types.str;
default = defaults.full-text-search.solr.q-op;
description = "The default combiner for tokens. One of {AND, OR}.";
};
};
});
default = defaults.full-text-search.solr;
description = "Configuration for the SOLR backend.";
};
migration = mkOption {
type = types.submodule({
options = {
index-all-chunk = mkOption {
type = types.int;
default = defaults.full-text-search.migration.index-all-chunk;
description = ''
Chunk size to use when indexing data from the database. This
many attachments are loaded into memory and pushed to the
full-text index.
'';
};
};
});
default = defaults.full-text-search.migration;
description = "Settings for running the index migration tasks";
};
};
});
default = defaults.full-text-search;
description = "Configuration for full-text search.";
};
};
};

View File

@ -37,6 +37,17 @@ let
header-value = "some-secret";
};
};
full-text-search = {
enabled = false;
solr = {
url = "http://localhost:8983/solr/docspell";
commit-within = 1000;
log-verbose = false;
def-type = "lucene";
q-op = "OR";
};
recreate-key = "";
};
auth = {
server-secret = "hex:caffee";
session-valid = "5 minutes";
@ -271,6 +282,75 @@ in {
'';
};
full-text-search = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.full-text-search.enabled;
description = ''
The full-text search feature can be disabled. It requires an
additional index server which needs additional memory and disk
space. It can be enabled later any time.
Currently the SOLR search platform is supported.
'';
};
solr = mkOption {
type = types.submodule({
options = {
url = mkOption {
type = types.str;
default = defaults.full-text-search.solr.url;
description = "The URL to solr";
};
commit-within = mkOption {
type = types.int;
default = defaults.full-text-search.solr.commit-within;
description = "Used to tell solr when to commit the data";
};
log-verbose = mkOption {
type = types.bool;
default = defaults.full-text-search.solr.log-verbose;
description = "If true, logs request and response bodies";
};
def-type = mkOption {
type = types.str;
default = defaults.full-text-search.solr.def-type;
description = ''
The defType parameter to lucene that defines the parser to
use. You might want to try "edismax" or look here:
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
'';
};
q-op = mkOption {
type = types.str;
default = defaults.full-text-search.solr.q-op;
description = "The default combiner for tokens. One of {AND, OR}.";
};
};
});
default = defaults.full-text-search.solr;
description = "Configuration for the SOLR backend.";
};
recreate-key = mkOption {
type = types.str;
default = defaults.full-text-search.recreate-key;
description = ''
When re-creating the complete index via a REST call, this key
is required. If left empty (the default), recreating the index
is disabled.
Example curl command:
curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
'';
};
};
});
default = defaults.full-text-search;
description = "Configuration for full-text search.";
};
backend = mkOption {
type = types.submodule({
options = {