mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 02:49:32 +00:00
commit
1a2893358e
@ -53,6 +53,9 @@ object Duration {
|
||||
def days(n: Long): Duration =
|
||||
apply(JDur.ofDays(n))
|
||||
|
||||
def years(n: Long): Duration =
|
||||
days(n * 365)
|
||||
|
||||
def nanos(n: Long): Duration =
|
||||
Duration(n)
|
||||
|
||||
|
@ -68,7 +68,7 @@ docspell.joex {
|
||||
# How often a failed job should be retried until it enters failed
|
||||
# state. If a job fails, it becomes "stuck" and will be retried
|
||||
# after a delay.
|
||||
retries = 5
|
||||
retries = 2
|
||||
|
||||
# The delay until the next try is performed for a failed job. This
|
||||
# delay is increased exponentially with the number of retries.
|
||||
@ -341,6 +341,13 @@ docspell.joex {
|
||||
}
|
||||
}
|
||||
|
||||
# General config for processing documents
|
||||
processing {
|
||||
# Restricts proposals for due dates. Only dates earlier than this
|
||||
# number of years in the future are considered.
|
||||
max-due-date-years = 10
|
||||
}
|
||||
|
||||
# The same section is also present in the rest-server config. It is
|
||||
# used when submitting files into the job queue for processing.
|
||||
#
|
||||
|
@ -25,7 +25,8 @@ case class Config(
|
||||
sendMail: MailSendConfig,
|
||||
files: Files,
|
||||
mailDebug: Boolean,
|
||||
fullTextSearch: Config.FullTextSearch
|
||||
fullTextSearch: Config.FullTextSearch,
|
||||
processing: Config.Processing
|
||||
)
|
||||
|
||||
object Config {
|
||||
@ -47,4 +48,6 @@ object Config {
|
||||
|
||||
final case class Migration(indexAllChunk: Int)
|
||||
}
|
||||
|
||||
case class Processing(maxDueDateYears: Int)
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ import cats.effect.Sync
|
||||
import docspell.analysis.contact._
|
||||
import docspell.common.MetaProposal.Candidate
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.records._
|
||||
|
||||
@ -16,33 +17,42 @@ import docspell.store.records._
|
||||
*/
|
||||
object FindProposal {
|
||||
|
||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
def apply[F[_]: Sync](
|
||||
cfg: Config.Processing
|
||||
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
|
||||
|
||||
ctx.logger.info("Starting find-proposal") *>
|
||||
rmas
|
||||
.traverse(rm =>
|
||||
processAttachment(rm, data.findDates(rm), ctx)
|
||||
processAttachment(cfg, rm, data.findDates(rm), ctx)
|
||||
.map(ml => rm.copy(proposals = ml))
|
||||
)
|
||||
.map(rmv => data.copy(metas = rmv))
|
||||
}
|
||||
|
||||
def processAttachment[F[_]: Sync](
|
||||
cfg: Config.Processing,
|
||||
rm: RAttachmentMeta,
|
||||
rd: Vector[NerDateLabel],
|
||||
ctx: Context[F, ProcessItemArgs]
|
||||
): F[MetaProposalList] = {
|
||||
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
|
||||
List(finder.find(rm.nerlabels), makeDateProposal(rd))
|
||||
List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd))
|
||||
.traverse(identity)
|
||||
.map(MetaProposalList.flatten)
|
||||
}
|
||||
|
||||
def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] =
|
||||
def makeDateProposal[F[_]: Sync](
|
||||
cfg: Config.Processing,
|
||||
dates: Vector[NerDateLabel]
|
||||
): F[MetaProposalList] =
|
||||
Timestamp.current[F].map { now =>
|
||||
val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date))
|
||||
val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong))
|
||||
val latestFirst = dates
|
||||
.filter(_.date.isBefore(maxFuture.toUtcDate))
|
||||
.sortWith((l1, l2) => l1.date.isAfter(l2.date))
|
||||
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
|
||||
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))
|
||||
|
||||
|
@ -2,7 +2,6 @@ package docspell.joex.process
|
||||
|
||||
import cats.effect._
|
||||
import docspell.common.ProcessItemArgs
|
||||
import docspell.analysis.TextAnalysisConfig
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.joex.Config
|
||||
import docspell.ftsclient.FtsClient
|
||||
@ -19,16 +18,16 @@ object ProcessItem {
|
||||
.flatMap(Task.setProgress(40))
|
||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||
.flatMap(Task.setProgress(60))
|
||||
.flatMap(analysisOnly[F](cfg.textAnalysis))
|
||||
.flatMap(analysisOnly[F](cfg))
|
||||
.flatMap(Task.setProgress(80))
|
||||
.flatMap(LinkProposal[F])
|
||||
.flatMap(Task.setProgress(99))
|
||||
|
||||
def analysisOnly[F[_]: Sync](
|
||||
cfg: TextAnalysisConfig
|
||||
cfg: Config
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](cfg)(item)
|
||||
.flatMap(FindProposal[F])
|
||||
TextAnalysis[F](cfg.textAnalysis)(item)
|
||||
.flatMap(FindProposal[F](cfg.processing))
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
|
||||
|
@ -0,0 +1,5 @@
|
||||
ALTER TABLE `item`
|
||||
MODIFY `itemdate` DATETIME NULL;
|
||||
|
||||
ALTER TABLE `item`
|
||||
MODIFY `duedate` DATETIME NULL;
|
@ -35,7 +35,7 @@ let
|
||||
scheduler = {
|
||||
pool-size = 2;
|
||||
counting-scheme = "4,1";
|
||||
retries = 5;
|
||||
retries = 2;
|
||||
retry-delay = "1 minute";
|
||||
log-buffer-size = 500;
|
||||
wakeup-period = "30 minutes";
|
||||
@ -92,6 +92,9 @@ let
|
||||
text-analysis = {
|
||||
max-length = 10000;
|
||||
};
|
||||
processing = {
|
||||
max-due-date-years = 10;
|
||||
};
|
||||
convert = {
|
||||
chunk-size = 524288;
|
||||
max-image-size = 14000000;
|
||||
@ -133,6 +136,19 @@ let
|
||||
chunk-size = 524288;
|
||||
valid-mime-types = [];
|
||||
};
|
||||
full-text-search = {
|
||||
enabled = false;
|
||||
solr = {
|
||||
url = "http://localhost:8983/solr/docspell";
|
||||
commit-within = 1000;
|
||||
log-verbose = false;
|
||||
def-type = "lucene";
|
||||
q-op = "OR";
|
||||
};
|
||||
migration = {
|
||||
index-all-chunk = 10;
|
||||
};
|
||||
};
|
||||
};
|
||||
in {
|
||||
|
||||
@ -653,6 +669,23 @@ in {
|
||||
description = "Settings for text analysis";
|
||||
};
|
||||
|
||||
processing = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
max-due-date-years = mkOption {
|
||||
type = types.int;
|
||||
default = defaults.processing.max-due-date-years;
|
||||
description = ''
|
||||
Restricts proposals for due dates. Only dates earlier than this
|
||||
number of years in the future are considered.
|
||||
'';
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.processing;
|
||||
description = "General config for processing documents";
|
||||
};
|
||||
|
||||
convert = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
@ -860,6 +893,79 @@ in {
|
||||
default = defaults.files;
|
||||
description= "Settings for how files are stored.";
|
||||
};
|
||||
full-text-search = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
enabled = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.full-text-search.enabled;
|
||||
description = ''
|
||||
The full-text search feature can be disabled. It requires an
|
||||
additional index server which needs additional memory and disk
|
||||
space. It can be enabled later any time.
|
||||
|
||||
Currently the SOLR search platform is supported.
|
||||
'';
|
||||
};
|
||||
solr = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
url = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.url;
|
||||
description = "The URL to solr";
|
||||
};
|
||||
commit-within = mkOption {
|
||||
type = types.int;
|
||||
default = defaults.full-text-search.solr.commit-within;
|
||||
description = "Used to tell solr when to commit the data";
|
||||
};
|
||||
log-verbose = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.full-text-search.solr.log-verbose;
|
||||
description = "If true, logs request and response bodies";
|
||||
};
|
||||
def-type = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.def-type;
|
||||
description = ''
|
||||
The defType parameter to lucene that defines the parser to
|
||||
use. You might want to try "edismax" or look here:
|
||||
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
|
||||
'';
|
||||
};
|
||||
q-op = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.q-op;
|
||||
description = "The default combiner for tokens. One of {AND, OR}.";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.full-text-search.solr;
|
||||
description = "Configuration for the SOLR backend.";
|
||||
};
|
||||
migration = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
index-all-chunk = mkOption {
|
||||
type = types.int;
|
||||
default = defaults.full-text-search.migration.index-all-chunk;
|
||||
description = ''
|
||||
Chunk size to use when indexing data from the database. This
|
||||
many attachments are loaded into memory and pushed to the
|
||||
full-text index.
|
||||
'';
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.full-text-search.migration;
|
||||
description = "Settings for running the index migration tasks";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.full-text-search;
|
||||
description = "Configuration for full-text search.";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -37,6 +37,17 @@ let
|
||||
header-value = "some-secret";
|
||||
};
|
||||
};
|
||||
full-text-search = {
|
||||
enabled = false;
|
||||
solr = {
|
||||
url = "http://localhost:8983/solr/docspell";
|
||||
commit-within = 1000;
|
||||
log-verbose = false;
|
||||
def-type = "lucene";
|
||||
q-op = "OR";
|
||||
};
|
||||
recreate-key = "";
|
||||
};
|
||||
auth = {
|
||||
server-secret = "hex:caffee";
|
||||
session-valid = "5 minutes";
|
||||
@ -271,6 +282,75 @@ in {
|
||||
'';
|
||||
};
|
||||
|
||||
full-text-search = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
enabled = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.full-text-search.enabled;
|
||||
description = ''
|
||||
The full-text search feature can be disabled. It requires an
|
||||
additional index server which needs additional memory and disk
|
||||
space. It can be enabled later any time.
|
||||
|
||||
Currently the SOLR search platform is supported.
|
||||
'';
|
||||
};
|
||||
solr = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
url = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.url;
|
||||
description = "The URL to solr";
|
||||
};
|
||||
commit-within = mkOption {
|
||||
type = types.int;
|
||||
default = defaults.full-text-search.solr.commit-within;
|
||||
description = "Used to tell solr when to commit the data";
|
||||
};
|
||||
log-verbose = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.full-text-search.solr.log-verbose;
|
||||
description = "If true, logs request and response bodies";
|
||||
};
|
||||
def-type = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.def-type;
|
||||
description = ''
|
||||
The defType parameter to lucene that defines the parser to
|
||||
use. You might want to try "edismax" or look here:
|
||||
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
|
||||
'';
|
||||
};
|
||||
q-op = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.solr.q-op;
|
||||
description = "The default combiner for tokens. One of {AND, OR}.";
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.full-text-search.solr;
|
||||
description = "Configuration for the SOLR backend.";
|
||||
};
|
||||
recreate-key = mkOption {
|
||||
type = types.str;
|
||||
default = defaults.full-text-search.recreate-key;
|
||||
description = ''
|
||||
When re-creating the complete index via a REST call, this key
|
||||
is required. If left empty (the default), recreating the index
|
||||
is disabled.
|
||||
|
||||
Example curl command:
|
||||
curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
|
||||
'';
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.full-text-search;
|
||||
description = "Configuration for full-text search.";
|
||||
};
|
||||
|
||||
backend = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
|
Loading…
x
Reference in New Issue
Block a user