mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-05 19:09:32 +00:00
commit
1a2893358e
@ -53,6 +53,9 @@ object Duration {
|
|||||||
def days(n: Long): Duration =
|
def days(n: Long): Duration =
|
||||||
apply(JDur.ofDays(n))
|
apply(JDur.ofDays(n))
|
||||||
|
|
||||||
|
def years(n: Long): Duration =
|
||||||
|
days(n * 365)
|
||||||
|
|
||||||
def nanos(n: Long): Duration =
|
def nanos(n: Long): Duration =
|
||||||
Duration(n)
|
Duration(n)
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ docspell.joex {
|
|||||||
# How often a failed job should be retried until it enters failed
|
# How often a failed job should be retried until it enters failed
|
||||||
# state. If a job fails, it becomes "stuck" and will be retried
|
# state. If a job fails, it becomes "stuck" and will be retried
|
||||||
# after a delay.
|
# after a delay.
|
||||||
retries = 5
|
retries = 2
|
||||||
|
|
||||||
# The delay until the next try is performed for a failed job. This
|
# The delay until the next try is performed for a failed job. This
|
||||||
# delay is increased exponentially with the number of retries.
|
# delay is increased exponentially with the number of retries.
|
||||||
@ -341,6 +341,13 @@ docspell.joex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# General config for processing documents
|
||||||
|
processing {
|
||||||
|
# Restricts proposals for due dates. Only dates earlier than this
|
||||||
|
# number of years in the future are considered.
|
||||||
|
max-due-date-years = 10
|
||||||
|
}
|
||||||
|
|
||||||
# The same section is also present in the rest-server config. It is
|
# The same section is also present in the rest-server config. It is
|
||||||
# used when submitting files into the job queue for processing.
|
# used when submitting files into the job queue for processing.
|
||||||
#
|
#
|
||||||
|
@ -25,7 +25,8 @@ case class Config(
|
|||||||
sendMail: MailSendConfig,
|
sendMail: MailSendConfig,
|
||||||
files: Files,
|
files: Files,
|
||||||
mailDebug: Boolean,
|
mailDebug: Boolean,
|
||||||
fullTextSearch: Config.FullTextSearch
|
fullTextSearch: Config.FullTextSearch,
|
||||||
|
processing: Config.Processing
|
||||||
)
|
)
|
||||||
|
|
||||||
object Config {
|
object Config {
|
||||||
@ -47,4 +48,6 @@ object Config {
|
|||||||
|
|
||||||
final case class Migration(indexAllChunk: Int)
|
final case class Migration(indexAllChunk: Int)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case class Processing(maxDueDateYears: Int)
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import cats.effect.Sync
|
|||||||
import docspell.analysis.contact._
|
import docspell.analysis.contact._
|
||||||
import docspell.common.MetaProposal.Candidate
|
import docspell.common.MetaProposal.Candidate
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.joex.Config
|
||||||
import docspell.joex.scheduler.{Context, Task}
|
import docspell.joex.scheduler.{Context, Task}
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
|
|
||||||
@ -16,33 +17,42 @@ import docspell.store.records._
|
|||||||
*/
|
*/
|
||||||
object FindProposal {
|
object FindProposal {
|
||||||
|
|
||||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
def apply[F[_]: Sync](
|
||||||
|
cfg: Config.Processing
|
||||||
|
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
|
val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels)))
|
||||||
|
|
||||||
ctx.logger.info("Starting find-proposal") *>
|
ctx.logger.info("Starting find-proposal") *>
|
||||||
rmas
|
rmas
|
||||||
.traverse(rm =>
|
.traverse(rm =>
|
||||||
processAttachment(rm, data.findDates(rm), ctx)
|
processAttachment(cfg, rm, data.findDates(rm), ctx)
|
||||||
.map(ml => rm.copy(proposals = ml))
|
.map(ml => rm.copy(proposals = ml))
|
||||||
)
|
)
|
||||||
.map(rmv => data.copy(metas = rmv))
|
.map(rmv => data.copy(metas = rmv))
|
||||||
}
|
}
|
||||||
|
|
||||||
def processAttachment[F[_]: Sync](
|
def processAttachment[F[_]: Sync](
|
||||||
|
cfg: Config.Processing,
|
||||||
rm: RAttachmentMeta,
|
rm: RAttachmentMeta,
|
||||||
rd: Vector[NerDateLabel],
|
rd: Vector[NerDateLabel],
|
||||||
ctx: Context[F, ProcessItemArgs]
|
ctx: Context[F, ProcessItemArgs]
|
||||||
): F[MetaProposalList] = {
|
): F[MetaProposalList] = {
|
||||||
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
|
val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx))
|
||||||
List(finder.find(rm.nerlabels), makeDateProposal(rd))
|
List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd))
|
||||||
.traverse(identity)
|
.traverse(identity)
|
||||||
.map(MetaProposalList.flatten)
|
.map(MetaProposalList.flatten)
|
||||||
}
|
}
|
||||||
|
|
||||||
def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] =
|
def makeDateProposal[F[_]: Sync](
|
||||||
|
cfg: Config.Processing,
|
||||||
|
dates: Vector[NerDateLabel]
|
||||||
|
): F[MetaProposalList] =
|
||||||
Timestamp.current[F].map { now =>
|
Timestamp.current[F].map { now =>
|
||||||
val latestFirst = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date))
|
val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong))
|
||||||
|
val latestFirst = dates
|
||||||
|
.filter(_.date.isBefore(maxFuture.toUtcDate))
|
||||||
|
.sortWith((l1, l2) => l1.date.isAfter(l2.date))
|
||||||
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
|
val nowDate = now.value.atZone(ZoneId.of("GMT")).toLocalDate
|
||||||
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))
|
val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate))
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ package docspell.joex.process
|
|||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import docspell.common.ProcessItemArgs
|
import docspell.common.ProcessItemArgs
|
||||||
import docspell.analysis.TextAnalysisConfig
|
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
@ -19,16 +18,16 @@ object ProcessItem {
|
|||||||
.flatMap(Task.setProgress(40))
|
.flatMap(Task.setProgress(40))
|
||||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||||
.flatMap(Task.setProgress(60))
|
.flatMap(Task.setProgress(60))
|
||||||
.flatMap(analysisOnly[F](cfg.textAnalysis))
|
.flatMap(analysisOnly[F](cfg))
|
||||||
.flatMap(Task.setProgress(80))
|
.flatMap(Task.setProgress(80))
|
||||||
.flatMap(LinkProposal[F])
|
.flatMap(LinkProposal[F])
|
||||||
.flatMap(Task.setProgress(99))
|
.flatMap(Task.setProgress(99))
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](
|
def analysisOnly[F[_]: Sync](
|
||||||
cfg: TextAnalysisConfig
|
cfg: Config
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
TextAnalysis[F](cfg)(item)
|
TextAnalysis[F](cfg.textAnalysis)(item)
|
||||||
.flatMap(FindProposal[F])
|
.flatMap(FindProposal[F](cfg.processing))
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
|
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
ALTER TABLE `item`
|
||||||
|
MODIFY `itemdate` DATETIME NULL;
|
||||||
|
|
||||||
|
ALTER TABLE `item`
|
||||||
|
MODIFY `duedate` DATETIME NULL;
|
@ -35,7 +35,7 @@ let
|
|||||||
scheduler = {
|
scheduler = {
|
||||||
pool-size = 2;
|
pool-size = 2;
|
||||||
counting-scheme = "4,1";
|
counting-scheme = "4,1";
|
||||||
retries = 5;
|
retries = 2;
|
||||||
retry-delay = "1 minute";
|
retry-delay = "1 minute";
|
||||||
log-buffer-size = 500;
|
log-buffer-size = 500;
|
||||||
wakeup-period = "30 minutes";
|
wakeup-period = "30 minutes";
|
||||||
@ -92,6 +92,9 @@ let
|
|||||||
text-analysis = {
|
text-analysis = {
|
||||||
max-length = 10000;
|
max-length = 10000;
|
||||||
};
|
};
|
||||||
|
processing = {
|
||||||
|
max-due-date-years = 10;
|
||||||
|
};
|
||||||
convert = {
|
convert = {
|
||||||
chunk-size = 524288;
|
chunk-size = 524288;
|
||||||
max-image-size = 14000000;
|
max-image-size = 14000000;
|
||||||
@ -133,6 +136,19 @@ let
|
|||||||
chunk-size = 524288;
|
chunk-size = 524288;
|
||||||
valid-mime-types = [];
|
valid-mime-types = [];
|
||||||
};
|
};
|
||||||
|
full-text-search = {
|
||||||
|
enabled = false;
|
||||||
|
solr = {
|
||||||
|
url = "http://localhost:8983/solr/docspell";
|
||||||
|
commit-within = 1000;
|
||||||
|
log-verbose = false;
|
||||||
|
def-type = "lucene";
|
||||||
|
q-op = "OR";
|
||||||
|
};
|
||||||
|
migration = {
|
||||||
|
index-all-chunk = 10;
|
||||||
|
};
|
||||||
|
};
|
||||||
};
|
};
|
||||||
in {
|
in {
|
||||||
|
|
||||||
@ -653,6 +669,23 @@ in {
|
|||||||
description = "Settings for text analysis";
|
description = "Settings for text analysis";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
processing = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
max-due-date-years = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.processing.max-due-date-years;
|
||||||
|
description = ''
|
||||||
|
Restricts proposals for due dates. Only dates earlier than this
|
||||||
|
number of years in the future are considered.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.processing;
|
||||||
|
description = "General config for processing documents";
|
||||||
|
};
|
||||||
|
|
||||||
convert = mkOption {
|
convert = mkOption {
|
||||||
type = types.submodule({
|
type = types.submodule({
|
||||||
options = {
|
options = {
|
||||||
@ -860,6 +893,79 @@ in {
|
|||||||
default = defaults.files;
|
default = defaults.files;
|
||||||
description= "Settings for how files are stored.";
|
description= "Settings for how files are stored.";
|
||||||
};
|
};
|
||||||
|
full-text-search = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.full-text-search.enabled;
|
||||||
|
description = ''
|
||||||
|
The full-text search feature can be disabled. It requires an
|
||||||
|
additional index server which needs additional memory and disk
|
||||||
|
space. It can be enabled later any time.
|
||||||
|
|
||||||
|
Currently the SOLR search platform is supported.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
solr = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
url = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.url;
|
||||||
|
description = "The URL to solr";
|
||||||
|
};
|
||||||
|
commit-within = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.full-text-search.solr.commit-within;
|
||||||
|
description = "Used to tell solr when to commit the data";
|
||||||
|
};
|
||||||
|
log-verbose = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.full-text-search.solr.log-verbose;
|
||||||
|
description = "If true, logs request and response bodies";
|
||||||
|
};
|
||||||
|
def-type = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.def-type;
|
||||||
|
description = ''
|
||||||
|
The defType parameter to lucene that defines the parser to
|
||||||
|
use. You might want to try "edismax" or look here:
|
||||||
|
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
q-op = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.q-op;
|
||||||
|
description = "The default combiner for tokens. One of {AND, OR}.";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.full-text-search.solr;
|
||||||
|
description = "Configuration for the SOLR backend.";
|
||||||
|
};
|
||||||
|
migration = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
index-all-chunk = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.full-text-search.migration.index-all-chunk;
|
||||||
|
description = ''
|
||||||
|
Chunk size to use when indexing data from the database. This
|
||||||
|
many attachments are loaded into memory and pushed to the
|
||||||
|
full-text index.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.full-text-search.migration;
|
||||||
|
description = "Settings for running the index migration tasks";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.full-text-search;
|
||||||
|
description = "Configuration for full-text search.";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -37,6 +37,17 @@ let
|
|||||||
header-value = "some-secret";
|
header-value = "some-secret";
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
full-text-search = {
|
||||||
|
enabled = false;
|
||||||
|
solr = {
|
||||||
|
url = "http://localhost:8983/solr/docspell";
|
||||||
|
commit-within = 1000;
|
||||||
|
log-verbose = false;
|
||||||
|
def-type = "lucene";
|
||||||
|
q-op = "OR";
|
||||||
|
};
|
||||||
|
recreate-key = "";
|
||||||
|
};
|
||||||
auth = {
|
auth = {
|
||||||
server-secret = "hex:caffee";
|
server-secret = "hex:caffee";
|
||||||
session-valid = "5 minutes";
|
session-valid = "5 minutes";
|
||||||
@ -271,6 +282,75 @@ in {
|
|||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
full-text-search = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.full-text-search.enabled;
|
||||||
|
description = ''
|
||||||
|
The full-text search feature can be disabled. It requires an
|
||||||
|
additional index server which needs additional memory and disk
|
||||||
|
space. It can be enabled later any time.
|
||||||
|
|
||||||
|
Currently the SOLR search platform is supported.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
solr = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
url = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.url;
|
||||||
|
description = "The URL to solr";
|
||||||
|
};
|
||||||
|
commit-within = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.full-text-search.solr.commit-within;
|
||||||
|
description = "Used to tell solr when to commit the data";
|
||||||
|
};
|
||||||
|
log-verbose = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.full-text-search.solr.log-verbose;
|
||||||
|
description = "If true, logs request and response bodies";
|
||||||
|
};
|
||||||
|
def-type = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.def-type;
|
||||||
|
description = ''
|
||||||
|
The defType parameter to lucene that defines the parser to
|
||||||
|
use. You might want to try "edismax" or look here:
|
||||||
|
https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
q-op = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.solr.q-op;
|
||||||
|
description = "The default combiner for tokens. One of {AND, OR}.";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.full-text-search.solr;
|
||||||
|
description = "Configuration for the SOLR backend.";
|
||||||
|
};
|
||||||
|
recreate-key = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.full-text-search.recreate-key;
|
||||||
|
description = ''
|
||||||
|
When re-creating the complete index via a REST call, this key
|
||||||
|
is required. If left empty (the default), recreating the index
|
||||||
|
is disabled.
|
||||||
|
|
||||||
|
Example curl command:
|
||||||
|
curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.full-text-search;
|
||||||
|
description = "Configuration for full-text search.";
|
||||||
|
};
|
||||||
|
|
||||||
backend = mkOption {
|
backend = mkOption {
|
||||||
type = types.submodule({
|
type = types.submodule({
|
||||||
options = {
|
options = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user