mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Update joex nixos module
This commit is contained in:
parent
145c308461
commit
afbe9554b6
@ -95,6 +95,21 @@ let
|
|||||||
enabled = true;
|
enabled = true;
|
||||||
file-cache-time = "1 minute";
|
file-cache-time = "1 minute";
|
||||||
};
|
};
|
||||||
|
classification = {
|
||||||
|
enabled = true;
|
||||||
|
item-count = 0;
|
||||||
|
classifiers = [
|
||||||
|
{ "useSplitWords" = "true";
|
||||||
|
"splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
|
||||||
|
"splitWordsIgnoreRegexp" = ''\s+'';
|
||||||
|
"useSplitPrefixSuffixNGrams" = "true";
|
||||||
|
"maxNGramLeng" = "4";
|
||||||
|
"minNGramLeng" = "1";
|
||||||
|
"splitWordShape" = "chris4";
|
||||||
|
"intern" = "true";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
working-dir = "/tmp/docspell-analysis";
|
working-dir = "/tmp/docspell-analysis";
|
||||||
};
|
};
|
||||||
processing = {
|
processing = {
|
||||||
@ -736,6 +751,59 @@ in {
|
|||||||
default = defaults.text-analysis.regex-ner;
|
default = defaults.text-analysis.regex-ner;
|
||||||
description = "";
|
description = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
classification = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.text-analysis.classification.enabled;
|
||||||
|
description = ''
|
||||||
|
Whether to enable classification globally. Each collective can
|
||||||
|
decide to disable it. If it is disabled here, no collective
|
||||||
|
can use classification.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
item-count = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.text-analysis.classification.item-count;
|
||||||
|
description = ''
|
||||||
|
If concerned with memory consumption, this restricts the
|
||||||
|
number of items to consider. More are better for training. A
|
||||||
|
negative value or zero means no train on all items.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
classifiers = mkOption {
|
||||||
|
type = types.listOf types.attrs;
|
||||||
|
default = defaults.text-analysis.classification.classifiers;
|
||||||
|
description = ''
|
||||||
|
These settings are used to configure the classifier. If
|
||||||
|
multiple are given, they are all tried and the "best" is
|
||||||
|
chosen at the end. See
|
||||||
|
https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||||
|
for more info about these settings. The settings here yielded
|
||||||
|
good results with *my* dataset.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis.classification;
|
||||||
|
description = ''
|
||||||
|
Settings for doing document classification.
|
||||||
|
|
||||||
|
This works by learning from existing documents. A collective can
|
||||||
|
specify a tag category and the system will try to predict a tag
|
||||||
|
from this category for new incoming documents.
|
||||||
|
|
||||||
|
This requires a satstical model that is computed from all
|
||||||
|
existing documents. This process is run periodically as
|
||||||
|
configured by the collective. It may require a lot of memory,
|
||||||
|
depending on the amount of data.
|
||||||
|
|
||||||
|
It utilises this NLP library: https://nlp.stanford.edu/.
|
||||||
|
'';
|
||||||
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.text-analysis;
|
default = defaults.text-analysis;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user