mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Add classifier settings
This commit is contained in:
@ -271,6 +271,50 @@ docspell.joex {
|
||||
# file will be kept until a check for a state change is done.
|
||||
file-cache-time = "1 minute"
|
||||
}
|
||||
|
||||
# Settings for doing document classification.
|
||||
#
|
||||
# This works by learning from existing documents. A collective can
|
||||
# specify a tag category and the system will try to predict a tag
|
||||
# from this category for new incoming documents.
|
||||
#
|
||||
# This requires a satstical model that is computed from all
|
||||
# existing documents. This process is run periodically as
|
||||
# configured by the collective. It may require a lot of memory,
|
||||
# depending on the amount of data.
|
||||
#
|
||||
# It utilises this NLP library: https://nlp.stanford.edu/.
|
||||
classification {
|
||||
# Whether to enable classification globally. Each collective can
|
||||
# decide to disable it. If it is disabled here, no collective
|
||||
# can use classification.
|
||||
enabled = true
|
||||
|
||||
# If concerned with memory consumption, this restricts the
|
||||
# number of items to consider. More are better for training. A
|
||||
# negative value or zero means no train on all items.
|
||||
item-count = 0
|
||||
|
||||
# These settings are used to configure the classifier. If
|
||||
# multiple are given, they are all tried and the "best" is
|
||||
# chosen at the end. See
|
||||
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
|
||||
# for more info about these settings. The settings are almost
|
||||
# identical to them, as they yielded best results with *my*
|
||||
# dataset.
|
||||
#
|
||||
# Enclose regexps in triple quotes.
|
||||
classifiers = [
|
||||
{ "useSplitWords" = "true"
|
||||
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
|
||||
"splitWordsIgnoreRegexp" = """\s+"""
|
||||
"useSplitPrefixSuffixNGrams" = "true"
|
||||
"maxNGramLeng" = "4"
|
||||
"minNGramLeng" = "1"
|
||||
"splitWordShape" = "chris4"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Configuration for converting files into PDFs.
|
||||
|
@ -57,7 +57,8 @@ object Config {
|
||||
case class TextAnalysis(
|
||||
maxLength: Int,
|
||||
workingDir: Path,
|
||||
regexNer: RegexNer
|
||||
regexNer: RegexNer,
|
||||
classification: Classification
|
||||
) {
|
||||
|
||||
def textAnalysisConfig: TextAnalysisConfig =
|
||||
@ -68,4 +69,10 @@ object Config {
|
||||
}
|
||||
|
||||
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
||||
|
||||
case class Classification(
|
||||
enabled: Boolean,
|
||||
itemCount: Int,
|
||||
classifiers: List[Map[String, String]]
|
||||
)
|
||||
}
|
||||
|
Reference in New Issue
Block a user