Add classifier settings

This commit is contained in:
Eike Kettner
2020-08-28 22:17:49 +02:00
parent 53fdb100ab
commit 8c4f2e702b
17 changed files with 649 additions and 56 deletions

View File

@ -271,6 +271,50 @@ docspell.joex {
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
# Settings for doing document classification.
#
# This works by learning from existing documents. A collective can
# specify a tag category and the system will try to predict a tag
# from this category for new incoming documents.
#
# This requires a satstical model that is computed from all
# existing documents. This process is run periodically as
# configured by the collective. It may require a lot of memory,
# depending on the amount of data.
#
# It utilises this NLP library: https://nlp.stanford.edu/.
classification {
# Whether to enable classification globally. Each collective can
# decide to disable it. If it is disabled here, no collective
# can use classification.
enabled = true
# If concerned with memory consumption, this restricts the
# number of items to consider. More are better for training. A
# negative value or zero means no train on all items.
item-count = 0
# These settings are used to configure the classifier. If
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
# for more info about these settings. The settings are almost
# identical to them, as they yielded best results with *my*
# dataset.
#
# Enclose regexps in triple quotes.
classifiers = [
{ "useSplitWords" = "true"
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
"splitWordsIgnoreRegexp" = """\s+"""
"useSplitPrefixSuffixNGrams" = "true"
"maxNGramLeng" = "4"
"minNGramLeng" = "1"
"splitWordShape" = "chris4"
}
]
}
}
# Configuration for converting files into PDFs.

View File

@ -57,7 +57,8 @@ object Config {
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
regexNer: RegexNer,
classification: Classification
) {
def textAnalysisConfig: TextAnalysisConfig =
@ -68,4 +69,10 @@ object Config {
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
case class Classification(
enabled: Boolean,
itemCount: Int,
classifiers: List[Map[String, String]]
)
}