mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Add classifier settings
This commit is contained in:
@ -271,6 +271,50 @@ docspell.joex {
|
||||
# file will be kept until a check for a state change is done.
|
||||
file-cache-time = "1 minute"
|
||||
}
|
||||
|
||||
# Settings for doing document classification.
|
||||
#
|
||||
# This works by learning from existing documents. A collective can
|
||||
# specify a tag category and the system will try to predict a tag
|
||||
# from this category for new incoming documents.
|
||||
#
|
||||
# This requires a satstical model that is computed from all
|
||||
# existing documents. This process is run periodically as
|
||||
# configured by the collective. It may require a lot of memory,
|
||||
# depending on the amount of data.
|
||||
#
|
||||
# It utilises this NLP library: https://nlp.stanford.edu/.
|
||||
classification {
|
||||
# Whether to enable classification globally. Each collective can
|
||||
# decide to disable it. If it is disabled here, no collective
|
||||
# can use classification.
|
||||
enabled = true
|
||||
|
||||
# If concerned with memory consumption, this restricts the
|
||||
# number of items to consider. More are better for training. A
|
||||
# negative value or zero means no train on all items.
|
||||
item-count = 0
|
||||
|
||||
# These settings are used to configure the classifier. If
|
||||
# multiple are given, they are all tried and the "best" is
|
||||
# chosen at the end. See
|
||||
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
|
||||
# for more info about these settings. The settings are almost
|
||||
# identical to them, as they yielded best results with *my*
|
||||
# dataset.
|
||||
#
|
||||
# Enclose regexps in triple quotes.
|
||||
classifiers = [
|
||||
{ "useSplitWords" = "true"
|
||||
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
|
||||
"splitWordsIgnoreRegexp" = """\s+"""
|
||||
"useSplitPrefixSuffixNGrams" = "true"
|
||||
"maxNGramLeng" = "4"
|
||||
"minNGramLeng" = "1"
|
||||
"splitWordShape" = "chris4"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Configuration for converting files into PDFs.
|
||||
|
Reference in New Issue
Block a user