Choose nlp mode in config file

This commit is contained in:
Eike Kettner 2021-01-14 00:55:19 +01:00
parent 54a09861c4
commit aa937797be
6 changed files with 95 additions and 19 deletions

View File

@ -1,14 +1,17 @@
package docspell.analysis
import cats.Applicative
import cats.effect._
import cats.implicits._
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind
import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
import docspell.analysis.nlp._
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
trait TextAnalyser[F[_]] {
def annotate(
@ -33,8 +36,8 @@ object TextAnalyser {
blocker: Blocker
): Resource[F, TextAnalyser[F]] =
Resource
.liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval))
.map(cache =>
.liftF(Nlp(cfg.nlpConfig))
.map(stanfordNer =>
new TextAnalyser[F] {
def annotate(
logger: Logger[F],
@ -44,7 +47,7 @@ object TextAnalyser {
): F[TextAnalyser.Result] =
for {
input <- textLimit(logger, text)
tags0 <- stanfordNer(cacheKey, settings, input)
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
tags1 <- contactNer(input)
dates <- dateNer(settings.lang, input)
list = tags0 ++ tags1
@ -62,10 +65,6 @@ object TextAnalyser {
s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F]
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
: F[Vector[NerLabel]] =
StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {
Contact.annotate(text)
@ -78,4 +77,31 @@ object TextAnalyser {
}
)
private object Nlp {
def apply[F[_]: Concurrent: Timer: BracketThrow](
cfg: TextAnalysisConfig.NlpConfig
): F[Input => F[Vector[NerLabel]]] =
cfg.mode match {
case NlpMode.Full =>
PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
case NlpMode.Basic =>
PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
case NlpMode.Disabled =>
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
}
final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
def full[F[_]: BracketThrow](
cache: PipelineCache[F, StanfordCoreNLP]
)(input: Input): F[Vector[NerLabel]] =
StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
def basic[F[_]: BracketThrow](
cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
)(input: Input): F[Vector[NerLabel]] =
BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
}
}

View File

@ -1,10 +1,16 @@
package docspell.analysis
import docspell.analysis.TextAnalysisConfig.NlpConfig
import docspell.analysis.classifier.TextClassifierConfig
import docspell.common._
case class TextAnalysisConfig(
maxLength: Int,
clearStanfordPipelineInterval: Duration,
nlpConfig: NlpConfig,
classifier: TextClassifierConfig
)
object TextAnalysisConfig {
case class NlpConfig(clearInterval: Duration, mode: NlpMode)
}

View File

@ -0,0 +1,23 @@
package docspell.common
sealed trait NlpMode { self: Product =>
def name: String =
self.productPrefix
}
object NlpMode {
case object Full extends NlpMode
case object Basic extends NlpMode
case object Disabled extends NlpMode
def fromString(name: String): Either[String, NlpMode] =
name.toLowerCase match {
case "full" => Right(Full)
case "basic" => Right(Basic)
case "disabled" => Right(Disabled)
case _ => Left(s"Unknown nlp-mode: $name")
}
def unsafeFromString(name: String): NlpMode =
fromString(name).fold(sys.error, identity)
}

View File

@ -44,6 +44,9 @@ object Implicits {
implicit val priorityReader: ConfigReader[Priority] =
ConfigReader[String].emap(reason(Priority.fromString))
implicit val nlpModeReader: ConfigReader[NlpMode] =
ConfigReader[String].emap(reason(NlpMode.fromString))
def reason[A: ClassTag](
f: String => Either[String, A]
): String => Either[FailureReason, A] =

View File

@ -277,12 +277,27 @@ docspell.joex {
# files.
working-dir = ${java.io.tmpdir}"/docspell-analysis"
# The StanfordCoreNLP library caches language models which
# requires quite some amount of memory. Setting this interval to a
# positive duration, the cache is cleared after this amount of
# idle time. Set it to 0 to disable it if you have enough memory,
# processing will be faster.
clear-stanford-nlp-interval = "15 minutes"
nlp-config {
# The StanfordCoreNLP library caches language models which
# requires quite some amount of memory. Setting this interval to a
# positive duration, the cache is cleared after this amount of
# idle time. Set it to 0 to disable it if you have enough memory,
# processing will be faster.
#
# This has only any effect, if mode != disabled.
clear-interval = "15 minutes"
# The mode for configuring NLP models. Currently 3 are available:
#
# 1. full builds the complete pipeline, run with -Xmx1500M or more
# 2. basic - builds only the ner annotator, run with -Xmx600M or more
# 3. disabled - doesn't use any stanford-nlp feature
#
# The basic variant does a quite good job for German and
# English. It might be worse for French, always depending on the
# type of text that is analysed.
mode = full
}
regex-ner {
# Whether to enable custom NER annotation. This uses the address
@ -295,6 +310,8 @@ docspell.joex {
#
# This setting might be moved to the collective settings in the
# future.
#
# Note, this is only relevant if nlp-config.mode = full.
enabled = true
# The NER annotation uses a file of patterns that is derived from

View File

@ -4,7 +4,8 @@ import java.nio.file.Path
import cats.data.NonEmptyList
import docspell.analysis.{TextAnalysisConfig, classifier}
import docspell.analysis.TextAnalysisConfig
import docspell.analysis.classifier.TextClassifierConfig
import docspell.backend.Config.Files
import docspell.common._
import docspell.convert.ConvertConfig
@ -59,7 +60,7 @@ object Config {
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
clearStanfordNlpInterval: Duration,
nlpConfig: TextAnalysisConfig.NlpConfig,
regexNer: RegexNer,
classification: Classification
) {
@ -67,8 +68,8 @@ object Config {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(
maxLength,
clearStanfordNlpInterval,
classifier.TextClassifierConfig(
nlpConfig,
TextClassifierConfig(
workingDir,
NonEmptyList
.fromList(classification.classifiers)