mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 07:05:59 +00:00
Choose nlp mode in config file
This commit is contained in:
parent
54a09861c4
commit
aa937797be
@ -1,14 +1,17 @@
|
|||||||
package docspell.analysis
|
package docspell.analysis
|
||||||
|
|
||||||
|
import cats.Applicative
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
|
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
|
||||||
import docspell.analysis.contact.Contact
|
import docspell.analysis.contact.Contact
|
||||||
import docspell.analysis.date.DateFind
|
import docspell.analysis.date.DateFind
|
||||||
import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
|
import docspell.analysis.nlp._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||||
|
|
||||||
trait TextAnalyser[F[_]] {
|
trait TextAnalyser[F[_]] {
|
||||||
|
|
||||||
def annotate(
|
def annotate(
|
||||||
@ -33,8 +36,8 @@ object TextAnalyser {
|
|||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Resource[F, TextAnalyser[F]] =
|
): Resource[F, TextAnalyser[F]] =
|
||||||
Resource
|
Resource
|
||||||
.liftF(PipelineCache.full(cfg.clearStanfordPipelineInterval))
|
.liftF(Nlp(cfg.nlpConfig))
|
||||||
.map(cache =>
|
.map(stanfordNer =>
|
||||||
new TextAnalyser[F] {
|
new TextAnalyser[F] {
|
||||||
def annotate(
|
def annotate(
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
@ -44,7 +47,7 @@ object TextAnalyser {
|
|||||||
): F[TextAnalyser.Result] =
|
): F[TextAnalyser.Result] =
|
||||||
for {
|
for {
|
||||||
input <- textLimit(logger, text)
|
input <- textLimit(logger, text)
|
||||||
tags0 <- stanfordNer(cacheKey, settings, input)
|
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
|
||||||
tags1 <- contactNer(input)
|
tags1 <- contactNer(input)
|
||||||
dates <- dateNer(settings.lang, input)
|
dates <- dateNer(settings.lang, input)
|
||||||
list = tags0 ++ tags1
|
list = tags0 ++ tags1
|
||||||
@ -62,10 +65,6 @@ object TextAnalyser {
|
|||||||
s" Analysing only first ${cfg.maxLength} characters."
|
s" Analysing only first ${cfg.maxLength} characters."
|
||||||
) *> text.take(cfg.maxLength).pure[F]
|
) *> text.take(cfg.maxLength).pure[F]
|
||||||
|
|
||||||
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
|
||||||
: F[Vector[NerLabel]] =
|
|
||||||
StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
|
|
||||||
|
|
||||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
Contact.annotate(text)
|
Contact.annotate(text)
|
||||||
@ -78,4 +77,31 @@ object TextAnalyser {
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private object Nlp {
|
||||||
|
|
||||||
|
def apply[F[_]: Concurrent: Timer: BracketThrow](
|
||||||
|
cfg: TextAnalysisConfig.NlpConfig
|
||||||
|
): F[Input => F[Vector[NerLabel]]] =
|
||||||
|
cfg.mode match {
|
||||||
|
case NlpMode.Full =>
|
||||||
|
PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
|
||||||
|
case NlpMode.Basic =>
|
||||||
|
PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
|
||||||
|
case NlpMode.Disabled =>
|
||||||
|
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
|
||||||
|
}
|
||||||
|
|
||||||
|
final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
|
||||||
|
|
||||||
|
def full[F[_]: BracketThrow](
|
||||||
|
cache: PipelineCache[F, StanfordCoreNLP]
|
||||||
|
)(input: Input): F[Vector[NerLabel]] =
|
||||||
|
StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
||||||
|
|
||||||
|
def basic[F[_]: BracketThrow](
|
||||||
|
cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
|
||||||
|
)(input: Input): F[Vector[NerLabel]] =
|
||||||
|
BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,16 @@
|
|||||||
package docspell.analysis
|
package docspell.analysis
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalysisConfig.NlpConfig
|
||||||
import docspell.analysis.classifier.TextClassifierConfig
|
import docspell.analysis.classifier.TextClassifierConfig
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
case class TextAnalysisConfig(
|
case class TextAnalysisConfig(
|
||||||
maxLength: Int,
|
maxLength: Int,
|
||||||
clearStanfordPipelineInterval: Duration,
|
nlpConfig: NlpConfig,
|
||||||
classifier: TextClassifierConfig
|
classifier: TextClassifierConfig
|
||||||
)
|
)
|
||||||
|
|
||||||
|
object TextAnalysisConfig {
|
||||||
|
|
||||||
|
case class NlpConfig(clearInterval: Duration, mode: NlpMode)
|
||||||
|
}
|
||||||
|
23
modules/common/src/main/scala/docspell/common/NlpMode.scala
Normal file
23
modules/common/src/main/scala/docspell/common/NlpMode.scala
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
sealed trait NlpMode { self: Product =>
|
||||||
|
|
||||||
|
def name: String =
|
||||||
|
self.productPrefix
|
||||||
|
}
|
||||||
|
object NlpMode {
|
||||||
|
case object Full extends NlpMode
|
||||||
|
case object Basic extends NlpMode
|
||||||
|
case object Disabled extends NlpMode
|
||||||
|
|
||||||
|
def fromString(name: String): Either[String, NlpMode] =
|
||||||
|
name.toLowerCase match {
|
||||||
|
case "full" => Right(Full)
|
||||||
|
case "basic" => Right(Basic)
|
||||||
|
case "disabled" => Right(Disabled)
|
||||||
|
case _ => Left(s"Unknown nlp-mode: $name")
|
||||||
|
}
|
||||||
|
|
||||||
|
def unsafeFromString(name: String): NlpMode =
|
||||||
|
fromString(name).fold(sys.error, identity)
|
||||||
|
}
|
@ -44,6 +44,9 @@ object Implicits {
|
|||||||
implicit val priorityReader: ConfigReader[Priority] =
|
implicit val priorityReader: ConfigReader[Priority] =
|
||||||
ConfigReader[String].emap(reason(Priority.fromString))
|
ConfigReader[String].emap(reason(Priority.fromString))
|
||||||
|
|
||||||
|
implicit val nlpModeReader: ConfigReader[NlpMode] =
|
||||||
|
ConfigReader[String].emap(reason(NlpMode.fromString))
|
||||||
|
|
||||||
def reason[A: ClassTag](
|
def reason[A: ClassTag](
|
||||||
f: String => Either[String, A]
|
f: String => Either[String, A]
|
||||||
): String => Either[FailureReason, A] =
|
): String => Either[FailureReason, A] =
|
||||||
|
@ -277,12 +277,27 @@ docspell.joex {
|
|||||||
# files.
|
# files.
|
||||||
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
||||||
|
|
||||||
# The StanfordCoreNLP library caches language models which
|
nlp-config {
|
||||||
# requires quite some amount of memory. Setting this interval to a
|
# The StanfordCoreNLP library caches language models which
|
||||||
# positive duration, the cache is cleared after this amount of
|
# requires quite some amount of memory. Setting this interval to a
|
||||||
# idle time. Set it to 0 to disable it if you have enough memory,
|
# positive duration, the cache is cleared after this amount of
|
||||||
# processing will be faster.
|
# idle time. Set it to 0 to disable it if you have enough memory,
|
||||||
clear-stanford-nlp-interval = "15 minutes"
|
# processing will be faster.
|
||||||
|
#
|
||||||
|
# This has only any effect, if mode != disabled.
|
||||||
|
clear-interval = "15 minutes"
|
||||||
|
|
||||||
|
# The mode for configuring NLP models. Currently 3 are available:
|
||||||
|
#
|
||||||
|
# 1. full – builds the complete pipeline, run with -Xmx1500M or more
|
||||||
|
# 2. basic - builds only the ner annotator, run with -Xmx600M or more
|
||||||
|
# 3. disabled - doesn't use any stanford-nlp feature
|
||||||
|
#
|
||||||
|
# The basic variant does a quite good job for German and
|
||||||
|
# English. It might be worse for French, always depending on the
|
||||||
|
# type of text that is analysed.
|
||||||
|
mode = full
|
||||||
|
}
|
||||||
|
|
||||||
regex-ner {
|
regex-ner {
|
||||||
# Whether to enable custom NER annotation. This uses the address
|
# Whether to enable custom NER annotation. This uses the address
|
||||||
@ -295,6 +310,8 @@ docspell.joex {
|
|||||||
#
|
#
|
||||||
# This setting might be moved to the collective settings in the
|
# This setting might be moved to the collective settings in the
|
||||||
# future.
|
# future.
|
||||||
|
#
|
||||||
|
# Note, this is only relevant if nlp-config.mode = full.
|
||||||
enabled = true
|
enabled = true
|
||||||
|
|
||||||
# The NER annotation uses a file of patterns that is derived from
|
# The NER annotation uses a file of patterns that is derived from
|
||||||
|
@ -4,7 +4,8 @@ import java.nio.file.Path
|
|||||||
|
|
||||||
import cats.data.NonEmptyList
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
import docspell.analysis.{TextAnalysisConfig, classifier}
|
import docspell.analysis.TextAnalysisConfig
|
||||||
|
import docspell.analysis.classifier.TextClassifierConfig
|
||||||
import docspell.backend.Config.Files
|
import docspell.backend.Config.Files
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
@ -59,7 +60,7 @@ object Config {
|
|||||||
case class TextAnalysis(
|
case class TextAnalysis(
|
||||||
maxLength: Int,
|
maxLength: Int,
|
||||||
workingDir: Path,
|
workingDir: Path,
|
||||||
clearStanfordNlpInterval: Duration,
|
nlpConfig: TextAnalysisConfig.NlpConfig,
|
||||||
regexNer: RegexNer,
|
regexNer: RegexNer,
|
||||||
classification: Classification
|
classification: Classification
|
||||||
) {
|
) {
|
||||||
@ -67,8 +68,8 @@ object Config {
|
|||||||
def textAnalysisConfig: TextAnalysisConfig =
|
def textAnalysisConfig: TextAnalysisConfig =
|
||||||
TextAnalysisConfig(
|
TextAnalysisConfig(
|
||||||
maxLength,
|
maxLength,
|
||||||
clearStanfordNlpInterval,
|
nlpConfig,
|
||||||
classifier.TextClassifierConfig(
|
TextClassifierConfig(
|
||||||
workingDir,
|
workingDir,
|
||||||
NonEmptyList
|
NonEmptyList
|
||||||
.fromList(classification.classifiers)
|
.fromList(classification.classifiers)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user