mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Allow configuring stanford-ner and cache based on collective
This commit is contained in:
@ -5,12 +5,19 @@ import cats.implicits._
|
||||
|
||||
import docspell.analysis.contact.Contact
|
||||
import docspell.analysis.date.DateFind
|
||||
import docspell.analysis.nlp.PipelineCache
|
||||
import docspell.analysis.nlp.StanfordNerClassifier
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.common._
|
||||
|
||||
trait TextAnalyser[F[_]] {
|
||||
|
||||
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result]
|
||||
|
||||
}
|
||||
object TextAnalyser {
|
||||
@ -22,43 +29,47 @@ object TextAnalyser {
|
||||
}
|
||||
|
||||
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
|
||||
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
lang: Language,
|
||||
text: String
|
||||
): F[TextAnalyser.Result] =
|
||||
for {
|
||||
input <- textLimit(logger, text)
|
||||
tags0 <- stanfordNer(lang, input)
|
||||
tags1 <- contactNer(input)
|
||||
dates <- dateNer(lang, input)
|
||||
list = tags0 ++ tags1
|
||||
spans = NerLabelSpan.build(list)
|
||||
} yield Result(spans ++ list, dates)
|
||||
Resource
|
||||
.liftF(PipelineCache[F]())
|
||||
.map(cache =>
|
||||
new TextAnalyser[F] {
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result] =
|
||||
for {
|
||||
input <- textLimit(logger, text)
|
||||
tags0 <- stanfordNer(cacheKey, settings, input)
|
||||
tags1 <- contactNer(input)
|
||||
dates <- dateNer(settings.lang, input)
|
||||
list = tags0 ++ tags1
|
||||
spans = NerLabelSpan.build(list)
|
||||
} yield Result(spans ++ list, dates)
|
||||
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (text.length <= cfg.maxLength) text.pure[F]
|
||||
else
|
||||
logger.info(
|
||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||
s" Analysing only first ${cfg.maxLength} characters."
|
||||
) *> text.take(cfg.maxLength).pure[F]
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (text.length <= cfg.maxLength) text.pure[F]
|
||||
else
|
||||
logger.info(
|
||||
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||
s" Analysing only first ${cfg.maxLength} characters."
|
||||
) *> text.take(cfg.maxLength).pure[F]
|
||||
|
||||
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
StanfordNerClassifier.nerAnnotate(lang)(text)
|
||||
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
|
||||
: F[Vector[NerLabel]] =
|
||||
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
||||
|
||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
Contact.annotate(text)
|
||||
}
|
||||
|
||||
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
|
||||
Sync[F].delay {
|
||||
DateFind.findDates(text, lang).toVector
|
||||
}
|
||||
}
|
||||
|
||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
Contact.annotate(text)
|
||||
}
|
||||
|
||||
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
|
||||
Sync[F].delay {
|
||||
DateFind.findDates(text, lang).toVector
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,90 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import cats.Applicative
|
||||
import cats.effect._
|
||||
import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
import org.log4s.getLogger
|
||||
|
||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||
* involves IO and initializing large objects.
|
||||
*
|
||||
* Therefore, the instances are cached, because they are thread-safe.
|
||||
*
|
||||
* **This is an internal API**
|
||||
*/
|
||||
trait PipelineCache[F[_]] {
|
||||
|
||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
|
||||
|
||||
}
|
||||
|
||||
object PipelineCache {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
def none[F[_]: Applicative]: PipelineCache[F] =
|
||||
new PipelineCache[F] {
|
||||
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||
makeClassifier(settings).pure[F]
|
||||
}
|
||||
|
||||
def apply[F[_]: Sync](): F[PipelineCache[F]] =
|
||||
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
|
||||
|
||||
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
|
||||
extends PipelineCache[F] {
|
||||
|
||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||
for {
|
||||
id <- makeSettingsId(settings)
|
||||
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
|
||||
} yield nlp
|
||||
|
||||
private def getOrCreate(
|
||||
key: String,
|
||||
id: String,
|
||||
cache: Map[String, Entry],
|
||||
settings: StanfordSettings
|
||||
): (Map[String, Entry], StanfordCoreNLP) =
|
||||
cache.get(key) match {
|
||||
case Some(entry) =>
|
||||
if (entry.id == id) (cache, entry.value)
|
||||
else {
|
||||
logger.info(
|
||||
s"StanfordNLP settings changed for key $key. Creating new classifier"
|
||||
)
|
||||
val nlp = makeClassifier(settings)
|
||||
val e = Entry(id, nlp)
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
case None =>
|
||||
val nlp = makeClassifier(settings)
|
||||
val e = Entry(id, nlp)
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
private def makeSettingsId(settings: StanfordSettings): F[String] = {
|
||||
val base = settings.copy(regexNer = None).toString
|
||||
val size: F[Long] =
|
||||
settings.regexNer match {
|
||||
case Some(p) =>
|
||||
File.size(p)
|
||||
case None =>
|
||||
0L.pure[F]
|
||||
}
|
||||
size.map(len => s"$base-$len")
|
||||
}
|
||||
|
||||
}
|
||||
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
|
||||
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||
}
|
||||
|
||||
private case class Entry(id: String, value: StanfordCoreNLP)
|
||||
}
|
@ -3,6 +3,7 @@ package docspell.analysis.nlp
|
||||
import java.util.{Properties => JProps}
|
||||
|
||||
import docspell.analysis.nlp.Properties.Implicits._
|
||||
import docspell.common._
|
||||
|
||||
object Properties {
|
||||
|
||||
@ -13,6 +14,19 @@ object Properties {
|
||||
p
|
||||
}
|
||||
|
||||
def forSettings(settings: StanfordSettings): JProps = {
|
||||
val regexNerFile = settings.regexNer
|
||||
.map(p => p.normalize().toAbsolutePath().toString())
|
||||
settings.lang match {
|
||||
case Language.German =>
|
||||
Properties.nerGerman(regexNerFile, settings.highRecall)
|
||||
case Language.English =>
|
||||
Properties.nerEnglish(regexNerFile)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(regexNerFile, settings.highRecall)
|
||||
}
|
||||
}
|
||||
|
||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
|
@ -1,45 +1,39 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.util.{Properties => JProps}
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
import cats.Applicative
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
||||
import org.log4s.getLogger
|
||||
|
||||
object StanfordNerClassifier {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
lazy val germanNerClassifier = makeClassifier(Language.German)
|
||||
lazy val englishNerClassifier = makeClassifier(Language.English)
|
||||
lazy val frenchNerClassifier = makeClassifier(Language.French)
|
||||
/** Runs named entity recognition on the given `text`.
|
||||
*
|
||||
* This uses the classifier pipeline from stanford-nlp, see
|
||||
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
|
||||
* classifiers is quite expensive, it involves loading large model
|
||||
* files. The classifiers are thread-safe and so they are cached.
|
||||
* The `cacheKey` defines the "slot" where classifiers are stored
|
||||
* and retrieved. If for a given `cacheKey` the `settings` change,
|
||||
* a new classifier must be created. It will then replace the
|
||||
* previous one.
|
||||
*/
|
||||
def nerAnnotate[F[_]: Applicative](
|
||||
cacheKey: String,
|
||||
cache: PipelineCache[F]
|
||||
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(cacheKey, settings)
|
||||
.map(crf => runClassifier(crf, text))
|
||||
|
||||
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
|
||||
val nerClassifier = lang match {
|
||||
case Language.English => englishNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
case Language.French => frenchNerClassifier
|
||||
}
|
||||
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||
val doc = new CoreDocument(text)
|
||||
nerClassifier.annotate(doc)
|
||||
|
||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
||||
}
|
||||
|
||||
private def makeClassifier(lang: Language): StanfordCoreNLP = {
|
||||
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
|
||||
new StanfordCoreNLP(classifierProperties(lang))
|
||||
}
|
||||
|
||||
private def classifierProperties(lang: Language): JProps =
|
||||
lang match {
|
||||
case Language.German =>
|
||||
Properties.nerGerman(None, false)
|
||||
case Language.English =>
|
||||
Properties.nerEnglish(None)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(None, false)
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common._
|
||||
|
||||
/** Settings for configuring the stanford NER pipeline.
|
||||
*
|
||||
* The language is mandatory, only the provided ones are supported.
|
||||
* The `highRecall` only applies for non-English languages. For
|
||||
* non-English languages the english classifier is run as second
|
||||
* classifier and if `highRecall` is true, then it will be used to
|
||||
* tag untagged tokens. This may lead to a lot of false positives,
|
||||
* but since English is omnipresent in other languages, too it
|
||||
* depends on the use case for whether this is useful or not.
|
||||
*
|
||||
* The `regexNer` allows to specify a text file as described here:
|
||||
* https://nlp.stanford.edu/software/regexner.html. This will be used
|
||||
* as a last step to tag untagged tokens using the provided list of
|
||||
* regexps.
|
||||
*/
|
||||
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
Reference in New Issue
Block a user