mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 10:28:27 +00:00
Reorganize nlp pipeline and add nlp-unsupported language italian
Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
This commit is contained in:
@ -0,0 +1,7 @@
|
||||
package docspell.analysis
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common._
|
||||
|
||||
case class NlpSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
@ -10,13 +10,13 @@ import docspell.analysis.date.DateFind
|
||||
import docspell.analysis.nlp._
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
import org.log4s.getLogger
|
||||
|
||||
trait TextAnalyser[F[_]] {
|
||||
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordNerSettings,
|
||||
settings: NlpSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result]
|
||||
@ -24,6 +24,7 @@ trait TextAnalyser[F[_]] {
|
||||
def classifier: TextClassifier[F]
|
||||
}
|
||||
object TextAnalyser {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
|
||||
|
||||
@ -41,13 +42,13 @@ object TextAnalyser {
|
||||
new TextAnalyser[F] {
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordNerSettings,
|
||||
settings: NlpSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result] =
|
||||
for {
|
||||
input <- textLimit(logger, text)
|
||||
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, input))
|
||||
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input))
|
||||
tags1 <- contactNer(input)
|
||||
dates <- dateNer(settings.lang, input)
|
||||
list = tags0 ++ tags1
|
||||
@ -77,31 +78,36 @@ object TextAnalyser {
|
||||
}
|
||||
)
|
||||
|
||||
/** Provides the nlp pipeline based on the configuration. */
|
||||
private object Nlp {
|
||||
|
||||
def apply[F[_]: Concurrent: Timer: BracketThrow](
|
||||
cfg: TextAnalysisConfig.NlpConfig
|
||||
): F[Input => F[Vector[NerLabel]]] =
|
||||
): F[Input[F] => F[Vector[NerLabel]]] =
|
||||
cfg.mode match {
|
||||
case NlpMode.Full =>
|
||||
PipelineCache.full(cfg.clearInterval).map(cache => full(cache))
|
||||
case NlpMode.Basic =>
|
||||
PipelineCache.basic(cfg.clearInterval).map(cache => basic(cache))
|
||||
case NlpMode.Disabled =>
|
||||
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
|
||||
Logger.log4s(logger).info("NLP is disabled as defined in config.") *>
|
||||
Applicative[F].pure(_ => Vector.empty[NerLabel].pure[F])
|
||||
case _ =>
|
||||
PipelineCache(cfg.clearInterval)(
|
||||
Annotator[F](cfg.mode),
|
||||
Annotator.clearCaches[F]
|
||||
)
|
||||
.map(annotate[F])
|
||||
}
|
||||
|
||||
final case class Input(key: Ident, settings: StanfordNerSettings, text: String)
|
||||
final case class Input[F[_]](
|
||||
key: Ident,
|
||||
settings: NlpSettings,
|
||||
logger: Logger[F],
|
||||
text: String
|
||||
)
|
||||
|
||||
def full[F[_]: BracketThrow](
|
||||
cache: PipelineCache[F, StanfordCoreNLP]
|
||||
)(input: Input): F[Vector[NerLabel]] =
|
||||
StanfordNerAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
||||
|
||||
def basic[F[_]: BracketThrow](
|
||||
cache: PipelineCache[F, BasicCRFAnnotator.Annotator]
|
||||
)(input: Input): F[Vector[NerLabel]] =
|
||||
BasicCRFAnnotator.nerAnnotate(input.key.id, cache)(input.settings, input.text)
|
||||
def annotate[F[_]: BracketThrow](
|
||||
cache: PipelineCache[F]
|
||||
)(input: Input[F]): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(input.key.id, input.settings)
|
||||
.use(ann => ann.nerAnnotate(input.logger)(input.text))
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -41,23 +41,30 @@ object DateFind {
|
||||
}
|
||||
|
||||
object SimpleDate {
|
||||
val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
def pattern0(lang: Language) = (readYear >> readMonth(lang) >> readDay).map {
|
||||
case ((y, m), d) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
}
|
||||
val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
def pattern1(lang: Language) = (readDay >> readMonth(lang) >> readYear).map {
|
||||
case ((d, m), y) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
}
|
||||
val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
def pattern2(lang: Language) = (readMonth(lang) >> readDay >> readYear).map {
|
||||
case ((m, d), y) =>
|
||||
List(SimpleDate(y, m, d))
|
||||
}
|
||||
|
||||
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
|
||||
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
|
||||
val p0 = pattern0(lang)
|
||||
val p1 = pattern1(lang)
|
||||
val p2 = pattern2(lang)
|
||||
val p = lang match {
|
||||
case Language.English =>
|
||||
p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1)
|
||||
case Language.German => p1.or(p0).or(p2)
|
||||
case Language.French => p1.or(p0).or(p2)
|
||||
case Language.German => p1.or(p0).or(p2)
|
||||
case Language.French => p1.or(p0).or(p2)
|
||||
case Language.Italian => p1.or(p0).or(p2)
|
||||
}
|
||||
p.read(parts) match {
|
||||
case Result.Success(sds, _) =>
|
||||
@ -76,9 +83,11 @@ object DateFind {
|
||||
}
|
||||
)
|
||||
|
||||
def readMonth: Reader[Int] =
|
||||
def readMonth(lang: Language): Reader[Int] =
|
||||
Reader.readFirst(w =>
|
||||
Some(months.indexWhere(_.contains(w.value))).filter(_ >= 0).map(_ + 1)
|
||||
Some(MonthName.getAll(lang).indexWhere(_.contains(w.value)))
|
||||
.filter(_ >= 0)
|
||||
.map(_ + 1)
|
||||
)
|
||||
|
||||
def readDay: Reader[Int] =
|
||||
@ -150,20 +159,5 @@ object DateFind {
|
||||
Failure
|
||||
}
|
||||
}
|
||||
|
||||
private val months = List(
|
||||
List("jan", "january", "januar", "01"),
|
||||
List("feb", "february", "februar", "02"),
|
||||
List("mar", "march", "märz", "marz", "03"),
|
||||
List("apr", "april", "04"),
|
||||
List("may", "mai", "05"),
|
||||
List("jun", "june", "juni", "06"),
|
||||
List("jul", "july", "juli", "07"),
|
||||
List("aug", "august", "08"),
|
||||
List("sep", "september", "09"),
|
||||
List("oct", "october", "oktober", "10"),
|
||||
List("nov", "november", "11"),
|
||||
List("dec", "december", "dezember", "12")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,101 @@
|
||||
package docspell.analysis.date
|
||||
|
||||
import docspell.common.Language
|
||||
|
||||
object MonthName {
|
||||
|
||||
def getAll(lang: Language): List[List[String]] =
|
||||
merge(numbers, forLang(lang))
|
||||
|
||||
private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
|
||||
ns.foldLeft(n0) { (res, el) =>
|
||||
res.zip(el).map({ case (a, b) => a ++ b })
|
||||
}
|
||||
|
||||
private def forLang(lang: Language): List[List[String]] =
|
||||
lang match {
|
||||
case Language.English =>
|
||||
english
|
||||
case Language.German =>
|
||||
german
|
||||
case Language.French =>
|
||||
french
|
||||
case Language.Italian =>
|
||||
italian
|
||||
}
|
||||
|
||||
private val numbers = List(
|
||||
List("01"),
|
||||
List("02"),
|
||||
List("03"),
|
||||
List("04"),
|
||||
List("05"),
|
||||
List("06"),
|
||||
List("07"),
|
||||
List("08"),
|
||||
List("09"),
|
||||
List("10"),
|
||||
List("11"),
|
||||
List("12")
|
||||
)
|
||||
|
||||
private val english = List(
|
||||
List("jan", "january"),
|
||||
List("feb", "february"),
|
||||
List("mar", "march"),
|
||||
List("apr", "april"),
|
||||
List("may"),
|
||||
List("jun", "june"),
|
||||
List("jul", "july"),
|
||||
List("aug", "august"),
|
||||
List("sept", "september"),
|
||||
List("oct", "october"),
|
||||
List("nov", "november"),
|
||||
List("dec", "december")
|
||||
)
|
||||
|
||||
private val german = List(
|
||||
List("jan", "januar"),
|
||||
List("feb", "februar"),
|
||||
List("märz"),
|
||||
List("apr", "april"),
|
||||
List("mai"),
|
||||
List("juni"),
|
||||
List("juli"),
|
||||
List("aug", "august"),
|
||||
List("sept", "september"),
|
||||
List("okt", "oktober"),
|
||||
List("nov", "november"),
|
||||
List("dez", "dezember")
|
||||
)
|
||||
|
||||
private val french = List(
|
||||
List("janv", "janvier"),
|
||||
List("févr", "fevr", "février", "fevrier"),
|
||||
List("mars"),
|
||||
List("avril"),
|
||||
List("mai"),
|
||||
List("juin"),
|
||||
List("juil", "juillet"),
|
||||
List("aout", "août"),
|
||||
List("sept", "septembre"),
|
||||
List("oct", "octobre"),
|
||||
List("nov", "novembre"),
|
||||
List("dec", "déc", "décembre", "decembre")
|
||||
)
|
||||
|
||||
private val italian = List(
|
||||
List("genn", "gennaio"),
|
||||
List("febbr", "febbraio"),
|
||||
List("mar", "marzo"),
|
||||
List("apr", "aprile"),
|
||||
List("magg", "maggio"),
|
||||
List("giugno"),
|
||||
List("luglio"),
|
||||
List("ag", "agosto"),
|
||||
List("sett", "settembre"),
|
||||
List("ott", "ottobre"),
|
||||
List("nov", "novembre"),
|
||||
List("dic", "dicembre")
|
||||
)
|
||||
}
|
@ -0,0 +1,98 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import cats.effect.Sync
|
||||
import cats.implicits._
|
||||
import cats.{Applicative, FlatMap}
|
||||
|
||||
import docspell.analysis.NlpSettings
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
|
||||
/** Analyses a text to mark certain parts with a `NerLabel`. */
|
||||
trait Annotator[F[_]] { self =>
|
||||
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]]
|
||||
|
||||
def ++(next: Annotator[F])(implicit F: FlatMap[F]): Annotator[F] =
|
||||
new Annotator[F] {
|
||||
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||
for {
|
||||
n0 <- self.nerAnnotate(logger)(text)
|
||||
n1 <- next.nerAnnotate(logger)(text)
|
||||
} yield (n0 ++ n1).distinct
|
||||
}
|
||||
}
|
||||
|
||||
object Annotator {
|
||||
|
||||
/** Creates an annotator according to the given `mode` and `settings`.
|
||||
*
|
||||
* There are the following ways:
|
||||
*
|
||||
* - disabled: it returns a no-op annotator that always gives an empty list
|
||||
* - full: the complete stanford pipeline is used
|
||||
* - basic: only the ner classifier is used
|
||||
*
|
||||
* Additionally, if there is a regexNer-file specified, the regexner annotator is
|
||||
* also run. In case the full pipeline is used, this is already included.
|
||||
*/
|
||||
def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
|
||||
mode match {
|
||||
case NlpMode.Disabled =>
|
||||
Annotator.none[F]
|
||||
case NlpMode.Full =>
|
||||
StanfordNerSettings.fromNlpSettings(settings) match {
|
||||
case Some(ss) =>
|
||||
Annotator.pipeline(StanfordNerAnnotator.makePipeline(ss))
|
||||
case None =>
|
||||
Annotator.none[F]
|
||||
}
|
||||
case NlpMode.Basic =>
|
||||
StanfordNerSettings.fromNlpSettings(settings) match {
|
||||
case Some(StanfordNerSettings.Full(lang, _, Some(file))) =>
|
||||
Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang)) ++
|
||||
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||
case Some(StanfordNerSettings.Full(lang, _, None)) =>
|
||||
Annotator.basic(BasicCRFAnnotator.Cache.getAnnotator(lang))
|
||||
case Some(StanfordNerSettings.RegexOnly(file)) =>
|
||||
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||
case None =>
|
||||
Annotator.none[F]
|
||||
}
|
||||
case NlpMode.RegexOnly =>
|
||||
settings.regexNer match {
|
||||
case Some(file) =>
|
||||
Annotator.pipeline(StanfordNerAnnotator.regexNerPipeline(file))
|
||||
case None =>
|
||||
Annotator.none[F]
|
||||
}
|
||||
}
|
||||
|
||||
def none[F[_]: Applicative]: Annotator[F] =
|
||||
new Annotator[F] {
|
||||
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||
logger.debug("Running empty annotator. NLP not supported.") *>
|
||||
Vector.empty[NerLabel].pure[F]
|
||||
}
|
||||
|
||||
def basic[F[_]: Sync](ann: BasicCRFAnnotator.Annotator): Annotator[F] =
|
||||
new Annotator[F] {
|
||||
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay(
|
||||
BasicCRFAnnotator.nerAnnotate(ann)(text)
|
||||
)
|
||||
}
|
||||
|
||||
def pipeline[F[_]: Sync](cp: StanfordCoreNLP): Annotator[F] =
|
||||
new Annotator[F] {
|
||||
def nerAnnotate(logger: Logger[F])(text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay(StanfordNerAnnotator.nerAnnotate(cp, text))
|
||||
|
||||
}
|
||||
|
||||
def clearCaches[F[_]: Sync]: F[Unit] =
|
||||
Sync[F].delay {
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
BasicCRFAnnotator.Cache.clearCache()
|
||||
}
|
||||
}
|
@ -7,9 +7,7 @@ import java.util.zip.GZIPInputStream
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.Using
|
||||
|
||||
import cats.Applicative
|
||||
import cats.effect.BracketThrow
|
||||
|
||||
import docspell.common.Language.NLPLanguage
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||
@ -30,14 +28,6 @@ object BasicCRFAnnotator {
|
||||
|
||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||
|
||||
def nerAnnotate[F[_]: BracketThrow](
|
||||
cacheKey: String,
|
||||
cache: PipelineCache[F, Annotator]
|
||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(cacheKey, settings)
|
||||
.use(crf => Applicative[F].pure(nerAnnotate(crf)(text)))
|
||||
|
||||
def nerAnnotate(nerClassifier: Annotator)(text: String): Vector[NerLabel] =
|
||||
nerClassifier
|
||||
.classify(text)
|
||||
@ -52,7 +42,7 @@ object BasicCRFAnnotator {
|
||||
})
|
||||
.toVector
|
||||
|
||||
private def makeClassifier(lang: Language): Annotator = {
|
||||
def makeAnnotator(lang: NLPLanguage): Annotator = {
|
||||
logger.info(s"Creating ${lang.name} Stanford NLP NER-only classifier...")
|
||||
val ner = classifierResource(lang)
|
||||
Using(new GZIPInputStream(ner.openStream())) { in =>
|
||||
@ -60,7 +50,7 @@ object BasicCRFAnnotator {
|
||||
}.fold(throw _, identity)
|
||||
}
|
||||
|
||||
private def classifierResource(lang: Language): URL = {
|
||||
private def classifierResource(lang: NLPLanguage): URL = {
|
||||
def check(name: String): URL =
|
||||
Option(getClass.getResource(name)) match {
|
||||
case None =>
|
||||
@ -79,11 +69,11 @@ object BasicCRFAnnotator {
|
||||
}
|
||||
|
||||
final class Cache {
|
||||
private[this] lazy val germanNerClassifier = makeClassifier(Language.German)
|
||||
private[this] lazy val englishNerClassifier = makeClassifier(Language.English)
|
||||
private[this] lazy val frenchNerClassifier = makeClassifier(Language.French)
|
||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||
|
||||
def forLang(language: Language): Annotator =
|
||||
def forLang(language: NLPLanguage): Annotator =
|
||||
language match {
|
||||
case Language.French => frenchNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
@ -95,7 +85,7 @@ object BasicCRFAnnotator {
|
||||
|
||||
private[this] val cacheRef = new AtomicReference[Cache](new Cache)
|
||||
|
||||
def getAnnotator(language: Language): Annotator =
|
||||
def getAnnotator(language: NLPLanguage): Annotator =
|
||||
cacheRef.get().forLang(language)
|
||||
|
||||
def clearCache(): Unit =
|
||||
|
@ -3,14 +3,13 @@ package docspell.analysis.nlp
|
||||
import scala.concurrent.duration.{Duration => _, _}
|
||||
|
||||
import cats.Applicative
|
||||
import cats.data.Kleisli
|
||||
import cats.effect._
|
||||
import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.NlpSettings
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
import org.log4s.getLogger
|
||||
|
||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||
@ -20,58 +19,32 @@ import org.log4s.getLogger
|
||||
*
|
||||
* **This is an internal API**
|
||||
*/
|
||||
trait PipelineCache[F[_], A] {
|
||||
trait PipelineCache[F[_]] {
|
||||
|
||||
def obtain(key: String, settings: StanfordNerSettings): Resource[F, A]
|
||||
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]]
|
||||
|
||||
}
|
||||
|
||||
object PipelineCache {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
def none[F[_]: Applicative, A](
|
||||
creator: Kleisli[F, StanfordNerSettings, A]
|
||||
): PipelineCache[F, A] =
|
||||
new PipelineCache[F, A] {
|
||||
def obtain(
|
||||
ignored: String,
|
||||
settings: StanfordNerSettings
|
||||
): Resource[F, A] =
|
||||
Resource.liftF(creator.run(settings))
|
||||
}
|
||||
|
||||
def apply[F[_]: Concurrent: Timer, A](clearInterval: Duration)(
|
||||
creator: StanfordNerSettings => A,
|
||||
def apply[F[_]: Concurrent: Timer](clearInterval: Duration)(
|
||||
creator: NlpSettings => Annotator[F],
|
||||
release: F[Unit]
|
||||
): F[PipelineCache[F, A]] =
|
||||
): F[PipelineCache[F]] =
|
||||
for {
|
||||
data <- Ref.of(Map.empty[String, Entry[A]])
|
||||
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
|
||||
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
||||
} yield new Impl[F, A](data, creator, cacheClear)
|
||||
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
|
||||
} yield new Impl[F](data, creator, cacheClear)
|
||||
|
||||
def full[F[_]: Concurrent: Timer](
|
||||
clearInterval: Duration
|
||||
): F[PipelineCache[F, StanfordCoreNLP]] =
|
||||
apply(clearInterval)(
|
||||
StanfordNerAnnotator.makePipeline,
|
||||
StanfordNerAnnotator.clearPipelineCaches
|
||||
)
|
||||
|
||||
def basic[F[_]: Concurrent: Timer](
|
||||
clearInterval: Duration
|
||||
): F[PipelineCache[F, BasicCRFAnnotator.Annotator]] =
|
||||
apply(clearInterval)(
|
||||
settings => BasicCRFAnnotator.Cache.getAnnotator(settings.lang),
|
||||
Sync[F].delay(BasicCRFAnnotator.Cache.clearCache())
|
||||
)
|
||||
|
||||
final private class Impl[F[_]: Sync, A](
|
||||
data: Ref[F, Map[String, Entry[A]]],
|
||||
creator: StanfordNerSettings => A,
|
||||
final private class Impl[F[_]: Sync](
|
||||
data: Ref[F, Map[String, Entry[Annotator[F]]]],
|
||||
creator: NlpSettings => Annotator[F],
|
||||
cacheClear: CacheClearing[F]
|
||||
) extends PipelineCache[F, A] {
|
||||
) extends PipelineCache[F] {
|
||||
|
||||
def obtain(key: String, settings: StanfordNerSettings): Resource[F, A] =
|
||||
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] =
|
||||
for {
|
||||
_ <- cacheClear.withCache
|
||||
id <- Resource.liftF(makeSettingsId(settings))
|
||||
@ -83,10 +56,10 @@ object PipelineCache {
|
||||
private def getOrCreate(
|
||||
key: String,
|
||||
id: String,
|
||||
cache: Map[String, Entry[A]],
|
||||
settings: StanfordNerSettings,
|
||||
creator: StanfordNerSettings => A
|
||||
): (Map[String, Entry[A]], A) =
|
||||
cache: Map[String, Entry[Annotator[F]]],
|
||||
settings: NlpSettings,
|
||||
creator: NlpSettings => Annotator[F]
|
||||
): (Map[String, Entry[Annotator[F]]], Annotator[F]) =
|
||||
cache.get(key) match {
|
||||
case Some(entry) =>
|
||||
if (entry.id == id) (cache, entry.value)
|
||||
@ -105,7 +78,7 @@ object PipelineCache {
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
|
||||
private def makeSettingsId(settings: NlpSettings): F[String] = {
|
||||
val base = settings.copy(regexNer = None).toString
|
||||
val size: F[Long] =
|
||||
settings.regexNer match {
|
||||
|
@ -1,9 +1,11 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
import java.util.{Properties => JProps}
|
||||
|
||||
import docspell.analysis.nlp.Properties.Implicits._
|
||||
import docspell.common._
|
||||
import docspell.common.syntax.FileSyntax._
|
||||
|
||||
object Properties {
|
||||
|
||||
@ -17,18 +19,21 @@ object Properties {
|
||||
p
|
||||
}
|
||||
|
||||
def forSettings(settings: StanfordNerSettings): JProps = {
|
||||
val regexNerFile = settings.regexNer
|
||||
.map(p => p.normalize().toAbsolutePath().toString())
|
||||
settings.lang match {
|
||||
case Language.German =>
|
||||
Properties.nerGerman(regexNerFile, settings.highRecall)
|
||||
case Language.English =>
|
||||
Properties.nerEnglish(regexNerFile)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(regexNerFile, settings.highRecall)
|
||||
def forSettings(settings: StanfordNerSettings): JProps =
|
||||
settings match {
|
||||
case StanfordNerSettings.Full(lang, highRecall, regexNer) =>
|
||||
val regexNerFile = regexNer.map(p => p.absolutePathAsString)
|
||||
lang match {
|
||||
case Language.German =>
|
||||
Properties.nerGerman(regexNerFile, highRecall)
|
||||
case Language.English =>
|
||||
Properties.nerEnglish(regexNerFile)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(regexNerFile, highRecall)
|
||||
}
|
||||
case StanfordNerSettings.RegexOnly(path) =>
|
||||
Properties.regexNerOnly(path)
|
||||
}
|
||||
}
|
||||
|
||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
@ -76,6 +81,11 @@ object Properties {
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit"
|
||||
).withRegexNer(Some(regexNerMappingFile.absolutePathAsString))
|
||||
|
||||
object Implicits {
|
||||
implicit final class JPropsOps(val p: JProps) extends AnyVal {
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
import cats.Applicative
|
||||
import cats.effect._
|
||||
|
||||
import docspell.common._
|
||||
@ -24,24 +25,24 @@ object StanfordNerAnnotator {
|
||||
* a new classifier must be created. It will then replace the
|
||||
* previous one.
|
||||
*/
|
||||
def nerAnnotate[F[_]: BracketThrow](
|
||||
cacheKey: String,
|
||||
cache: PipelineCache[F, StanfordCoreNLP]
|
||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(cacheKey, settings)
|
||||
.use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
|
||||
|
||||
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||
val doc = new CoreDocument(text)
|
||||
nerClassifier.annotate(doc)
|
||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
||||
}
|
||||
|
||||
def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP = {
|
||||
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||
}
|
||||
def makePipeline(settings: StanfordNerSettings): StanfordCoreNLP =
|
||||
settings match {
|
||||
case s: StanfordNerSettings.Full =>
|
||||
logger.info(s"Creating ${s.lang.name} Stanford NLP NER classifier...")
|
||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||
case StanfordNerSettings.RegexOnly(path) =>
|
||||
logger.info(s"Creating regexNer-only Stanford NLP NER classifier...")
|
||||
regexNerPipeline(path)
|
||||
}
|
||||
|
||||
def regexNerPipeline(regexNerFile: Path): StanfordCoreNLP =
|
||||
new StanfordCoreNLP(Properties.regexNerOnly(regexNerFile))
|
||||
|
||||
def clearPipelineCaches[F[_]: Sync]: F[Unit] =
|
||||
Sync[F].delay {
|
||||
|
@ -2,25 +2,41 @@ package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import docspell.common._
|
||||
import docspell.analysis.NlpSettings
|
||||
import docspell.common.Language.NLPLanguage
|
||||
|
||||
/** Settings for configuring the stanford NER pipeline.
|
||||
*
|
||||
* The language is mandatory, only the provided ones are supported.
|
||||
* The `highRecall` only applies for non-English languages. For
|
||||
* non-English languages the english classifier is run as second
|
||||
* classifier and if `highRecall` is true, then it will be used to
|
||||
* tag untagged tokens. This may lead to a lot of false positives,
|
||||
* but since English is omnipresent in other languages, too it
|
||||
* depends on the use case for whether this is useful or not.
|
||||
*
|
||||
* The `regexNer` allows to specify a text file as described here:
|
||||
* https://nlp.stanford.edu/software/regexner.html. This will be used
|
||||
* as a last step to tag untagged tokens using the provided list of
|
||||
* regexps.
|
||||
*/
|
||||
case class StanfordNerSettings(
|
||||
lang: Language,
|
||||
highRecall: Boolean,
|
||||
regexNer: Option[Path]
|
||||
)
|
||||
sealed trait StanfordNerSettings
|
||||
|
||||
object StanfordNerSettings {
|
||||
|
||||
/** Settings for configuring the stanford NER pipeline.
|
||||
*
|
||||
* The language is mandatory, only the provided ones are supported.
|
||||
* The `highRecall` only applies for non-English languages. For
|
||||
* non-English languages the english classifier is run as second
|
||||
* classifier and if `highRecall` is true, then it will be used to
|
||||
* tag untagged tokens. This may lead to a lot of false positives,
|
||||
* but since English is omnipresent in other languages, too it
|
||||
* depends on the use case for whether this is useful or not.
|
||||
*
|
||||
* The `regexNer` allows to specify a text file as described here:
|
||||
* https://nlp.stanford.edu/software/regexner.html. This will be used
|
||||
* as a last step to tag untagged tokens using the provided list of
|
||||
* regexps.
|
||||
*/
|
||||
case class Full(
|
||||
lang: NLPLanguage,
|
||||
highRecall: Boolean,
|
||||
regexNer: Option[Path]
|
||||
) extends StanfordNerSettings
|
||||
|
||||
/** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
|
||||
*/
|
||||
case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings
|
||||
|
||||
def fromNlpSettings(ns: NlpSettings): Option[StanfordNerSettings] =
|
||||
NLPLanguage.all
|
||||
.find(nl => nl == ns.lang)
|
||||
.map(nl => Full(nl, ns.highRecall, ns.regexNer))
|
||||
.orElse(ns.regexNer.map(nrf => RegexOnly(nrf)))
|
||||
}
|
||||
|
@ -1,12 +1,13 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common.Language.NLPLanguage
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object BaseCRFAnnotatorSuite extends SimpleTestSuite {
|
||||
|
||||
def annotate(language: Language): String => Vector[NerLabel] =
|
||||
def annotate(language: NLPLanguage): String => Vector[NerLabel] =
|
||||
BasicCRFAnnotator.nerAnnotate(BasicCRFAnnotator.Cache.getAnnotator(language))
|
||||
|
||||
test("find english ner labels") {
|
||||
|
@ -1,8 +1,12 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import cats.effect.IO
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
import docspell.common.syntax.FileSyntax._
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
|
||||
object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||
@ -68,4 +72,36 @@ object StanfordNerAnnotatorSuite extends SimpleTestSuite {
|
||||
assertEquals(labels, expect)
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
|
||||
test("regexner-only annotator") {
|
||||
val regexNerContent =
|
||||
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|""".stripMargin
|
||||
|
||||
File
|
||||
.withTempDir[IO](Paths.get("target"), "test-regex-ner")
|
||||
.use { dir =>
|
||||
for {
|
||||
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
|
||||
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
|
||||
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
|
||||
_ <- IO(
|
||||
assertEquals(
|
||||
labels,
|
||||
Vector(
|
||||
NerLabel("Andrea", NerTag.Person, 6, 12),
|
||||
NerLabel("Rossi", NerTag.Person, 13, 18)
|
||||
)
|
||||
)
|
||||
)
|
||||
} yield ()
|
||||
}
|
||||
.unsafeRunSync()
|
||||
StanfordCoreNLP.clearAnnotatorPool()
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user