Allow configuring stanford-ner and cache based on collective

This commit is contained in:
Eike Kettner 2020-08-24 00:56:25 +02:00
parent 4e7c00c345
commit 8628a0a8b3
11 changed files with 271 additions and 117 deletions

View File

@ -5,12 +5,19 @@ import cats.implicits._
import docspell.analysis.contact.Contact import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind import docspell.analysis.date.DateFind
import docspell.analysis.nlp.PipelineCache
import docspell.analysis.nlp.StanfordNerClassifier import docspell.analysis.nlp.StanfordNerClassifier
import docspell.analysis.nlp.StanfordSettings
import docspell.common._ import docspell.common._
trait TextAnalyser[F[_]] { trait TextAnalyser[F[_]] {
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result] def annotate(
logger: Logger[F],
settings: StanfordSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result]
} }
object TextAnalyser { object TextAnalyser {
@ -22,17 +29,21 @@ object TextAnalyser {
} }
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] = def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] { Resource
.liftF(PipelineCache[F]())
.map(cache =>
new TextAnalyser[F] {
def annotate( def annotate(
logger: Logger[F], logger: Logger[F],
lang: Language, settings: StanfordSettings,
cacheKey: Ident,
text: String text: String
): F[TextAnalyser.Result] = ): F[TextAnalyser.Result] =
for { for {
input <- textLimit(logger, text) input <- textLimit(logger, text)
tags0 <- stanfordNer(lang, input) tags0 <- stanfordNer(cacheKey, settings, input)
tags1 <- contactNer(input) tags1 <- contactNer(input)
dates <- dateNer(lang, input) dates <- dateNer(settings.lang, input)
list = tags0 ++ tags1 list = tags0 ++ tags1
spans = NerLabelSpan.build(list) spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates) } yield Result(spans ++ list, dates)
@ -45,10 +56,9 @@ object TextAnalyser {
s" Analysing only first ${cfg.maxLength} characters." s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F] ) *> text.take(cfg.maxLength).pure[F]
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] = private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
Sync[F].delay { : F[Vector[NerLabel]] =
StanfordNerClassifier.nerAnnotate(lang)(text) StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
}
private def contactNer(text: String): F[Vector[NerLabel]] = private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay { Sync[F].delay {
@ -59,6 +69,7 @@ object TextAnalyser {
Sync[F].delay { Sync[F].delay {
DateFind.findDates(text, lang).toVector DateFind.findDates(text, lang).toVector
} }
}) }
)
} }

View File

@ -0,0 +1,90 @@
package docspell.analysis.nlp
import cats.Applicative
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import org.log4s.getLogger
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
* involves IO and initializing large objects.
*
* Therefore, the instances are cached, because they are thread-safe.
*
* **This is an internal API**
*/
trait PipelineCache[F[_]] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
}
object PipelineCache {
private[this] val logger = getLogger
def none[F[_]: Applicative]: PipelineCache[F] =
new PipelineCache[F] {
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
makeClassifier(settings).pure[F]
}
def apply[F[_]: Sync](): F[PipelineCache[F]] =
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
extends PipelineCache[F] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
for {
id <- makeSettingsId(settings)
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
} yield nlp
private def getOrCreate(
key: String,
id: String,
cache: Map[String, Entry],
settings: StanfordSettings
): (Map[String, Entry], StanfordCoreNLP) =
cache.get(key) match {
case Some(entry) =>
if (entry.id == id) (cache, entry.value)
else {
logger.info(
s"StanfordNLP settings changed for key $key. Creating new classifier"
)
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
case None =>
val nlp = makeClassifier(settings)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
private def makeSettingsId(settings: StanfordSettings): F[String] = {
val base = settings.copy(regexNer = None).toString
val size: F[Long] =
settings.regexNer match {
case Some(p) =>
File.size(p)
case None =>
0L.pure[F]
}
size.map(len => s"$base-$len")
}
}
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(Properties.forSettings(settings))
}
private case class Entry(id: String, value: StanfordCoreNLP)
}

View File

@ -3,6 +3,7 @@ package docspell.analysis.nlp
import java.util.{Properties => JProps} import java.util.{Properties => JProps}
import docspell.analysis.nlp.Properties.Implicits._ import docspell.analysis.nlp.Properties.Implicits._
import docspell.common._
object Properties { object Properties {
@ -13,6 +14,19 @@ object Properties {
p p
} }
def forSettings(settings: StanfordSettings): JProps = {
val regexNerFile = settings.regexNer
.map(p => p.normalize().toAbsolutePath().toString())
settings.lang match {
case Language.German =>
Properties.nerGerman(regexNerFile, settings.highRecall)
case Language.English =>
Properties.nerEnglish(regexNerFile)
case Language.French =>
Properties.nerFrench(regexNerFile, settings.highRecall)
}
}
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps = def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties( Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner", "annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",

View File

@ -1,45 +1,39 @@
package docspell.analysis.nlp package docspell.analysis.nlp
import java.util.{Properties => JProps}
import scala.jdk.CollectionConverters._ import scala.jdk.CollectionConverters._
import cats.Applicative
import cats.implicits._
import docspell.common._ import docspell.common._
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
import org.log4s.getLogger
object StanfordNerClassifier { object StanfordNerClassifier {
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German) /** Runs named entity recognition on the given `text`.
lazy val englishNerClassifier = makeClassifier(Language.English) *
lazy val frenchNerClassifier = makeClassifier(Language.French) * This uses the classifier pipeline from stanford-nlp, see
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
* classifiers is quite expensive, it involves loading large model
* files. The classifiers are thread-safe and so they are cached.
* The `cacheKey` defines the "slot" where classifiers are stored
* and retrieved. If for a given `cacheKey` the `settings` change,
* a new classifier must be created. It will then replace the
* previous one.
*/
def nerAnnotate[F[_]: Applicative](
cacheKey: String,
cache: PipelineCache[F]
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.map(crf => runClassifier(crf, text))
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
case Language.French => frenchNerClassifier
}
val doc = new CoreDocument(text) val doc = new CoreDocument(text)
nerClassifier.annotate(doc) nerClassifier.annotate(doc)
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
} }
private def makeClassifier(lang: Language): StanfordCoreNLP = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(classifierProperties(lang))
}
private def classifierProperties(lang: Language): JProps =
lang match {
case Language.German =>
Properties.nerGerman(None, false)
case Language.English =>
Properties.nerEnglish(None)
case Language.French =>
Properties.nerFrench(None, false)
}
} }

View File

@ -0,0 +1,22 @@
package docspell.analysis.nlp
import java.nio.file.Path
import docspell.common._
/** Settings for configuring the stanford NER pipeline.
*
* The language is mandatory, only the provided ones are supported.
* The `highRecall` only applies for non-English languages. For
* non-English languages the english classifier is run as second
* classifier and if `highRecall` is true, then it will be used to
* tag untagged tokens. This may lead to a lot of false positives,
* but since English is omnipresent in other languages, too it
* depends on the use case for whether this is useful or not.
*
* The `regexNer` allows to specify a text file as described here:
* https://nlp.stanford.edu/software/regexner.html. This will be used
* as a last step to tag untagged tokens using the provided list of
* regexps.
*/
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])

View File

@ -55,6 +55,9 @@ object File {
def exists[F[_]: Sync](file: Path): F[Boolean] = def exists[F[_]: Sync](file: Path): F[Boolean] =
Sync[F].delay(Files.exists(file)) Sync[F].delay(Files.exists(file))
def size[F[_]: Sync](file: Path): F[Long] =
Sync[F].delay(Files.size(file))
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)

View File

@ -6,6 +6,7 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.concurrent.SignallingRef import fs2.concurrent.SignallingRef
import docspell.analysis.TextAnalyser
import docspell.backend.ops._ import docspell.backend.ops._
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
@ -88,6 +89,7 @@ object JoexAppImpl {
upload <- OUpload(store, queue, cfg.files, joex) upload <- OUpload(store, queue, cfg.files, joex)
fts <- createFtsClient(cfg)(httpClient) fts <- createFtsClient(cfg)(httpClient)
itemOps <- OItem(store, fts, queue, joex) itemOps <- OItem(store, fts, queue, joex)
analyser <- TextAnalyser.create[F](cfg.textAnalysis)
javaEmil = javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
sch <- SchedulerBuilder(cfg.scheduler, blocker, store) sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -95,14 +97,14 @@ object JoexAppImpl {
.withTask( .withTask(
JobTask.json( JobTask.json(
ProcessItemArgs.taskName, ProcessItemArgs.taskName,
ItemHandler.newItem[F](cfg, itemOps, fts), ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
ItemHandler.onCancel[F] ItemHandler.onCancel[F]
) )
) )
.withTask( .withTask(
JobTask.json( JobTask.json(
ReProcessItemArgs.taskName, ReProcessItemArgs.taskName,
ReProcessItem[F](cfg, fts), ReProcessItem[F](cfg, fts, analyser),
ReProcessItem.onCancel[F] ReProcessItem.onCancel[F]
) )
) )

View File

@ -5,6 +5,7 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem import docspell.backend.ops.OItem
import docspell.common.{ItemState, ProcessItemArgs} import docspell.common.{ItemState, ProcessItemArgs}
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
@ -29,11 +30,12 @@ object ItemHandler {
def newItem[F[_]: ConcurrentEffect: ContextShift]( def newItem[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
CreateItem[F] CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing)) .flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts)) .flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
.map(_ => ()) .map(_ => ())
def itemStateTask[F[_]: Sync, A]( def itemStateTask[F[_]: Sync, A](
@ -51,11 +53,12 @@ object ItemHandler {
def safeProcess[F[_]: ConcurrentEffect: ContextShift]( def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({ ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
case Left(ex) => case Left(ex) =>
@ -65,7 +68,8 @@ object ItemHandler {
.andThen(_ => Sync[F].raiseError(ex)) .andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created)) ProcessItem[F](cfg, itemOps, fts, analyser)(data)
.flatMap(itemStateTask(ItemState.Created))
} }
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] = private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =

View File

@ -2,6 +2,7 @@ package docspell.joex.process
import cats.effect._ import cats.effect._
import docspell.analysis.TextAnalyser
import docspell.backend.ops.OItem import docspell.backend.ops.OItem
import docspell.common.ProcessItemArgs import docspell.common.ProcessItemArgs
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
@ -13,25 +14,28 @@ object ProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift]( def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ExtractArchive(item) ExtractArchive(item)
.flatMap(Task.setProgress(20)) .flatMap(Task.setProgress(20))
.flatMap(processAttachments0(cfg, fts, (40, 60, 80))) .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(SetGivenData[F](itemOps)) .flatMap(SetGivenData[F](itemOps))
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
def processAttachments[F[_]: ConcurrentEffect: ContextShift]( def processAttachments[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, (30, 60, 90))(item) processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync]( def analysisOnly[F[_]: Sync](
cfg: Config cfg: Config,
analyser: TextAnalyser[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](cfg.textAnalysis)(item) TextAnalysis[F](analyser)(item)
.flatMap(FindProposal[F](cfg.processing)) .flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F]) .flatMap(EvalProposals[F])
.flatMap(SaveProposals[F]) .flatMap(SaveProposals[F])
@ -39,12 +43,13 @@ object ProcessItem {
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift]( private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F],
progress: (Int, Int, Int) progress: (Int, Int, Int)
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(cfg.convert, item) ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1)) .flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(progress._2)) .flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg)) .flatMap(analysisOnly[F](cfg, analyser))
.flatMap(Task.setProgress(progress._3)) .flatMap(Task.setProgress(progress._3))
} }

View File

@ -4,6 +4,7 @@ import cats.data.OptionT
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
@ -19,10 +20,11 @@ object ReProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift]( def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
loadItem[F] loadItem[F]
.flatMap(safeProcess[F](cfg, fts)) .flatMap(safeProcess[F](cfg, fts, analyser))
.map(_ => ()) .map(_ => ())
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -70,6 +72,7 @@ object ReProcessItem {
def processFiles[F[_]: ConcurrentEffect: ContextShift]( def processFiles[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F],
data: ItemData data: ItemData
): Task[F, Args, ItemData] = { ): Task[F, Args, ItemData] = {
@ -91,7 +94,7 @@ object ReProcessItem {
getLanguage[F].flatMap { lang => getLanguage[F].flatMap { lang =>
ProcessItem ProcessItem
.processAttachments[F](cfg, fts)(data) .processAttachments[F](cfg, fts, analyser)(data)
.contramap[Args](convertArgs(lang)) .contramap[Args](convertArgs(lang))
} }
} }
@ -109,11 +112,12 @@ object ReProcessItem {
def safeProcess[F[_]: ConcurrentEffect: ContextShift]( def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F] fts: FtsClient[F],
analyser: TextAnalyser[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
processFiles[F](cfg, fts, data).attempt processFiles[F](cfg, fts, analyser, data).attempt
.flatMap({ .flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
@ -123,7 +127,7 @@ object ReProcessItem {
).andThen(_ => Sync[F].raiseError(ex)) ).andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
processFiles[F](cfg, fts, data) processFiles[F](cfg, fts, analyser, data)
} }
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =

View File

@ -1,9 +1,10 @@
package docspell.joex.process package docspell.joex.process
import cats.effect.Sync import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.{TextAnalyser, TextAnalysisConfig} import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.common._ import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
@ -12,17 +13,16 @@ import docspell.store.records.RAttachmentMeta
object TextAnalysis { object TextAnalysis {
def apply[F[_]: Sync]( def apply[F[_]: Sync](
cfg: TextAnalysisConfig analyser: TextAnalyser[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
TextAnalyser.create[F](cfg).use { analyser =>
for { for {
_ <- ctx.logger.info("Starting text analysis") _ <- ctx.logger.info("Starting text analysis")
s <- Duration.stopTime[F] s <- Duration.stopTime[F]
t <- t <-
item.metas.toList item.metas.toList
.traverse( .traverse(
annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser) annotateAttachment[F](ctx.args, ctx.logger, analyser)
) )
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m => _ <- t.traverse(m =>
@ -33,15 +33,20 @@ object TextAnalysis {
v = t.toVector v = t.toVector
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
} }
}
def annotateAttachment[F[_]: Sync]( def annotateAttachment[F[_]: Sync](
lang: Language, args: ProcessItemArgs,
logger: Logger[F], logger: Logger[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(args.meta.language, false, None)
for { for {
labels <- analyser.annotate(logger, lang, rm.content.getOrElse("")) labels <- analyser.annotate(
logger,
settings,
args.meta.collective,
rm.content.getOrElse("")
)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
} }