mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Allow configuring stanford-ner and cache based on collective
This commit is contained in:
@ -6,6 +6,7 @@ import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.concurrent.SignallingRef
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.backend.ops._
|
||||
import docspell.common._
|
||||
import docspell.ftsclient.FtsClient
|
||||
@ -80,14 +81,15 @@ object JoexAppImpl {
|
||||
for {
|
||||
httpClient <- BlazeClientBuilder[F](clientEC).resource
|
||||
client = JoexClient(httpClient)
|
||||
store <- Store.create(cfg.jdbc, connectEC, blocker)
|
||||
queue <- JobQueue(store)
|
||||
pstore <- PeriodicTaskStore.create(store)
|
||||
nodeOps <- ONode(store)
|
||||
joex <- OJoex(client, store)
|
||||
upload <- OUpload(store, queue, cfg.files, joex)
|
||||
fts <- createFtsClient(cfg)(httpClient)
|
||||
itemOps <- OItem(store, fts, queue, joex)
|
||||
store <- Store.create(cfg.jdbc, connectEC, blocker)
|
||||
queue <- JobQueue(store)
|
||||
pstore <- PeriodicTaskStore.create(store)
|
||||
nodeOps <- ONode(store)
|
||||
joex <- OJoex(client, store)
|
||||
upload <- OUpload(store, queue, cfg.files, joex)
|
||||
fts <- createFtsClient(cfg)(httpClient)
|
||||
itemOps <- OItem(store, fts, queue, joex)
|
||||
analyser <- TextAnalyser.create[F](cfg.textAnalysis)
|
||||
javaEmil =
|
||||
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
|
||||
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
|
||||
@ -95,14 +97,14 @@ object JoexAppImpl {
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
ProcessItemArgs.taskName,
|
||||
ItemHandler.newItem[F](cfg, itemOps, fts),
|
||||
ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
|
||||
ItemHandler.onCancel[F]
|
||||
)
|
||||
)
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
ReProcessItemArgs.taskName,
|
||||
ReProcessItem[F](cfg, fts),
|
||||
ReProcessItem[F](cfg, fts, analyser),
|
||||
ReProcessItem.onCancel[F]
|
||||
)
|
||||
)
|
||||
|
@ -5,6 +5,7 @@ import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.backend.ops.OItem
|
||||
import docspell.common.{ItemState, ProcessItemArgs}
|
||||
import docspell.ftsclient.FtsClient
|
||||
@ -29,11 +30,12 @@ object ItemHandler {
|
||||
def newItem[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
itemOps: OItem[F],
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
): Task[F, Args, Unit] =
|
||||
CreateItem[F]
|
||||
.flatMap(itemStateTask(ItemState.Processing))
|
||||
.flatMap(safeProcess[F](cfg, itemOps, fts))
|
||||
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
|
||||
.map(_ => ())
|
||||
|
||||
def itemStateTask[F[_]: Sync, A](
|
||||
@ -51,11 +53,12 @@ object ItemHandler {
|
||||
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
itemOps: OItem[F],
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
)(data: ItemData): Task[F, Args, ItemData] =
|
||||
isLastRetry[F].flatMap {
|
||||
case true =>
|
||||
ProcessItem[F](cfg, itemOps, fts)(data).attempt.flatMap({
|
||||
ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
|
||||
case Right(d) =>
|
||||
Task.pure(d)
|
||||
case Left(ex) =>
|
||||
@ -65,7 +68,8 @@ object ItemHandler {
|
||||
.andThen(_ => Sync[F].raiseError(ex))
|
||||
})
|
||||
case false =>
|
||||
ProcessItem[F](cfg, itemOps, fts)(data).flatMap(itemStateTask(ItemState.Created))
|
||||
ProcessItem[F](cfg, itemOps, fts, analyser)(data)
|
||||
.flatMap(itemStateTask(ItemState.Created))
|
||||
}
|
||||
|
||||
private def markItemCreated[F[_]: Sync]: Task[F, Args, Boolean] =
|
||||
|
@ -2,6 +2,7 @@ package docspell.joex.process
|
||||
|
||||
import cats.effect._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.backend.ops.OItem
|
||||
import docspell.common.ProcessItemArgs
|
||||
import docspell.ftsclient.FtsClient
|
||||
@ -13,25 +14,28 @@ object ProcessItem {
|
||||
def apply[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
itemOps: OItem[F],
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
ExtractArchive(item)
|
||||
.flatMap(Task.setProgress(20))
|
||||
.flatMap(processAttachments0(cfg, fts, (40, 60, 80)))
|
||||
.flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
|
||||
.flatMap(LinkProposal[F])
|
||||
.flatMap(SetGivenData[F](itemOps))
|
||||
.flatMap(Task.setProgress(99))
|
||||
|
||||
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
processAttachments0[F](cfg, fts, (30, 60, 90))(item)
|
||||
processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
|
||||
|
||||
def analysisOnly[F[_]: Sync](
|
||||
cfg: Config
|
||||
cfg: Config,
|
||||
analyser: TextAnalyser[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](cfg.textAnalysis)(item)
|
||||
TextAnalysis[F](analyser)(item)
|
||||
.flatMap(FindProposal[F](cfg.processing))
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
@ -39,12 +43,13 @@ object ProcessItem {
|
||||
private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F],
|
||||
progress: (Int, Int, Int)
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
ConvertPdf(cfg.convert, item)
|
||||
.flatMap(Task.setProgress(progress._1))
|
||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||
.flatMap(Task.setProgress(progress._2))
|
||||
.flatMap(analysisOnly[F](cfg))
|
||||
.flatMap(analysisOnly[F](cfg, analyser))
|
||||
.flatMap(Task.setProgress(progress._3))
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.common._
|
||||
import docspell.ftsclient.FtsClient
|
||||
import docspell.joex.Config
|
||||
@ -19,10 +20,11 @@ object ReProcessItem {
|
||||
|
||||
def apply[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
): Task[F, Args, Unit] =
|
||||
loadItem[F]
|
||||
.flatMap(safeProcess[F](cfg, fts))
|
||||
.flatMap(safeProcess[F](cfg, fts, analyser))
|
||||
.map(_ => ())
|
||||
|
||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
|
||||
@ -70,6 +72,7 @@ object ReProcessItem {
|
||||
def processFiles[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F],
|
||||
data: ItemData
|
||||
): Task[F, Args, ItemData] = {
|
||||
|
||||
@ -91,7 +94,7 @@ object ReProcessItem {
|
||||
|
||||
getLanguage[F].flatMap { lang =>
|
||||
ProcessItem
|
||||
.processAttachments[F](cfg, fts)(data)
|
||||
.processAttachments[F](cfg, fts, analyser)(data)
|
||||
.contramap[Args](convertArgs(lang))
|
||||
}
|
||||
}
|
||||
@ -109,11 +112,12 @@ object ReProcessItem {
|
||||
|
||||
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
||||
cfg: Config,
|
||||
fts: FtsClient[F]
|
||||
fts: FtsClient[F],
|
||||
analyser: TextAnalyser[F]
|
||||
)(data: ItemData): Task[F, Args, ItemData] =
|
||||
isLastRetry[F].flatMap {
|
||||
case true =>
|
||||
processFiles[F](cfg, fts, data).attempt
|
||||
processFiles[F](cfg, fts, analyser, data).attempt
|
||||
.flatMap({
|
||||
case Right(d) =>
|
||||
Task.pure(d)
|
||||
@ -123,7 +127,7 @@ object ReProcessItem {
|
||||
).andThen(_ => Sync[F].raiseError(ex))
|
||||
})
|
||||
case false =>
|
||||
processFiles[F](cfg, fts, data)
|
||||
processFiles[F](cfg, fts, analyser, data)
|
||||
}
|
||||
|
||||
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
|
||||
|
@ -1,9 +1,10 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.effect.Sync
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.common._
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Task
|
||||
@ -12,36 +13,40 @@ import docspell.store.records.RAttachmentMeta
|
||||
object TextAnalysis {
|
||||
|
||||
def apply[F[_]: Sync](
|
||||
cfg: TextAnalysisConfig
|
||||
analyser: TextAnalyser[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
TextAnalyser.create[F](cfg).use { analyser =>
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting text analysis")
|
||||
s <- Duration.stopTime[F]
|
||||
t <-
|
||||
item.metas.toList
|
||||
.traverse(
|
||||
annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser)
|
||||
)
|
||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||
_ <- t.traverse(m =>
|
||||
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
||||
)
|
||||
e <- s
|
||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||
v = t.toVector
|
||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
}
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting text analysis")
|
||||
s <- Duration.stopTime[F]
|
||||
t <-
|
||||
item.metas.toList
|
||||
.traverse(
|
||||
annotateAttachment[F](ctx.args, ctx.logger, analyser)
|
||||
)
|
||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||
_ <- t.traverse(m =>
|
||||
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
||||
)
|
||||
e <- s
|
||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||
v = t.toVector
|
||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
}
|
||||
|
||||
def annotateAttachment[F[_]: Sync](
|
||||
lang: Language,
|
||||
args: ProcessItemArgs,
|
||||
logger: Logger[F],
|
||||
analyser: TextAnalyser[F]
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||
val settings = StanfordSettings(args.meta.language, false, None)
|
||||
for {
|
||||
labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
|
||||
labels <- analyser.annotate(
|
||||
logger,
|
||||
settings,
|
||||
args.meta.collective,
|
||||
rm.content.getOrElse("")
|
||||
)
|
||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user