mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Improve text analysis
- Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels
This commit is contained in:
@ -0,0 +1,66 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
|
||||
/** Reorders the proposals to put most probable fits first.
|
||||
*/
|
||||
object EvalProposals {
|
||||
|
||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { _ =>
|
||||
val metas = data.metas.map(reorderCandidates)
|
||||
data.copy(metas = metas).pure[F]
|
||||
}
|
||||
|
||||
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
|
||||
val list = rm.proposals.getTypes.toList
|
||||
.map(mpt => rm.proposals.find(mpt) match {
|
||||
case Some(mp) =>
|
||||
val v = mp.values.sortBy(weight(rm, mp))
|
||||
Some(mp.copy(values = v))
|
||||
case None =>
|
||||
None
|
||||
})
|
||||
|
||||
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
|
||||
}
|
||||
|
||||
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
|
||||
val textLen = rm.content.map(_.length).getOrElse(0)
|
||||
val tagCount = cand.origin.size.toDouble
|
||||
val pos = cand.origin.map(_.startPosition).min
|
||||
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
|
||||
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
|
||||
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
|
||||
}
|
||||
|
||||
def positionWeight(pos: Int, total: Int): Double = {
|
||||
if (total <= 0) 1
|
||||
else {
|
||||
val p = math.abs(pos.toDouble / total.toDouble)
|
||||
if (p < 0.7) p / 2
|
||||
else p
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
|
||||
tag match {
|
||||
case NerTag.Date => 1.0
|
||||
case NerTag.Email => 0.5
|
||||
case NerTag.Location => 1.0
|
||||
case NerTag.Misc => 1.0
|
||||
case NerTag.Organization =>
|
||||
if (mt == MetaProposalType.CorrOrg) 0.8
|
||||
else 1.0
|
||||
case NerTag.Person =>
|
||||
if (mt == MetaProposalType.CorrPerson ||
|
||||
mt == MetaProposalType.ConcPerson) 0.8
|
||||
else 1.0
|
||||
case NerTag.Website => 0.5
|
||||
}
|
||||
}
|
@ -24,16 +24,10 @@ object FindProposal {
|
||||
ctx.logger.info("Starting find-proposal") *>
|
||||
rmas
|
||||
.traverse(rm =>
|
||||
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
|
||||
)
|
||||
.flatMap(rmv =>
|
||||
rmv
|
||||
.traverse(rm =>
|
||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||
)
|
||||
.map(_ => data.copy(metas = rmv))
|
||||
processAttachment(rm, data.findDates(rm), ctx)
|
||||
.map(ml => rm.copy(proposals = ml))
|
||||
)
|
||||
.map(rmv => data.copy(metas = rmv))
|
||||
}
|
||||
|
||||
def processAttachment[F[_]: Sync](
|
||||
@ -56,13 +50,19 @@ object FindProposal {
|
||||
val dueDates = MetaProposalList.fromSeq1(
|
||||
MetaProposalType.DueDate,
|
||||
after.map(ndl =>
|
||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
||||
Candidate(
|
||||
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||
Set(ndl.label)
|
||||
)
|
||||
)
|
||||
)
|
||||
val itemDates = MetaProposalList.fromSeq1(
|
||||
MetaProposalType.DocDate,
|
||||
before.map(ndl =>
|
||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
||||
Candidate(
|
||||
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||
Set(ndl.label)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@ -71,13 +71,13 @@ object FindProposal {
|
||||
|
||||
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
|
||||
labels
|
||||
.sortBy(_.startPosition)
|
||||
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
|
||||
case ((seen, result), el) =>
|
||||
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
|
||||
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
|
||||
}
|
||||
._2
|
||||
.sortBy(_.startPosition)
|
||||
|
||||
trait Finder[F[_]] { self =>
|
||||
def find(labels: Seq[NerLabel]): F[MetaProposalList]
|
||||
@ -91,7 +91,9 @@ object FindProposal {
|
||||
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
|
||||
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
|
||||
|
||||
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
|
||||
def map(
|
||||
f: MetaProposalList => MetaProposalList
|
||||
)(implicit F: Applicative[F]): Finder[F] =
|
||||
labels => self.find(labels).map(f)
|
||||
|
||||
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
||||
@ -118,10 +120,12 @@ object FindProposal {
|
||||
_ => value.pure[F]
|
||||
|
||||
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
||||
labels =>
|
||||
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
||||
|
||||
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
||||
labels =>
|
||||
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
||||
}
|
||||
|
||||
private def search[F[_]: Sync](
|
||||
@ -154,10 +158,15 @@ object FindProposal {
|
||||
val s2 = ctx.store
|
||||
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
|
||||
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
|
||||
ctx.logger.debug(s"Looking for persons: $value") *> (for {
|
||||
val s3 =
|
||||
ctx.store
|
||||
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
|
||||
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
|
||||
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
|
||||
ml0 <- s1
|
||||
ml1 <- s2
|
||||
} yield ml0 |+| ml1)
|
||||
ml2 <- s3
|
||||
} yield ml0 |+| ml1 |+| ml2)
|
||||
|
||||
case NerTag.Location =>
|
||||
ctx.logger
|
||||
|
@ -16,7 +16,16 @@ object ProcessItem {
|
||||
.flatMap(TextAnalysis[F])
|
||||
.flatMap(Task.setProgress(50))
|
||||
.flatMap(FindProposal[F])
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
.flatMap(Task.setProgress(75))
|
||||
.flatMap(LinkProposal[F])
|
||||
.flatMap(Task.setProgress(99))
|
||||
|
||||
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](item)
|
||||
.flatMap(FindProposal[F])
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,24 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records._
|
||||
|
||||
/** Saves the proposals in the database
|
||||
*
|
||||
*/
|
||||
object SaveProposals {
|
||||
|
||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
ctx.logger.info("Storing proposals") *>
|
||||
data.metas
|
||||
.traverse(rm =>
|
||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||
)
|
||||
.map(_ => data)
|
||||
}
|
||||
}
|
@ -34,8 +34,10 @@ object TextAnalysis {
|
||||
for {
|
||||
list0 <- stanfordNer[F](lang, rm)
|
||||
list1 <- contactNer[F](rm)
|
||||
list = list0 ++ list1
|
||||
spans = NerLabelSpan.build(list.toSeq)
|
||||
dates <- dateNer[F](rm, lang)
|
||||
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
|
||||
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
|
||||
|
||||
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
|
@ -33,14 +33,14 @@ object Context {
|
||||
private[this] val log = getLogger
|
||||
|
||||
def create[F[_]: Functor, A](
|
||||
job: RJob,
|
||||
jobId: Ident,
|
||||
arg: A,
|
||||
config: SchedulerConfig,
|
||||
log: Logger[F],
|
||||
store: Store[F],
|
||||
blocker: Blocker
|
||||
): Context[F, A] =
|
||||
new ContextImpl(arg, log, store, blocker, config, job.id)
|
||||
new ContextImpl(arg, log, store, blocker, config, jobId)
|
||||
|
||||
def apply[F[_]: Concurrent, A](
|
||||
job: RJob,
|
||||
@ -54,7 +54,7 @@ object Context {
|
||||
_ <- log.ftrace("Creating logger for task run")
|
||||
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
|
||||
_ <- log.ftrace("Logger created, instantiating context")
|
||||
ctx = create[F, A](job, arg, config, logger, store, blocker)
|
||||
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
|
||||
} yield ctx
|
||||
|
||||
final private class ContextImpl[F[_]: Functor, A](
|
||||
|
Reference in New Issue
Block a user