Improve text analysis

- Search for consecutive labels

- Sort list of candidates by a weight

- Search for organizations using person labels
This commit is contained in:
Eike Kettner
2020-03-17 22:34:50 +01:00
parent a4c97d5d57
commit 00ca6b5697
10 changed files with 241 additions and 22 deletions

View File

@ -0,0 +1,66 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first.
*/
object EvalProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { _ =>
val metas = data.metas.map(reorderCandidates)
data.copy(metas = metas).pure[F]
}
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList
.map(mpt => rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp))
Some(mp.copy(values = v))
case None =>
None
})
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
}
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
val textLen = rm.content.map(_.length).getOrElse(0)
val tagCount = cand.origin.size.toDouble
val pos = cand.origin.map(_.startPosition).min
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
}
def positionWeight(pos: Int, total: Int): Double = {
if (total <= 0) 1
else {
val p = math.abs(pos.toDouble / total.toDouble)
if (p < 0.7) p / 2
else p
}
}
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
tag match {
case NerTag.Date => 1.0
case NerTag.Email => 0.5
case NerTag.Location => 1.0
case NerTag.Misc => 1.0
case NerTag.Organization =>
if (mt == MetaProposalType.CorrOrg) 0.8
else 1.0
case NerTag.Person =>
if (mt == MetaProposalType.CorrPerson ||
mt == MetaProposalType.ConcPerson) 0.8
else 1.0
case NerTag.Website => 0.5
}
}

View File

@ -24,16 +24,10 @@ object FindProposal {
ctx.logger.info("Starting find-proposal") *>
rmas
.traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
)
.flatMap(rmv =>
rmv
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data.copy(metas = rmv))
processAttachment(rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml))
)
.map(rmv => data.copy(metas = rmv))
}
def processAttachment[F[_]: Sync](
@ -56,13 +50,19 @@ object FindProposal {
val dueDates = MetaProposalList.fromSeq1(
MetaProposalType.DueDate,
after.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
val itemDates = MetaProposalList.fromSeq1(
MetaProposalType.DocDate,
before.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
@ -71,13 +71,13 @@ object FindProposal {
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
labels
.sortBy(_.startPosition)
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
case ((seen, result), el) =>
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
}
._2
.sortBy(_.startPosition)
trait Finder[F[_]] { self =>
def find(labels: Seq[NerLabel]): F[MetaProposalList]
@ -91,7 +91,9 @@ object FindProposal {
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
def map(
f: MetaProposalList => MetaProposalList
)(implicit F: Applicative[F]): Finder[F] =
labels => self.find(labels).map(f)
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
@ -118,10 +120,12 @@ object FindProposal {
_ => value.pure[F]
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
}
private def search[F[_]: Sync](
@ -154,10 +158,15 @@ object FindProposal {
val s2 = ctx.store
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
ctx.logger.debug(s"Looking for persons: $value") *> (for {
val s3 =
ctx.store
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
ml0 <- s1
ml1 <- s2
} yield ml0 |+| ml1)
ml2 <- s3
} yield ml0 |+| ml1 |+| ml2)
case NerTag.Location =>
ctx.logger

View File

@ -16,7 +16,16 @@ object ProcessItem {
.flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50))
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
.flatMap(Task.setProgress(75))
.flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](item)
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
}

View File

@ -0,0 +1,24 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records._
/** Saves the proposals in the database
*
*/
object SaveProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
ctx.logger.info("Storing proposals") *>
data.metas
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data)
}
}

View File

@ -34,8 +34,10 @@ object TextAnalysis {
for {
list0 <- stanfordNer[F](lang, rm)
list1 <- contactNer[F](rm)
list = list0 ++ list1
spans = NerLabelSpan.build(list.toSeq)
dates <- dateNer[F](rm, lang)
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
Sync[F].delay {

View File

@ -33,14 +33,14 @@ object Context {
private[this] val log = getLogger
def create[F[_]: Functor, A](
job: RJob,
jobId: Ident,
arg: A,
config: SchedulerConfig,
log: Logger[F],
store: Store[F],
blocker: Blocker
): Context[F, A] =
new ContextImpl(arg, log, store, blocker, config, job.id)
new ContextImpl(arg, log, store, blocker, config, jobId)
def apply[F[_]: Concurrent, A](
job: RJob,
@ -54,7 +54,7 @@ object Context {
_ <- log.ftrace("Creating logger for task run")
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
_ <- log.ftrace("Logger created, instantiating context")
ctx = create[F, A](job, arg, config, logger, store, blocker)
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
} yield ctx
final private class ContextImpl[F[_]: Functor, A](