Improve text analysis

- Search for consecutive labels

- Sort list of candidates by a weight

- Search for organizations using person labels
This commit is contained in:
Eike Kettner 2020-03-17 22:34:50 +01:00
parent a4c97d5d57
commit 00ca6b5697
10 changed files with 241 additions and 22 deletions

View File

@ -40,6 +40,13 @@ val testSettings = Seq(
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
)
lazy val noPublish = Seq(
publish := {},
publishLocal := {},
publishArtifact := false
)
val elmSettings = Seq(
elmCompileMode := ElmCompileMode.Debug,
Compile/resourceGenerators += Def.task {
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
val root = project.in(file(".")).
settings(sharedSettings).
settings(noPublish).
settings(
name := "docspell-root"
).

View File

@ -0,0 +1,56 @@
package docspell.common
final case class NerLabelSpan private (
labels: Vector[NerLabel]
) {
def size: Int = labels.size
def +(label: NerLabel): NerLabelSpan =
labels.lastOption match {
case None =>
NerLabelSpan(Vector(label))
case Some(el) =>
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
else this
}
def asLabel: Option[NerLabel] =
(labels.headOption, labels.lastOption) match {
case (Some(s), Some(e)) =>
Some(
NerLabel(
labels.map(_.label).mkString(" "),
s.tag,
s.startPosition,
e.endPosition
)
)
case _ =>
None
}
}
object NerLabelSpan {
val empty = NerLabelSpan(Vector.empty)
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
val sorted = labels.sortBy(_.startPosition)
sorted
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
span.lastOption match {
case Some(last) =>
val next = last + el
if (next eq last) span :+ (empty + el)
else span.dropRight(1) :+ next
case None =>
Vector(empty + el)
}
}
.filter(_.size > 1)
}
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
buildSpans(labels).flatMap(_.asLabel)
}

View File

@ -0,0 +1,45 @@
package docspell.common
import minitest._
object NerLabelSpanTest extends SimpleTestSuite {
test("build") {
val labels = List(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 162, 167),
NerLabel("Production", NerTag.Organization, 168, 178),
NerLabel("Old", NerTag.Organization, 179, 182),
NerLabel("Sticky", NerTag.Organization, 183, 189),
NerLabel("Pancake", NerTag.Organization, 190, 197),
NerLabel("Company", NerTag.Organization, 198, 205),
NerLabel("Maple", NerTag.Location, 210, 215),
NerLabel("Lane", NerTag.Location, 216, 220),
NerLabel("Forest", NerTag.Location, 222, 238),
NerLabel("Hemptown", NerTag.Location, 243, 251),
NerLabel("Little", NerTag.Organization, 351, 357),
NerLabel("League", NerTag.Organization, 358, 364),
NerLabel("Derek", NerTag.Person, 1121, 1126),
NerLabel("Jeter", NerTag.Person, 1127, 1132),
NerLabel("2016-11-07", NerTag.Date, 50, 60),
NerLabel("2016-11-07", NerTag.Date, 119, 129),
NerLabel("2019-09-03", NerTag.Date, 253, 264),
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
)
val spans = NerLabelSpan.build(labels)
assertEquals(spans, Vector(
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
NerLabel("Maple Lane", NerTag.Location, 210, 220),
NerLabel("Little League", NerTag.Organization, 351, 364),
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
))
}
}

View File

@ -0,0 +1,66 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first.
*/
object EvalProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { _ =>
val metas = data.metas.map(reorderCandidates)
data.copy(metas = metas).pure[F]
}
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList
.map(mpt => rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp))
Some(mp.copy(values = v))
case None =>
None
})
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
}
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
val textLen = rm.content.map(_.length).getOrElse(0)
val tagCount = cand.origin.size.toDouble
val pos = cand.origin.map(_.startPosition).min
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
}
def positionWeight(pos: Int, total: Int): Double = {
if (total <= 0) 1
else {
val p = math.abs(pos.toDouble / total.toDouble)
if (p < 0.7) p / 2
else p
}
}
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
tag match {
case NerTag.Date => 1.0
case NerTag.Email => 0.5
case NerTag.Location => 1.0
case NerTag.Misc => 1.0
case NerTag.Organization =>
if (mt == MetaProposalType.CorrOrg) 0.8
else 1.0
case NerTag.Person =>
if (mt == MetaProposalType.CorrPerson ||
mt == MetaProposalType.ConcPerson) 0.8
else 1.0
case NerTag.Website => 0.5
}
}

View File

@ -24,16 +24,10 @@ object FindProposal {
ctx.logger.info("Starting find-proposal") *>
rmas
.traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
)
.flatMap(rmv =>
rmv
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data.copy(metas = rmv))
processAttachment(rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml))
)
.map(rmv => data.copy(metas = rmv))
}
def processAttachment[F[_]: Sync](
@ -56,13 +50,19 @@ object FindProposal {
val dueDates = MetaProposalList.fromSeq1(
MetaProposalType.DueDate,
after.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
val itemDates = MetaProposalList.fromSeq1(
MetaProposalType.DocDate,
before.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
@ -71,13 +71,13 @@ object FindProposal {
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
labels
.sortBy(_.startPosition)
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
case ((seen, result), el) =>
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
}
._2
.sortBy(_.startPosition)
trait Finder[F[_]] { self =>
def find(labels: Seq[NerLabel]): F[MetaProposalList]
@ -91,7 +91,9 @@ object FindProposal {
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
def map(
f: MetaProposalList => MetaProposalList
)(implicit F: Applicative[F]): Finder[F] =
labels => self.find(labels).map(f)
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
@ -118,10 +120,12 @@ object FindProposal {
_ => value.pure[F]
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
}
private def search[F[_]: Sync](
@ -154,10 +158,15 @@ object FindProposal {
val s2 = ctx.store
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
ctx.logger.debug(s"Looking for persons: $value") *> (for {
val s3 =
ctx.store
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
ml0 <- s1
ml1 <- s2
} yield ml0 |+| ml1)
ml2 <- s3
} yield ml0 |+| ml1 |+| ml2)
case NerTag.Location =>
ctx.logger

View File

@ -16,7 +16,16 @@ object ProcessItem {
.flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50))
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
.flatMap(Task.setProgress(75))
.flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](item)
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
}

View File

@ -0,0 +1,24 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records._
/** Saves the proposals in the database
*
*/
object SaveProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
ctx.logger.info("Storing proposals") *>
data.metas
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data)
}
}

View File

@ -34,8 +34,10 @@ object TextAnalysis {
for {
list0 <- stanfordNer[F](lang, rm)
list1 <- contactNer[F](rm)
list = list0 ++ list1
spans = NerLabelSpan.build(list.toSeq)
dates <- dateNer[F](rm, lang)
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
Sync[F].delay {

View File

@ -33,14 +33,14 @@ object Context {
private[this] val log = getLogger
def create[F[_]: Functor, A](
job: RJob,
jobId: Ident,
arg: A,
config: SchedulerConfig,
log: Logger[F],
store: Store[F],
blocker: Blocker
): Context[F, A] =
new ContextImpl(arg, log, store, blocker, config, job.id)
new ContextImpl(arg, log, store, blocker, config, jobId)
def apply[F[_]: Concurrent, A](
job: RJob,
@ -54,7 +54,7 @@ object Context {
_ <- log.ftrace("Creating logger for task run")
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
_ <- log.ftrace("Logger created, instantiating context")
ctx = create[F, A](job, arg, config, logger, store, blocker)
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
} yield ctx
final private class ContextImpl[F[_]: Functor, A](

View File

@ -24,7 +24,7 @@ object ONode {
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
for {
node <- RNode(appId, nodeType, uri)
_ <- logger.finfo(s"Registering node $node")
_ <- logger.finfo(s"Registering node ${node.id.id}")
_ <- store.transact(RNode.set(node))
} yield ()