Improve text analysis

- Search for consecutive labels

- Sort list of candidates by a weight

- Search for organizations using person labels
This commit is contained in:
Eike Kettner 2020-03-17 22:34:50 +01:00
parent a4c97d5d57
commit 00ca6b5697
10 changed files with 241 additions and 22 deletions

View File

@ -40,6 +40,13 @@ val testSettings = Seq(
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test) libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
) )
lazy val noPublish = Seq(
publish := {},
publishLocal := {},
publishArtifact := false
)
val elmSettings = Seq( val elmSettings = Seq(
elmCompileMode := ElmCompileMode.Debug, elmCompileMode := ElmCompileMode.Debug,
Compile/resourceGenerators += Def.task { Compile/resourceGenerators += Def.task {
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
val root = project.in(file(".")). val root = project.in(file(".")).
settings(sharedSettings). settings(sharedSettings).
settings(noPublish).
settings( settings(
name := "docspell-root" name := "docspell-root"
). ).

View File

@ -0,0 +1,56 @@
package docspell.common
final case class NerLabelSpan private (
labels: Vector[NerLabel]
) {
def size: Int = labels.size
def +(label: NerLabel): NerLabelSpan =
labels.lastOption match {
case None =>
NerLabelSpan(Vector(label))
case Some(el) =>
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
else this
}
def asLabel: Option[NerLabel] =
(labels.headOption, labels.lastOption) match {
case (Some(s), Some(e)) =>
Some(
NerLabel(
labels.map(_.label).mkString(" "),
s.tag,
s.startPosition,
e.endPosition
)
)
case _ =>
None
}
}
object NerLabelSpan {
val empty = NerLabelSpan(Vector.empty)
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
val sorted = labels.sortBy(_.startPosition)
sorted
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
span.lastOption match {
case Some(last) =>
val next = last + el
if (next eq last) span :+ (empty + el)
else span.dropRight(1) :+ next
case None =>
Vector(empty + el)
}
}
.filter(_.size > 1)
}
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
buildSpans(labels).flatMap(_.asLabel)
}

View File

@ -0,0 +1,45 @@
package docspell.common
import minitest._
object NerLabelSpanTest extends SimpleTestSuite {
test("build") {
val labels = List(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 162, 167),
NerLabel("Production", NerTag.Organization, 168, 178),
NerLabel("Old", NerTag.Organization, 179, 182),
NerLabel("Sticky", NerTag.Organization, 183, 189),
NerLabel("Pancake", NerTag.Organization, 190, 197),
NerLabel("Company", NerTag.Organization, 198, 205),
NerLabel("Maple", NerTag.Location, 210, 215),
NerLabel("Lane", NerTag.Location, 216, 220),
NerLabel("Forest", NerTag.Location, 222, 238),
NerLabel("Hemptown", NerTag.Location, 243, 251),
NerLabel("Little", NerTag.Organization, 351, 357),
NerLabel("League", NerTag.Organization, 358, 364),
NerLabel("Derek", NerTag.Person, 1121, 1126),
NerLabel("Jeter", NerTag.Person, 1127, 1132),
NerLabel("2016-11-07", NerTag.Date, 50, 60),
NerLabel("2016-11-07", NerTag.Date, 119, 129),
NerLabel("2019-09-03", NerTag.Date, 253, 264),
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
)
val spans = NerLabelSpan.build(labels)
assertEquals(spans, Vector(
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
NerLabel("Maple Lane", NerTag.Location, 210, 220),
NerLabel("Little League", NerTag.Organization, 351, 364),
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
))
}
}

View File

@ -0,0 +1,66 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first.
*/
object EvalProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { _ =>
val metas = data.metas.map(reorderCandidates)
data.copy(metas = metas).pure[F]
}
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList
.map(mpt => rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp))
Some(mp.copy(values = v))
case None =>
None
})
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
}
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
val textLen = rm.content.map(_.length).getOrElse(0)
val tagCount = cand.origin.size.toDouble
val pos = cand.origin.map(_.startPosition).min
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
}
def positionWeight(pos: Int, total: Int): Double = {
if (total <= 0) 1
else {
val p = math.abs(pos.toDouble / total.toDouble)
if (p < 0.7) p / 2
else p
}
}
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
tag match {
case NerTag.Date => 1.0
case NerTag.Email => 0.5
case NerTag.Location => 1.0
case NerTag.Misc => 1.0
case NerTag.Organization =>
if (mt == MetaProposalType.CorrOrg) 0.8
else 1.0
case NerTag.Person =>
if (mt == MetaProposalType.CorrPerson ||
mt == MetaProposalType.ConcPerson) 0.8
else 1.0
case NerTag.Website => 0.5
}
}

View File

@ -24,16 +24,10 @@ object FindProposal {
ctx.logger.info("Starting find-proposal") *> ctx.logger.info("Starting find-proposal") *>
rmas rmas
.traverse(rm => .traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml)) processAttachment(rm, data.findDates(rm), ctx)
) .map(ml => rm.copy(proposals = ml))
.flatMap(rmv =>
rmv
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data.copy(metas = rmv))
) )
.map(rmv => data.copy(metas = rmv))
} }
def processAttachment[F[_]: Sync]( def processAttachment[F[_]: Sync](
@ -56,13 +50,19 @@ object FindProposal {
val dueDates = MetaProposalList.fromSeq1( val dueDates = MetaProposalList.fromSeq1(
MetaProposalType.DueDate, MetaProposalType.DueDate,
after.map(ndl => after.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label)) Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
) )
) )
val itemDates = MetaProposalList.fromSeq1( val itemDates = MetaProposalList.fromSeq1(
MetaProposalType.DocDate, MetaProposalType.DocDate,
before.map(ndl => before.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label)) Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
) )
) )
@ -71,13 +71,13 @@ object FindProposal {
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] = def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
labels labels
.sortBy(_.startPosition)
.foldLeft((Set.empty[String], List.empty[NerLabel])) { .foldLeft((Set.empty[String], List.empty[NerLabel])) {
case ((seen, result), el) => case ((seen, result), el) =>
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result) if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
else (seen + (el.tag.name + el.label.toLowerCase), el :: result) else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
} }
._2 ._2
.sortBy(_.startPosition)
trait Finder[F[_]] { self => trait Finder[F[_]] { self =>
def find(labels: Seq[NerLabel]): F[MetaProposalList] def find(labels: Seq[NerLabel]): F[MetaProposalList]
@ -91,7 +91,9 @@ object FindProposal {
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] = def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
labels => self.find(labels).flatMap(ml => f(ml).find(labels)) labels => self.find(labels).flatMap(ml => f(ml).find(labels))
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] = def map(
f: MetaProposalList => MetaProposalList
)(implicit F: Applicative[F]): Finder[F] =
labels => self.find(labels).map(f) labels => self.find(labels).map(f)
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] = def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
@ -118,10 +120,12 @@ object FindProposal {
_ => value.pure[F] _ => value.pure[F]
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] = def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten) labels =>
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] = def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten) labels =>
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
} }
private def search[F[_]: Sync]( private def search[F[_]: Sync](
@ -154,10 +158,15 @@ object FindProposal {
val s2 = ctx.store val s2 = ctx.store
.transact(RPerson.findLike(ctx.args.meta.collective, value, false)) .transact(RPerson.findLike(ctx.args.meta.collective, value, false))
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt)) .map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
ctx.logger.debug(s"Looking for persons: $value") *> (for { val s3 =
ctx.store
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
ml0 <- s1 ml0 <- s1
ml1 <- s2 ml1 <- s2
} yield ml0 |+| ml1) ml2 <- s3
} yield ml0 |+| ml1 |+| ml2)
case NerTag.Location => case NerTag.Location =>
ctx.logger ctx.logger

View File

@ -16,7 +16,16 @@ object ProcessItem {
.flatMap(TextAnalysis[F]) .flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50)) .flatMap(Task.setProgress(50))
.flatMap(FindProposal[F]) .flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
.flatMap(Task.setProgress(75)) .flatMap(Task.setProgress(75))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](item)
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
} }

View File

@ -0,0 +1,24 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records._
/** Saves the proposals in the database
*
*/
object SaveProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
ctx.logger.info("Storing proposals") *>
data.metas
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data)
}
}

View File

@ -34,8 +34,10 @@ object TextAnalysis {
for { for {
list0 <- stanfordNer[F](lang, rm) list0 <- stanfordNer[F](lang, rm)
list1 <- contactNer[F](rm) list1 <- contactNer[F](rm)
list = list0 ++ list1
spans = NerLabelSpan.build(list.toSeq)
dates <- dateNer[F](rm, lang) dates <- dateNer[F](rm, lang)
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates) } yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] = def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
Sync[F].delay { Sync[F].delay {

View File

@ -33,14 +33,14 @@ object Context {
private[this] val log = getLogger private[this] val log = getLogger
def create[F[_]: Functor, A]( def create[F[_]: Functor, A](
job: RJob, jobId: Ident,
arg: A, arg: A,
config: SchedulerConfig, config: SchedulerConfig,
log: Logger[F], log: Logger[F],
store: Store[F], store: Store[F],
blocker: Blocker blocker: Blocker
): Context[F, A] = ): Context[F, A] =
new ContextImpl(arg, log, store, blocker, config, job.id) new ContextImpl(arg, log, store, blocker, config, jobId)
def apply[F[_]: Concurrent, A]( def apply[F[_]: Concurrent, A](
job: RJob, job: RJob,
@ -54,7 +54,7 @@ object Context {
_ <- log.ftrace("Creating logger for task run") _ <- log.ftrace("Creating logger for task run")
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink) logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
_ <- log.ftrace("Logger created, instantiating context") _ <- log.ftrace("Logger created, instantiating context")
ctx = create[F, A](job, arg, config, logger, store, blocker) ctx = create[F, A](job.id, arg, config, logger, store, blocker)
} yield ctx } yield ctx
final private class ContextImpl[F[_]: Functor, A]( final private class ContextImpl[F[_]: Functor, A](

View File

@ -24,7 +24,7 @@ object ONode {
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] = def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
for { for {
node <- RNode(appId, nodeType, uri) node <- RNode(appId, nodeType, uri)
_ <- logger.finfo(s"Registering node $node") _ <- logger.finfo(s"Registering node ${node.id.id}")
_ <- store.transact(RNode.set(node)) _ <- store.transact(RNode.set(node))
} yield () } yield ()