From 00ca6b569761f59cb97dd26fa3d72e696626b5f8 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 17 Mar 2020 22:34:50 +0100 Subject: [PATCH] Improve text analysis - Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels --- build.sbt | 8 +++ .../scala/docspell/common/NerLabelSpan.scala | 56 ++++++++++++++++ .../docspell/common/NerLabelSpanTest.scala | 45 +++++++++++++ .../docspell/joex/process/EvalProposals.scala | 66 +++++++++++++++++++ .../docspell/joex/process/FindProposal.scala | 43 +++++++----- .../docspell/joex/process/ProcessItem.scala | 9 +++ .../docspell/joex/process/SaveProposals.scala | 24 +++++++ .../docspell/joex/process/TextAnalysis.scala | 4 +- .../docspell/joex/scheduler/Context.scala | 6 +- .../main/scala/docspell/store/ops/ONode.scala | 2 +- 10 files changed, 241 insertions(+), 22 deletions(-) create mode 100644 modules/common/src/main/scala/docspell/common/NerLabelSpan.scala create mode 100644 modules/common/src/test/scala/docspell/common/NerLabelSpanTest.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala diff --git a/build.sbt b/build.sbt index 68b11ccc..b5b674e1 100644 --- a/build.sbt +++ b/build.sbt @@ -40,6 +40,13 @@ val testSettings = Seq( libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test) ) +lazy val noPublish = Seq( + publish := {}, + publishLocal := {}, + publishArtifact := false +) + + val elmSettings = Seq( elmCompileMode := ElmCompileMode.Debug, Compile/resourceGenerators += Def.task { @@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")). val root = project.in(file(".")). settings(sharedSettings). + settings(noPublish). settings( name := "docspell-root" ). diff --git a/modules/common/src/main/scala/docspell/common/NerLabelSpan.scala b/modules/common/src/main/scala/docspell/common/NerLabelSpan.scala new file mode 100644 index 00000000..12e87618 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/NerLabelSpan.scala @@ -0,0 +1,56 @@ +package docspell.common + +final case class NerLabelSpan private ( + labels: Vector[NerLabel] +) { + + def size: Int = labels.size + + def +(label: NerLabel): NerLabelSpan = + labels.lastOption match { + case None => + NerLabelSpan(Vector(label)) + case Some(el) => + if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label) + else this + } + + def asLabel: Option[NerLabel] = + (labels.headOption, labels.lastOption) match { + case (Some(s), Some(e)) => + Some( + NerLabel( + labels.map(_.label).mkString(" "), + s.tag, + s.startPosition, + e.endPosition + ) + ) + case _ => + None + } +} + +object NerLabelSpan { + + val empty = NerLabelSpan(Vector.empty) + + def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = { + val sorted = labels.sortBy(_.startPosition) + sorted + .foldLeft(Vector.empty[NerLabelSpan]) { (span, el) => + span.lastOption match { + case Some(last) => + val next = last + el + if (next eq last) span :+ (empty + el) + else span.dropRight(1) :+ next + case None => + Vector(empty + el) + } + } + .filter(_.size > 1) + } + + def build(labels: Seq[NerLabel]): Vector[NerLabel] = + buildSpans(labels).flatMap(_.asLabel) +} diff --git a/modules/common/src/test/scala/docspell/common/NerLabelSpanTest.scala b/modules/common/src/test/scala/docspell/common/NerLabelSpanTest.scala new file mode 100644 index 00000000..a2a677d4 --- /dev/null +++ b/modules/common/src/test/scala/docspell/common/NerLabelSpanTest.scala @@ -0,0 +1,45 @@ +package docspell.common + +import minitest._ + +object NerLabelSpanTest extends SimpleTestSuite { + + test("build") { + val labels = List( + NerLabel("Derek", NerTag.Person, 0, 5), + NerLabel("Jeter", NerTag.Person, 6, 11), + NerLabel("Treesville", NerTag.Person, 27, 37), + NerLabel("Derek", NerTag.Person, 68, 73), + NerLabel("Jeter", NerTag.Person, 74, 79), + NerLabel("Treesville", NerTag.Location, 95, 105), + NerLabel("Syrup", NerTag.Organization, 162, 167), + NerLabel("Production", NerTag.Organization, 168, 178), + NerLabel("Old", NerTag.Organization, 179, 182), + NerLabel("Sticky", NerTag.Organization, 183, 189), + NerLabel("Pancake", NerTag.Organization, 190, 197), + NerLabel("Company", NerTag.Organization, 198, 205), + NerLabel("Maple", NerTag.Location, 210, 215), + NerLabel("Lane", NerTag.Location, 216, 220), + NerLabel("Forest", NerTag.Location, 222, 238), + NerLabel("Hemptown", NerTag.Location, 243, 251), + NerLabel("Little", NerTag.Organization, 351, 357), + NerLabel("League", NerTag.Organization, 358, 364), + NerLabel("Derek", NerTag.Person, 1121, 1126), + NerLabel("Jeter", NerTag.Person, 1127, 1132), + NerLabel("2016-11-07", NerTag.Date, 50, 60), + NerLabel("2016-11-07", NerTag.Date, 119, 129), + NerLabel("2019-09-03", NerTag.Date, 253, 264), + NerLabel("2016-12-12", NerTag.Date, 1080, 1091) + ) + + val spans = NerLabelSpan.build(labels) + assertEquals(spans, Vector( + NerLabel("Derek Jeter", NerTag.Person, 0, 11), + NerLabel("Derek Jeter", NerTag.Person, 68, 79), + NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205), + NerLabel("Maple Lane", NerTag.Location, 210, 220), + NerLabel("Little League", NerTag.Organization, 351, 364), + NerLabel("Derek Jeter", NerTag.Person, 1121, 1132) + )) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala new file mode 100644 index 00000000..021cb097 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala @@ -0,0 +1,66 @@ +package docspell.joex.process + +import cats.implicits._ +import cats.effect.Sync +import docspell.common._ +import docspell.joex.scheduler.Task +import docspell.store.records.RAttachmentMeta + +/** Reorders the proposals to put most probable fits first. + */ +object EvalProposals { + + def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + Task { _ => + val metas = data.metas.map(reorderCandidates) + data.copy(metas = metas).pure[F] + } + + def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = { + val list = rm.proposals.getTypes.toList + .map(mpt => rm.proposals.find(mpt) match { + case Some(mp) => + val v = mp.values.sortBy(weight(rm, mp)) + Some(mp.copy(values = v)) + case None => + None + }) + + rm.copy(proposals = MetaProposalList(list.flatMap(identity))) + } + + def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = { + val textLen = rm.content.map(_.length).getOrElse(0) + val tagCount = cand.origin.size.toDouble + val pos = cand.origin.map(_.startPosition).min + val words = cand.origin.map(_.label.split(' ').length).max.toDouble + val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min + (1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac + } + + def positionWeight(pos: Int, total: Int): Double = { + if (total <= 0) 1 + else { + val p = math.abs(pos.toDouble / total.toDouble) + if (p < 0.7) p / 2 + else p + } + } + + + def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double = + tag match { + case NerTag.Date => 1.0 + case NerTag.Email => 0.5 + case NerTag.Location => 1.0 + case NerTag.Misc => 1.0 + case NerTag.Organization => + if (mt == MetaProposalType.CorrOrg) 0.8 + else 1.0 + case NerTag.Person => + if (mt == MetaProposalType.CorrPerson || + mt == MetaProposalType.ConcPerson) 0.8 + else 1.0 + case NerTag.Website => 0.5 + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index 8f3f1a04..b4eaf0f7 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -24,16 +24,10 @@ object FindProposal { ctx.logger.info("Starting find-proposal") *> rmas .traverse(rm => - processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml)) - ) - .flatMap(rmv => - rmv - .traverse(rm => - ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> - ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) - ) - .map(_ => data.copy(metas = rmv)) + processAttachment(rm, data.findDates(rm), ctx) + .map(ml => rm.copy(proposals = ml)) ) + .map(rmv => data.copy(metas = rmv)) } def processAttachment[F[_]: Sync]( @@ -56,13 +50,19 @@ object FindProposal { val dueDates = MetaProposalList.fromSeq1( MetaProposalType.DueDate, after.map(ndl => - Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label)) + Candidate( + IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), + Set(ndl.label) + ) ) ) val itemDates = MetaProposalList.fromSeq1( MetaProposalType.DocDate, before.map(ndl => - Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label)) + Candidate( + IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), + Set(ndl.label) + ) ) ) @@ -71,13 +71,13 @@ object FindProposal { def removeDuplicates(labels: List[NerLabel]): List[NerLabel] = labels + .sortBy(_.startPosition) .foldLeft((Set.empty[String], List.empty[NerLabel])) { case ((seen, result), el) => if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result) else (seen + (el.tag.name + el.label.toLowerCase), el :: result) } ._2 - .sortBy(_.startPosition) trait Finder[F[_]] { self => def find(labels: Seq[NerLabel]): F[MetaProposalList] @@ -91,7 +91,9 @@ object FindProposal { def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] = labels => self.find(labels).flatMap(ml => f(ml).find(labels)) - def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] = + def map( + f: MetaProposalList => MetaProposalList + )(implicit F: Applicative[F]): Finder[F] = labels => self.find(labels).map(f) def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] = @@ -118,10 +120,12 @@ object FindProposal { _ => value.pure[F] def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] = - labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten) + labels => + labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten) def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] = - labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten) + labels => + labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten) } private def search[F[_]: Sync]( @@ -154,10 +158,15 @@ object FindProposal { val s2 = ctx.store .transact(RPerson.findLike(ctx.args.meta.collective, value, false)) .map(MetaProposalList.from(MetaProposalType.CorrPerson, nt)) - ctx.logger.debug(s"Looking for persons: $value") *> (for { + val s3 = + ctx.store + .transact(ROrganization.findLike(ctx.args.meta.collective, value)) + .map(MetaProposalList.from(MetaProposalType.CorrOrg, nt)) + ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for { ml0 <- s1 ml1 <- s2 - } yield ml0 |+| ml1) + ml2 <- s3 + } yield ml0 |+| ml1 |+| ml2) case NerTag.Location => ctx.logger diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index bb67fe03..b79aa40a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -16,7 +16,16 @@ object ProcessItem { .flatMap(TextAnalysis[F]) .flatMap(Task.setProgress(50)) .flatMap(FindProposal[F]) + .flatMap(EvalProposals[F]) + .flatMap(SaveProposals[F]) .flatMap(Task.setProgress(75)) .flatMap(LinkProposal[F]) .flatMap(Task.setProgress(99)) + + def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] = + TextAnalysis[F](item) + .flatMap(FindProposal[F]) + .flatMap(EvalProposals[F]) + .flatMap(SaveProposals[F]) + } diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala new file mode 100644 index 00000000..c9c74e11 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -0,0 +1,24 @@ +package docspell.joex.process + +import cats.implicits._ +import cats.effect.Sync +import docspell.common._ +import docspell.joex.scheduler.Task +import docspell.store.records._ + +/** Saves the proposals in the database + * + */ +object SaveProposals { + + def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + ctx.logger.info("Storing proposals") *> + data.metas + .traverse(rm => + ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> + ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) + ) + .map(_ => data) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 14aabdc4..b64f8997 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -34,8 +34,10 @@ object TextAnalysis { for { list0 <- stanfordNer[F](lang, rm) list1 <- contactNer[F](rm) + list = list0 ++ list1 + spans = NerLabelSpan.build(list.toSeq) dates <- dateNer[F](rm, lang) - } yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates) + } yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates) def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] = Sync[F].delay { diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala index b1cef3fb..ca16c1a8 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala @@ -33,14 +33,14 @@ object Context { private[this] val log = getLogger def create[F[_]: Functor, A]( - job: RJob, + jobId: Ident, arg: A, config: SchedulerConfig, log: Logger[F], store: Store[F], blocker: Blocker ): Context[F, A] = - new ContextImpl(arg, log, store, blocker, config, job.id) + new ContextImpl(arg, log, store, blocker, config, jobId) def apply[F[_]: Concurrent, A]( job: RJob, @@ -54,7 +54,7 @@ object Context { _ <- log.ftrace("Creating logger for task run") logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink) _ <- log.ftrace("Logger created, instantiating context") - ctx = create[F, A](job, arg, config, logger, store, blocker) + ctx = create[F, A](job.id, arg, config, logger, store, blocker) } yield ctx final private class ContextImpl[F[_]: Functor, A]( diff --git a/modules/store/src/main/scala/docspell/store/ops/ONode.scala b/modules/store/src/main/scala/docspell/store/ops/ONode.scala index 42f8c91e..8e682b18 100644 --- a/modules/store/src/main/scala/docspell/store/ops/ONode.scala +++ b/modules/store/src/main/scala/docspell/store/ops/ONode.scala @@ -24,7 +24,7 @@ object ONode { def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] = for { node <- RNode(appId, nodeType, uri) - _ <- logger.finfo(s"Registering node $node") + _ <- logger.finfo(s"Registering node ${node.id.id}") _ <- store.transact(RNode.set(node)) } yield ()