mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-30 03:55:07 +00:00
Improve text analysis
- Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels
This commit is contained in:
parent
a4c97d5d57
commit
00ca6b5697
@ -40,6 +40,13 @@ val testSettings = Seq(
|
||||
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
|
||||
)
|
||||
|
||||
lazy val noPublish = Seq(
|
||||
publish := {},
|
||||
publishLocal := {},
|
||||
publishArtifact := false
|
||||
)
|
||||
|
||||
|
||||
val elmSettings = Seq(
|
||||
elmCompileMode := ElmCompileMode.Debug,
|
||||
Compile/resourceGenerators += Def.task {
|
||||
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
|
||||
|
||||
val root = project.in(file(".")).
|
||||
settings(sharedSettings).
|
||||
settings(noPublish).
|
||||
settings(
|
||||
name := "docspell-root"
|
||||
).
|
||||
|
@ -0,0 +1,56 @@
|
||||
package docspell.common
|
||||
|
||||
final case class NerLabelSpan private (
|
||||
labels: Vector[NerLabel]
|
||||
) {
|
||||
|
||||
def size: Int = labels.size
|
||||
|
||||
def +(label: NerLabel): NerLabelSpan =
|
||||
labels.lastOption match {
|
||||
case None =>
|
||||
NerLabelSpan(Vector(label))
|
||||
case Some(el) =>
|
||||
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
|
||||
else this
|
||||
}
|
||||
|
||||
def asLabel: Option[NerLabel] =
|
||||
(labels.headOption, labels.lastOption) match {
|
||||
case (Some(s), Some(e)) =>
|
||||
Some(
|
||||
NerLabel(
|
||||
labels.map(_.label).mkString(" "),
|
||||
s.tag,
|
||||
s.startPosition,
|
||||
e.endPosition
|
||||
)
|
||||
)
|
||||
case _ =>
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
object NerLabelSpan {
|
||||
|
||||
val empty = NerLabelSpan(Vector.empty)
|
||||
|
||||
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
|
||||
val sorted = labels.sortBy(_.startPosition)
|
||||
sorted
|
||||
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
|
||||
span.lastOption match {
|
||||
case Some(last) =>
|
||||
val next = last + el
|
||||
if (next eq last) span :+ (empty + el)
|
||||
else span.dropRight(1) :+ next
|
||||
case None =>
|
||||
Vector(empty + el)
|
||||
}
|
||||
}
|
||||
.filter(_.size > 1)
|
||||
}
|
||||
|
||||
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
|
||||
buildSpans(labels).flatMap(_.asLabel)
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package docspell.common
|
||||
|
||||
import minitest._
|
||||
|
||||
object NerLabelSpanTest extends SimpleTestSuite {
|
||||
|
||||
test("build") {
|
||||
val labels = List(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||
NerLabel("Treesville", NerTag.Location, 95, 105),
|
||||
NerLabel("Syrup", NerTag.Organization, 162, 167),
|
||||
NerLabel("Production", NerTag.Organization, 168, 178),
|
||||
NerLabel("Old", NerTag.Organization, 179, 182),
|
||||
NerLabel("Sticky", NerTag.Organization, 183, 189),
|
||||
NerLabel("Pancake", NerTag.Organization, 190, 197),
|
||||
NerLabel("Company", NerTag.Organization, 198, 205),
|
||||
NerLabel("Maple", NerTag.Location, 210, 215),
|
||||
NerLabel("Lane", NerTag.Location, 216, 220),
|
||||
NerLabel("Forest", NerTag.Location, 222, 238),
|
||||
NerLabel("Hemptown", NerTag.Location, 243, 251),
|
||||
NerLabel("Little", NerTag.Organization, 351, 357),
|
||||
NerLabel("League", NerTag.Organization, 358, 364),
|
||||
NerLabel("Derek", NerTag.Person, 1121, 1126),
|
||||
NerLabel("Jeter", NerTag.Person, 1127, 1132),
|
||||
NerLabel("2016-11-07", NerTag.Date, 50, 60),
|
||||
NerLabel("2016-11-07", NerTag.Date, 119, 129),
|
||||
NerLabel("2019-09-03", NerTag.Date, 253, 264),
|
||||
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
|
||||
)
|
||||
|
||||
val spans = NerLabelSpan.build(labels)
|
||||
assertEquals(spans, Vector(
|
||||
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
|
||||
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
|
||||
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
|
||||
NerLabel("Maple Lane", NerTag.Location, 210, 220),
|
||||
NerLabel("Little League", NerTag.Organization, 351, 364),
|
||||
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
|
||||
))
|
||||
}
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
|
||||
/** Reorders the proposals to put most probable fits first.
|
||||
*/
|
||||
object EvalProposals {
|
||||
|
||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { _ =>
|
||||
val metas = data.metas.map(reorderCandidates)
|
||||
data.copy(metas = metas).pure[F]
|
||||
}
|
||||
|
||||
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
|
||||
val list = rm.proposals.getTypes.toList
|
||||
.map(mpt => rm.proposals.find(mpt) match {
|
||||
case Some(mp) =>
|
||||
val v = mp.values.sortBy(weight(rm, mp))
|
||||
Some(mp.copy(values = v))
|
||||
case None =>
|
||||
None
|
||||
})
|
||||
|
||||
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
|
||||
}
|
||||
|
||||
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
|
||||
val textLen = rm.content.map(_.length).getOrElse(0)
|
||||
val tagCount = cand.origin.size.toDouble
|
||||
val pos = cand.origin.map(_.startPosition).min
|
||||
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
|
||||
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
|
||||
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
|
||||
}
|
||||
|
||||
def positionWeight(pos: Int, total: Int): Double = {
|
||||
if (total <= 0) 1
|
||||
else {
|
||||
val p = math.abs(pos.toDouble / total.toDouble)
|
||||
if (p < 0.7) p / 2
|
||||
else p
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
|
||||
tag match {
|
||||
case NerTag.Date => 1.0
|
||||
case NerTag.Email => 0.5
|
||||
case NerTag.Location => 1.0
|
||||
case NerTag.Misc => 1.0
|
||||
case NerTag.Organization =>
|
||||
if (mt == MetaProposalType.CorrOrg) 0.8
|
||||
else 1.0
|
||||
case NerTag.Person =>
|
||||
if (mt == MetaProposalType.CorrPerson ||
|
||||
mt == MetaProposalType.ConcPerson) 0.8
|
||||
else 1.0
|
||||
case NerTag.Website => 0.5
|
||||
}
|
||||
}
|
@ -24,16 +24,10 @@ object FindProposal {
|
||||
ctx.logger.info("Starting find-proposal") *>
|
||||
rmas
|
||||
.traverse(rm =>
|
||||
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
|
||||
)
|
||||
.flatMap(rmv =>
|
||||
rmv
|
||||
.traverse(rm =>
|
||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||
)
|
||||
.map(_ => data.copy(metas = rmv))
|
||||
processAttachment(rm, data.findDates(rm), ctx)
|
||||
.map(ml => rm.copy(proposals = ml))
|
||||
)
|
||||
.map(rmv => data.copy(metas = rmv))
|
||||
}
|
||||
|
||||
def processAttachment[F[_]: Sync](
|
||||
@ -56,13 +50,19 @@ object FindProposal {
|
||||
val dueDates = MetaProposalList.fromSeq1(
|
||||
MetaProposalType.DueDate,
|
||||
after.map(ndl =>
|
||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
||||
Candidate(
|
||||
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||
Set(ndl.label)
|
||||
)
|
||||
)
|
||||
)
|
||||
val itemDates = MetaProposalList.fromSeq1(
|
||||
MetaProposalType.DocDate,
|
||||
before.map(ndl =>
|
||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
||||
Candidate(
|
||||
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||
Set(ndl.label)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
@ -71,13 +71,13 @@ object FindProposal {
|
||||
|
||||
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
|
||||
labels
|
||||
.sortBy(_.startPosition)
|
||||
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
|
||||
case ((seen, result), el) =>
|
||||
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
|
||||
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
|
||||
}
|
||||
._2
|
||||
.sortBy(_.startPosition)
|
||||
|
||||
trait Finder[F[_]] { self =>
|
||||
def find(labels: Seq[NerLabel]): F[MetaProposalList]
|
||||
@ -91,7 +91,9 @@ object FindProposal {
|
||||
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
|
||||
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
|
||||
|
||||
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
|
||||
def map(
|
||||
f: MetaProposalList => MetaProposalList
|
||||
)(implicit F: Applicative[F]): Finder[F] =
|
||||
labels => self.find(labels).map(f)
|
||||
|
||||
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
||||
@ -118,10 +120,12 @@ object FindProposal {
|
||||
_ => value.pure[F]
|
||||
|
||||
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
||||
labels =>
|
||||
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
||||
|
||||
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
||||
labels =>
|
||||
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
||||
}
|
||||
|
||||
private def search[F[_]: Sync](
|
||||
@ -154,10 +158,15 @@ object FindProposal {
|
||||
val s2 = ctx.store
|
||||
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
|
||||
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
|
||||
ctx.logger.debug(s"Looking for persons: $value") *> (for {
|
||||
val s3 =
|
||||
ctx.store
|
||||
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
|
||||
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
|
||||
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
|
||||
ml0 <- s1
|
||||
ml1 <- s2
|
||||
} yield ml0 |+| ml1)
|
||||
ml2 <- s3
|
||||
} yield ml0 |+| ml1 |+| ml2)
|
||||
|
||||
case NerTag.Location =>
|
||||
ctx.logger
|
||||
|
@ -16,7 +16,16 @@ object ProcessItem {
|
||||
.flatMap(TextAnalysis[F])
|
||||
.flatMap(Task.setProgress(50))
|
||||
.flatMap(FindProposal[F])
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
.flatMap(Task.setProgress(75))
|
||||
.flatMap(LinkProposal[F])
|
||||
.flatMap(Task.setProgress(99))
|
||||
|
||||
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](item)
|
||||
.flatMap(FindProposal[F])
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,24 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records._
|
||||
|
||||
/** Saves the proposals in the database
|
||||
*
|
||||
*/
|
||||
object SaveProposals {
|
||||
|
||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
ctx.logger.info("Storing proposals") *>
|
||||
data.metas
|
||||
.traverse(rm =>
|
||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||
)
|
||||
.map(_ => data)
|
||||
}
|
||||
}
|
@ -34,8 +34,10 @@ object TextAnalysis {
|
||||
for {
|
||||
list0 <- stanfordNer[F](lang, rm)
|
||||
list1 <- contactNer[F](rm)
|
||||
list = list0 ++ list1
|
||||
spans = NerLabelSpan.build(list.toSeq)
|
||||
dates <- dateNer[F](rm, lang)
|
||||
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
|
||||
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
|
||||
|
||||
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
|
@ -33,14 +33,14 @@ object Context {
|
||||
private[this] val log = getLogger
|
||||
|
||||
def create[F[_]: Functor, A](
|
||||
job: RJob,
|
||||
jobId: Ident,
|
||||
arg: A,
|
||||
config: SchedulerConfig,
|
||||
log: Logger[F],
|
||||
store: Store[F],
|
||||
blocker: Blocker
|
||||
): Context[F, A] =
|
||||
new ContextImpl(arg, log, store, blocker, config, job.id)
|
||||
new ContextImpl(arg, log, store, blocker, config, jobId)
|
||||
|
||||
def apply[F[_]: Concurrent, A](
|
||||
job: RJob,
|
||||
@ -54,7 +54,7 @@ object Context {
|
||||
_ <- log.ftrace("Creating logger for task run")
|
||||
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
|
||||
_ <- log.ftrace("Logger created, instantiating context")
|
||||
ctx = create[F, A](job, arg, config, logger, store, blocker)
|
||||
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
|
||||
} yield ctx
|
||||
|
||||
final private class ContextImpl[F[_]: Functor, A](
|
||||
|
@ -24,7 +24,7 @@ object ONode {
|
||||
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
|
||||
for {
|
||||
node <- RNode(appId, nodeType, uri)
|
||||
_ <- logger.finfo(s"Registering node $node")
|
||||
_ <- logger.finfo(s"Registering node ${node.id.id}")
|
||||
_ <- store.transact(RNode.set(node))
|
||||
} yield ()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user