mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Improve text analysis
- Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels
This commit is contained in:
parent
a4c97d5d57
commit
00ca6b5697
@ -40,6 +40,13 @@ val testSettings = Seq(
|
|||||||
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
|
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
lazy val noPublish = Seq(
|
||||||
|
publish := {},
|
||||||
|
publishLocal := {},
|
||||||
|
publishArtifact := false
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
val elmSettings = Seq(
|
val elmSettings = Seq(
|
||||||
elmCompileMode := ElmCompileMode.Debug,
|
elmCompileMode := ElmCompileMode.Debug,
|
||||||
Compile/resourceGenerators += Def.task {
|
Compile/resourceGenerators += Def.task {
|
||||||
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
|
|||||||
|
|
||||||
val root = project.in(file(".")).
|
val root = project.in(file(".")).
|
||||||
settings(sharedSettings).
|
settings(sharedSettings).
|
||||||
|
settings(noPublish).
|
||||||
settings(
|
settings(
|
||||||
name := "docspell-root"
|
name := "docspell-root"
|
||||||
).
|
).
|
||||||
|
@ -0,0 +1,56 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
final case class NerLabelSpan private (
|
||||||
|
labels: Vector[NerLabel]
|
||||||
|
) {
|
||||||
|
|
||||||
|
def size: Int = labels.size
|
||||||
|
|
||||||
|
def +(label: NerLabel): NerLabelSpan =
|
||||||
|
labels.lastOption match {
|
||||||
|
case None =>
|
||||||
|
NerLabelSpan(Vector(label))
|
||||||
|
case Some(el) =>
|
||||||
|
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
|
||||||
|
else this
|
||||||
|
}
|
||||||
|
|
||||||
|
def asLabel: Option[NerLabel] =
|
||||||
|
(labels.headOption, labels.lastOption) match {
|
||||||
|
case (Some(s), Some(e)) =>
|
||||||
|
Some(
|
||||||
|
NerLabel(
|
||||||
|
labels.map(_.label).mkString(" "),
|
||||||
|
s.tag,
|
||||||
|
s.startPosition,
|
||||||
|
e.endPosition
|
||||||
|
)
|
||||||
|
)
|
||||||
|
case _ =>
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object NerLabelSpan {
|
||||||
|
|
||||||
|
val empty = NerLabelSpan(Vector.empty)
|
||||||
|
|
||||||
|
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
|
||||||
|
val sorted = labels.sortBy(_.startPosition)
|
||||||
|
sorted
|
||||||
|
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
|
||||||
|
span.lastOption match {
|
||||||
|
case Some(last) =>
|
||||||
|
val next = last + el
|
||||||
|
if (next eq last) span :+ (empty + el)
|
||||||
|
else span.dropRight(1) :+ next
|
||||||
|
case None =>
|
||||||
|
Vector(empty + el)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.filter(_.size > 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
|
||||||
|
buildSpans(labels).flatMap(_.asLabel)
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
import minitest._
|
||||||
|
|
||||||
|
object NerLabelSpanTest extends SimpleTestSuite {
|
||||||
|
|
||||||
|
test("build") {
|
||||||
|
val labels = List(
|
||||||
|
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||||
|
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||||
|
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||||
|
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||||
|
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||||
|
NerLabel("Treesville", NerTag.Location, 95, 105),
|
||||||
|
NerLabel("Syrup", NerTag.Organization, 162, 167),
|
||||||
|
NerLabel("Production", NerTag.Organization, 168, 178),
|
||||||
|
NerLabel("Old", NerTag.Organization, 179, 182),
|
||||||
|
NerLabel("Sticky", NerTag.Organization, 183, 189),
|
||||||
|
NerLabel("Pancake", NerTag.Organization, 190, 197),
|
||||||
|
NerLabel("Company", NerTag.Organization, 198, 205),
|
||||||
|
NerLabel("Maple", NerTag.Location, 210, 215),
|
||||||
|
NerLabel("Lane", NerTag.Location, 216, 220),
|
||||||
|
NerLabel("Forest", NerTag.Location, 222, 238),
|
||||||
|
NerLabel("Hemptown", NerTag.Location, 243, 251),
|
||||||
|
NerLabel("Little", NerTag.Organization, 351, 357),
|
||||||
|
NerLabel("League", NerTag.Organization, 358, 364),
|
||||||
|
NerLabel("Derek", NerTag.Person, 1121, 1126),
|
||||||
|
NerLabel("Jeter", NerTag.Person, 1127, 1132),
|
||||||
|
NerLabel("2016-11-07", NerTag.Date, 50, 60),
|
||||||
|
NerLabel("2016-11-07", NerTag.Date, 119, 129),
|
||||||
|
NerLabel("2019-09-03", NerTag.Date, 253, 264),
|
||||||
|
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
|
||||||
|
)
|
||||||
|
|
||||||
|
val spans = NerLabelSpan.build(labels)
|
||||||
|
assertEquals(spans, Vector(
|
||||||
|
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
|
||||||
|
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
|
||||||
|
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
|
||||||
|
NerLabel("Maple Lane", NerTag.Location, 210, 220),
|
||||||
|
NerLabel("Little League", NerTag.Organization, 351, 364),
|
||||||
|
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,66 @@
|
|||||||
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.implicits._
|
||||||
|
import cats.effect.Sync
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.joex.scheduler.Task
|
||||||
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
|
||||||
|
/** Reorders the proposals to put most probable fits first.
|
||||||
|
*/
|
||||||
|
object EvalProposals {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { _ =>
|
||||||
|
val metas = data.metas.map(reorderCandidates)
|
||||||
|
data.copy(metas = metas).pure[F]
|
||||||
|
}
|
||||||
|
|
||||||
|
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
|
||||||
|
val list = rm.proposals.getTypes.toList
|
||||||
|
.map(mpt => rm.proposals.find(mpt) match {
|
||||||
|
case Some(mp) =>
|
||||||
|
val v = mp.values.sortBy(weight(rm, mp))
|
||||||
|
Some(mp.copy(values = v))
|
||||||
|
case None =>
|
||||||
|
None
|
||||||
|
})
|
||||||
|
|
||||||
|
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
|
||||||
|
}
|
||||||
|
|
||||||
|
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
|
||||||
|
val textLen = rm.content.map(_.length).getOrElse(0)
|
||||||
|
val tagCount = cand.origin.size.toDouble
|
||||||
|
val pos = cand.origin.map(_.startPosition).min
|
||||||
|
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
|
||||||
|
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
|
||||||
|
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
|
||||||
|
}
|
||||||
|
|
||||||
|
def positionWeight(pos: Int, total: Int): Double = {
|
||||||
|
if (total <= 0) 1
|
||||||
|
else {
|
||||||
|
val p = math.abs(pos.toDouble / total.toDouble)
|
||||||
|
if (p < 0.7) p / 2
|
||||||
|
else p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
|
||||||
|
tag match {
|
||||||
|
case NerTag.Date => 1.0
|
||||||
|
case NerTag.Email => 0.5
|
||||||
|
case NerTag.Location => 1.0
|
||||||
|
case NerTag.Misc => 1.0
|
||||||
|
case NerTag.Organization =>
|
||||||
|
if (mt == MetaProposalType.CorrOrg) 0.8
|
||||||
|
else 1.0
|
||||||
|
case NerTag.Person =>
|
||||||
|
if (mt == MetaProposalType.CorrPerson ||
|
||||||
|
mt == MetaProposalType.ConcPerson) 0.8
|
||||||
|
else 1.0
|
||||||
|
case NerTag.Website => 0.5
|
||||||
|
}
|
||||||
|
}
|
@ -24,16 +24,10 @@ object FindProposal {
|
|||||||
ctx.logger.info("Starting find-proposal") *>
|
ctx.logger.info("Starting find-proposal") *>
|
||||||
rmas
|
rmas
|
||||||
.traverse(rm =>
|
.traverse(rm =>
|
||||||
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
|
processAttachment(rm, data.findDates(rm), ctx)
|
||||||
)
|
.map(ml => rm.copy(proposals = ml))
|
||||||
.flatMap(rmv =>
|
|
||||||
rmv
|
|
||||||
.traverse(rm =>
|
|
||||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
|
||||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
|
||||||
)
|
|
||||||
.map(_ => data.copy(metas = rmv))
|
|
||||||
)
|
)
|
||||||
|
.map(rmv => data.copy(metas = rmv))
|
||||||
}
|
}
|
||||||
|
|
||||||
def processAttachment[F[_]: Sync](
|
def processAttachment[F[_]: Sync](
|
||||||
@ -56,13 +50,19 @@ object FindProposal {
|
|||||||
val dueDates = MetaProposalList.fromSeq1(
|
val dueDates = MetaProposalList.fromSeq1(
|
||||||
MetaProposalType.DueDate,
|
MetaProposalType.DueDate,
|
||||||
after.map(ndl =>
|
after.map(ndl =>
|
||||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
Candidate(
|
||||||
|
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||||
|
Set(ndl.label)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
val itemDates = MetaProposalList.fromSeq1(
|
val itemDates = MetaProposalList.fromSeq1(
|
||||||
MetaProposalType.DocDate,
|
MetaProposalType.DocDate,
|
||||||
before.map(ndl =>
|
before.map(ndl =>
|
||||||
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
|
Candidate(
|
||||||
|
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
|
||||||
|
Set(ndl.label)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -71,13 +71,13 @@ object FindProposal {
|
|||||||
|
|
||||||
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
|
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
|
||||||
labels
|
labels
|
||||||
|
.sortBy(_.startPosition)
|
||||||
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
|
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
|
||||||
case ((seen, result), el) =>
|
case ((seen, result), el) =>
|
||||||
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
|
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
|
||||||
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
|
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
|
||||||
}
|
}
|
||||||
._2
|
._2
|
||||||
.sortBy(_.startPosition)
|
|
||||||
|
|
||||||
trait Finder[F[_]] { self =>
|
trait Finder[F[_]] { self =>
|
||||||
def find(labels: Seq[NerLabel]): F[MetaProposalList]
|
def find(labels: Seq[NerLabel]): F[MetaProposalList]
|
||||||
@ -91,7 +91,9 @@ object FindProposal {
|
|||||||
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
|
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
|
||||||
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
|
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
|
||||||
|
|
||||||
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
|
def map(
|
||||||
|
f: MetaProposalList => MetaProposalList
|
||||||
|
)(implicit F: Applicative[F]): Finder[F] =
|
||||||
labels => self.find(labels).map(f)
|
labels => self.find(labels).map(f)
|
||||||
|
|
||||||
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
||||||
@ -118,10 +120,12 @@ object FindProposal {
|
|||||||
_ => value.pure[F]
|
_ => value.pure[F]
|
||||||
|
|
||||||
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||||
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
labels =>
|
||||||
|
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
|
||||||
|
|
||||||
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
|
||||||
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
labels =>
|
||||||
|
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def search[F[_]: Sync](
|
private def search[F[_]: Sync](
|
||||||
@ -154,10 +158,15 @@ object FindProposal {
|
|||||||
val s2 = ctx.store
|
val s2 = ctx.store
|
||||||
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
|
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
|
||||||
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
|
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
|
||||||
ctx.logger.debug(s"Looking for persons: $value") *> (for {
|
val s3 =
|
||||||
|
ctx.store
|
||||||
|
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
|
||||||
|
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
|
||||||
|
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
|
||||||
ml0 <- s1
|
ml0 <- s1
|
||||||
ml1 <- s2
|
ml1 <- s2
|
||||||
} yield ml0 |+| ml1)
|
ml2 <- s3
|
||||||
|
} yield ml0 |+| ml1 |+| ml2)
|
||||||
|
|
||||||
case NerTag.Location =>
|
case NerTag.Location =>
|
||||||
ctx.logger
|
ctx.logger
|
||||||
|
@ -16,7 +16,16 @@ object ProcessItem {
|
|||||||
.flatMap(TextAnalysis[F])
|
.flatMap(TextAnalysis[F])
|
||||||
.flatMap(Task.setProgress(50))
|
.flatMap(Task.setProgress(50))
|
||||||
.flatMap(FindProposal[F])
|
.flatMap(FindProposal[F])
|
||||||
|
.flatMap(EvalProposals[F])
|
||||||
|
.flatMap(SaveProposals[F])
|
||||||
.flatMap(Task.setProgress(75))
|
.flatMap(Task.setProgress(75))
|
||||||
.flatMap(LinkProposal[F])
|
.flatMap(LinkProposal[F])
|
||||||
.flatMap(Task.setProgress(99))
|
.flatMap(Task.setProgress(99))
|
||||||
|
|
||||||
|
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
TextAnalysis[F](item)
|
||||||
|
.flatMap(FindProposal[F])
|
||||||
|
.flatMap(EvalProposals[F])
|
||||||
|
.flatMap(SaveProposals[F])
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.implicits._
|
||||||
|
import cats.effect.Sync
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.joex.scheduler.Task
|
||||||
|
import docspell.store.records._
|
||||||
|
|
||||||
|
/** Saves the proposals in the database
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
object SaveProposals {
|
||||||
|
|
||||||
|
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
ctx.logger.info("Storing proposals") *>
|
||||||
|
data.metas
|
||||||
|
.traverse(rm =>
|
||||||
|
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||||
|
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||||
|
)
|
||||||
|
.map(_ => data)
|
||||||
|
}
|
||||||
|
}
|
@ -34,8 +34,10 @@ object TextAnalysis {
|
|||||||
for {
|
for {
|
||||||
list0 <- stanfordNer[F](lang, rm)
|
list0 <- stanfordNer[F](lang, rm)
|
||||||
list1 <- contactNer[F](rm)
|
list1 <- contactNer[F](rm)
|
||||||
|
list = list0 ++ list1
|
||||||
|
spans = NerLabelSpan.build(list.toSeq)
|
||||||
dates <- dateNer[F](rm, lang)
|
dates <- dateNer[F](rm, lang)
|
||||||
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
|
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
|
||||||
|
|
||||||
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
|
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
|
@ -33,14 +33,14 @@ object Context {
|
|||||||
private[this] val log = getLogger
|
private[this] val log = getLogger
|
||||||
|
|
||||||
def create[F[_]: Functor, A](
|
def create[F[_]: Functor, A](
|
||||||
job: RJob,
|
jobId: Ident,
|
||||||
arg: A,
|
arg: A,
|
||||||
config: SchedulerConfig,
|
config: SchedulerConfig,
|
||||||
log: Logger[F],
|
log: Logger[F],
|
||||||
store: Store[F],
|
store: Store[F],
|
||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Context[F, A] =
|
): Context[F, A] =
|
||||||
new ContextImpl(arg, log, store, blocker, config, job.id)
|
new ContextImpl(arg, log, store, blocker, config, jobId)
|
||||||
|
|
||||||
def apply[F[_]: Concurrent, A](
|
def apply[F[_]: Concurrent, A](
|
||||||
job: RJob,
|
job: RJob,
|
||||||
@ -54,7 +54,7 @@ object Context {
|
|||||||
_ <- log.ftrace("Creating logger for task run")
|
_ <- log.ftrace("Creating logger for task run")
|
||||||
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
|
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
|
||||||
_ <- log.ftrace("Logger created, instantiating context")
|
_ <- log.ftrace("Logger created, instantiating context")
|
||||||
ctx = create[F, A](job, arg, config, logger, store, blocker)
|
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
|
||||||
} yield ctx
|
} yield ctx
|
||||||
|
|
||||||
final private class ContextImpl[F[_]: Functor, A](
|
final private class ContextImpl[F[_]: Functor, A](
|
||||||
|
@ -24,7 +24,7 @@ object ONode {
|
|||||||
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
|
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
|
||||||
for {
|
for {
|
||||||
node <- RNode(appId, nodeType, uri)
|
node <- RNode(appId, nodeType, uri)
|
||||||
_ <- logger.finfo(s"Registering node $node")
|
_ <- logger.finfo(s"Registering node ${node.id.id}")
|
||||||
_ <- store.transact(RNode.set(node))
|
_ <- store.transact(RNode.set(node))
|
||||||
} yield ()
|
} yield ()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user