Improve text analysis
- Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels
This commit is contained in:
@ -40,6 +40,13 @@ val testSettings = Seq(
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
lazy val noPublish = Seq(
publish := {},
publishLocal := {},
publishArtifact := false
val elmSettings = Seq(
elmCompileMode := ElmCompileMode.Debug,
Compile/resourceGenerators += Def.task {
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
val root = project.in(file(".")).
name := "docspell-root"
@ -0,0 +1,56 @@
package docspell.common
final case class NerLabelSpan private (
labels: Vector[NerLabel]
) {
def size: Int = labels.size
def +(label: NerLabel): NerLabelSpan =
labels.lastOption match {
case None =>
case Some(el) =>
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
else this
def asLabel: Option[NerLabel] =
(labels.headOption, labels.lastOption) match {
case (Some(s), Some(e)) =>
labels.map(_.label).mkString(" "),
case _ =>
object NerLabelSpan {
val empty = NerLabelSpan(Vector.empty)
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
val sorted = labels.sortBy(_.startPosition)
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
span.lastOption match {
case Some(last) =>
val next = last + el
if (next eq last) span :+ (empty + el)
else span.dropRight(1) :+ next
case None =>
Vector(empty + el)
.filter(_.size > 1)
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
@ -0,0 +1,45 @@
package docspell.common
import minitest._
object NerLabelSpanTest extends SimpleTestSuite {
test("build") {
val labels = List(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 162, 167),
NerLabel("Production", NerTag.Organization, 168, 178),
NerLabel("Old", NerTag.Organization, 179, 182),
NerLabel("Sticky", NerTag.Organization, 183, 189),
NerLabel("Pancake", NerTag.Organization, 190, 197),
NerLabel("Company", NerTag.Organization, 198, 205),
NerLabel("Maple", NerTag.Location, 210, 215),
NerLabel("Lane", NerTag.Location, 216, 220),
NerLabel("Forest", NerTag.Location, 222, 238),
NerLabel("Hemptown", NerTag.Location, 243, 251),
NerLabel("Little", NerTag.Organization, 351, 357),
NerLabel("League", NerTag.Organization, 358, 364),
NerLabel("Derek", NerTag.Person, 1121, 1126),
NerLabel("Jeter", NerTag.Person, 1127, 1132),
NerLabel("2016-11-07", NerTag.Date, 50, 60),
NerLabel("2016-11-07", NerTag.Date, 119, 129),
NerLabel("2019-09-03", NerTag.Date, 253, 264),
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
val spans = NerLabelSpan.build(labels)
assertEquals(spans, Vector(
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
NerLabel("Maple Lane", NerTag.Location, 210, 220),
NerLabel("Little League", NerTag.Organization, 351, 364),
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
@ -0,0 +1,66 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first.
object EvalProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { _ =>
val metas = data.metas.map(reorderCandidates)
data.copy(metas = metas).pure[F]
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList
.map(mpt => rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp))
Some(mp.copy(values = v))
case None =>
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
val textLen = rm.content.map(_.length).getOrElse(0)
val tagCount = cand.origin.size.toDouble
val pos = cand.origin.map(_.startPosition).min
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
def positionWeight(pos: Int, total: Int): Double = {
if (total <= 0) 1
else {
val p = math.abs(pos.toDouble / total.toDouble)
if (p < 0.7) p / 2
else p
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
tag match {
case NerTag.Date => 1.0
case NerTag.Email => 0.5
case NerTag.Location => 1.0
case NerTag.Misc => 1.0
case NerTag.Organization =>
if (mt == MetaProposalType.CorrOrg) 0.8
else 1.0
case NerTag.Person =>
if (mt == MetaProposalType.CorrPerson ||
mt == MetaProposalType.ConcPerson) 0.8
else 1.0
case NerTag.Website => 0.5
@ -24,16 +24,10 @@ object FindProposal {
ctx.logger.info("Starting find-proposal") *>
.traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
.flatMap(rmv =>
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
.map(_ => data.copy(metas = rmv))
processAttachment(rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml))
.map(rmv => data.copy(metas = rmv))
def processAttachment[F[_]: Sync](
@ -56,13 +50,19 @@ object FindProposal {
val dueDates = MetaProposalList.fromSeq1(
after.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
val itemDates = MetaProposalList.fromSeq1(
before.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
@ -71,13 +71,13 @@ object FindProposal {
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
case ((seen, result), el) =>
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
trait Finder[F[_]] { self =>
def find(labels: Seq[NerLabel]): F[MetaProposalList]
@ -91,7 +91,9 @@ object FindProposal {
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
def map(
f: MetaProposalList => MetaProposalList
)(implicit F: Applicative[F]): Finder[F] =
labels => self.find(labels).map(f)
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
@ -118,10 +120,12 @@ object FindProposal {
_ => value.pure[F]
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
private def search[F[_]: Sync](
@ -154,10 +158,15 @@ object FindProposal {
val s2 = ctx.store
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
ctx.logger.debug(s"Looking for persons: $value") *> (for {
val s3 =
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
ml0 <- s1
ml1 <- s2
} yield ml0 |+| ml1)
ml2 <- s3
} yield ml0 |+| ml1 |+| ml2)
case NerTag.Location =>
@ -16,7 +16,16 @@ object ProcessItem {
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
@ -0,0 +1,24 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records._
/** Saves the proposals in the database
object SaveProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
ctx.logger.info("Storing proposals") *>
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
.map(_ => data)
@ -34,8 +34,10 @@ object TextAnalysis {
for {
list0 <- stanfordNer[F](lang, rm)
list1 <- contactNer[F](rm)
list = list0 ++ list1
spans = NerLabelSpan.build(list.toSeq)
dates <- dateNer[F](rm, lang)
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
Sync[F].delay {
@ -33,14 +33,14 @@ object Context {
private[this] val log = getLogger
def create[F[_]: Functor, A](
job: RJob,
jobId: Ident,
arg: A,
config: SchedulerConfig,
log: Logger[F],
store: Store[F],
blocker: Blocker
): Context[F, A] =
new ContextImpl(arg, log, store, blocker, config, job.id)
new ContextImpl(arg, log, store, blocker, config, jobId)
def apply[F[_]: Concurrent, A](
job: RJob,
@ -54,7 +54,7 @@ object Context {
_ <- log.ftrace("Creating logger for task run")
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
_ <- log.ftrace("Logger created, instantiating context")
ctx = create[F, A](job, arg, config, logger, store, blocker)
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
} yield ctx
final private class ContextImpl[F[_]: Functor, A](
@ -24,7 +24,7 @@ object ONode {
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
for {
node <- RNode(appId, nodeType, uri)
_ <- logger.finfo(s"Registering node $node")
_ <- logger.finfo(s"Registering node ${node.id.id}")
_ <- store.transact(RNode.set(node))
} yield ()
