Improve text analysis

- Search for consecutive labels

- Sort list of candidates by a weight

- Search for organizations using person labels
This commit is contained in:
Eike Kettner
2020-03-17 22:34:50 +01:00
parent a4c97d5d57
commit 00ca6b5697
10 changed files with 241 additions and 22 deletions

View File

@ -0,0 +1,56 @@
package docspell.common
final case class NerLabelSpan private (
labels: Vector[NerLabel]
) {
def size: Int = labels.size
def +(label: NerLabel): NerLabelSpan =
labels.lastOption match {
case None =>
NerLabelSpan(Vector(label))
case Some(el) =>
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
else this
}
def asLabel: Option[NerLabel] =
(labels.headOption, labels.lastOption) match {
case (Some(s), Some(e)) =>
Some(
NerLabel(
labels.map(_.label).mkString(" "),
s.tag,
s.startPosition,
e.endPosition
)
)
case _ =>
None
}
}
object NerLabelSpan {
val empty = NerLabelSpan(Vector.empty)
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
val sorted = labels.sortBy(_.startPosition)
sorted
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
span.lastOption match {
case Some(last) =>
val next = last + el
if (next eq last) span :+ (empty + el)
else span.dropRight(1) :+ next
case None =>
Vector(empty + el)
}
}
.filter(_.size > 1)
}
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
buildSpans(labels).flatMap(_.asLabel)
}

View File

@ -0,0 +1,45 @@
package docspell.common
import minitest._
object NerLabelSpanTest extends SimpleTestSuite {
test("build") {
val labels = List(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 162, 167),
NerLabel("Production", NerTag.Organization, 168, 178),
NerLabel("Old", NerTag.Organization, 179, 182),
NerLabel("Sticky", NerTag.Organization, 183, 189),
NerLabel("Pancake", NerTag.Organization, 190, 197),
NerLabel("Company", NerTag.Organization, 198, 205),
NerLabel("Maple", NerTag.Location, 210, 215),
NerLabel("Lane", NerTag.Location, 216, 220),
NerLabel("Forest", NerTag.Location, 222, 238),
NerLabel("Hemptown", NerTag.Location, 243, 251),
NerLabel("Little", NerTag.Organization, 351, 357),
NerLabel("League", NerTag.Organization, 358, 364),
NerLabel("Derek", NerTag.Person, 1121, 1126),
NerLabel("Jeter", NerTag.Person, 1127, 1132),
NerLabel("2016-11-07", NerTag.Date, 50, 60),
NerLabel("2016-11-07", NerTag.Date, 119, 129),
NerLabel("2019-09-03", NerTag.Date, 253, 264),
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
)
val spans = NerLabelSpan.build(labels)
assertEquals(spans, Vector(
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
NerLabel("Maple Lane", NerTag.Location, 210, 220),
NerLabel("Little League", NerTag.Organization, 351, 364),
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
))
}
}