mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Improve text analysis
- Search for consecutive labels - Sort list of candidates by a weight - Search for organizations using person labels
This commit is contained in:
@ -0,0 +1,56 @@
|
||||
package docspell.common
|
||||
|
||||
final case class NerLabelSpan private (
|
||||
labels: Vector[NerLabel]
|
||||
) {
|
||||
|
||||
def size: Int = labels.size
|
||||
|
||||
def +(label: NerLabel): NerLabelSpan =
|
||||
labels.lastOption match {
|
||||
case None =>
|
||||
NerLabelSpan(Vector(label))
|
||||
case Some(el) =>
|
||||
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
|
||||
else this
|
||||
}
|
||||
|
||||
def asLabel: Option[NerLabel] =
|
||||
(labels.headOption, labels.lastOption) match {
|
||||
case (Some(s), Some(e)) =>
|
||||
Some(
|
||||
NerLabel(
|
||||
labels.map(_.label).mkString(" "),
|
||||
s.tag,
|
||||
s.startPosition,
|
||||
e.endPosition
|
||||
)
|
||||
)
|
||||
case _ =>
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
object NerLabelSpan {
|
||||
|
||||
val empty = NerLabelSpan(Vector.empty)
|
||||
|
||||
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
|
||||
val sorted = labels.sortBy(_.startPosition)
|
||||
sorted
|
||||
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
|
||||
span.lastOption match {
|
||||
case Some(last) =>
|
||||
val next = last + el
|
||||
if (next eq last) span :+ (empty + el)
|
||||
else span.dropRight(1) :+ next
|
||||
case None =>
|
||||
Vector(empty + el)
|
||||
}
|
||||
}
|
||||
.filter(_.size > 1)
|
||||
}
|
||||
|
||||
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
|
||||
buildSpans(labels).flatMap(_.asLabel)
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
package docspell.common
|
||||
|
||||
import minitest._
|
||||
|
||||
object NerLabelSpanTest extends SimpleTestSuite {
|
||||
|
||||
test("build") {
|
||||
val labels = List(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||
NerLabel("Treesville", NerTag.Location, 95, 105),
|
||||
NerLabel("Syrup", NerTag.Organization, 162, 167),
|
||||
NerLabel("Production", NerTag.Organization, 168, 178),
|
||||
NerLabel("Old", NerTag.Organization, 179, 182),
|
||||
NerLabel("Sticky", NerTag.Organization, 183, 189),
|
||||
NerLabel("Pancake", NerTag.Organization, 190, 197),
|
||||
NerLabel("Company", NerTag.Organization, 198, 205),
|
||||
NerLabel("Maple", NerTag.Location, 210, 215),
|
||||
NerLabel("Lane", NerTag.Location, 216, 220),
|
||||
NerLabel("Forest", NerTag.Location, 222, 238),
|
||||
NerLabel("Hemptown", NerTag.Location, 243, 251),
|
||||
NerLabel("Little", NerTag.Organization, 351, 357),
|
||||
NerLabel("League", NerTag.Organization, 358, 364),
|
||||
NerLabel("Derek", NerTag.Person, 1121, 1126),
|
||||
NerLabel("Jeter", NerTag.Person, 1127, 1132),
|
||||
NerLabel("2016-11-07", NerTag.Date, 50, 60),
|
||||
NerLabel("2016-11-07", NerTag.Date, 119, 129),
|
||||
NerLabel("2019-09-03", NerTag.Date, 253, 264),
|
||||
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
|
||||
)
|
||||
|
||||
val spans = NerLabelSpan.build(labels)
|
||||
assertEquals(spans, Vector(
|
||||
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
|
||||
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
|
||||
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
|
||||
NerLabel("Maple Lane", NerTag.Location, 210, 220),
|
||||
NerLabel("Little League", NerTag.Organization, 351, 364),
|
||||
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
|
||||
))
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user