Merge pull request #362 from eikek/fix-ner-regexes

Fix regex patterns used for NER
This commit is contained in:
mergify[bot] 2020-10-20 23:08:27 +00:00 committed by GitHub
commit cad5991507
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 14 deletions

View File

@ -75,25 +75,26 @@ object NerFile {
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
def apply(weight: Int)(str: String): List[Pattern] = {
val delims = " \t\n\r".toSet
val words =
val splitted =
TextSplitter
.split(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.map(w => s"(?i)${w}")
val tokens =
TextSplitter
.splitToken(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.take(3)
.map(w => s"(?i)${w}")
.toList
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
.take(3)
.map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
}
private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
private def sanitizeRegex(str: String): String =
str.trim.toLowerCase.foldLeft("") { (res, ch) =>
if (invalidChars.contains(ch)) s"${res}\\$ch"
else s"$res$ch"
}
}
}

View File

@ -232,7 +232,7 @@ object FindProposal {
// The backslash *must* be stripped from search strings.
private[this] val invalidSearch =
"…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
"…[]^<>=ſ{}|`\"';\\".toSet
private def normalizeSearchValue(str: String): String =
str.toLowerCase.filter(c => !invalidSearch.contains(c))

View File

@ -0,0 +1,33 @@
package docspell.joex.analysis
import minitest._
import NerFile.Pattern
import java.{util => ju}
object NerFileTest extends SimpleTestSuite {
test("create valid case insensitive patterns") {
val names = List(
"Some company AG" -> "(?i)some company ag",
"Acme GmbH" -> "(?i)acme gmbh",
"UP" -> "(?i)up",
"1 & 1" -> "(?i)1 & 1",
"1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)",
"X-corp (this)*-*[one]" -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]"
)
for ((name, first) <- names) {
val ps = Pattern(1)(name).distinct
//check if it compiles to a regex pattern
ps.flatMap(_.value.split("\\s+").toList).foreach(_.r)
ps.foreach(_.value.r)
val regex = ps.head.value.r
regex.matches(name)
regex.matches(name.toLowerCase(ju.Locale.ROOT))
regex.matches(name.toUpperCase(ju.Locale.ROOT))
assertEquals(ps.head.value, first)
}
}
}