Fix regex patterns used for NER

Patterns are split on whitespace by the nlp library and then compiled,
so each "word" must be a valid regex.

Fixes: #356
This commit is contained in:
Eike Kettner
2020-10-21 00:48:09 +02:00
parent 0c873f732b
commit 3c0b86cb19
3 changed files with 48 additions and 14 deletions

View File

@ -75,25 +75,26 @@ object NerFile {
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
def apply(weight: Int)(str: String): List[Pattern] = {
val delims = " \t\n\r".toSet
val words =
val splitted =
TextSplitter
.split(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.map(w => s"(?i)${w}")
val tokens =
TextSplitter
.splitToken(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.take(3)
.map(w => s"(?i)${w}")
.toList
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
.take(3)
.map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
}
private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
private def sanitizeRegex(str: String): String =
str.trim.toLowerCase.foldLeft("") { (res, ch) =>
if (invalidChars.contains(ch)) s"${res}\\$ch"
else s"$res$ch"
}
}
}

View File

@ -232,7 +232,7 @@ object FindProposal {
// The backslash *must* be stripped from search strings.
private[this] val invalidSearch =
"…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
"…[]^<>=ſ{}|`\"';\\".toSet
private def normalizeSearchValue(str: String): String =
str.toLowerCase.filter(c => !invalidSearch.contains(c))