mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-23 19:08:26 +00:00
Fix regex patterns used for NER
Patterns are split on whitespace by the nlp library and then compiled, so each "word" must be a valid regex. Fixes: #356
This commit is contained in:
@ -75,25 +75,26 @@ object NerFile {
|
||||
}
|
||||
|
||||
object Pattern {
|
||||
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
||||
def apply(weight: Int)(str: String): List[Pattern] = {
|
||||
val delims = " \t\n\r".toSet
|
||||
val words =
|
||||
val splitted =
|
||||
TextSplitter
|
||||
.split(str, delims)
|
||||
.map(_.toLower.value.trim)
|
||||
.filter(_.nonEmpty)
|
||||
.toVector
|
||||
.map(w => s"(?i)${w}")
|
||||
val tokens =
|
||||
TextSplitter
|
||||
.splitToken(str, delims)
|
||||
.map(_.toLower.value.trim)
|
||||
.filter(_.nonEmpty)
|
||||
.toVector
|
||||
.take(3)
|
||||
.map(w => s"(?i)${w}")
|
||||
.toList
|
||||
|
||||
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
||||
Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
|
||||
.take(3)
|
||||
.map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
|
||||
}
|
||||
|
||||
private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
|
||||
|
||||
private def sanitizeRegex(str: String): String =
|
||||
str.trim.toLowerCase.foldLeft("") { (res, ch) =>
|
||||
if (invalidChars.contains(ch)) s"${res}\\$ch"
|
||||
else s"$res$ch"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -232,7 +232,7 @@ object FindProposal {
|
||||
|
||||
// The backslash *must* be stripped from search strings.
|
||||
private[this] val invalidSearch =
|
||||
"…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
|
||||
"…[]^<>=ſ{}|`\"';\\".toSet
|
||||
|
||||
private def normalizeSearchValue(str: String): String =
|
||||
str.toLowerCase.filter(c => !invalidSearch.contains(c))
|
||||
|
Reference in New Issue
Block a user