Fix regex patterns used for NER

Patterns are split on whitespace by the nlp library and then compiled, so each "word" must be a valid regex. Fixes: #356
2025-09-28 23:58:21 +00:00 · 2020-10-21 00:48:09 +02:00
parent 0c873f732b
commit 3c0b86cb19
3 changed files with 48 additions and 14 deletions
--- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@@ -75,25 +75,26 @@ object NerFile {
  }

  object Pattern {
-    def apply(weight: Int)(str: String): Vector[Pattern] = {
+    def apply(weight: Int)(str: String): List[Pattern] = {
      val delims = " \t\n\r".toSet
-      val words =
+      val splitted =
        TextSplitter
          .split(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
-          .toVector
-          .map(w => s"(?i)${w}")
-      val tokens =
-        TextSplitter
-          .splitToken(str, delims)
-          .map(_.toLower.value.trim)
-          .filter(_.nonEmpty)
-          .toVector
-          .take(3)
-          .map(w => s"(?i)${w}")
+          .toList

-      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+      Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
+        .take(3)
+        .map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
    }
+
+    private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
+
+    private def sanitizeRegex(str: String): String =
+      str.trim.toLowerCase.foldLeft("") { (res, ch) =>
+        if (invalidChars.contains(ch)) s"${res}\\$ch"
+        else s"$res$ch"
+      }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
@@ -232,7 +232,7 @@ object FindProposal {

  // The backslash *must* be stripped from search strings.
  private[this] val invalidSearch =
-    "…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
+    "…[]^<>=ſ{}|`\"';\\".toSet

  private def normalizeSearchValue(str: String): String =
    str.toLowerCase.filter(c => !invalidSearch.contains(c))