Merge pull request #362 from eikek/fix-ner-regexes

Fix regex patterns used for NER
2025-06-21 18:08:25 +00:00 · 2020-10-20 23:08:27 +00:00
parent 0c873f732b 3c0b86cb19
commit cad5991507
3 changed files with 48 additions and 14 deletions
--- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
+++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala
@ -75,25 +75,26 @@ object NerFile {
  }

  object Pattern {
-    def apply(weight: Int)(str: String): Vector[Pattern] = {
+    def apply(weight: Int)(str: String): List[Pattern] = {
      val delims = " \t\n\r".toSet
-      val words =
+      val splitted =
        TextSplitter
          .split(str, delims)
          .map(_.toLower.value.trim)
          .filter(_.nonEmpty)
-          .toVector
-          .map(w => s"(?i)${w}")
-      val tokens =
-        TextSplitter
-          .splitToken(str, delims)
-          .map(_.toLower.value.trim)
-          .filter(_.nonEmpty)
-          .toVector
-          .take(3)
-          .map(w => s"(?i)${w}")
+          .toList

-      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
+      Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
+        .take(3)
+        .map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
    }
+
+    private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
+
+    private def sanitizeRegex(str: String): String =
+      str.trim.toLowerCase.foldLeft("") { (res, ch) =>
+        if (invalidChars.contains(ch)) s"${res}\\$ch"
+        else s"$res$ch"
+      }
  }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
@ -232,7 +232,7 @@ object FindProposal {

  // The backslash *must* be stripped from search strings.
  private[this] val invalidSearch =
-    "…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
+    "…[]^<>=ſ{}|`\"';\\".toSet

  private def normalizeSearchValue(str: String): String =
    str.toLowerCase.filter(c => !invalidSearch.contains(c))
--- a/modules/joex/src/test/scala/docspell/joex/analysis/NerFileTest.scala
+++ b/modules/joex/src/test/scala/docspell/joex/analysis/NerFileTest.scala
@ -0,0 +1,33 @@
+package docspell.joex.analysis
+
+import minitest._
+import NerFile.Pattern
+import java.{util => ju}
+
+object NerFileTest extends SimpleTestSuite {
+
+  test("create valid case insensitive patterns") {
+    val names = List(
+      "Some company AG"            -> "(?i)some company ag",
+      "Acme GmbH"                  -> "(?i)acme gmbh",
+      "UP"                         -> "(?i)up",
+      "1 & 1"                      -> "(?i)1 & 1",
+      "1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)",
+      "X-corp (this)*-*[one]"      -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]"
+    )
+
+    for ((name, first) <- names) {
+      val ps = Pattern(1)(name).distinct
+      //check if it compiles to a regex pattern
+      ps.flatMap(_.value.split("\\s+").toList).foreach(_.r)
+      ps.foreach(_.value.r)
+
+      val regex = ps.head.value.r
+      regex.matches(name)
+      regex.matches(name.toLowerCase(ju.Locale.ROOT))
+      regex.matches(name.toUpperCase(ju.Locale.ROOT))
+
+      assertEquals(ps.head.value, first)
+    }
+  }
+}