mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Fix regex patterns used for NER
Patterns are split on whitespace by the nlp library and then compiled, so each "word" must be a valid regex. Fixes: #356
This commit is contained in:
		@@ -75,25 +75,26 @@ object NerFile {
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  object Pattern {
 | 
			
		||||
    def apply(weight: Int)(str: String): Vector[Pattern] = {
 | 
			
		||||
    def apply(weight: Int)(str: String): List[Pattern] = {
 | 
			
		||||
      val delims = " \t\n\r".toSet
 | 
			
		||||
      val words =
 | 
			
		||||
      val splitted =
 | 
			
		||||
        TextSplitter
 | 
			
		||||
          .split(str, delims)
 | 
			
		||||
          .map(_.toLower.value.trim)
 | 
			
		||||
          .filter(_.nonEmpty)
 | 
			
		||||
          .toVector
 | 
			
		||||
          .map(w => s"(?i)${w}")
 | 
			
		||||
      val tokens =
 | 
			
		||||
        TextSplitter
 | 
			
		||||
          .splitToken(str, delims)
 | 
			
		||||
          .map(_.toLower.value.trim)
 | 
			
		||||
          .filter(_.nonEmpty)
 | 
			
		||||
          .toVector
 | 
			
		||||
          .take(3)
 | 
			
		||||
          .map(w => s"(?i)${w}")
 | 
			
		||||
          .toList
 | 
			
		||||
 | 
			
		||||
      tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
 | 
			
		||||
      Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
 | 
			
		||||
        .take(3)
 | 
			
		||||
        .map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
 | 
			
		||||
 | 
			
		||||
    private def sanitizeRegex(str: String): String =
 | 
			
		||||
      str.trim.toLowerCase.foldLeft("") { (res, ch) =>
 | 
			
		||||
        if (invalidChars.contains(ch)) s"${res}\\$ch"
 | 
			
		||||
        else s"$res$ch"
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -232,7 +232,7 @@ object FindProposal {
 | 
			
		||||
 | 
			
		||||
  // The backslash *must* be stripped from search strings.
 | 
			
		||||
  private[this] val invalidSearch =
 | 
			
		||||
    "…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
 | 
			
		||||
    "…[]^<>=ſ{}|`\"';\\".toSet
 | 
			
		||||
 | 
			
		||||
  private def normalizeSearchValue(str: String): String =
 | 
			
		||||
    str.toLowerCase.filter(c => !invalidSearch.contains(c))
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,33 @@
 | 
			
		||||
package docspell.joex.analysis
 | 
			
		||||
 | 
			
		||||
import minitest._
 | 
			
		||||
import NerFile.Pattern
 | 
			
		||||
import java.{util => ju}
 | 
			
		||||
 | 
			
		||||
object NerFileTest extends SimpleTestSuite {
 | 
			
		||||
 | 
			
		||||
  test("create valid case insensitive patterns") {
 | 
			
		||||
    val names = List(
 | 
			
		||||
      "Some company AG"            -> "(?i)some company ag",
 | 
			
		||||
      "Acme GmbH"                  -> "(?i)acme gmbh",
 | 
			
		||||
      "UP"                         -> "(?i)up",
 | 
			
		||||
      "1 & 1"                      -> "(?i)1 & 1",
 | 
			
		||||
      "1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)",
 | 
			
		||||
      "X-corp (this)*-*[one]"      -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    for ((name, first) <- names) {
 | 
			
		||||
      val ps = Pattern(1)(name).distinct
 | 
			
		||||
      //check if it compiles to a regex pattern
 | 
			
		||||
      ps.flatMap(_.value.split("\\s+").toList).foreach(_.r)
 | 
			
		||||
      ps.foreach(_.value.r)
 | 
			
		||||
 | 
			
		||||
      val regex = ps.head.value.r
 | 
			
		||||
      regex.matches(name)
 | 
			
		||||
      regex.matches(name.toLowerCase(ju.Locale.ROOT))
 | 
			
		||||
      regex.matches(name.toUpperCase(ju.Locale.ROOT))
 | 
			
		||||
 | 
			
		||||
      assertEquals(ps.head.value, first)
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user