diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala index f7abe029..3939fc26 100644 --- a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala +++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala @@ -75,25 +75,26 @@ object NerFile { } object Pattern { - def apply(weight: Int)(str: String): Vector[Pattern] = { + def apply(weight: Int)(str: String): List[Pattern] = { val delims = " \t\n\r".toSet - val words = + val splitted = TextSplitter .split(str, delims) .map(_.toLower.value.trim) .filter(_.nonEmpty) - .toVector - .map(w => s"(?i)${w}") - val tokens = - TextSplitter - .splitToken(str, delims) - .map(_.toLower.value.trim) - .filter(_.nonEmpty) - .toVector - .take(3) - .map(w => s"(?i)${w}") + .toList - tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) + Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted + .take(3) + .map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight)) } + + private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet + + private def sanitizeRegex(str: String): String = + str.trim.toLowerCase.foldLeft("") { (res, ch) => + if (invalidChars.contains(ch)) s"${res}\\$ch" + else s"$res$ch" + } } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index ad37d179..2de0de71 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -232,7 +232,7 @@ object FindProposal { // The backslash *must* be stripped from search strings. private[this] val invalidSearch = - "…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet + "…[]^<>=ſ{}|`\"';\\".toSet private def normalizeSearchValue(str: String): String = str.toLowerCase.filter(c => !invalidSearch.contains(c)) diff --git a/modules/joex/src/test/scala/docspell/joex/analysis/NerFileTest.scala b/modules/joex/src/test/scala/docspell/joex/analysis/NerFileTest.scala new file mode 100644 index 00000000..03cdcbf4 --- /dev/null +++ b/modules/joex/src/test/scala/docspell/joex/analysis/NerFileTest.scala @@ -0,0 +1,33 @@ +package docspell.joex.analysis + +import minitest._ +import NerFile.Pattern +import java.{util => ju} + +object NerFileTest extends SimpleTestSuite { + + test("create valid case insensitive patterns") { + val names = List( + "Some company AG" -> "(?i)some company ag", + "Acme GmbH" -> "(?i)acme gmbh", + "UP" -> "(?i)up", + "1 & 1" -> "(?i)1 & 1", + "1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)", + "X-corp (this)*-*[one]" -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]" + ) + + for ((name, first) <- names) { + val ps = Pattern(1)(name).distinct + //check if it compiles to a regex pattern + ps.flatMap(_.value.split("\\s+").toList).foreach(_.r) + ps.foreach(_.value.r) + + val regex = ps.head.value.r + regex.matches(name) + regex.matches(name.toLowerCase(ju.Locale.ROOT)) + regex.matches(name.toUpperCase(ju.Locale.ROOT)) + + assertEquals(ps.head.value, first) + } + } +}