mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-25 16:45:05 +00:00
Merge pull request #362 from eikek/fix-ner-regexes
Fix regex patterns used for NER
This commit is contained in:
commit
cad5991507
@ -75,25 +75,26 @@ object NerFile {
|
||||
}
|
||||
|
||||
object Pattern {
|
||||
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
||||
def apply(weight: Int)(str: String): List[Pattern] = {
|
||||
val delims = " \t\n\r".toSet
|
||||
val words =
|
||||
val splitted =
|
||||
TextSplitter
|
||||
.split(str, delims)
|
||||
.map(_.toLower.value.trim)
|
||||
.filter(_.nonEmpty)
|
||||
.toVector
|
||||
.map(w => s"(?i)${w}")
|
||||
val tokens =
|
||||
TextSplitter
|
||||
.splitToken(str, delims)
|
||||
.map(_.toLower.value.trim)
|
||||
.filter(_.nonEmpty)
|
||||
.toVector
|
||||
.take(3)
|
||||
.map(w => s"(?i)${w}")
|
||||
.toList
|
||||
|
||||
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
||||
Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
|
||||
.take(3)
|
||||
.map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
|
||||
}
|
||||
|
||||
private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
|
||||
|
||||
private def sanitizeRegex(str: String): String =
|
||||
str.trim.toLowerCase.foldLeft("") { (res, ch) =>
|
||||
if (invalidChars.contains(ch)) s"${res}\\$ch"
|
||||
else s"$res$ch"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -232,7 +232,7 @@ object FindProposal {
|
||||
|
||||
// The backslash *must* be stripped from search strings.
|
||||
private[this] val invalidSearch =
|
||||
"…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
|
||||
"…[]^<>=ſ{}|`\"';\\".toSet
|
||||
|
||||
private def normalizeSearchValue(str: String): String =
|
||||
str.toLowerCase.filter(c => !invalidSearch.contains(c))
|
||||
|
@ -0,0 +1,33 @@
|
||||
package docspell.joex.analysis
|
||||
|
||||
import minitest._
|
||||
import NerFile.Pattern
|
||||
import java.{util => ju}
|
||||
|
||||
object NerFileTest extends SimpleTestSuite {
|
||||
|
||||
test("create valid case insensitive patterns") {
|
||||
val names = List(
|
||||
"Some company AG" -> "(?i)some company ag",
|
||||
"Acme GmbH" -> "(?i)acme gmbh",
|
||||
"UP" -> "(?i)up",
|
||||
"1 & 1" -> "(?i)1 & 1",
|
||||
"1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)",
|
||||
"X-corp (this)*-*[one]" -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]"
|
||||
)
|
||||
|
||||
for ((name, first) <- names) {
|
||||
val ps = Pattern(1)(name).distinct
|
||||
//check if it compiles to a regex pattern
|
||||
ps.flatMap(_.value.split("\\s+").toList).foreach(_.r)
|
||||
ps.foreach(_.value.r)
|
||||
|
||||
val regex = ps.head.value.r
|
||||
regex.matches(name)
|
||||
regex.matches(name.toLowerCase(ju.Locale.ROOT))
|
||||
regex.matches(name.toUpperCase(ju.Locale.ROOT))
|
||||
|
||||
assertEquals(ps.head.value, first)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user