mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Merge pull request #362 from eikek/fix-ner-regexes
Fix regex patterns used for NER
This commit is contained in:
commit
cad5991507
@ -75,25 +75,26 @@ object NerFile {
|
|||||||
}
|
}
|
||||||
|
|
||||||
object Pattern {
|
object Pattern {
|
||||||
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
def apply(weight: Int)(str: String): List[Pattern] = {
|
||||||
val delims = " \t\n\r".toSet
|
val delims = " \t\n\r".toSet
|
||||||
val words =
|
val splitted =
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.split(str, delims)
|
.split(str, delims)
|
||||||
.map(_.toLower.value.trim)
|
.map(_.toLower.value.trim)
|
||||||
.filter(_.nonEmpty)
|
.filter(_.nonEmpty)
|
||||||
.toVector
|
.toList
|
||||||
.map(w => s"(?i)${w}")
|
|
||||||
val tokens =
|
|
||||||
TextSplitter
|
|
||||||
.splitToken(str, delims)
|
|
||||||
.map(_.toLower.value.trim)
|
|
||||||
.filter(_.nonEmpty)
|
|
||||||
.toVector
|
|
||||||
.take(3)
|
|
||||||
.map(w => s"(?i)${w}")
|
|
||||||
|
|
||||||
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted
|
||||||
|
.take(3)
|
||||||
|
.map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet
|
||||||
|
|
||||||
|
private def sanitizeRegex(str: String): String =
|
||||||
|
str.trim.toLowerCase.foldLeft("") { (res, ch) =>
|
||||||
|
if (invalidChars.contains(ch)) s"${res}\\$ch"
|
||||||
|
else s"$res$ch"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -232,7 +232,7 @@ object FindProposal {
|
|||||||
|
|
||||||
// The backslash *must* be stripped from search strings.
|
// The backslash *must* be stripped from search strings.
|
||||||
private[this] val invalidSearch =
|
private[this] val invalidSearch =
|
||||||
"…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet
|
"…[]^<>=ſ{}|`\"';\\".toSet
|
||||||
|
|
||||||
private def normalizeSearchValue(str: String): String =
|
private def normalizeSearchValue(str: String): String =
|
||||||
str.toLowerCase.filter(c => !invalidSearch.contains(c))
|
str.toLowerCase.filter(c => !invalidSearch.contains(c))
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
package docspell.joex.analysis
|
||||||
|
|
||||||
|
import minitest._
|
||||||
|
import NerFile.Pattern
|
||||||
|
import java.{util => ju}
|
||||||
|
|
||||||
|
object NerFileTest extends SimpleTestSuite {
|
||||||
|
|
||||||
|
test("create valid case insensitive patterns") {
|
||||||
|
val names = List(
|
||||||
|
"Some company AG" -> "(?i)some company ag",
|
||||||
|
"Acme GmbH" -> "(?i)acme gmbh",
|
||||||
|
"UP" -> "(?i)up",
|
||||||
|
"1 & 1" -> "(?i)1 & 1",
|
||||||
|
"1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)",
|
||||||
|
"X-corp (this)*-*[one]" -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]"
|
||||||
|
)
|
||||||
|
|
||||||
|
for ((name, first) <- names) {
|
||||||
|
val ps = Pattern(1)(name).distinct
|
||||||
|
//check if it compiles to a regex pattern
|
||||||
|
ps.flatMap(_.value.split("\\s+").toList).foreach(_.r)
|
||||||
|
ps.foreach(_.value.r)
|
||||||
|
|
||||||
|
val regex = ps.head.value.r
|
||||||
|
regex.matches(name)
|
||||||
|
regex.matches(name.toLowerCase(ju.Locale.ROOT))
|
||||||
|
regex.matches(name.toUpperCase(ju.Locale.ROOT))
|
||||||
|
|
||||||
|
assertEquals(ps.head.value, first)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user