mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 17:50:11 +00:00 
			
		
		
		
	Merge pull request #362 from eikek/fix-ner-regexes
Fix regex patterns used for NER
This commit is contained in:
		| @@ -75,25 +75,26 @@ object NerFile { | ||||
|   } | ||||
|  | ||||
|   object Pattern { | ||||
|     def apply(weight: Int)(str: String): Vector[Pattern] = { | ||||
|     def apply(weight: Int)(str: String): List[Pattern] = { | ||||
|       val delims = " \t\n\r".toSet | ||||
|       val words = | ||||
|       val splitted = | ||||
|         TextSplitter | ||||
|           .split(str, delims) | ||||
|           .map(_.toLower.value.trim) | ||||
|           .filter(_.nonEmpty) | ||||
|           .toVector | ||||
|           .map(w => s"(?i)${w}") | ||||
|       val tokens = | ||||
|         TextSplitter | ||||
|           .splitToken(str, delims) | ||||
|           .map(_.toLower.value.trim) | ||||
|           .filter(_.nonEmpty) | ||||
|           .toVector | ||||
|           .take(3) | ||||
|           .map(w => s"(?i)${w}") | ||||
|           .toList | ||||
|  | ||||
|       tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) | ||||
|       Pattern("(?i)" + sanitizeRegex(str), weight) :: splitted | ||||
|         .take(3) | ||||
|         .map(t => Pattern(s"(?i)${sanitizeRegex(t)}", weight)) | ||||
|     } | ||||
|  | ||||
|     private val invalidChars = """<([{\^-=$!|]})?*+.>""".toSet | ||||
|  | ||||
|     private def sanitizeRegex(str: String): String = | ||||
|       str.trim.toLowerCase.foldLeft("") { (res, ch) => | ||||
|         if (invalidChars.contains(ch)) s"${res}\\$ch" | ||||
|         else s"$res$ch" | ||||
|       } | ||||
|   } | ||||
| } | ||||
|   | ||||
| @@ -232,7 +232,7 @@ object FindProposal { | ||||
|  | ||||
|   // The backslash *must* be stripped from search strings. | ||||
|   private[this] val invalidSearch = | ||||
|     "…_[]^<>=&ſ/{}*?@#$|~`+%\"';\\".toSet | ||||
|     "…[]^<>=ſ{}|`\"';\\".toSet | ||||
|  | ||||
|   private def normalizeSearchValue(str: String): String = | ||||
|     str.toLowerCase.filter(c => !invalidSearch.contains(c)) | ||||
|   | ||||
| @@ -0,0 +1,33 @@ | ||||
| package docspell.joex.analysis | ||||
|  | ||||
| import minitest._ | ||||
| import NerFile.Pattern | ||||
| import java.{util => ju} | ||||
|  | ||||
| object NerFileTest extends SimpleTestSuite { | ||||
|  | ||||
|   test("create valid case insensitive patterns") { | ||||
|     val names = List( | ||||
|       "Some company AG"            -> "(?i)some company ag", | ||||
|       "Acme GmbH"                  -> "(?i)acme gmbh", | ||||
|       "UP"                         -> "(?i)up", | ||||
|       "1 & 1"                      -> "(?i)1 & 1", | ||||
|       "1 & 1 (Telefon / Internet)" -> "(?i)1 & 1 \\(telefon / internet\\)", | ||||
|       "X-corp (this)*-*[one]"      -> "(?i)x\\-corp \\(this\\)\\*\\-\\*\\[one\\]" | ||||
|     ) | ||||
|  | ||||
|     for ((name, first) <- names) { | ||||
|       val ps = Pattern(1)(name).distinct | ||||
|       //check if it compiles to a regex pattern | ||||
|       ps.flatMap(_.value.split("\\s+").toList).foreach(_.r) | ||||
|       ps.foreach(_.value.r) | ||||
|  | ||||
|       val regex = ps.head.value.r | ||||
|       regex.matches(name) | ||||
|       regex.matches(name.toLowerCase(ju.Locale.ROOT)) | ||||
|       regex.matches(name.toUpperCase(ju.Locale.ROOT)) | ||||
|  | ||||
|       assertEquals(ps.head.value, first) | ||||
|     } | ||||
|   } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user