mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Add polish to processing lanugages
SOLR doesn't support polish out of the box. Plugins are required for polish. The language has been added only with basic support. For better results, a manual setup of solr is required. Closes: #1345
This commit is contained in:
@ -46,13 +46,16 @@ object DateFind {
|
||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||
|
||||
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
val stext =
|
||||
val sep = " -\t.,\n\r/"
|
||||
val (separators, stext) =
|
||||
if (lang == Language.Japanese) {
|
||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||
} else text
|
||||
(sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||
} else if (lang == Language.Lithuanian) {
|
||||
(sep + "md") -> text
|
||||
} else sep -> text
|
||||
|
||||
TextSplitter
|
||||
.splitToken(stext, " -\t.,\n\r/年月日md".toSet)
|
||||
.splitToken(stext, separators.toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||
}
|
||||
@ -106,6 +109,7 @@ object DateFind {
|
||||
case Language.Japanese => ymd
|
||||
case Language.Hebrew => dmy
|
||||
case Language.Lithuanian => ymd
|
||||
case Language.Polish => dmy
|
||||
}
|
||||
p.read(parts) match {
|
||||
case Result.Success(sds, _) =>
|
||||
|
@ -58,6 +58,8 @@ object MonthName {
|
||||
hebrew
|
||||
case Language.Lithuanian =>
|
||||
lithuanian
|
||||
case Language.Polish =>
|
||||
polish
|
||||
}
|
||||
|
||||
private val numbers = List(
|
||||
@ -358,4 +360,19 @@ object MonthName {
|
||||
List("lapkritis", "lapkričio", "lapkr"),
|
||||
List("gruodis", "gruodžio", "gruod")
|
||||
)
|
||||
|
||||
private val polish = List(
|
||||
List("stycznia", "sty"),
|
||||
List("lutego", "lut"),
|
||||
List("marca", "mar"),
|
||||
List("kwietnia", "kwi"),
|
||||
List("maja", "maj"),
|
||||
List("czerwca", "cze"),
|
||||
List("lipca", "lip"),
|
||||
List("sierpnia", "sie"),
|
||||
List("września", "wrz"),
|
||||
List("października", "paź"),
|
||||
List("listopada", "lis"),
|
||||
List("grudnia", "gru")
|
||||
)
|
||||
}
|
||||
|
@ -231,4 +231,32 @@ class DateFindTest extends FunSuite {
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("find polish dates") {
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates(
|
||||
"Some text in polish 21 maja 2022 and stuff",
|
||||
Language.Polish
|
||||
)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2022, 5, 21),
|
||||
NerLabel("21 maja 2022", NerTag.Date, 20, 32)
|
||||
)
|
||||
)
|
||||
)
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("19.11.2021", Language.Polish)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 11, 19),
|
||||
NerLabel("19.11.2021", NerTag.Date, 0, 10)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user