Add polish to processing lanugages

SOLR doesn't support polish out of the box. Plugins are required for
polish. The language has been added only with basic support. For
better results, a manual setup of solr is required.

Closes: #1345
This commit is contained in:
eikek
2022-05-21 14:34:48 +02:00
parent 81f7e4e322
commit 5ec311c331
9 changed files with 102 additions and 6 deletions

View File

@ -32,6 +32,7 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-jpn \ tesseract-ocr-data-jpn \
tesseract-ocr-data-heb \ tesseract-ocr-data-heb \
tesseract-ocr-data-lit \ tesseract-ocr-data-lit \
tesseract-ocr-data-pol \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -46,13 +46,16 @@ object DateFind {
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = { private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext = val sep = " -\t.,\n\r/"
val (separators, stext) =
if (lang == Language.Japanese) { if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ') (sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text } else if (lang == Language.Lithuanian) {
(sep + "md") -> text
} else sep -> text
TextSplitter TextSplitter
.splitToken(stext, " -\t.,\n\r/年月日md".toSet) .splitToken(stext, separators.toSet)
.filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de") .filter(w => lang != Language.Spanish || w.value != "de")
} }
@ -106,6 +109,7 @@ object DateFind {
case Language.Japanese => ymd case Language.Japanese => ymd
case Language.Hebrew => dmy case Language.Hebrew => dmy
case Language.Lithuanian => ymd case Language.Lithuanian => ymd
case Language.Polish => dmy
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -58,6 +58,8 @@ object MonthName {
hebrew hebrew
case Language.Lithuanian => case Language.Lithuanian =>
lithuanian lithuanian
case Language.Polish =>
polish
} }
private val numbers = List( private val numbers = List(
@ -358,4 +360,19 @@ object MonthName {
List("lapkritis", "lapkričio", "lapkr"), List("lapkritis", "lapkričio", "lapkr"),
List("gruodis", "gruodžio", "gruod") List("gruodis", "gruodžio", "gruod")
) )
private val polish = List(
List("stycznia", "sty"),
List("lutego", "lut"),
List("marca", "mar"),
List("kwietnia", "kwi"),
List("maja", "maj"),
List("czerwca", "cze"),
List("lipca", "lip"),
List("sierpnia", "sie"),
List("września", "wrz"),
List("października", "paź"),
List("listopada", "lis"),
List("grudnia", "gru")
)
} }

View File

@ -231,4 +231,32 @@ class DateFindTest extends FunSuite {
) )
) )
} }
test("find polish dates") {
assertEquals(
DateFind
.findDates(
"Some text in polish 21 maja 2022 and stuff",
Language.Polish
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("21 maja 2022", NerTag.Date, 20, 32)
)
)
)
assertEquals(
DateFind
.findDates("19.11.2021", Language.Polish)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("19.11.2021", NerTag.Date, 0, 10)
)
)
)
}
} }

View File

@ -128,6 +128,11 @@ object Language {
val iso3 = "lit" val iso3 = "lit"
} }
case object Polish extends Language {
val iso2 = "pl"
val iso3 = "pol"
}
val all: List[Language] = val all: List[Language] =
List( List(
German, German,
@ -148,7 +153,8 @@ object Language {
Latvian, Latvian,
Japanese, Japanese,
Hebrew, Hebrew,
Lithuanian Lithuanian,
Polish
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -194,5 +194,6 @@ object FtsRepository extends DoobieMeta {
case Language.Japanese => "simple" case Language.Japanese => "simple"
case Language.Hebrew => "simple" case Language.Hebrew => "simple"
case Language.Lithuanian => "simple" case Language.Lithuanian => "simple"
case Language.Polish => "simple"
} }
} }

View File

@ -144,7 +144,18 @@ object SolrSetup {
"Add lithuanian", "Add lithuanian",
addContentField(Language.Lithuanian) addContentField(Language.Lithuanian)
), ),
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field") SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"),
SolrMigration[F](
24,
"Add new field type for polish content",
addFieldType(AddFieldType.textPol)
),
SolrMigration[F](
25,
"Add polish",
addContentField(Language.Polish)
),
SolrMigration.reIndexAll(26, "Re-Index after adding polish content field")
) )
def addFolderField: F[Unit] = def addFolderField: F[Unit] =
@ -297,6 +308,17 @@ object SolrSetup {
) )
) )
val textPol = AddFieldType(
"text_pl",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
final case class Filter(`class`: String, attr: Map[String, String]) final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -33,6 +33,7 @@ type Language
| Hebrew | Hebrew
| Hungarian | Hungarian
| Lithuanian | Lithuanian
| Polish
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -94,6 +95,9 @@ fromString str =
else if str == "lit" || str == "lt" || str == "lithuanian" then else if str == "lit" || str == "lt" || str == "lithuanian" then
Just Lithuanian Just Lithuanian
else if str == "pol" || str == "pl" || str == "polish" then
Just Polish
else else
Nothing Nothing
@ -158,6 +162,9 @@ toIso3 lang =
Lithuanian -> Lithuanian ->
"lit" "lit"
Polish ->
"pol"
all : List Language all : List Language
all = all =
@ -180,4 +187,5 @@ all =
, Hebrew , Hebrew
, Hungarian , Hungarian
, Lithuanian , Lithuanian
, Polish
] ]

View File

@ -74,6 +74,9 @@ gb lang =
Lithuanian -> Lithuanian ->
"Lithuanian" "Lithuanian"
Polish ->
"Polish"
de : Language -> String de : Language -> String
de lang = de lang =
@ -135,6 +138,9 @@ de lang =
Lithuanian -> Lithuanian ->
"Litauisch" "Litauisch"
Polish ->
"Polnisch"
fr : Language -> String fr : Language -> String
fr lang = fr lang =
@ -195,3 +201,6 @@ fr lang =
Lithuanian -> Lithuanian ->
"Lituanien" "Lituanien"
Polish ->
"Polonais"