Add Lithuanian to processing languages

SOLR doesn't support Lithuanian, maybe it can be added via plugins. A
manual setup of solr is required then. It has been added with basic
support.

Closes: #1540
This commit is contained in:
eikek 2022-05-21 14:11:38 +02:00
parent 0f1c3abd6e
commit 9d69401fea
9 changed files with 96 additions and 4 deletions

View File

@ -31,6 +31,7 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-lav \
tesseract-ocr-data-jpn \
tesseract-ocr-data-heb \
tesseract-ocr-data-lit \
unpaper \
wkhtmltopdf \
libreoffice \

View File

@ -52,7 +52,7 @@ object DateFind {
} else text
TextSplitter
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
.splitToken(stext, " -\t.,\n\r/年月日md".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de")
}
@ -105,6 +105,7 @@ object DateFind {
case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd
case Language.Hebrew => dmy
case Language.Lithuanian => ymd
}
p.read(parts) match {
case Result.Success(sds, _) =>

View File

@ -56,6 +56,8 @@ object MonthName {
japanese
case Language.Hebrew =>
hebrew
case Language.Lithuanian =>
lithuanian
}
private val numbers = List(
@ -341,4 +343,19 @@ object MonthName {
List("XI", "nov", "november"),
List("XII", "dec", "december")
)
private val lithuanian = List(
List("sausis", "sausio", "saus"),
List("vasaris", "vasario", "vas"),
List("kovas", "kovo", "kov"),
List("balandis", "balandžio", "bal"),
List("gegužis", "gegužės", "geg"),
List("birželis", "birželio", "birž"),
List("liepa", "liepos", "liep"),
List("rugpjūtis", "rugpjūčio", "rugp"),
List("rugsėjis", "rugsėjo", "rugs"),
List("spalis", "spalio", "spal"),
List("lapkritis", "lapkričio", "lapkr"),
List("gruodis", "gruodžio", "gruod")
)
}

View File

@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
)
)
)
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Spanish)
@ -204,4 +203,32 @@ class DateFindTest extends FunSuite {
)
)
}
test("find lithuanian dates") {
assertEquals(
DateFind
.findDates(
"Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
Language.Lithuanian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
)
)
)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Lithuanian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("2021-11-19", NerTag.Date, 0, 10)
)
)
)
}
}

View File

@ -123,6 +123,11 @@ object Language {
val iso3 = "heb"
}
case object Lithuanian extends Language {
val iso2 = "lt"
val iso3 = "lit"
}
val all: List[Language] =
List(
German,
@ -142,7 +147,8 @@ object Language {
Romanian,
Latvian,
Japanese,
Hebrew
Hebrew,
Lithuanian
)
def fromString(str: String): Either[String, Language] = {

View File

@ -193,5 +193,6 @@ object FtsRepository extends DoobieMeta {
case Language.Latvian => "simple"
case Language.Japanese => "simple"
case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
}
}

View File

@ -133,7 +133,18 @@ object SolrSetup {
"Add hungarian",
addContentField(Language.Hungarian)
),
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
SolrMigration[F](
21,
"Add new field type for lithuanian content",
addFieldType(AddFieldType.textLit)
),
SolrMigration[F](
22,
"Add lithuanian",
addContentField(Language.Lithuanian)
),
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field")
)
def addFolderField: F[Unit] =
@ -275,6 +286,17 @@ object SolrSetup {
)
)
val textLit = AddFieldType(
"text_lt",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -32,6 +32,7 @@ type Language
| Japanese
| Hebrew
| Hungarian
| Lithuanian
fromString : String -> Maybe Language
@ -90,6 +91,9 @@ fromString str =
else if str == "hun" || str == "hu" || str == "hungarian" then
Just Hungarian
else if str == "lit" || str == "lt" || str == "lithuanian" then
Just Lithuanian
else
Nothing
@ -151,6 +155,9 @@ toIso3 lang =
Hungarian ->
"hun"
Lithuanian ->
"lit"
all : List Language
all =
@ -172,4 +179,5 @@ all =
, Japanese
, Hebrew
, Hungarian
, Lithuanian
]

View File

@ -71,6 +71,9 @@ gb lang =
Hungarian ->
"Hungarian"
Lithuanian ->
"Lithuanian"
de : Language -> String
de lang =
@ -129,6 +132,9 @@ de lang =
Hungarian ->
"Ungarisch"
Lithuanian ->
"Litauisch"
fr : Language -> String
fr lang =
@ -186,3 +192,6 @@ fr lang =
Hungarian ->
"Hongrois"
Lithuanian ->
"Lituanien"