Add Lithuanian to processing languages

SOLR doesn't support Lithuanian, maybe it can be added via plugins. A
manual setup of solr is required then. It has been added with basic
support.

Closes: #1540
This commit is contained in:
eikek
2022-05-21 14:11:38 +02:00
parent 0f1c3abd6e
commit 9d69401fea
9 changed files with 96 additions and 4 deletions

View File

@ -31,6 +31,7 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-lav \ tesseract-ocr-data-lav \
tesseract-ocr-data-jpn \ tesseract-ocr-data-jpn \
tesseract-ocr-data-heb \ tesseract-ocr-data-heb \
tesseract-ocr-data-lit \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -52,7 +52,7 @@ object DateFind {
} else text } else text
TextSplitter TextSplitter
.splitToken(stext, " -\t.,\n\r/年月日".toSet) .splitToken(stext, " -\t.,\n\r/年月日md".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de") .filter(w => lang != Language.Spanish || w.value != "de")
} }
@ -105,6 +105,7 @@ object DateFind {
case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd case Language.Japanese => ymd
case Language.Hebrew => dmy case Language.Hebrew => dmy
case Language.Lithuanian => ymd
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -56,6 +56,8 @@ object MonthName {
japanese japanese
case Language.Hebrew => case Language.Hebrew =>
hebrew hebrew
case Language.Lithuanian =>
lithuanian
} }
private val numbers = List( private val numbers = List(
@ -341,4 +343,19 @@ object MonthName {
List("XI", "nov", "november"), List("XI", "nov", "november"),
List("XII", "dec", "december") List("XII", "dec", "december")
) )
private val lithuanian = List(
List("sausis", "sausio", "saus"),
List("vasaris", "vasario", "vas"),
List("kovas", "kovo", "kov"),
List("balandis", "balandžio", "bal"),
List("gegužis", "gegužės", "geg"),
List("birželis", "birželio", "birž"),
List("liepa", "liepos", "liep"),
List("rugpjūtis", "rugpjūčio", "rugp"),
List("rugsėjis", "rugsėjo", "rugs"),
List("spalis", "spalio", "spal"),
List("lapkritis", "lapkričio", "lapkr"),
List("gruodis", "gruodžio", "gruod")
)
} }

View File

@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
) )
) )
) )
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
assertEquals( assertEquals(
DateFind DateFind
.findDates("2021-11-19", Language.Spanish) .findDates("2021-11-19", Language.Spanish)
@ -204,4 +203,32 @@ class DateFindTest extends FunSuite {
) )
) )
} }
test("find lithuanian dates") {
assertEquals(
DateFind
.findDates(
"Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
Language.Lithuanian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
)
)
)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Lithuanian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("2021-11-19", NerTag.Date, 0, 10)
)
)
)
}
} }

View File

@ -123,6 +123,11 @@ object Language {
val iso3 = "heb" val iso3 = "heb"
} }
case object Lithuanian extends Language {
val iso2 = "lt"
val iso3 = "lit"
}
val all: List[Language] = val all: List[Language] =
List( List(
German, German,
@ -142,7 +147,8 @@ object Language {
Romanian, Romanian,
Latvian, Latvian,
Japanese, Japanese,
Hebrew Hebrew,
Lithuanian
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -193,5 +193,6 @@ object FtsRepository extends DoobieMeta {
case Language.Latvian => "simple" case Language.Latvian => "simple"
case Language.Japanese => "simple" case Language.Japanese => "simple"
case Language.Hebrew => "simple" case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
} }
} }

View File

@ -133,7 +133,18 @@ object SolrSetup {
"Add hungarian", "Add hungarian",
addContentField(Language.Hungarian) addContentField(Language.Hungarian)
), ),
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field") SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
SolrMigration[F](
21,
"Add new field type for lithuanian content",
addFieldType(AddFieldType.textLit)
),
SolrMigration[F](
22,
"Add lithuanian",
addContentField(Language.Lithuanian)
),
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field")
) )
def addFolderField: F[Unit] = def addFolderField: F[Unit] =
@ -275,6 +286,17 @@ object SolrSetup {
) )
) )
val textLit = AddFieldType(
"text_lt",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
final case class Filter(`class`: String, attr: Map[String, String]) final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -32,6 +32,7 @@ type Language
| Japanese | Japanese
| Hebrew | Hebrew
| Hungarian | Hungarian
| Lithuanian
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -90,6 +91,9 @@ fromString str =
else if str == "hun" || str == "hu" || str == "hungarian" then else if str == "hun" || str == "hu" || str == "hungarian" then
Just Hungarian Just Hungarian
else if str == "lit" || str == "lt" || str == "lithuanian" then
Just Lithuanian
else else
Nothing Nothing
@ -151,6 +155,9 @@ toIso3 lang =
Hungarian -> Hungarian ->
"hun" "hun"
Lithuanian ->
"lit"
all : List Language all : List Language
all = all =
@ -172,4 +179,5 @@ all =
, Japanese , Japanese
, Hebrew , Hebrew
, Hungarian , Hungarian
, Lithuanian
] ]

View File

@ -71,6 +71,9 @@ gb lang =
Hungarian -> Hungarian ->
"Hungarian" "Hungarian"
Lithuanian ->
"Lithuanian"
de : Language -> String de : Language -> String
de lang = de lang =
@ -129,6 +132,9 @@ de lang =
Hungarian -> Hungarian ->
"Ungarisch" "Ungarisch"
Lithuanian ->
"Litauisch"
fr : Language -> String fr : Language -> String
fr lang = fr lang =
@ -186,3 +192,6 @@ fr lang =
Hungarian -> Hungarian ->
"Hongrois" "Hongrois"
Lithuanian ->
"Lituanien"