mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-02-15 20:33:26 +00:00
Add Lithuanian to processing languages
SOLR doesn't support Lithuanian, maybe it can be added via plugins. A manual setup of solr is required then. It has been added with basic support. Closes: #1540
This commit is contained in:
parent
0f1c3abd6e
commit
9d69401fea
@ -31,6 +31,7 @@ RUN JDKPKG="openjdk11-jre"; \
|
||||
tesseract-ocr-data-lav \
|
||||
tesseract-ocr-data-jpn \
|
||||
tesseract-ocr-data-heb \
|
||||
tesseract-ocr-data-lit \
|
||||
unpaper \
|
||||
wkhtmltopdf \
|
||||
libreoffice \
|
||||
|
@ -52,7 +52,7 @@ object DateFind {
|
||||
} else text
|
||||
|
||||
TextSplitter
|
||||
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
|
||||
.splitToken(stext, " -\t.,\n\r/年月日md".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||
}
|
||||
@ -105,6 +105,7 @@ object DateFind {
|
||||
case Language.Latvian => dmy.or(lavLong).or(ymd)
|
||||
case Language.Japanese => ymd
|
||||
case Language.Hebrew => dmy
|
||||
case Language.Lithuanian => ymd
|
||||
}
|
||||
p.read(parts) match {
|
||||
case Result.Success(sds, _) =>
|
||||
|
@ -56,6 +56,8 @@ object MonthName {
|
||||
japanese
|
||||
case Language.Hebrew =>
|
||||
hebrew
|
||||
case Language.Lithuanian =>
|
||||
lithuanian
|
||||
}
|
||||
|
||||
private val numbers = List(
|
||||
@ -341,4 +343,19 @@ object MonthName {
|
||||
List("XI", "nov", "november"),
|
||||
List("XII", "dec", "december")
|
||||
)
|
||||
|
||||
private val lithuanian = List(
|
||||
List("sausis", "sausio", "saus"),
|
||||
List("vasaris", "vasario", "vas"),
|
||||
List("kovas", "kovo", "kov"),
|
||||
List("balandis", "balandžio", "bal"),
|
||||
List("gegužis", "gegužės", "geg"),
|
||||
List("birželis", "birželio", "birž"),
|
||||
List("liepa", "liepos", "liep"),
|
||||
List("rugpjūtis", "rugpjūčio", "rugp"),
|
||||
List("rugsėjis", "rugsėjo", "rugs"),
|
||||
List("spalis", "spalio", "spal"),
|
||||
List("lapkritis", "lapkričio", "lapkr"),
|
||||
List("gruodis", "gruodžio", "gruod")
|
||||
)
|
||||
}
|
||||
|
@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
|
||||
)
|
||||
)
|
||||
)
|
||||
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("2021-11-19", Language.Spanish)
|
||||
@ -204,4 +203,32 @@ class DateFindTest extends FunSuite {
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
test("find lithuanian dates") {
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates(
|
||||
"Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
|
||||
Language.Lithuanian
|
||||
)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2022, 5, 21),
|
||||
NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
|
||||
)
|
||||
)
|
||||
)
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("2021-11-19", Language.Lithuanian)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 11, 19),
|
||||
NerLabel("2021-11-19", NerTag.Date, 0, 10)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -123,6 +123,11 @@ object Language {
|
||||
val iso3 = "heb"
|
||||
}
|
||||
|
||||
case object Lithuanian extends Language {
|
||||
val iso2 = "lt"
|
||||
val iso3 = "lit"
|
||||
}
|
||||
|
||||
val all: List[Language] =
|
||||
List(
|
||||
German,
|
||||
@ -142,7 +147,8 @@ object Language {
|
||||
Romanian,
|
||||
Latvian,
|
||||
Japanese,
|
||||
Hebrew
|
||||
Hebrew,
|
||||
Lithuanian
|
||||
)
|
||||
|
||||
def fromString(str: String): Either[String, Language] = {
|
||||
|
@ -193,5 +193,6 @@ object FtsRepository extends DoobieMeta {
|
||||
case Language.Latvian => "simple"
|
||||
case Language.Japanese => "simple"
|
||||
case Language.Hebrew => "simple"
|
||||
case Language.Lithuanian => "simple"
|
||||
}
|
||||
}
|
||||
|
@ -133,7 +133,18 @@ object SolrSetup {
|
||||
"Add hungarian",
|
||||
addContentField(Language.Hungarian)
|
||||
),
|
||||
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
|
||||
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
|
||||
SolrMigration[F](
|
||||
21,
|
||||
"Add new field type for lithuanian content",
|
||||
addFieldType(AddFieldType.textLit)
|
||||
),
|
||||
SolrMigration[F](
|
||||
22,
|
||||
"Add lithuanian",
|
||||
addContentField(Language.Lithuanian)
|
||||
),
|
||||
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field")
|
||||
)
|
||||
|
||||
def addFolderField: F[Unit] =
|
||||
@ -275,6 +286,17 @@ object SolrSetup {
|
||||
)
|
||||
)
|
||||
|
||||
val textLit = AddFieldType(
|
||||
"text_lt",
|
||||
"solr.TextField",
|
||||
Analyzer(
|
||||
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
|
||||
List(
|
||||
Filter("solr.LowerCaseFilterFactory", Map.empty)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
final case class Filter(`class`: String, attr: Map[String, String])
|
||||
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
||||
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
||||
|
@ -32,6 +32,7 @@ type Language
|
||||
| Japanese
|
||||
| Hebrew
|
||||
| Hungarian
|
||||
| Lithuanian
|
||||
|
||||
|
||||
fromString : String -> Maybe Language
|
||||
@ -90,6 +91,9 @@ fromString str =
|
||||
else if str == "hun" || str == "hu" || str == "hungarian" then
|
||||
Just Hungarian
|
||||
|
||||
else if str == "lit" || str == "lt" || str == "lithuanian" then
|
||||
Just Lithuanian
|
||||
|
||||
else
|
||||
Nothing
|
||||
|
||||
@ -151,6 +155,9 @@ toIso3 lang =
|
||||
Hungarian ->
|
||||
"hun"
|
||||
|
||||
Lithuanian ->
|
||||
"lit"
|
||||
|
||||
|
||||
all : List Language
|
||||
all =
|
||||
@ -172,4 +179,5 @@ all =
|
||||
, Japanese
|
||||
, Hebrew
|
||||
, Hungarian
|
||||
, Lithuanian
|
||||
]
|
||||
|
@ -71,6 +71,9 @@ gb lang =
|
||||
Hungarian ->
|
||||
"Hungarian"
|
||||
|
||||
Lithuanian ->
|
||||
"Lithuanian"
|
||||
|
||||
|
||||
de : Language -> String
|
||||
de lang =
|
||||
@ -129,6 +132,9 @@ de lang =
|
||||
Hungarian ->
|
||||
"Ungarisch"
|
||||
|
||||
Lithuanian ->
|
||||
"Litauisch"
|
||||
|
||||
|
||||
fr : Language -> String
|
||||
fr lang =
|
||||
@ -186,3 +192,6 @@ fr lang =
|
||||
|
||||
Hungarian ->
|
||||
"Hongrois"
|
||||
|
||||
Lithuanian ->
|
||||
"Lituanien"
|
||||
|
Loading…
Reference in New Issue
Block a user