mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Merge pull request #2208 from mprasil/add-slovak-language-support
Add support for Slovak language
This commit is contained in:
@ -33,6 +33,7 @@ RUN apk update && \
|
|||||||
tesseract-ocr-data-pol \
|
tesseract-ocr-data-pol \
|
||||||
tesseract-ocr-data-est \
|
tesseract-ocr-data-est \
|
||||||
tesseract-ocr-data-ukr \
|
tesseract-ocr-data-ukr \
|
||||||
|
tesseract-ocr-data-slk \
|
||||||
unpaper \
|
unpaper \
|
||||||
weasyprint \
|
weasyprint \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -131,6 +131,7 @@ object DateFind {
|
|||||||
case Language.Estonian => dmy
|
case Language.Estonian => dmy
|
||||||
case Language.Khmer => dmy
|
case Language.Khmer => dmy
|
||||||
case Language.Ukrainian => dmy.or(ymd)
|
case Language.Ukrainian => dmy.or(ymd)
|
||||||
|
case Language.Slovak => dmy.or(ymd)
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
|
@ -66,6 +66,8 @@ object MonthName {
|
|||||||
ukrainian
|
ukrainian
|
||||||
case Language.Khmer =>
|
case Language.Khmer =>
|
||||||
khmer
|
khmer
|
||||||
|
case Language.Slovak =>
|
||||||
|
slovak
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -426,4 +428,19 @@ object MonthName {
|
|||||||
List("листопада", "лист", "лис"),
|
List("листопада", "лист", "лис"),
|
||||||
List("грудня", "груд", "гру")
|
List("грудня", "груд", "гру")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val slovak = List(
|
||||||
|
List("jan", "január", "januára"),
|
||||||
|
List("feb", "február", "februára"),
|
||||||
|
List("mar", "marec", "marca"),
|
||||||
|
List("apr", "apríl", "apríla"),
|
||||||
|
List("maj", "máj", "mája"),
|
||||||
|
List("jun", "jún", "júna"),
|
||||||
|
List("jul", "júl", "júla"),
|
||||||
|
List("aug", "august", "augusta"),
|
||||||
|
List("sep", "september", "septembra"),
|
||||||
|
List("okt", "október", "októbra"),
|
||||||
|
List("nov", "november", "novembra"),
|
||||||
|
List("dec", "december", "decembra")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -365,4 +365,57 @@ class DateFindTest extends FunSuite {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find slovak dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates(
|
||||||
|
"Do funkcie bola inaugurovaná 15. júna 2019 pred Národnou radou SR",
|
||||||
|
Language.Slovak
|
||||||
|
)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2019, 6, 15),
|
||||||
|
NerLabel("15. júna 2019", NerTag.Date, 29, 42)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates(
|
||||||
|
"Dátum narodenia: 14. feb 2015",
|
||||||
|
Language.Slovak
|
||||||
|
)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2015, 2, 14),
|
||||||
|
NerLabel("14. feb 2015", NerTag.Date, 17, 29)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("19.11.2021", Language.Slovak)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 11, 19),
|
||||||
|
NerLabel("19.11.2021", NerTag.Date, 0, 10)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("Dátum: 2022.11.05", Language.Slovak)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 11, 5),
|
||||||
|
NerLabel("2022.11.05", NerTag.Date, 7, 17)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -148,6 +148,11 @@ object Language {
|
|||||||
val iso3 = "ukr"
|
val iso3 = "ukr"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Slovak extends Language {
|
||||||
|
val iso2 = "sk"
|
||||||
|
val iso3 = "svk"
|
||||||
|
}
|
||||||
|
|
||||||
val all: List[Language] =
|
val all: List[Language] =
|
||||||
List(
|
List(
|
||||||
German,
|
German,
|
||||||
@ -172,6 +177,7 @@ object Language {
|
|||||||
Polish,
|
Polish,
|
||||||
Estonian,
|
Estonian,
|
||||||
Ukrainian,
|
Ukrainian,
|
||||||
|
Slovak,
|
||||||
Khmer
|
Khmer
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -207,5 +207,6 @@ object FtsRepository extends DoobieMeta {
|
|||||||
case Language.Estonian => "simple"
|
case Language.Estonian => "simple"
|
||||||
case Language.Ukrainian => "simple"
|
case Language.Ukrainian => "simple"
|
||||||
case Language.Khmer => "simple"
|
case Language.Khmer => "simple"
|
||||||
|
case Language.Slovak => "simple"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -183,7 +183,18 @@ object SolrSetup {
|
|||||||
"Add Khmer",
|
"Add Khmer",
|
||||||
addContentField(Language.Khmer)
|
addContentField(Language.Khmer)
|
||||||
),
|
),
|
||||||
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer")
|
SolrMigration.reIndexAll(34, "Re-Index after adding Khmer"),
|
||||||
|
SolrMigration[F](
|
||||||
|
35,
|
||||||
|
"Add new field type for slovak content",
|
||||||
|
addFieldType(AddFieldType.textSvk)
|
||||||
|
),
|
||||||
|
SolrMigration[F](
|
||||||
|
36,
|
||||||
|
"Add Slovak",
|
||||||
|
addContentField(Language.Slovak)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(37, "Re-Index after adding Slovak")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
@ -368,6 +379,17 @@ object SolrSetup {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val textSvk = AddFieldType(
|
||||||
|
"text_sk",
|
||||||
|
"solr.TextField",
|
||||||
|
Analyzer(
|
||||||
|
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
|
||||||
|
List(
|
||||||
|
Filter("solr.LowerCaseFilterFactory", Map.empty)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
final case class Filter(`class`: String, attr: Map[String, String])
|
final case class Filter(`class`: String, attr: Map[String, String])
|
||||||
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
||||||
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
||||||
|
@ -37,6 +37,7 @@ type Language
|
|||||||
| Estonian
|
| Estonian
|
||||||
| Ukrainian
|
| Ukrainian
|
||||||
| Khmer
|
| Khmer
|
||||||
|
| Slovak
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -110,6 +111,9 @@ fromString str =
|
|||||||
else if str == "khm" || str == "kh" || str == "khmer" then
|
else if str == "khm" || str == "kh" || str == "khmer" then
|
||||||
Just Khmer
|
Just Khmer
|
||||||
|
|
||||||
|
else if str == "svk" || str == "sk" || str == "slovak" then
|
||||||
|
Just Slovak
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -186,6 +190,9 @@ toIso3 lang =
|
|||||||
Khmer ->
|
Khmer ->
|
||||||
"khm"
|
"khm"
|
||||||
|
|
||||||
|
Slovak ->
|
||||||
|
"svk"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -212,4 +219,5 @@ all =
|
|||||||
, Estonian
|
, Estonian
|
||||||
, Ukrainian
|
, Ukrainian
|
||||||
, Khmer
|
, Khmer
|
||||||
|
, Slovak
|
||||||
]
|
]
|
||||||
|
@ -86,6 +86,9 @@ gb lang =
|
|||||||
Khmer ->
|
Khmer ->
|
||||||
"Khmer"
|
"Khmer"
|
||||||
|
|
||||||
|
Slovak ->
|
||||||
|
"Slovak"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -159,6 +162,9 @@ de lang =
|
|||||||
Khmer ->
|
Khmer ->
|
||||||
"Khmer"
|
"Khmer"
|
||||||
|
|
||||||
|
Slovak ->
|
||||||
|
"Slowakisch"
|
||||||
|
|
||||||
|
|
||||||
fr : Language -> String
|
fr : Language -> String
|
||||||
fr lang =
|
fr lang =
|
||||||
@ -231,3 +237,6 @@ fr lang =
|
|||||||
|
|
||||||
Khmer ->
|
Khmer ->
|
||||||
"Khmer"
|
"Khmer"
|
||||||
|
|
||||||
|
Slovak ->
|
||||||
|
"Slovaquie"
|
||||||
|
Reference in New Issue
Block a user