Add 9 more lanugages to the list of document lanugages

This commit is contained in:
Eike Kettner 2021-01-17 22:53:12 +01:00
parent 94bb18c152
commit 3f75af0807
7 changed files with 371 additions and 12 deletions

View File

@ -17,6 +17,15 @@ RUN apk add --no-cache openjdk11-jre \
tesseract-ocr-data-fra \
tesseract-ocr-data-ita \
tesseract-ocr-data-spa \
tesseract-ocr-data-por \
tesseract-ocr-data-ces \
tesseract-ocr-data-nld \
tesseract-ocr-data-dan \
tesseract-ocr-data-fin \
tesseract-ocr-data-nor \
tesseract-ocr-data-swe \
tesseract-ocr-data-rus \
tesseract-ocr-data-ron \
unpaper \
wkhtmltopdf \
libreoffice \

View File

@ -56,16 +56,26 @@ object DateFind {
// ymd , ydm, dmy , dym, myd, mdy
def fromParts(parts: List[Word], lang: Language): List[SimpleDate] = {
val p0 = pattern0(lang)
val p1 = pattern1(lang)
val p2 = pattern2(lang)
val ymd = pattern0(lang)
val dmy = pattern1(lang)
val mdy = pattern2(lang)
// most is from wikipedia
val p = lang match {
case Language.English =>
p2.alt(p1).map(t => t._1 ++ t._2).or(p2).or(p0).or(p1)
case Language.German => p1.or(p0).or(p2)
case Language.French => p1.or(p0).or(p2)
case Language.Italian => p1.or(p0).or(p2)
case Language.Spanish => p1.or(p0).or(p2)
mdy.alt(dmy).map(t => t._1 ++ t._2).or(mdy).or(ymd).or(dmy)
case Language.German => dmy.or(ymd).or(mdy)
case Language.French => dmy.or(ymd).or(mdy)
case Language.Italian => dmy.or(ymd).or(mdy)
case Language.Spanish => dmy.or(ymd).or(mdy)
case Language.Czech => dmy.or(ymd).or(mdy)
case Language.Danish => dmy.or(ymd).or(mdy)
case Language.Finnish => dmy.or(ymd).or(mdy)
case Language.Norwegian => dmy.or(ymd).or(mdy)
case Language.Portuguese => dmy.or(ymd).or(mdy)
case Language.Romanian => dmy.or(ymd).or(mdy)
case Language.Russian => dmy.or(ymd).or(mdy)
case Language.Swedish => ymd.or(dmy).or(mdy)
case Language.Dutch => dmy.or(ymd).or(mdy)
}
p.read(parts) match {
case Result.Success(sds, _) =>

View File

@ -24,6 +24,24 @@ object MonthName {
italian
case Language.Spanish =>
spanish
case Language.Swedish =>
swedish
case Language.Norwegian =>
norwegian
case Language.Dutch =>
dutch
case Language.Czech =>
czech
case Language.Danish =>
danish
case Language.Portuguese =>
portuguese
case Language.Romanian =>
romanian
case Language.Finnish =>
finnish
case Language.Russian =>
russian
}
private val numbers = List(
@ -115,4 +133,138 @@ object MonthName {
List("nov", "noviembre"),
List("dic", "diciembre")
)
private val swedish = List(
List("jan", "januari"),
List("febr", "februari"),
List("mars"),
List("april"),
List("maj"),
List("juni"),
List("juli"),
List("aug", "augusti"),
List("sept", "september"),
List("okt", "oktober"),
List("nov", "november"),
List("dec", "december")
)
private val norwegian = List(
List("jan", "januar"),
List("febr", "februar"),
List("mars"),
List("april"),
List("mai"),
List("juni"),
List("juli"),
List("aug", "august"),
List("sept", "september"),
List("okt", "oktober"),
List("nov", "november"),
List("des", "desember")
)
private val czech = List(
List("led", "leden"),
List("un", "ún", "únor", "unor"),
List("brez", "březen", "brezen"),
List("dub", "duben"),
List("kvet", "květen"),
List("cerv", "červen"),
List("cerven", "červenec"),
List("srp", "srpen"),
List("zari", "září"),
List("ríj", "rij", "říjen"),
List("list", "listopad"),
List("pros", "prosinec")
)
private val romanian = List(
List("ian", "ianuarie"),
List("feb", "februarie"),
List("mar", "martie"),
List("apr", "aprilie"),
List("mai"),
List("iunie"),
List("iulie"),
List("aug", "august"),
List("sept", "septembrie"),
List("oct", "octombrie"),
List("noem", "nov", "noiembrie"),
List("dec", "decembrie")
)
private val danish = List(
List("jan", "januar"),
List("febr", "februar"),
List("marts"),
List("april"),
List("maj"),
List("juni"),
List("juli"),
List("aug", "august"),
List("sept", "september"),
List("okt", "oktober"),
List("nov", "november"),
List("dec", "december")
)
private val portuguese = List(
List("jan", "janeiro"),
List("fev", "fevereiro"),
List("março", "marco"),
List("abril"),
List("maio"),
List("junho"),
List("julho"),
List("agosto"),
List("set", "setembro"),
List("out", "outubro"),
List("nov", "novembro"),
List("dez", "dezembro")
)
private val finnish = List(
List("tammikuu"),
List("helmikuu"),
List("maaliskuu"),
List("huhtikuu"),
List("toukokuu"),
List("kesäkuu"),
List("heinäkuu"),
List("elokuu"),
List("syyskuu"),
List("lokakuu"),
List("marraskuu"),
List("joulukuu")
)
private val russian = List(
List("январь"),
List("февраль"),
List("март"),
List("апрель"),
List("май"),
List("июнь"),
List("июль"),
List("август"),
List("сентябрь"),
List("октябрь"),
List("ноябрь"),
List("декабрь")
)
private val dutch = List(
List("jan", "januari"),
List("feb", "februari"),
List("maart"),
List("apr", "april"),
List("mei"),
List("juni"),
List("juli"),
List("aug", "augustus"),
List("sept", "september"),
List("okt", "oct", "oktober"),
List("nov", "november"),
List("dec", "december")
)
}

View File

@ -52,7 +52,68 @@ object Language {
val iso3 = "spa"
}
val all: List[Language] = List(German, English, French, Italian, Spanish)
case object Portuguese extends Language {
val iso2 = "pt"
val iso3 = "por"
}
case object Czech extends Language {
val iso2 = "cs"
val iso3 = "ces"
}
case object Danish extends Language {
val iso2 = "da"
val iso3 = "dan"
}
case object Finnish extends Language {
val iso2 = "fi"
val iso3 = "fin"
}
case object Norwegian extends Language {
val iso2 = "no"
val iso3 = "nor"
}
case object Swedish extends Language {
val iso2 = "sv"
val iso3 = "swe"
}
case object Russian extends Language {
val iso2 = "ru"
val iso3 = "rus"
}
case object Romanian extends Language {
val iso2 = "ro"
val iso3 = "ron"
}
case object Dutch extends Language {
val iso2 = "nl"
val iso3 = "nld"
}
val all: List[Language] =
List(
German,
English,
French,
Italian,
Spanish,
Dutch,
Portuguese,
Czech,
Danish,
Finnish,
Norwegian,
Swedish,
Russian,
Romanian
)
def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase

View File

@ -32,7 +32,8 @@ object Field {
.map(contentField)
def contentField(lang: Language): Field =
Field(s"content_${lang.iso2}")
if (lang == Language.Czech) Field(s"content_cz")
else Field(s"content_${lang.iso2}")
implicit val jsonEncoder: Encoder[Field] =
Encoder.encodeString.contramap(_.name)

View File

@ -75,12 +75,33 @@ object SolrSetup {
solrEngine,
"Add content_es field",
addContentField(Language.Spanish).map(_ => FtsMigration.Result.reIndexAll)
),
FtsMigration[F](
9,
solrEngine,
"Add more content fields",
addMoreContentFields.map(_ => FtsMigration.Result.reIndexAll)
)
)
def addFolderField: F[Unit] =
addStringField(Field.folderId)
def addMoreContentFields: F[Unit] = {
val remain = List[Language](
Language.Norwegian,
Language.Romanian,
Language.Swedish,
Language.Finnish,
Language.Danish,
Language.Czech,
Language.Dutch,
Language.Portuguese,
Language.Russian
)
remain.traverse(addContentField).map(_ => ())
}
def setupCoreSchema: F[Unit] = {
val cmds0 =
List(
@ -162,7 +183,8 @@ object SolrSetup {
AddField(field, "text_general", true, true, false)
def textLang(field: Field, lang: Language): AddField =
AddField(field, s"text_${lang.iso2}", true, true, false)
if (lang == Language.Czech) AddField(field, s"text_cz", true, true, false)
else AddField(field, s"text_${lang.iso2}", true, true, false)
}
case class DeleteField(name: Field)

View File

@ -13,6 +13,15 @@ type Language
| French
| Italian
| Spanish
| Portuguese
| Czech
| Danish
| Finnish
| Norwegian
| Swedish
| Russian
| Romanian
| Dutch
fromString : String -> Maybe Language
@ -32,6 +41,33 @@ fromString str =
else if str == "spa" || str == "es" || str == "spanish" then
Just Spanish
else if str == "por" || str == "pt" || str == "portuguese" then
Just Portuguese
else if str == "ces" || str == "cs" || str == "czech" then
Just Czech
else if str == "dan" || str == "da" || str == "danish" then
Just Danish
else if str == "nld" || str == "nd" || str == "dutch" then
Just Dutch
else if str == "fin" || str == "fi" || str == "finnish" then
Just Finnish
else if str == "nor" || str == "no" || str == "norwegian" then
Just Norwegian
else if str == "swe" || str == "sv" || str == "swedish" then
Just Swedish
else if str == "rus" || str == "ru" || str == "russian" then
Just Russian
else if str == "ron" || str == "ro" || str == "romanian" then
Just Romanian
else
Nothing
@ -54,6 +90,33 @@ toIso3 lang =
Spanish ->
"spa"
Portuguese ->
"por"
Czech ->
"ces"
Danish ->
"dan"
Finnish ->
"fin"
Norwegian ->
"nor"
Swedish ->
"swe"
Russian ->
"rus"
Romanian ->
"ron"
Dutch ->
"nld"
toName : Language -> String
toName lang =
@ -73,7 +136,48 @@ toName lang =
Spanish ->
"Spanish"
Portuguese ->
"Portuguese"
Czech ->
"Czech"
Danish ->
"Danish"
Finnish ->
"Finnish"
Norwegian ->
"Norwegian"
Swedish ->
"Swedish"
Russian ->
"Russian"
Romanian ->
"Romanian"
Dutch ->
"Dutch"
all : List Language
all =
[ German, English, French, Italian, Spanish ]
[ German
, English
, French
, Italian
, Spanish
, Portuguese
, Czech
, Dutch
, Danish
, Finnish
, Norwegian
, Swedish
, Russian
, Romanian
]