Add spanish as an example

Adding a new language without nlp requires now only to fill out the
pieces:

- define a list of month names to support date recognition
- add it to joex' dockerfile to be available for tesseract
- update the solr migration/field definitions
- update the elm file so it shows up on the client
This commit is contained in:
Eike Kettner 2021-01-17 12:18:23 +01:00
parent 360cad3304
commit 26dff18ae0
8 changed files with 49 additions and 2 deletions

View File

@ -16,6 +16,7 @@ RUN apk add --no-cache openjdk11-jre \
tesseract-ocr-data-deu \ tesseract-ocr-data-deu \
tesseract-ocr-data-fra \ tesseract-ocr-data-fra \
tesseract-ocr-data-ita \ tesseract-ocr-data-ita \
tesseract-ocr-data-spa \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -65,6 +65,7 @@ object DateFind {
case Language.German => p1.or(p0).or(p2) case Language.German => p1.or(p0).or(p2)
case Language.French => p1.or(p0).or(p2) case Language.French => p1.or(p0).or(p2)
case Language.Italian => p1.or(p0).or(p2) case Language.Italian => p1.or(p0).or(p2)
case Language.Spanish => p1.or(p0).or(p2)
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -22,6 +22,8 @@ object MonthName {
french french
case Language.Italian => case Language.Italian =>
italian italian
case Language.Spanish =>
spanish
} }
private val numbers = List( private val numbers = List(
@ -98,4 +100,19 @@ object MonthName {
List("nov", "novembre"), List("nov", "novembre"),
List("dic", "dicembre") List("dic", "dicembre")
) )
private val spanish = List(
List("ene", "enero"),
List("feb", "febrero"),
List("mar", "marzo"),
List("abr", "abril"),
List("may", "mayo"),
List("jun"),
List("jul"),
List("ago", "agosto"),
List("sep", "septiembre"),
List("oct", "octubre"),
List("nov", "noviembre"),
List("dic", "diciembre")
)
} }

View File

@ -47,7 +47,12 @@ object Language {
val iso3 = "ita" val iso3 = "ita"
} }
val all: List[Language] = List(German, English, French, Italian) case object Spanish extends Language {
val iso2 = "es"
val iso3 = "spa"
}
val all: List[Language] = List(German, English, French, Italian, Spanish)
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase val lang = str.toLowerCase

View File

@ -25,6 +25,7 @@ object Field {
val content_en = Field("content_en") val content_en = Field("content_en")
val content_fr = Field("content_fr") val content_fr = Field("content_fr")
val content_it = Field("content_it") val content_it = Field("content_it")
val content_es = Field("content_es")
val itemName = Field("itemName") val itemName = Field("itemName")
val itemNotes = Field("itemNotes") val itemNotes = Field("itemNotes")
val folderId = Field("folder") val folderId = Field("folder")
@ -39,6 +40,8 @@ object Field {
Field.content_fr Field.content_fr
case Language.Italian => case Language.Italian =>
Field.content_it Field.content_it
case Language.Spanish =>
Field.content_es
} }
implicit val jsonEncoder: Encoder[Field] = implicit val jsonEncoder: Encoder[Field] =

View File

@ -41,6 +41,7 @@ object SolrQuery {
Field.content_en, Field.content_en,
Field.content_fr, Field.content_fr,
Field.content_it, Field.content_it,
Field.content_es,
Field.itemName, Field.itemName,
Field.itemNotes, Field.itemNotes,
Field.attachmentName Field.attachmentName

View File

@ -69,6 +69,14 @@ object SolrSetup {
solrEngine, solrEngine,
"Add content_it field", "Add content_it field",
addContentItField.map(_ => FtsMigration.Result.reIndexAll) addContentItField.map(_ => FtsMigration.Result.reIndexAll)
),
FtsMigration[F](
8,
solrEngine,
"Add content_es field",
addTextField(Some(Language.Spanish))(Field.content_es).map(_ =>
FtsMigration.Result.reIndexAll
)
) )
) )

View File

@ -12,6 +12,7 @@ type Language
| English | English
| French | French
| Italian | Italian
| Spanish
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -27,6 +28,10 @@ fromString str =
else if str == "ita" || str == "it" || str == "italian" then else if str == "ita" || str == "it" || str == "italian" then
Just Italian Just Italian
else if str == "spa" || str == "es" || str == "spanish" then
Just Spanish
else else
Nothing Nothing
@ -46,6 +51,9 @@ toIso3 lang =
Italian -> Italian ->
"ita" "ita"
Spanish ->
"spa"
toName : Language -> String toName : Language -> String
toName lang = toName lang =
@ -62,7 +70,10 @@ toName lang =
Italian -> Italian ->
"Italian" "Italian"
Spanish ->
"Spanish"
all : List Language all : List Language
all = all =
[ German, English, French, Italian ] [ German, English, French, Italian, Spanish ]