Merge pull request #1559 from eikek/more-languages

More languages
This commit is contained in:
mergify[bot] 2022-05-21 12:53:54 +00:00 committed by GitHub
commit 6b913873e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 265 additions and 13 deletions

View File

@ -31,6 +31,8 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-lav \
tesseract-ocr-data-jpn \
tesseract-ocr-data-heb \
tesseract-ocr-data-lit \
tesseract-ocr-data-pol \
unpaper \
wkhtmltopdf \
libreoffice \

View File

@ -46,13 +46,16 @@ object DateFind {
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext =
val sep = " -\t.,\n\r/"
val (separators, stext) =
if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text
(sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
} else if (lang == Language.Lithuanian) {
(sep + "md") -> text
} else sep -> text
TextSplitter
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
.splitToken(stext, separators.toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de")
}
@ -105,6 +108,8 @@ object DateFind {
case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd
case Language.Hebrew => dmy
case Language.Lithuanian => ymd
case Language.Polish => dmy
}
p.read(parts) match {
case Result.Success(sds, _) =>

View File

@ -56,6 +56,10 @@ object MonthName {
japanese
case Language.Hebrew =>
hebrew
case Language.Lithuanian =>
lithuanian
case Language.Polish =>
polish
}
private val numbers = List(
@ -341,4 +345,34 @@ object MonthName {
List("XI", "nov", "november"),
List("XII", "dec", "december")
)
private val lithuanian = List(
List("sausis", "sausio", "saus"),
List("vasaris", "vasario", "vas"),
List("kovas", "kovo", "kov"),
List("balandis", "balandžio", "bal"),
List("gegužis", "gegužės", "geg"),
List("birželis", "birželio", "birž"),
List("liepa", "liepos", "liep"),
List("rugpjūtis", "rugpjūčio", "rugp"),
List("rugsėjis", "rugsėjo", "rugs"),
List("spalis", "spalio", "spal"),
List("lapkritis", "lapkričio", "lapkr"),
List("gruodis", "gruodžio", "gruod")
)
private val polish = List(
List("stycznia", "sty"),
List("lutego", "lut"),
List("marca", "mar"),
List("kwietnia", "kwi"),
List("maja", "maj"),
List("czerwca", "cze"),
List("lipca", "lip"),
List("sierpnia", "sie"),
List("września", "wrz"),
List("października", "paź"),
List("listopada", "lis"),
List("grudnia", "gru")
)
}

View File

@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
)
)
)
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Spanish)
@ -204,4 +203,60 @@ class DateFindTest extends FunSuite {
)
)
}
test("find lithuanian dates") {
assertEquals(
DateFind
.findDates(
"Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
Language.Lithuanian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
)
)
)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Lithuanian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("2021-11-19", NerTag.Date, 0, 10)
)
)
)
}
test("find polish dates") {
assertEquals(
DateFind
.findDates(
"Some text in polish 21 maja 2022 and stuff",
Language.Polish
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("21 maja 2022", NerTag.Date, 20, 32)
)
)
)
assertEquals(
DateFind
.findDates("19.11.2021", Language.Polish)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("19.11.2021", NerTag.Date, 0, 10)
)
)
)
}
}

View File

@ -123,6 +123,16 @@ object Language {
val iso3 = "heb"
}
case object Lithuanian extends Language {
val iso2 = "lt"
val iso3 = "lit"
}
case object Polish extends Language {
val iso2 = "pl"
val iso3 = "pol"
}
val all: List[Language] =
List(
German,
@ -142,7 +152,9 @@ object Language {
Romanian,
Latvian,
Japanese,
Hebrew
Hebrew,
Lithuanian,
Polish
)
def fromString(str: String): Either[String, Language] = {

View File

@ -193,5 +193,7 @@ object FtsRepository extends DoobieMeta {
case Language.Latvian => "simple"
case Language.Japanese => "simple"
case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
case Language.Polish => "simple"
}
}

View File

@ -133,7 +133,29 @@ object SolrSetup {
"Add hungarian",
addContentField(Language.Hungarian)
),
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
SolrMigration[F](
21,
"Add new field type for lithuanian content",
addFieldType(AddFieldType.textLit)
),
SolrMigration[F](
22,
"Add lithuanian",
addContentField(Language.Lithuanian)
),
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"),
SolrMigration[F](
24,
"Add new field type for polish content",
addFieldType(AddFieldType.textPol)
),
SolrMigration[F](
25,
"Add polish",
addContentField(Language.Polish)
),
SolrMigration.reIndexAll(26, "Re-Index after adding polish content field")
)
def addFolderField: F[Unit] =
@ -275,6 +297,28 @@ object SolrSetup {
)
)
val textLit = AddFieldType(
"text_lt",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
val textPol = AddFieldType(
"text_pl",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -32,6 +32,8 @@ type Language
| Japanese
| Hebrew
| Hungarian
| Lithuanian
| Polish
fromString : String -> Maybe Language
@ -90,6 +92,12 @@ fromString str =
else if str == "hun" || str == "hu" || str == "hungarian" then
Just Hungarian
else if str == "lit" || str == "lt" || str == "lithuanian" then
Just Lithuanian
else if str == "pol" || str == "pl" || str == "polish" then
Just Polish
else
Nothing
@ -151,6 +159,12 @@ toIso3 lang =
Hungarian ->
"hun"
Lithuanian ->
"lit"
Polish ->
"pol"
all : List Language
all =
@ -172,4 +186,6 @@ all =
, Japanese
, Hebrew
, Hungarian
, Lithuanian
, Polish
]

View File

@ -71,6 +71,12 @@ gb lang =
Hungarian ->
"Hungarian"
Lithuanian ->
"Lithuanian"
Polish ->
"Polish"
de : Language -> String
de lang =
@ -129,6 +135,12 @@ de lang =
Hungarian ->
"Ungarisch"
Lithuanian ->
"Litauisch"
Polish ->
"Polnisch"
fr : Language -> String
fr lang =
@ -186,3 +198,9 @@ fr lang =
Hungarian ->
"Hongrois"
Lithuanian ->
"Lituanien"
Polish ->
"Polonais"

View File

@ -0,0 +1,70 @@
+++
title = "Adding new language"
weight = 30
+++
# Adding a new language for document processing
Then there are other commits and issues to look at:
- [Add Lithuanian](https://github.com/eikek/docspell/issues/1540) and [PR](https://github.com/eikek/docspell/pull/1559/commits/9d69401fea8ff07330c8a9116bd0d987827317c9)
- [Add Polish](https://github.com/eikek/docspell/issues/1345) and [PR](https://github.com/eikek/docspell/pull/1559/commits/1228937574ec52b36d5d77925c5fcdb1f536220c)
- [Add Spanish language](https://github.com/eikek/docspell/commit/26dff18ae0d32ce2b32b4d11ce381ada0e99314f)
- [Add Latvian language](https://github.com/eikek/docspell/issues/679) and [PR](https://github.com/eikek/docspell/pull/694/commits/9991ad5fcc43ccefe011a6cc4d01bdae4bcd4573)
- [Add Japanese language](https://github.com/eikek/docspell/issues/948) and [PR](https://github.com/eikek/docspell/pull/961/commits/f994d4b2488e64668ee064676f8c6469d9ccc1be), had some corrections: [1](https://github.com/eikek/docspell/commit/c59d4f8a6d021ec4b01a92320c211248503f16a5), [Issue](https://github.com/eikek/docspell/issues/973)
- [Add Hebrew language](https://github.com/eikek/docspell/pull/1027)
Some older commits may be a bit out of date, but still show the
relevant things to do. These are:
- add it to `Language.scala`, create a new `case object` and add it to
the `all` list (then fix compile errors)
- define a list of month names to support date recognition and update
`DateFind.scala` to recognize date patterns for that language. Add
some tests to `DateFindTest`.
- add it to joex' dockerfile to be available for tesseract
- update the solr migration/field definitions in `SolrSetup`. Create a
new solr migration that adds the content field for the new
language - it is a copy&paste from other similar changes.
- update `FtsRepository` for the PostgreSQL fulltext search variant:
if not sure, use `simple` here
- update the elm file so it shows up on the client. Also requires to
add translations in `Messages.Data.Language`
## Test
Check if everything is fine with `sbt Test/compile`. After the project
compiles without errors, run `sbt fix` to apply formatting fixes.
It would be good to startup docspell and check the new lanugage a bit,
including whether fulltext search is working.
Sometimes, SOLR doesn't support a language. In this case the migration
needs to first add the new *field type*. There are examples for
Lithuanian and Hebrew in the code.
For the docker image, you can run
```bash
PLATFORMS=linux/amd64 ./build.sh 0.36.0-SNAPSHOT
```
in `docker/dockerfile` directory to build the docker image (just
choose some version, it doesn't matter).
## Non-NLP only
Note that this is without support for NLP. Including support for NLP
means that the [stanford nlp](https://github.com/stanfordnlp/CoreNLP)
library needs to provide models for it and these must be included in
the build and tested a bit.
## Opening issues on Github
You can also open an issue on github requesting to support a language.
I kindly ask to include all necessary information, like in
[this](https://github.com/eikek/docspell/issues/1540) issue. I know
that I can dig it out from websites, but it would be nice to have
everything ready. Also it is better to know from a local person some
details, like which date patterns are more likely to appear than
others.

View File

@ -206,9 +206,3 @@ publishing the release. However, for the nightly releases, this
doesn't matter - everything must be automated here obviously. I also
wanted the docker images to be built from the exact same artifacts
that have been released at github (in contrast to being built again).
# Background Info
There is a list of [ADRs](@/docs/dev/adr/_index.md) containing
internal/background info for various topics.