Merge pull request #1559 from eikek/more-languages

More languages
This commit is contained in:
mergify[bot] 2022-05-21 12:53:54 +00:00 committed by GitHub
commit 6b913873e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 265 additions and 13 deletions

View File

@ -31,6 +31,8 @@ RUN JDKPKG="openjdk11-jre"; \
tesseract-ocr-data-lav \ tesseract-ocr-data-lav \
tesseract-ocr-data-jpn \ tesseract-ocr-data-jpn \
tesseract-ocr-data-heb \ tesseract-ocr-data-heb \
tesseract-ocr-data-lit \
tesseract-ocr-data-pol \
unpaper \ unpaper \
wkhtmltopdf \ wkhtmltopdf \
libreoffice \ libreoffice \

View File

@ -46,13 +46,16 @@ object DateFind {
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = { private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
val stext = val sep = " -\t.,\n\r/"
val (separators, stext) =
if (lang == Language.Japanese) { if (lang == Language.Japanese) {
text.map(c => if (jpnChars.contains(c)) c else ' ') (sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
} else text } else if (lang == Language.Lithuanian) {
(sep + "md") -> text
} else sep -> text
TextSplitter TextSplitter
.splitToken(stext, " -\t.,\n\r/年月日".toSet) .splitToken(stext, separators.toSet)
.filter(w => lang != Language.Latvian || w.value != "gada") .filter(w => lang != Language.Latvian || w.value != "gada")
.filter(w => lang != Language.Spanish || w.value != "de") .filter(w => lang != Language.Spanish || w.value != "de")
} }
@ -105,6 +108,8 @@ object DateFind {
case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd case Language.Japanese => ymd
case Language.Hebrew => dmy case Language.Hebrew => dmy
case Language.Lithuanian => ymd
case Language.Polish => dmy
} }
p.read(parts) match { p.read(parts) match {
case Result.Success(sds, _) => case Result.Success(sds, _) =>

View File

@ -56,6 +56,10 @@ object MonthName {
japanese japanese
case Language.Hebrew => case Language.Hebrew =>
hebrew hebrew
case Language.Lithuanian =>
lithuanian
case Language.Polish =>
polish
} }
private val numbers = List( private val numbers = List(
@ -341,4 +345,34 @@ object MonthName {
List("XI", "nov", "november"), List("XI", "nov", "november"),
List("XII", "dec", "december") List("XII", "dec", "december")
) )
private val lithuanian = List(
List("sausis", "sausio", "saus"),
List("vasaris", "vasario", "vas"),
List("kovas", "kovo", "kov"),
List("balandis", "balandžio", "bal"),
List("gegužis", "gegužės", "geg"),
List("birželis", "birželio", "birž"),
List("liepa", "liepos", "liep"),
List("rugpjūtis", "rugpjūčio", "rugp"),
List("rugsėjis", "rugsėjo", "rugs"),
List("spalis", "spalio", "spal"),
List("lapkritis", "lapkričio", "lapkr"),
List("gruodis", "gruodžio", "gruod")
)
private val polish = List(
List("stycznia", "sty"),
List("lutego", "lut"),
List("marca", "mar"),
List("kwietnia", "kwi"),
List("maja", "maj"),
List("czerwca", "cze"),
List("lipca", "lip"),
List("sierpnia", "sie"),
List("września", "wrz"),
List("października", "paź"),
List("listopada", "lis"),
List("grudnia", "gru")
)
} }

View File

@ -191,7 +191,6 @@ class DateFindTest extends FunSuite {
) )
) )
) )
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
assertEquals( assertEquals(
DateFind DateFind
.findDates("2021-11-19", Language.Spanish) .findDates("2021-11-19", Language.Spanish)
@ -204,4 +203,60 @@ class DateFindTest extends FunSuite {
) )
) )
} }
test("find lithuanian dates") {
assertEquals(
DateFind
.findDates(
"Some text in lithuanian 2022 m. gegužės 21 d. and stuff",
Language.Lithuanian
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("2022 m. gegužės 21", NerTag.Date, 24, 42)
)
)
)
assertEquals(
DateFind
.findDates("2021-11-19", Language.Lithuanian)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("2021-11-19", NerTag.Date, 0, 10)
)
)
)
}
test("find polish dates") {
assertEquals(
DateFind
.findDates(
"Some text in polish 21 maja 2022 and stuff",
Language.Polish
)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2022, 5, 21),
NerLabel("21 maja 2022", NerTag.Date, 20, 32)
)
)
)
assertEquals(
DateFind
.findDates("19.11.2021", Language.Polish)
.toVector,
Vector(
NerDateLabel(
LocalDate.of(2021, 11, 19),
NerLabel("19.11.2021", NerTag.Date, 0, 10)
)
)
)
}
} }

View File

@ -123,6 +123,16 @@ object Language {
val iso3 = "heb" val iso3 = "heb"
} }
case object Lithuanian extends Language {
val iso2 = "lt"
val iso3 = "lit"
}
case object Polish extends Language {
val iso2 = "pl"
val iso3 = "pol"
}
val all: List[Language] = val all: List[Language] =
List( List(
German, German,
@ -142,7 +152,9 @@ object Language {
Romanian, Romanian,
Latvian, Latvian,
Japanese, Japanese,
Hebrew Hebrew,
Lithuanian,
Polish
) )
def fromString(str: String): Either[String, Language] = { def fromString(str: String): Either[String, Language] = {

View File

@ -193,5 +193,7 @@ object FtsRepository extends DoobieMeta {
case Language.Latvian => "simple" case Language.Latvian => "simple"
case Language.Japanese => "simple" case Language.Japanese => "simple"
case Language.Hebrew => "simple" case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
case Language.Polish => "simple"
} }
} }

View File

@ -133,7 +133,29 @@ object SolrSetup {
"Add hungarian", "Add hungarian",
addContentField(Language.Hungarian) addContentField(Language.Hungarian)
), ),
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field") SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field"),
SolrMigration[F](
21,
"Add new field type for lithuanian content",
addFieldType(AddFieldType.textLit)
),
SolrMigration[F](
22,
"Add lithuanian",
addContentField(Language.Lithuanian)
),
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"),
SolrMigration[F](
24,
"Add new field type for polish content",
addFieldType(AddFieldType.textPol)
),
SolrMigration[F](
25,
"Add polish",
addContentField(Language.Polish)
),
SolrMigration.reIndexAll(26, "Re-Index after adding polish content field")
) )
def addFolderField: F[Unit] = def addFolderField: F[Unit] =
@ -275,6 +297,28 @@ object SolrSetup {
) )
) )
val textLit = AddFieldType(
"text_lt",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
val textPol = AddFieldType(
"text_pl",
"solr.TextField",
Analyzer(
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
List(
Filter("solr.LowerCaseFilterFactory", Map.empty)
)
)
)
final case class Filter(`class`: String, attr: Map[String, String]) final case class Filter(`class`: String, attr: Map[String, String])
final case class Tokenizer(`class`: String, attr: Map[String, String]) final case class Tokenizer(`class`: String, attr: Map[String, String])
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])

View File

@ -32,6 +32,8 @@ type Language
| Japanese | Japanese
| Hebrew | Hebrew
| Hungarian | Hungarian
| Lithuanian
| Polish
fromString : String -> Maybe Language fromString : String -> Maybe Language
@ -90,6 +92,12 @@ fromString str =
else if str == "hun" || str == "hu" || str == "hungarian" then else if str == "hun" || str == "hu" || str == "hungarian" then
Just Hungarian Just Hungarian
else if str == "lit" || str == "lt" || str == "lithuanian" then
Just Lithuanian
else if str == "pol" || str == "pl" || str == "polish" then
Just Polish
else else
Nothing Nothing
@ -151,6 +159,12 @@ toIso3 lang =
Hungarian -> Hungarian ->
"hun" "hun"
Lithuanian ->
"lit"
Polish ->
"pol"
all : List Language all : List Language
all = all =
@ -172,4 +186,6 @@ all =
, Japanese , Japanese
, Hebrew , Hebrew
, Hungarian , Hungarian
, Lithuanian
, Polish
] ]

View File

@ -71,6 +71,12 @@ gb lang =
Hungarian -> Hungarian ->
"Hungarian" "Hungarian"
Lithuanian ->
"Lithuanian"
Polish ->
"Polish"
de : Language -> String de : Language -> String
de lang = de lang =
@ -129,6 +135,12 @@ de lang =
Hungarian -> Hungarian ->
"Ungarisch" "Ungarisch"
Lithuanian ->
"Litauisch"
Polish ->
"Polnisch"
fr : Language -> String fr : Language -> String
fr lang = fr lang =
@ -186,3 +198,9 @@ fr lang =
Hungarian -> Hungarian ->
"Hongrois" "Hongrois"
Lithuanian ->
"Lituanien"
Polish ->
"Polonais"

View File

@ -0,0 +1,70 @@
+++
title = "Adding new language"
weight = 30
+++
# Adding a new language for document processing
Then there are other commits and issues to look at:
- [Add Lithuanian](https://github.com/eikek/docspell/issues/1540) and [PR](https://github.com/eikek/docspell/pull/1559/commits/9d69401fea8ff07330c8a9116bd0d987827317c9)
- [Add Polish](https://github.com/eikek/docspell/issues/1345) and [PR](https://github.com/eikek/docspell/pull/1559/commits/1228937574ec52b36d5d77925c5fcdb1f536220c)
- [Add Spanish language](https://github.com/eikek/docspell/commit/26dff18ae0d32ce2b32b4d11ce381ada0e99314f)
- [Add Latvian language](https://github.com/eikek/docspell/issues/679) and [PR](https://github.com/eikek/docspell/pull/694/commits/9991ad5fcc43ccefe011a6cc4d01bdae4bcd4573)
- [Add Japanese language](https://github.com/eikek/docspell/issues/948) and [PR](https://github.com/eikek/docspell/pull/961/commits/f994d4b2488e64668ee064676f8c6469d9ccc1be), had some corrections: [1](https://github.com/eikek/docspell/commit/c59d4f8a6d021ec4b01a92320c211248503f16a5), [Issue](https://github.com/eikek/docspell/issues/973)
- [Add Hebrew language](https://github.com/eikek/docspell/pull/1027)
Some older commits may be a bit out of date, but still show the
relevant things to do. These are:
- add it to `Language.scala`, create a new `case object` and add it to
the `all` list (then fix compile errors)
- define a list of month names to support date recognition and update
`DateFind.scala` to recognize date patterns for that language. Add
some tests to `DateFindTest`.
- add it to joex' dockerfile to be available for tesseract
- update the solr migration/field definitions in `SolrSetup`. Create a
new solr migration that adds the content field for the new
language - it is a copy&paste from other similar changes.
- update `FtsRepository` for the PostgreSQL fulltext search variant:
if not sure, use `simple` here
- update the elm file so it shows up on the client. Also requires to
add translations in `Messages.Data.Language`
## Test
Check if everything is fine with `sbt Test/compile`. After the project
compiles without errors, run `sbt fix` to apply formatting fixes.
It would be good to startup docspell and check the new lanugage a bit,
including whether fulltext search is working.
Sometimes, SOLR doesn't support a language. In this case the migration
needs to first add the new *field type*. There are examples for
Lithuanian and Hebrew in the code.
For the docker image, you can run
```bash
PLATFORMS=linux/amd64 ./build.sh 0.36.0-SNAPSHOT
```
in `docker/dockerfile` directory to build the docker image (just
choose some version, it doesn't matter).
## Non-NLP only
Note that this is without support for NLP. Including support for NLP
means that the [stanford nlp](https://github.com/stanfordnlp/CoreNLP)
library needs to provide models for it and these must be included in
the build and tested a bit.
## Opening issues on Github
You can also open an issue on github requesting to support a language.
I kindly ask to include all necessary information, like in
[this](https://github.com/eikek/docspell/issues/1540) issue. I know
that I can dig it out from websites, but it would be nice to have
everything ready. Also it is better to know from a local person some
details, like which date patterns are more likely to appear than
others.

View File

@ -206,9 +206,3 @@ publishing the release. However, for the nightly releases, this
doesn't matter - everything must be automated here obviously. I also doesn't matter - everything must be automated here obviously. I also
wanted the docker images to be built from the exact same artifacts wanted the docker images to be built from the exact same artifacts
that have been released at github (in contrast to being built again). that have been released at github (in contrast to being built again).
# Background Info
There is a list of [ADRs](@/docs/dev/adr/_index.md) containing
internal/background info for various topics.