mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Add polish to processing lanugages
SOLR doesn't support polish out of the box. Plugins are required for polish. The language has been added only with basic support. For better results, a manual setup of solr is required. Closes: #1345
This commit is contained in:
@ -32,6 +32,7 @@ RUN JDKPKG="openjdk11-jre"; \
|
|||||||
tesseract-ocr-data-jpn \
|
tesseract-ocr-data-jpn \
|
||||||
tesseract-ocr-data-heb \
|
tesseract-ocr-data-heb \
|
||||||
tesseract-ocr-data-lit \
|
tesseract-ocr-data-lit \
|
||||||
|
tesseract-ocr-data-pol \
|
||||||
unpaper \
|
unpaper \
|
||||||
wkhtmltopdf \
|
wkhtmltopdf \
|
||||||
libreoffice \
|
libreoffice \
|
||||||
|
@ -46,13 +46,16 @@ object DateFind {
|
|||||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||||
|
|
||||||
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||||
val stext =
|
val sep = " -\t.,\n\r/"
|
||||||
|
val (separators, stext) =
|
||||||
if (lang == Language.Japanese) {
|
if (lang == Language.Japanese) {
|
||||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
(sep + "年月日") -> text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||||
} else text
|
} else if (lang == Language.Lithuanian) {
|
||||||
|
(sep + "md") -> text
|
||||||
|
} else sep -> text
|
||||||
|
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(stext, " -\t.,\n\r/年月日md".toSet)
|
.splitToken(stext, separators.toSet)
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||||
}
|
}
|
||||||
@ -106,6 +109,7 @@ object DateFind {
|
|||||||
case Language.Japanese => ymd
|
case Language.Japanese => ymd
|
||||||
case Language.Hebrew => dmy
|
case Language.Hebrew => dmy
|
||||||
case Language.Lithuanian => ymd
|
case Language.Lithuanian => ymd
|
||||||
|
case Language.Polish => dmy
|
||||||
}
|
}
|
||||||
p.read(parts) match {
|
p.read(parts) match {
|
||||||
case Result.Success(sds, _) =>
|
case Result.Success(sds, _) =>
|
||||||
|
@ -58,6 +58,8 @@ object MonthName {
|
|||||||
hebrew
|
hebrew
|
||||||
case Language.Lithuanian =>
|
case Language.Lithuanian =>
|
||||||
lithuanian
|
lithuanian
|
||||||
|
case Language.Polish =>
|
||||||
|
polish
|
||||||
}
|
}
|
||||||
|
|
||||||
private val numbers = List(
|
private val numbers = List(
|
||||||
@ -358,4 +360,19 @@ object MonthName {
|
|||||||
List("lapkritis", "lapkričio", "lapkr"),
|
List("lapkritis", "lapkričio", "lapkr"),
|
||||||
List("gruodis", "gruodžio", "gruod")
|
List("gruodis", "gruodžio", "gruod")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val polish = List(
|
||||||
|
List("stycznia", "sty"),
|
||||||
|
List("lutego", "lut"),
|
||||||
|
List("marca", "mar"),
|
||||||
|
List("kwietnia", "kwi"),
|
||||||
|
List("maja", "maj"),
|
||||||
|
List("czerwca", "cze"),
|
||||||
|
List("lipca", "lip"),
|
||||||
|
List("sierpnia", "sie"),
|
||||||
|
List("września", "wrz"),
|
||||||
|
List("października", "paź"),
|
||||||
|
List("listopada", "lis"),
|
||||||
|
List("grudnia", "gru")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -231,4 +231,32 @@ class DateFindTest extends FunSuite {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find polish dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates(
|
||||||
|
"Some text in polish 21 maja 2022 and stuff",
|
||||||
|
Language.Polish
|
||||||
|
)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2022, 5, 21),
|
||||||
|
NerLabel("21 maja 2022", NerTag.Date, 20, 32)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("19.11.2021", Language.Polish)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 11, 19),
|
||||||
|
NerLabel("19.11.2021", NerTag.Date, 0, 10)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -128,6 +128,11 @@ object Language {
|
|||||||
val iso3 = "lit"
|
val iso3 = "lit"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Polish extends Language {
|
||||||
|
val iso2 = "pl"
|
||||||
|
val iso3 = "pol"
|
||||||
|
}
|
||||||
|
|
||||||
val all: List[Language] =
|
val all: List[Language] =
|
||||||
List(
|
List(
|
||||||
German,
|
German,
|
||||||
@ -148,7 +153,8 @@ object Language {
|
|||||||
Latvian,
|
Latvian,
|
||||||
Japanese,
|
Japanese,
|
||||||
Hebrew,
|
Hebrew,
|
||||||
Lithuanian
|
Lithuanian,
|
||||||
|
Polish
|
||||||
)
|
)
|
||||||
|
|
||||||
def fromString(str: String): Either[String, Language] = {
|
def fromString(str: String): Either[String, Language] = {
|
||||||
|
@ -194,5 +194,6 @@ object FtsRepository extends DoobieMeta {
|
|||||||
case Language.Japanese => "simple"
|
case Language.Japanese => "simple"
|
||||||
case Language.Hebrew => "simple"
|
case Language.Hebrew => "simple"
|
||||||
case Language.Lithuanian => "simple"
|
case Language.Lithuanian => "simple"
|
||||||
|
case Language.Polish => "simple"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -144,7 +144,18 @@ object SolrSetup {
|
|||||||
"Add lithuanian",
|
"Add lithuanian",
|
||||||
addContentField(Language.Lithuanian)
|
addContentField(Language.Lithuanian)
|
||||||
),
|
),
|
||||||
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field")
|
SolrMigration.reIndexAll(23, "Re-Index after adding lithuanian content field"),
|
||||||
|
SolrMigration[F](
|
||||||
|
24,
|
||||||
|
"Add new field type for polish content",
|
||||||
|
addFieldType(AddFieldType.textPol)
|
||||||
|
),
|
||||||
|
SolrMigration[F](
|
||||||
|
25,
|
||||||
|
"Add polish",
|
||||||
|
addContentField(Language.Polish)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(26, "Re-Index after adding polish content field")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
@ -297,6 +308,17 @@ object SolrSetup {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val textPol = AddFieldType(
|
||||||
|
"text_pl",
|
||||||
|
"solr.TextField",
|
||||||
|
Analyzer(
|
||||||
|
Tokenizer("solr.StandardTokenizerFactory", Map.empty),
|
||||||
|
List(
|
||||||
|
Filter("solr.LowerCaseFilterFactory", Map.empty)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
final case class Filter(`class`: String, attr: Map[String, String])
|
final case class Filter(`class`: String, attr: Map[String, String])
|
||||||
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
final case class Tokenizer(`class`: String, attr: Map[String, String])
|
||||||
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter])
|
||||||
|
@ -33,6 +33,7 @@ type Language
|
|||||||
| Hebrew
|
| Hebrew
|
||||||
| Hungarian
|
| Hungarian
|
||||||
| Lithuanian
|
| Lithuanian
|
||||||
|
| Polish
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -94,6 +95,9 @@ fromString str =
|
|||||||
else if str == "lit" || str == "lt" || str == "lithuanian" then
|
else if str == "lit" || str == "lt" || str == "lithuanian" then
|
||||||
Just Lithuanian
|
Just Lithuanian
|
||||||
|
|
||||||
|
else if str == "pol" || str == "pl" || str == "polish" then
|
||||||
|
Just Polish
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -158,6 +162,9 @@ toIso3 lang =
|
|||||||
Lithuanian ->
|
Lithuanian ->
|
||||||
"lit"
|
"lit"
|
||||||
|
|
||||||
|
Polish ->
|
||||||
|
"pol"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -180,4 +187,5 @@ all =
|
|||||||
, Hebrew
|
, Hebrew
|
||||||
, Hungarian
|
, Hungarian
|
||||||
, Lithuanian
|
, Lithuanian
|
||||||
|
, Polish
|
||||||
]
|
]
|
||||||
|
@ -74,6 +74,9 @@ gb lang =
|
|||||||
Lithuanian ->
|
Lithuanian ->
|
||||||
"Lithuanian"
|
"Lithuanian"
|
||||||
|
|
||||||
|
Polish ->
|
||||||
|
"Polish"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -135,6 +138,9 @@ de lang =
|
|||||||
Lithuanian ->
|
Lithuanian ->
|
||||||
"Litauisch"
|
"Litauisch"
|
||||||
|
|
||||||
|
Polish ->
|
||||||
|
"Polnisch"
|
||||||
|
|
||||||
|
|
||||||
fr : Language -> String
|
fr : Language -> String
|
||||||
fr lang =
|
fr lang =
|
||||||
@ -195,3 +201,6 @@ fr lang =
|
|||||||
|
|
||||||
Lithuanian ->
|
Lithuanian ->
|
||||||
"Lituanien"
|
"Lituanien"
|
||||||
|
|
||||||
|
Polish ->
|
||||||
|
"Polonais"
|
||||||
|
Reference in New Issue
Block a user