Reorganize nlp pipeline and add nlp-unsupported language italian

Improves and reorganizes how nlp pipelines are setup. Now users can
choose from many options, depending on their hardware and usage
scenario.

This is the base to use more languages without depending on what
stanford-nlp supports. Support then is involves to text extraction and
simple regex-ner processing.
This commit is contained in:
Eike Kettner
2021-01-16 23:43:24 +01:00
parent a70e9ab614
commit f01646aeb5
29 changed files with 676 additions and 255 deletions

View File

@ -24,6 +24,7 @@ object Field {
val content_de = Field("content_de")
val content_en = Field("content_en")
val content_fr = Field("content_fr")
val content_it = Field("content_it")
val itemName = Field("itemName")
val itemNotes = Field("itemNotes")
val folderId = Field("folder")
@ -36,6 +37,8 @@ object Field {
Field.content_en
case Language.French =>
Field.content_fr
case Language.Italian =>
Field.content_it
}
implicit val jsonEncoder: Encoder[Field] =

View File

@ -40,6 +40,7 @@ object SolrQuery {
Field.content_de,
Field.content_en,
Field.content_fr,
Field.content_it,
Field.itemName,
Field.itemNotes,
Field.attachmentName

View File

@ -63,6 +63,12 @@ object SolrSetup {
solrEngine,
"Index all from database",
FtsMigration.Result.indexAll.pure[F]
),
FtsMigration[F](
7,
solrEngine,
"Add content_it field",
addContentItField.map(_ => FtsMigration.Result.reIndexAll)
)
)
@ -72,6 +78,9 @@ object SolrSetup {
def addContentFrField: F[Unit] =
addTextField(Some(Language.French))(Field.content_fr)
def addContentItField: F[Unit] =
addTextField(Some(Language.Italian))(Field.content_it)
def setupCoreSchema: F[Unit] = {
val cmds0 =
List(
@ -90,13 +99,15 @@ object SolrSetup {
)
.traverse(addTextField(None))
val cntLang = Language.all.traverse {
val cntLang = List(Language.German, Language.English, Language.French).traverse {
case l @ Language.German =>
addTextField(l.some)(Field.content_de)
case l @ Language.English =>
addTextField(l.some)(Field.content_en)
case l @ Language.French =>
addTextField(l.some)(Field.content_fr)
case _ =>
().pure[F]
}
cmds0 *> cmds1 *> cntLang *> ().pure[F]
@ -125,6 +136,9 @@ object SolrSetup {
case Some(Language.French) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textFR(field)))
case Some(Language.Italian) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textIT(field)))
}
}
}
@ -161,6 +175,9 @@ object SolrSetup {
def textFR(field: Field): AddField =
AddField(field, "text_fr", true, true, false)
def textIT(field: Field): AddField =
AddField(field, "text_it", true, true, false)
}
case class DeleteField(name: Field)