From 637f11d0f667204416128f7e741b9a7ffa4be8bb Mon Sep 17 00:00:00 2001 From: eikek Date: Sat, 28 Aug 2021 00:10:36 +0200 Subject: [PATCH] Fix solr setup by adding a text_he field This field is used for Hebrew language. Solr doesn't support it out of the box. The new field type is just a very basic field using the standard tokenizer and lowercase filter. It is very likely not providing good results. Hebrew is really difficult and it requires at least installing plugins for solr - this is out of scope for docspell. Users can setup their solr however they like and run a re-index afterwards. --- .../scala/docspell/ftssolr/SolrSetup.scala | 66 ++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 8ebd323b..02fe1f15 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -18,6 +18,7 @@ import org.http4s._ import org.http4s.circe._ import org.http4s.client.Client import org.http4s.client.dsl.Http4sClientDsl +import org.log4s.getLogger trait SolrSetup[F[_]] { @@ -29,6 +30,7 @@ trait SolrSetup[F[_]] { object SolrSetup { private val versionDocId = "6d8f09f4-8d7e-4bc9-98b8-7c89223b36dd" + private[this] val logger = getLogger def apply[F[_]: Async](cfg: SolrConfig, client: Client[F]): SolrSetup[F] = { val dsl = new Http4sClientDsl[F] {} @@ -117,10 +119,15 @@ object SolrSetup { SolrMigration.reIndexAll(15, "Re-Index after adding japanese content field"), SolrMigration[F]( 16, + "Add new field type for hebrew content", + addFieldType(AddFieldType.textHe) + ), + SolrMigration[F]( + 17, "Add hebrew content field", addContentField(Language.Hebrew) ), - SolrMigration.reIndexAll(17, "Re-Index after adding hebrew content field") + SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field") ) def addFolderField: F[Unit] = @@ -194,6 +201,15 @@ object SolrSetup { run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.textLang(field, lang))) } + + private def addFieldType(ft: AddFieldType): F[Unit] = + run(AddFieldType.command(ft)).attempt.flatMap { + case Right(_) => ().pure[F] + case Left(ex) => + Async[F].delay( + logger.warn(s"Adding new field type '$ft' failed: ${ex.getMessage()}") + ) + } } } @@ -234,4 +250,52 @@ object SolrSetup { def command(body: DeleteField): Json = Map("delete-field" -> body.asJson).asJson } + + final case class AddFieldType( + name: String, + `class`: String, + analyzer: AddFieldType.Analyzer + ) + object AddFieldType { + + val textHe = AddFieldType( + "text_he", + "solr.TextField", + Analyzer( + Tokenizer("solr.StandardTokenizerFactory", Map.empty), + List( + Filter("solr.LowerCaseFilterFactory", Map.empty) + ) + ) + ) + + final case class Filter(`class`: String, attr: Map[String, String]) + final case class Tokenizer(`class`: String, attr: Map[String, String]) + final case class Analyzer(tokenizer: Tokenizer, filter: List[Filter]) + + object Filter { + implicit val jsonEncoder: Encoder[Filter] = + Encoder.encodeJson.contramap { filter => + val m = filter.attr.updated("class", filter.`class`) + m.asJson + } + } + object Tokenizer { + implicit val jsonEncoder: Encoder[Tokenizer] = + Encoder.encodeJson.contramap { tokenizer => + val m = tokenizer.attr.updated("class", tokenizer.`class`) + m.asJson + } + } + object Analyzer { + implicit val jsonEncoder: Encoder[Analyzer] = + deriveEncoder[Analyzer] + } + + def command(body: AddFieldType): Json = + Map("add-field-type" -> body.asJson).asJson + + implicit val jsonEncoder: Encoder[AddFieldType] = + deriveEncoder[AddFieldType] + } }