From 3576c45d1a8c53a5a054d6529ce78e7424effe6f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 20 Jun 2020 01:57:24 +0200 Subject: [PATCH] First basic working solr search --- .../docspell/backend/ops/OFulltext.scala | 7 +- .../main/scala/docspell/common/Duration.scala | 3 + .../main/scala/docspell/common/Ident.scala | 2 +- .../docspell/ftsclient/FtsBasicResult.scala | 19 ---- .../scala/docspell/ftsclient/FtsClient.scala | 2 +- .../scala/docspell/ftsclient/FtsQuery.scala | 2 +- .../scala/docspell/ftsclient/FtsResult.scala | 28 ++++++ .../main/scala/docspell/ftssolr/Field.scala | 29 ++++++ .../main/scala/docspell/ftssolr/Fields.scala | 19 ---- .../scala/docspell/ftssolr/JsonCodec.scala | 93 ++++++++++++++----- .../scala/docspell/ftssolr/QueryData.scala | 56 +++++++++++ .../docspell/ftssolr/SolrFtsClient.scala | 26 ++---- .../scala/docspell/ftssolr/SolrQuery.scala | 54 +++++++++++ .../scala/docspell/ftssolr/SolrSetup.scala | 44 +++++---- 14 files changed, 277 insertions(+), 107 deletions(-) delete mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala create mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala create mode 100644 modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala delete mode 100644 modules/fts-solr/src/main/scala/docspell/ftssolr/Fields.scala create mode 100644 modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala create mode 100644 modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala index aa404108..c58213a5 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala @@ -20,7 +20,6 @@ object OFulltext { // then run a query // check if supported by mariadb, postgres and h2. seems like it is supported everywhere - def apply[F[_]: Effect]( itemSearch: OItemSearch[F], fts: FtsClient[F] @@ -43,21 +42,21 @@ object OFulltext { .compile .toVector - private def findItemsFts[A]( q: Query, ftsQ: String, batch: Batch, search: (Query, Batch) => F[Vector[A]] ): Stream[F, A] = { - val fq = FtsQuery(ftsQ, q.collective, batch.limit, batch.offset) + val fq = FtsQuery(ftsQ, q.collective, batch.limit, batch.offset, Nil) val qres = for { items <- fts .searchBasic(fq) - .map(_.item) + .flatMap(r => Stream.emits(r.results)) + .map(_.itemId) .compile .toVector .map(_.toSet) diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index bb47059e..dfda4652 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -25,6 +25,9 @@ case class Duration(nanos: Long) { def formatExact: String = s"$millis ms" + + override def toString(): String = + s"Duration(${millis}ms)" } object Duration { diff --git a/modules/common/src/main/scala/docspell/common/Ident.scala b/modules/common/src/main/scala/docspell/common/Ident.scala index b9bfa95c..08c008f6 100644 --- a/modules/common/src/main/scala/docspell/common/Ident.scala +++ b/modules/common/src/main/scala/docspell/common/Ident.scala @@ -17,7 +17,7 @@ case class Ident(id: String) { !isEmpty def / (next: Ident): Ident = - new Ident(id + "/" + next.id) + new Ident(id + "." + next.id) } object Ident { diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala deleted file mode 100644 index 3e0b5e61..00000000 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala +++ /dev/null @@ -1,19 +0,0 @@ -package docspell.ftsclient - -import cats.data.NonEmptyList -import cats.implicits._ -import docspell.common._ - -import FtsBasicResult.AttachmentMatch - -final case class FtsBasicResult(item: Ident, attachments: NonEmptyList[AttachmentMatch]) { - - def score: Double = - attachments.map(_.score).toList.max -} - -object FtsBasicResult { - - case class AttachmentMatch(id: Ident, score: Double) - -} diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala index b4db64c0..94d45f1c 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala @@ -19,7 +19,7 @@ trait FtsClient[F[_]] { */ def initialize: F[Unit] - def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult] + def searchBasic(q: FtsQuery): Stream[F, FtsResult] def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] } diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala index 6cbee19f..93dff968 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala @@ -7,4 +7,4 @@ import docspell.common._ * The query itself is a raw string. Each implementation may * interpret it according to the system in use. */ -final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int) +final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int, items: List[Ident]) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala new file mode 100644 index 00000000..8aa85dd3 --- /dev/null +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala @@ -0,0 +1,28 @@ +package docspell.ftsclient + +import docspell.common._ + +import FtsResult.ItemMatch + +final case class FtsResult( + qtime: Duration, + count: Int, + maxScore: Double, + highlight: Map[Ident, List[String]], + results: List[ItemMatch] +) {} + +object FtsResult { + + sealed trait MatchData + case class AttachmentData(attachId: Ident) extends MatchData + case object ItemData extends MatchData + + case class ItemMatch( + id: Ident, + itemId: Ident, + collectiveId: Ident, + score: Double, + data: MatchData + ) +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala new file mode 100644 index 00000000..be6cd0eb --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -0,0 +1,29 @@ +package docspell.ftssolr + +import io.circe._ + +final class Field(val name: String) extends AnyVal { + + override def toString(): String = s"Field($name)" +} + +object Field { + + def apply(name: String): Field = + new Field(name) + + + val discriminator = Field("discriminator") + val id = Field("id") + val itemId = Field("itemId") + val collectiveId = Field("collectiveId") + val attachmentId = Field("attachmentId") + val attachmentName = Field("attachmentName") + val content = Field("content") + val itemName = Field("itemName") + val itemNotes = Field("itemNotes") + + + implicit val jsonEncoder: Encoder[Field] = + Encoder.encodeString.contramap(_.name) +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Fields.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Fields.scala deleted file mode 100644 index f9ecc354..00000000 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Fields.scala +++ /dev/null @@ -1,19 +0,0 @@ -package docspell.ftssolr - -object Fields { - val discriminator = "discriminator" - val id = "id" - val itemId = "itemId" - val collectiveId = "collectiveId" - - object Attachment { - val attachmentId = "attachmentId" - val attachmentName = "attachmentName" - val content = "content" - } - - object Item { - val itemName = "itemName" - val itemNotes = "itemNotes" - } -} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala index 9ad35645..6b55efb3 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala @@ -1,43 +1,92 @@ package docspell.ftssolr +//import cats.implicits._ +import io.circe._ import docspell.common._ import docspell.ftsclient._ -import io.circe._ -import Fields.{Item, Attachment} trait JsonCodec { - implicit def attachmentEncoder: Encoder[TextData.Attachment] = + implicit def attachmentEncoder(implicit + enc: Encoder[Ident] + ): Encoder[TextData.Attachment] = new Encoder[TextData.Attachment] { - final def apply(td: TextData.Attachment): Json = Json.obj( - (Fields.id, Ident.encodeIdent(td.id)), - (Fields.itemId, Ident.encodeIdent(td.item)), - (Fields.collectiveId, Ident.encodeIdent(td.collective)), - (Attachment.attachmentId, Ident.encodeIdent(td.attachId)), - (Attachment.attachmentName, Json.fromString(td.name.getOrElse(""))), - (Attachment.content, Json.fromString(td.text.getOrElse(""))), - (Fields.discriminator, Json.fromString("attachment")) - ) + final def apply(td: TextData.Attachment): Json = + Json.obj( + (Field.id.name, enc(td.id)), + (Field.itemId.name, enc(td.item)), + (Field.collectiveId.name, enc(td.collective)), + (Field.attachmentId.name, enc(td.attachId)), + (Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))), + (Field.content.name, Json.fromString(td.text.getOrElse(""))), + (Field.discriminator.name, Json.fromString("attachment")) + ) } - implicit def itemEncoder: Encoder[TextData.Item] = + implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] = new Encoder[TextData.Item] { - final def apply(td: TextData.Item): Json = Json.obj( - (Fields.id, Ident.encodeIdent(td.id)), - (Fields.itemId, Ident.encodeIdent(td.item)), - (Fields.collectiveId, Ident.encodeIdent(td.collective)), - (Item.itemName, Json.fromString(td.name.getOrElse(""))), - (Item.itemNotes, Json.fromString(td.notes.getOrElse(""))), - (Fields.discriminator, Json.fromString("item")) - ) + final def apply(td: TextData.Item): Json = + Json.obj( + (Field.id.name, enc(td.id)), + (Field.itemId.name, enc(td.item)), + (Field.collectiveId.name, enc(td.collective)), + (Field.itemName.name, Json.fromString(td.name.getOrElse(""))), + (Field.itemNotes.name, Json.fromString(td.notes.getOrElse(""))), + (Field.discriminator.name, Json.fromString("item")) + ) } - implicit def textDataEncoder(implicit ae: Encoder[TextData.Attachment], ie: Encoder[TextData.Item] ): Encoder[TextData] = Encoder(_.fold(ae.apply, ie.apply)) + + implicit def ftsResultDecoder: Decoder[FtsResult] = + new Decoder[FtsResult] { + final def apply(c: HCursor): Decoder.Result[FtsResult] = + for { + qtime <- c.downField("responseHeader").get[Duration]("QTime") + count <- c.downField("response").get[Int]("numFound") + maxScore <- c.downField("response").get[Double]("maxScore") + results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs") + highligh <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting") + highline = highligh.map(kv => kv._1 -> kv._2.values.flatten.toList) + } yield FtsResult(qtime, count, maxScore, highline, results) + } + + implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] = + new Decoder[FtsResult.ItemMatch] { + final def apply(c: HCursor): Decoder.Result[FtsResult.ItemMatch] = + for { + itemId <- c.get[Ident]("itemId") + id <- c.get[Ident]("id") + coll <- c.get[Ident]("collectiveId") + score <- c.get[Double]("score") + md <- decodeMatchData(c) + } yield FtsResult.ItemMatch(id, itemId, coll, score, md) + } + + def decodeMatchData: Decoder[FtsResult.MatchData] = + new Decoder[FtsResult.MatchData] { + final def apply(c: HCursor): Decoder.Result[FtsResult.MatchData] = + for { + disc <- c.get[String]("discriminator") + md <- + if ("attachment" == disc) + c.get[Ident]("attachmentId").map(FtsResult.AttachmentData.apply) + else Right(FtsResult.ItemData) + } yield md + } + + implicit def identKeyEncoder: KeyEncoder[Ident] = + new KeyEncoder[Ident] { + override def apply(ident: Ident): String = ident.id + } + implicit def identKeyDecoder: KeyDecoder[Ident] = + new KeyDecoder[Ident] { + override def apply(ident: String): Option[Ident] = Ident(ident).toOption + } } object JsonCodec extends JsonCodec diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala new file mode 100644 index 00000000..8c877c3b --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala @@ -0,0 +1,56 @@ +package docspell.ftssolr + +import io.circe._ +import io.circe.generic.semiauto._ +import docspell.ftsclient.FtsQuery + +final case class QueryData( + query: String, + filter: String, + limit: Int, + offset: Int, + fields: List[Field], + params: Map[String, String] +) { + + def nextPage: QueryData = + copy(offset = offset + limit) + + def withHighLight(fields: List[Field], pre: String, post: String): QueryData = + copy(params = + params ++ Map( + "hl" -> "on", + "hl.requireFieldMatch" -> "true", + "hl.fl" -> fields.map(_.name).mkString(","), + "hl.simple.pre" -> pre, + "hl.simple.post" -> post + ) + ) +} + +object QueryData { + + implicit val jsonEncoder: Encoder[QueryData] = + deriveEncoder[QueryData] + + def apply(search: List[Field], fields: List[Field], fq: FtsQuery): QueryData = { + val q = sanitize(fq.q) + val extQ = search.map(f => s"${f.name}:($q)").mkString(" OR ") + val items = fq.items.map(_.id).mkString(" ") + val collQ = s"""${Field.collectiveId.name}:"${fq.collective.id}"""" + val filterQ = fq.items match { + case Nil => + collQ + case _ => + (collQ :: List(s"""${Field.itemId.name}:($items)""")).mkString(" AND ") + } + QueryData(extQ, filterQ, fq.limit, fq.offset, fields, Map.empty).withHighLight( + search, + "**", + "**" + ) + } + + private def sanitize(q: String): String = + q.replaceAll("[\\(,\\)]+", " ") +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala index 14eaa8d2..cc25726a 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala @@ -5,32 +5,20 @@ import cats.effect._ import cats.implicits._ import org.http4s.client.Client -import cats.data.NonEmptyList import docspell.common._ import docspell.ftsclient._ -import docspell.ftsclient.FtsBasicResult._ final class SolrFtsClient[F[_]: Effect]( solrUpdate: SolrUpdate[F], - solrSetup: SolrSetup[F] + solrSetup: SolrSetup[F], + solrQuery: SolrQuery[F] ) extends FtsClient[F] { def initialize: F[Unit] = solrSetup.setupSchema - def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult] = - Stream.emits( - Seq( - FtsBasicResult( - Ident.unsafe("5J4zvCiTE2j-UEznDUsUCsA-5px6ftrSwfs-FpUWCaHh2Ei"), - NonEmptyList.of(AttachmentMatch(Ident.unsafe("a"), 0.2)) - ), - FtsBasicResult( - Ident.unsafe("8B8UNoC1U4y-dqnqjdFG7ue-LG5ktz9pWVt-diFemCLrLAa"), - NonEmptyList.of(AttachmentMatch(Ident.unsafe("b"), 0.5)) - ) - ) - ) + def searchBasic(q: FtsQuery): Stream[F, FtsResult] = + Stream.eval(solrQuery.query(q)) def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = (for { @@ -53,7 +41,11 @@ object SolrFtsClient { httpClient: Client[F] ): Resource[F, FtsClient[F]] = Resource.pure[F, FtsClient[F]]( - new SolrFtsClient(SolrUpdate(cfg, httpClient), SolrSetup(cfg, httpClient)) + new SolrFtsClient( + SolrUpdate(cfg, httpClient), + SolrSetup(cfg, httpClient), + SolrQuery(cfg, httpClient) + ) ) } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala new file mode 100644 index 00000000..435402e2 --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -0,0 +1,54 @@ +package docspell.ftssolr + +import cats.effect._ +import org.http4s._ +import org.http4s.client.Client +import org.http4s.circe._ +import org.http4s.circe.CirceEntityDecoder._ +import org.http4s.client.dsl.Http4sClientDsl +import _root_.io.circe.syntax._ +import org.log4s.getLogger + +import docspell.ftsclient._ +import JsonCodec._ + +trait SolrQuery[F[_]] { + + def query(q: QueryData): F[FtsResult] + + def query(q: FtsQuery): F[FtsResult] = { + val fq = QueryData( + List(Field.content, Field.itemName, Field.itemNotes, Field.attachmentName), + List( + Field.id, + Field.itemId, + Field.collectiveId, + Field("score"), + Field.attachmentId, + Field.discriminator + ), + q + ) + query(fq) + } +} + +object SolrQuery { + private[this] val logger = getLogger + + def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrQuery[F] = { + val dsl = new Http4sClientDsl[F] {} + import dsl._ + + new SolrQuery[F] { + val url = Uri.unsafeFromString(cfg.url.asString) / "query" + + def query(q: QueryData): F[FtsResult] = { + val req = Method.POST(q.asJson, url) + logger.debug(s"Running query: $req : ${q.asJson}") + client.expect[FtsResult](req) + } + + } + } +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 616dfaff..13d43d17 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -11,8 +11,6 @@ import _root_.io.circe.syntax._ import _root_.io.circe._ import _root_.io.circe.generic.semiauto._ -import Fields.{Attachment, Item} - trait SolrSetup[F[_]] { def setupSchema: F[Unit] @@ -33,18 +31,18 @@ object SolrSetup { def setupSchema: F[Unit] = { val cmds0 = List( - Fields.id, - Fields.itemId, - Fields.collectiveId, - Fields.discriminator, - Attachment.attachmentId + Field.id, + Field.itemId, + Field.collectiveId, + Field.discriminator, + Field.attachmentId ) .traverse(addStringField) val cmds1 = List( - Attachment.attachmentName, - Attachment.content, - Item.itemName, - Item.itemNotes + Field.attachmentName, + Field.content, + Field.itemName, + Field.itemNotes ) .traverse(addTextField) @@ -57,13 +55,13 @@ object SolrSetup { client.expect[String](req).map(r => logger.debug(s"Response: $r")) } - private def addStringField(name: String): F[Unit] = - run(DeleteField.command(DeleteField(name))).attempt *> - run(AddField.command(AddField.string(name))) + private def addStringField(field: Field): F[Unit] = + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.string(field))) - private def addTextField(name: String): F[Unit] = - run(DeleteField.command(DeleteField(name))).attempt *> - run(AddField.command(AddField.text(name))) + private def addTextField(field: Field): F[Unit] = + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.text(field))) } } @@ -71,7 +69,7 @@ object SolrSetup { // Schema Commands case class AddField( - name: String, + name: Field, `type`: String, stored: Boolean, indexed: Boolean, @@ -84,14 +82,14 @@ object SolrSetup { def command(body: AddField): Json = Map("add-field" -> body.asJson).asJson - def string(name: String): AddField = - AddField(name, "string", true, true, false) + def string(field: Field): AddField = + AddField(field, "string", true, true, false) - def text(name: String): AddField = - AddField(name, "text_general", true, true, false) + def text(field: Field): AddField = + AddField(field, "text_general", true, true, false) } - case class DeleteField(name: String) + case class DeleteField(name: Field) object DeleteField { implicit val encoder: Encoder[DeleteField] = deriveEncoder[DeleteField]