From ffbb16db4510b5eed6b744354f37a8973f8a8902 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 22 Jun 2020 22:54:39 +0200 Subject: [PATCH] Transport highlighting information to the client --- .../docspell/backend/ops/OFulltext.scala | 99 ++++++++++++++----- .../scala/docspell/ftsclient/FtsQuery.scala | 12 ++- .../scala/docspell/ftssolr/QueryData.scala | 4 +- .../joex/src/main/resources/reference.conf | 4 +- .../src/main/resources/docspell-openapi.yml | 23 +++++ .../src/main/resources/reference.conf | 4 +- .../restserver/conv/Conversions.scala | 48 ++++++++- .../restserver/routes/ItemRoutes.scala | 56 +++++++---- 8 files changed, 202 insertions(+), 48 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala index 18501aad..30d87230 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala @@ -13,10 +13,18 @@ import OItemSearch.{Batch, ListItem, ListItemWithTags, Query} trait OFulltext[F[_]] { - def findItems(q: Query, fts: String, batch: Batch): F[Vector[ListItem]] + def findItems( + q: Query, + fts: OFulltext.FtsInput, + batch: Batch + ): F[Vector[OFulltext.FtsItem]] /** Same as `findItems` but does more queries per item to find all tags. */ - def findItemsWithTags(q: Query, fts: String, batch: Batch): F[Vector[ListItemWithTags]] + def findItemsWithTags( + q: Query, + fts: OFulltext.FtsInput, + batch: Batch + ): F[Vector[OFulltext.FtsItemWithTags]] /** Clears the full-text index completely and launches a task that * indexes all data. @@ -30,9 +38,26 @@ trait OFulltext[F[_]] { } object OFulltext { - // maybe use a temporary table? could run fts and do .take(batch.limit) and store this in sql - // then run a query - // check if supported by mariadb, postgres and h2. seems like it is supported everywhere + + case class FtsInput( + query: String, + highlightPre: String = "***", + highlightPost: String = "***" + ) + + case class FtsDataItem( + score: Double, + matchData: FtsResult.MatchData, + context: List[String] + ) + case class FtsData( + maxScore: Double, + count: Int, + qtime: Duration, + items: List[FtsDataItem] + ) + case class FtsItem(item: ListItem, ftsData: FtsData) + case class FtsItemWithTags(item: ListItemWithTags, ftsData: FtsData) def apply[F[_]: Effect]( itemSearch: OItemSearch[F], @@ -59,53 +84,83 @@ object OFulltext { else queue.insertIfNew(job) *> joex.notifyAllNodes } yield () - def findItems(q: Query, ftsQ: String, batch: Batch): F[Vector[ListItem]] = - findItemsFts(q, ftsQ, batch.first, itemSearch.findItems) + def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] = + findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem]) .drop(batch.offset.toLong) .take(batch.limit.toLong) + .map({ case (li, fd) => FtsItem(li, fd) }) .compile .toVector def findItemsWithTags( q: Query, - ftsQ: String, + ftsQ: FtsInput, batch: Batch - ): F[Vector[ListItemWithTags]] = - findItemsFts(q, ftsQ, batch.first, itemSearch.findItemsWithTags) + ): F[Vector[FtsItemWithTags]] = + findItemsFts( + q, + ftsQ, + batch.first, + itemSearch.findItemsWithTags, + convertFtsData[ListItemWithTags] + ) .drop(batch.offset.toLong) .take(batch.limit.toLong) + .map({ case (li, fd) => FtsItemWithTags(li, fd) }) .compile .toVector - private def findItemsFts[A: ItemId]( + private def findItemsFts[A: ItemId, B]( q: Query, - ftsQ: String, + ftsQ: FtsInput, batch: Batch, - search: (Query, Batch) => F[Vector[A]] - ): Stream[F, A] = { + search: (Query, Batch) => F[Vector[A]], + convert: ( + FtsResult, + Map[Ident, List[FtsResult.ItemMatch]] + ) => PartialFunction[A, (A, FtsData)] + ): Stream[F, (A, FtsData)] = { val sqlResult = search(q, batch) - val fq = FtsQuery(ftsQ, q.collective, Set.empty, batch.limit, batch.offset) + val fq = FtsQuery( + ftsQ.query, + q.collective, + Set.empty, + batch.limit, + batch.offset, + FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost) + ) val qres = for { items <- sqlResult ids = items.map(a => ItemId[A].itemId(a)) ftsQ = fq.copy(items = ids.toSet) - ftsR <- - fts - .search(ftsQ) - .map(_.results.map(_.itemId)) - .map(_.toSet) - res = items.filter(a => ftsR.contains(ItemId[A].itemId(a))) + ftsR <- fts.search(ftsQ) + ftsItems = ftsR.results.groupBy(_.itemId) + res = items.collect(convert(ftsR, ftsItems)) } yield res Stream.eval(qres).flatMap { v => val results = Stream.emits(v) if (v.size < batch.limit) results - else results ++ findItemsFts(q, ftsQ, batch.next, search) + else results ++ findItemsFts(q, ftsQ, batch.next, search, convert) } } + + private def convertFtsData[A: ItemId]( + ftr: FtsResult, + ftrItems: Map[Ident, List[FtsResult.ItemMatch]] + ): PartialFunction[A, (A, FtsData)] = { + case a if ftrItems.contains(ItemId[A].itemId(a)) => + val ftsDataItems = ftrItems + .get(ItemId[A].itemId(a)) + .getOrElse(Nil) + .map(im => + FtsDataItem(im.score, im.data, ftr.highlight.get(im.id).getOrElse(Nil)) + ) + (a, FtsData(ftr.maxScore, ftr.count, ftr.qtime, ftsDataItems)) + } }) trait ItemId[A] { diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala index 276d7589..d2fe953a 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala @@ -15,9 +15,19 @@ final case class FtsQuery( collective: Ident, items: Set[Ident], limit: Int, - offset: Int + offset: Int, + highlight: FtsQuery.HighlightSetting ) { def nextPage: FtsQuery = copy(offset = limit + offset) } + +object FtsQuery { + + case class HighlightSetting(pre: String, post: String) + + object HighlightSetting { + val default = HighlightSetting("**", "**") + } +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala index b2638ac3..a88736bf 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala @@ -46,8 +46,8 @@ object QueryData { } QueryData(extQ, filterQ, fq.limit, fq.offset, fields, Map.empty).withHighLight( search, - "**", - "**" + fq.highlight.pre, + fq.highlight.post ) } diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 3bcfd5e6..62b48f99 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -367,12 +367,12 @@ docspell.joex { # Configuration of the full-text search engine. full-text-search { - enabled = true + enabled = false migration = { index-all-chunk = 10 } solr = { - url = "http://localhost:8983/solr/docspell_core" + url = "http://localhost:8983/solr/docspell" commit-within = 1000 } } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index cdb897d0..0b29e938 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -3502,6 +3502,29 @@ components: type: array items: $ref: "#/components/schemas/Tag" + highlighting: + description: | + Optional contextual information of a search query. Each + item refers to some field where a search match was found + (e.g. the name of an attachment or the item notes) and a + list of lines giving surrounding context of the macth. + type: array + items: + $ref: "#/components/schemas/HighlightEntry" + HighlightEntry: + description: | + Highlighting information for a single field (maybe attachment + name or item notes). + required: + - name + - lines + properties: + name: + type: string + lines: + type: array + items: + type: string IdName: description: | The identifier and a human readable name of some entity. diff --git a/modules/restserver/src/main/resources/reference.conf b/modules/restserver/src/main/resources/reference.conf index b298fba6..798704ea 100644 --- a/modules/restserver/src/main/resources/reference.conf +++ b/modules/restserver/src/main/resources/reference.conf @@ -91,7 +91,7 @@ docspell.server { # memory and disk space. It can be enabled later any time. # # Currently the SOLR search platform is supported. - enabled = true + enabled = false # When re-creating the complete index via a REST call, this key # is required. If left empty (the default), recreating the index @@ -103,7 +103,7 @@ docspell.server { # Configuration for the SOLR backend. solr = { - url = "http://localhost:8983/solr/docspell_core" + url = "http://localhost:8983/solr/docspell" commit-within = 1000 } } diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index b02491b3..f217ca30 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -14,8 +14,9 @@ import bitpeace.FileMeta import docspell.backend.ops.OCollective.{InsightData, PassChangeResult} import docspell.backend.ops.OJob.JobCancelResult import docspell.backend.ops.OUpload.{UploadData, UploadMeta, UploadResult} -import docspell.backend.ops.{OItemSearch, OJob, OOrganization, OUpload} +import docspell.backend.ops.{OFulltext, OItemSearch, OJob, OOrganization, OUpload} import docspell.store.AddResult +import docspell.ftsclient.FtsResult import org.http4s.multipart.Multipart import org.http4s.headers.`Content-Type` import org.log4s.Logger @@ -139,6 +140,17 @@ trait Conversions { ItemLightList(gs) } + def mkItemListFts(v: Vector[OFulltext.FtsItem]): ItemLightList = { + val groups = v.groupBy(item => item.item.date.toUtcDate.toString.substring(0, 7)) + + def mkGroup(g: (String, Vector[OFulltext.FtsItem])): ItemLightGroup = + ItemLightGroup(g._1, g._2.map(mkItemLight).toList) + + val gs = + groups.map(mkGroup _).toList.sortWith((g1, g2) => g1.name.compareTo(g2.name) >= 0) + ItemLightList(gs) + } + def mkItemListWithTags(v: Vector[OItemSearch.ListItemWithTags]): ItemLightList = { val groups = v.groupBy(ti => ti.item.date.toUtcDate.toString.substring(0, 7)) @@ -150,6 +162,17 @@ trait Conversions { ItemLightList(gs) } + def mkItemListWithTagsFts(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = { + val groups = v.groupBy(ti => ti.item.item.date.toUtcDate.toString.substring(0, 7)) + + def mkGroup(g: (String, Vector[OFulltext.FtsItemWithTags])): ItemLightGroup = + ItemLightGroup(g._1, g._2.map(mkItemLightWithTags).toList) + + val gs = + groups.map(mkGroup _).toList.sortWith((g1, g2) => g1.name.compareTo(g2.name) >= 0) + ItemLightList(gs) + } + def mkItemLight(i: OItemSearch.ListItem): ItemLight = ItemLight( i.id, @@ -164,12 +187,35 @@ trait Conversions { i.concPerson.map(mkIdName), i.concEquip.map(mkIdName), i.fileCount, + Nil, Nil ) + def mkItemLight(i: OFulltext.FtsItem): ItemLight = { + val il = mkItemLight(i.item) + val highlight = mkHighlight(i.ftsData) + il.copy(highlighting = highlight) + } + def mkItemLightWithTags(i: OItemSearch.ListItemWithTags): ItemLight = mkItemLight(i.item).copy(tags = i.tags.map(mkTag)) + def mkItemLightWithTags(i: OFulltext.FtsItemWithTags): ItemLight = { + val il = mkItemLightWithTags(i.item) + val highlight = mkHighlight(i.ftsData) + il.copy(highlighting = highlight) + } + + private def mkHighlight(ftsData: OFulltext.FtsData): List[HighlightEntry] = + ftsData.items.filter(_.context.nonEmpty).sortBy(-_.score).map { fdi => + fdi.matchData match { + case FtsResult.AttachmentData(_, aName) => + HighlightEntry(aName, fdi.context) + case FtsResult.ItemData => + HighlightEntry("Item", fdi.context) + } + } + // job def mkJobQueueState(state: OJob.CollectiveQueueState): JobQueueState = { def desc(f: JobDetail => Option[Timestamp])(j1: JobDetail, j2: JobDetail): Boolean = { diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala index a4725291..766a33e3 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala @@ -5,6 +5,7 @@ import cats.implicits._ import docspell.backend.BackendApp import docspell.backend.auth.AuthToken import docspell.backend.ops.OItemSearch.Batch +import docspell.backend.ops.OFulltext import docspell.common.{Ident, ItemState} import org.http4s.HttpRoutes import org.http4s.dsl.Http4sDsl @@ -34,11 +35,25 @@ object ItemRoutes { _ <- logger.ftrace(s"Got search mask: $mask") query = Conversions.mkQuery(mask, user.account.collective) _ <- logger.ftrace(s"Running query: $query") - items <- backend.itemSearch.findItems( - query, - Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) - ) - resp <- Ok(Conversions.mkItemList(items)) + resp <- mask.fullText match { + case Some(fq) if cfg.fullTextSearch.enabled => + for { + items <- backend.fulltext.findItems( + query, + OFulltext.FtsInput(fq), + Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) + ) + ok <- Ok(Conversions.mkItemListFts(items)) + } yield ok + case _ => + for { + items <- backend.itemSearch.findItems( + query, + Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) + ) + ok <- Ok(Conversions.mkItemList(items)) + } yield ok + } } yield resp case req @ POST -> Root / "searchWithTags" => @@ -47,20 +62,25 @@ object ItemRoutes { _ <- logger.ftrace(s"Got search mask: $mask") query = Conversions.mkQuery(mask, user.account.collective) _ <- logger.ftrace(s"Running query: $query") - items <- mask.fullText match { - case None => - backend.itemSearch.findItemsWithTags( - query, - Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) - ) - case Some(fq) => - backend.fulltext.findItemsWithTags( - query, - fq, - Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) - ) + resp <- mask.fullText match { + case Some(fq) if cfg.fullTextSearch.enabled => + for { + items <- backend.fulltext.findItemsWithTags( + query, + OFulltext.FtsInput(fq), + Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) + ) + ok <- Ok(Conversions.mkItemListWithTagsFts(items)) + } yield ok + case _ => + for { + items <- backend.itemSearch.findItemsWithTags( + query, + Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) + ) + ok <- Ok(Conversions.mkItemListWithTags(items)) + } yield ok } - resp <- Ok(Conversions.mkItemListWithTags(items)) } yield resp case GET -> Root / Ident(id) =>