From d5c9923a6daa26a35f18179930099c30aebc607f Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 23 Jun 2020 23:02:58 +0200 Subject: [PATCH] Add a route that only searches the full-text index It returns the results in the same order as received from the index to preserve the relevance ordering. --- .../docspell/backend/ops/OFulltext.scala | 42 +++++++++++ .../scala/docspell/ftssolr/SolrQuery.scala | 2 +- .../dev/adr/0014_fulltext_search_engine.md | 26 ++++--- .../dev/adr/0015_fulltext_search_design.md | 16 ----- .../src/main/resources/docspell-openapi.yml | 69 +++++++++++++++++-- .../restserver/conv/Conversions.scala | 5 ++ .../restserver/routes/ItemRoutes.scala | 20 ++++++ .../scala/docspell/store/queries/QItem.scala | 43 +++++++++--- 8 files changed, 178 insertions(+), 45 deletions(-) delete mode 100644 modules/microsite/docs/dev/adr/0015_fulltext_search_design.md diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala index 6514bc21..33bedffc 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala @@ -9,6 +9,7 @@ import docspell.backend.JobFactory import docspell.store.Store import docspell.store.records.RJob import docspell.store.queue.JobQueue +import docspell.store.queries.QItem import OItemSearch.{Batch, ListItem, ListItemWithTags, Query} trait OFulltext[F[_]] { @@ -26,6 +27,12 @@ trait OFulltext[F[_]] { batch: Batch ): F[Vector[OFulltext.FtsItemWithTags]] + def findIndexOnly( + fts: OFulltext.FtsInput, + collective: Ident, + batch: Batch + ): F[Vector[OFulltext.FtsItemWithTags]] + /** Clears the full-text index completely and launches a task that * indexes all data. */ @@ -84,6 +91,41 @@ object OFulltext { else queue.insertIfNew(job) *> joex.notifyAllNodes } yield () + def findIndexOnly( + ftsQ: OFulltext.FtsInput, + collective: Ident, + batch: Batch + ): F[Vector[OFulltext.FtsItemWithTags]] = { + val fq = FtsQuery( + ftsQ.query, + collective, + Set.empty, + batch.limit, + batch.offset, + FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost) + ) + for { + ftsR <- fts.search(fq) + ftsItems = ftsR.results.groupBy(_.itemId) + select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet + itemsWithTags <- + store + .transact( + QItem.findItemsWithTags( + collective, + QItem.findSelectedItems(QItem.Query.empty(collective), select) + ) + ) + .take(batch.limit.toLong) + .compile + .toVector + res = + itemsWithTags + .collect(convertFtsData(ftsR, ftsItems)) + .map({ case (li, fd) => FtsItemWithTags(li, fd) }) + } yield res + } + def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] = findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem]) .drop(batch.offset.toLong) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index 6342cd6d..21a84360 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -53,7 +53,7 @@ object SolrQuery { def query(q: QueryData): F[FtsResult] = { val req = Method.POST(q.asJson, url) - logger.debug(s"Running query: $req") + logger.trace(s"Running query: $req : ${q.asJson}") client.expect[FtsResult](req) } diff --git a/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md index a32ecd3b..1719c7fc 100644 --- a/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md +++ b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md @@ -29,23 +29,21 @@ work (just the fulltext search is then not working). ## Decision Outcome -If docspell is running on PostgreSQL, it would be the best option to -also use it for fulltext search. But I don't want to lock the database -to PostgreSQL *only* because of the fulltext search feature. This -would be a too large impact on the whole application. +If docspell is running on PostgreSQL, it would be nice to also use it +for fulltext search to save the cost of running another component. But +I don't want to lock the database to PostgreSQL *only* because of the +fulltext search feature. ElasticSearch and Apache SOLR are quite similiar in features. SOLR is part of Lucene and therefore lives in the Apache ecosystem. I would -choose this over ElasticSearch, which is backed by a company (the oss -version is released under the Apache License, afaiu). Regarding -features, both are great. +choose SOLR over ElasticSearch, because I used it before. The last option (supporting all) is interesting, since it would enable -to use PostgreSQL for fulltext search, when already using PostgreSQL -as the database for docspell. +to use PostgreSQL for fulltext search for those that use PostgreSQL as +the database for docspell. -So in a first step, identify what docspell needs from a fulltext -search component and create this interface and an implementation for -Apache SOLR. This enables all users to use the fulltext search -feature. As a later step, an implementation based on PostgreSQL could -be provided, too. +In a first step, identify what docspell needs from a fulltext search +component and create this interface and an implementation for Apache +SOLR. This enables all users to use the fulltext search feature. As a +later step, an implementation based on PostgreSQL and/or ElasticSearch +could be provided, too. diff --git a/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md b/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md deleted file mode 100644 index 4c4f0ff7..00000000 --- a/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -layout: docs -title: Fulltext Search Design ---- - -# How to integrate Fulltext Search - - - -## Context and Problem Statement - - -## Considered Options - - -## Decision Outcome diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 0b29e938..aaf52fb5 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -1027,9 +1027,13 @@ paths: summary: Search for items. description: | Search for items given a search form. The results are grouped - by month by default. Tags are *not* resolved! The results will - always contain an empty list for item tags. Use - `/searchWithTags` to also retrieve all tags of an item. + by month and are sorted by item date (newest first). Tags are + *not* resolved. The results will always contain an empty list + for item tags. Use `/searchWithTags` to also retrieve all tags + of an item. + + The `fulltext` field can be used to restrict the results by + using full-text search in the documents contents. security: - authTokenHeader: [] requestBody: @@ -1051,7 +1055,11 @@ paths: description: | Search for items given a search form. The results are grouped by month by default. For each item, its tags are also - returned. This uses more queries and is therefore slower. + returned. This uses more queries and is therefore slower, but + returns all tags to an item. + + The `fulltext` field can be used to restrict the results by + using full-text search in the documents contents. security: - authTokenHeader: [] requestBody: @@ -1066,6 +1074,37 @@ paths: application/json: schema: $ref: "#/components/schemas/ItemLightList" + /sec/item/searchIndex: + post: + tags: [ Item ] + summary: Search for items using full-text search only. + description: | + Search for items by only using the full-text search index. + + Unlike the other search routes, this one only asks the + full-text search index and returns only one group that + contains the results in the same order as given from the + index. Most full-text search engines use an ordering that + reflect the relevance wrt the search term. + + The other search routes always order the results by some + property (the item date) and thus the relevance ordering is + destroyed when using the full-text search. + security: + - authTokenHeader: [] + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/ItemFtsSearch" + responses: + 200: + description: Ok + content: + application/json: + schema: + $ref: "#/components/schemas/ItemLightList" + /sec/item/{id}: get: tags: [ Item ] @@ -2295,6 +2334,28 @@ paths: components: schemas: + ItemFtsSearch: + description: | + Query description for a full-text only search. + required: + - query + - offset + - limit + properties: + offset: + type: integer + format: int32 + limit: + type: integer + format: int32 + description: | + The maximum number of results to return. Note that this + limit is a soft limit, there is some hard limit on the + server, too. + query: + type: string + description: | + A query searching the contents of documents. MoveAttachment: description: | Data to move an attachment to another position. diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index f217ca30..9cdd8be6 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -173,6 +173,11 @@ trait Conversions { ItemLightList(gs) } + def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = { + if (v.isEmpty) ItemLightList(Nil) + else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList))) + } + def mkItemLight(i: OItemSearch.ListItem): ItemLight = ItemLight( i.id, diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala index 766a33e3..03e8d7ae 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala @@ -83,6 +83,26 @@ object ItemRoutes { } } yield resp + case req @ POST -> Root / "searchIndex" => + for { + mask <- req.as[ItemFtsSearch] + resp <- mask.query match { + case q if q.length > 1 => + val ftsIn = OFulltext.FtsInput(q) + for { + items <- backend.fulltext.findIndexOnly( + ftsIn, + user.account.collective, + Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize) + ) + ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items)) + } yield ok + + case _ => + BadRequest(BasicResult(false, "Query string too short")) + } + } yield resp + case GET -> Root / Ident(id) => for { item <- backend.itemSearch.findItem(id, user.account.collective) diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 70544741..64868a6c 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -214,7 +214,7 @@ object QItem { Batch(0, c) } - private def findItemsBase(q: Query): Fragment = { + private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = { val IC = RItem.Columns val AC = RAttachment.Columns val PC = RPerson.Columns @@ -258,14 +258,17 @@ object QItem { val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++ fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")" + val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT" val query = withCTE( - "items" -> withItem, - "persons" -> withPerson, - "orgs" -> withOrgs, - "equips" -> withEquips, - "attachs" -> withAttach + (Seq( + "items" -> withItem, + "persons" -> withPerson, + "orgs" -> withOrgs, + "equips" -> withEquips, + "attachs" -> withAttach + ) ++ ctes): _* ) ++ - fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++ + selectKW ++ finalCols ++ fr" FROM items i" ++ fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++ fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++ fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++ @@ -280,7 +283,7 @@ object QItem { val OC = ROrganization.Columns val EC = REquipment.Columns - val query = findItemsBase(q) + val query = findItemsBase(q, true) // inclusive tags are AND-ed val tagSelectsIncl = q.tagsInclude @@ -374,14 +377,34 @@ object QItem { frag.query[ListItem].stream } + case class SelectedItem(itemId: Ident, weight: Double) + def findSelectedItems( + q: Query, + items: Set[SelectedItem] + ): Stream[ConnectionIO, ListItem] = + if (items.isEmpty) Stream.empty + else { + val IC = RItem.Columns + val values = items + .map(it => fr"(${it.itemId}, ${it.weight})") + .reduce((r, e) => r ++ fr"," ++ e) + + val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++ + fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++ + fr"ORDER BY tids.weight DESC" + + logger.trace(s"fts query: $from") + from.query[ListItem].stream + } + case class ListItemWithTags(item: ListItem, tags: List[RTag]) /** Same as `findItems` but resolves the tags for each item. Note that * this is implemented by running an additional query per item. */ def findItemsWithTags( - collective: Ident, - search: Stream[ConnectionIO, ListItem] + collective: Ident, + search: Stream[ConnectionIO, ListItem] ): Stream[ConnectionIO, ListItemWithTags] = { def findTag( cache: Ref[ConnectionIO, Map[Ident, RTag]],