Add a route that only searches the full-text index

It returns the results in the same order as received from the index to preserve the relevance ordering.
2025-08-01 13:04:52 +00:00 · 2020-06-23 23:02:58 +02:00
parent d9f0f05613
commit d5c9923a6d
8 changed files with 178 additions and 45 deletions
--- a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala
@ -9,6 +9,7 @@ import docspell.backend.JobFactory
 import docspell.store.Store
 import docspell.store.records.RJob
 import docspell.store.queue.JobQueue
+import docspell.store.queries.QItem
 import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}

 trait OFulltext[F[_]] {
@ -26,6 +27,12 @@ trait OFulltext[F[_]] {
      batch: Batch
  ): F[Vector[OFulltext.FtsItemWithTags]]

+  def findIndexOnly(
+      fts: OFulltext.FtsInput,
+      collective: Ident,
+      batch: Batch
+  ): F[Vector[OFulltext.FtsItemWithTags]]
+
  /** Clears the full-text index completely and launches a task that
    * indexes all data.
    */
@ -84,6 +91,41 @@ object OFulltext {
            else queue.insertIfNew(job) *> joex.notifyAllNodes
        } yield ()

+      def findIndexOnly(
+          ftsQ: OFulltext.FtsInput,
+          collective: Ident,
+          batch: Batch
+      ): F[Vector[OFulltext.FtsItemWithTags]] = {
+        val fq = FtsQuery(
+          ftsQ.query,
+          collective,
+          Set.empty,
+          batch.limit,
+          batch.offset,
+          FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
+        )
+        for {
+          ftsR <- fts.search(fq)
+          ftsItems = ftsR.results.groupBy(_.itemId)
+          select   = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
+          itemsWithTags <-
+            store
+              .transact(
+                QItem.findItemsWithTags(
+                  collective,
+                  QItem.findSelectedItems(QItem.Query.empty(collective), select)
+                )
+              )
+              .take(batch.limit.toLong)
+              .compile
+              .toVector
+          res =
+            itemsWithTags
+              .collect(convertFtsData(ftsR, ftsItems))
+              .map({ case (li, fd) => FtsItemWithTags(li, fd) })
+        } yield res
+      }
+
      def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
        findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
          .drop(batch.offset.toLong)
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@ -53,7 +53,7 @@ object SolrQuery {

      def query(q: QueryData): F[FtsResult] = {
        val req = Method.POST(q.asJson, url)
-        logger.debug(s"Running query: $req")
+        logger.trace(s"Running query: $req : ${q.asJson}")
        client.expect[FtsResult](req)
      }

--- a/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md
+++ b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md
@ -29,23 +29,21 @@ work (just the fulltext search is then not working).

 ## Decision Outcome

-If docspell is running on PostgreSQL, it would be the best option to
-also use it for fulltext search. But I don't want to lock the database
-to PostgreSQL *only* because of the fulltext search feature. This
-would be a too large impact on the whole application.
+If docspell is running on PostgreSQL, it would be nice to also use it
+for fulltext search to save the cost of running another component. But
+I don't want to lock the database to PostgreSQL *only* because of the
+fulltext search feature.

 ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
 part of Lucene and therefore lives in the Apache ecosystem. I would
-choose this over ElasticSearch, which is backed by a company (the oss
-version is released under the Apache License, afaiu). Regarding
-features, both are great.
+choose SOLR over ElasticSearch, because I used it before.

 The last option (supporting all) is interesting, since it would enable
-to use PostgreSQL for fulltext search, when already using PostgreSQL
-as the database for docspell.
+to use PostgreSQL for fulltext search for those that use PostgreSQL as
+the database for docspell.

-So in a first step, identify what docspell needs from a fulltext
-search component and create this interface and an implementation for
-Apache SOLR. This enables all users to use the fulltext search
-feature. As a later step, an implementation based on PostgreSQL could
-be provided, too.
+In a first step, identify what docspell needs from a fulltext search
+component and create this interface and an implementation for Apache
+SOLR. This enables all users to use the fulltext search feature. As a
+later step, an implementation based on PostgreSQL and/or ElasticSearch
+could be provided, too.
--- a/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md
+++ b/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md
@ -1,16 +0,0 @@
---
-layout: docs
-title: Fulltext Search Design
---
-
-# How to integrate Fulltext Search
-
-
-
-## Context and Problem Statement
-
-
-## Considered Options
-
-
-## Decision Outcome
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@ -1027,9 +1027,13 @@ paths:
      summary: Search for items.
      description: |
        Search for items given a search form. The results are grouped
-        by month by default. Tags are *not* resolved! The results will
-        always contain an empty list for item tags. Use
-        `/searchWithTags` to also retrieve all tags of an item.
+        by month and are sorted by item date (newest first). Tags are
+        *not* resolved. The results will always contain an empty list
+        for item tags. Use `/searchWithTags` to also retrieve all tags
+        of an item.
+
+        The `fulltext` field can be used to restrict the results by
+        using full-text search in the documents contents.
      security:
        - authTokenHeader: []
      requestBody:
@ -1051,7 +1055,11 @@ paths:
      description: |
        Search for items given a search form. The results are grouped
        by month by default. For each item, its tags are also
-        returned. This uses more queries and is therefore slower.
+        returned. This uses more queries and is therefore slower, but
+        returns all tags to an item.
+
+        The `fulltext` field can be used to restrict the results by
+        using full-text search in the documents contents.
      security:
        - authTokenHeader: []
      requestBody:
@ -1066,6 +1074,37 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ItemLightList"
+  /sec/item/searchIndex:
+    post:
+      tags: [ Item ]
+      summary: Search for items using full-text search only.
+      description: |
+        Search for items by only using the full-text search index.
+
+        Unlike the other search routes, this one only asks the
+        full-text search index and returns only one group that
+        contains the results in the same order as given from the
+        index. Most full-text search engines use an ordering that
+        reflect the relevance wrt the search term.
+
+        The other search routes always order the results by some
+        property (the item date) and thus the relevance ordering is
+        destroyed when using the full-text search.
+      security:
+        - authTokenHeader: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ItemFtsSearch"
+      responses:
+        200:
+          description: Ok
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ItemLightList"
+
  /sec/item/{id}:
    get:
      tags: [ Item ]
@ -2295,6 +2334,28 @@ paths:

 components:
  schemas:
+    ItemFtsSearch:
+      description: |
+        Query description for a full-text only search.
+      required:
+        - query
+        - offset
+        - limit
+      properties:
+        offset:
+          type: integer
+          format: int32
+        limit:
+          type: integer
+          format: int32
+          description: |
+            The maximum number of results to return. Note that this
+            limit is a soft limit, there is some hard limit on the
+            server, too.
+        query:
+          type: string
+          description: |
+            A query searching the contents of documents.
    MoveAttachment:
      description: |
        Data to move an attachment to another position.
--- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala
@ -173,6 +173,11 @@ trait Conversions {
    ItemLightList(gs)
  }

+  def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = {
+    if (v.isEmpty) ItemLightList(Nil)
+    else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList)))
+  }
+
  def mkItemLight(i: OItemSearch.ListItem): ItemLight =
    ItemLight(
      i.id,
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala
@ -83,6 +83,26 @@ object ItemRoutes {
          }
        } yield resp

+      case req @ POST -> Root / "searchIndex" =>
+        for {
+          mask <- req.as[ItemFtsSearch]
+          resp <- mask.query match {
+            case q if q.length > 1 =>
+              val ftsIn = OFulltext.FtsInput(q)
+              for {
+                items <- backend.fulltext.findIndexOnly(
+                  ftsIn,
+                  user.account.collective,
+                  Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize)
+                )
+                ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items))
+              } yield ok
+
+            case _ =>
+              BadRequest(BasicResult(false, "Query string too short"))
+          }
+        } yield resp
+
      case GET -> Root / Ident(id) =>
        for {
          item <- backend.itemSearch.findItem(id, user.account.collective)
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@ -214,7 +214,7 @@ object QItem {
      Batch(0, c)
  }

-  private def findItemsBase(q: Query): Fragment = {
+  private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = {
    val IC         = RItem.Columns
    val AC         = RAttachment.Columns
    val PC         = RPerson.Columns
@ -258,14 +258,17 @@ object QItem {
    val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
      fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"

+    val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT"
    val query = withCTE(
-      "items"   -> withItem,
-      "persons" -> withPerson,
-      "orgs"    -> withOrgs,
-      "equips"  -> withEquips,
-      "attachs" -> withAttach
+      (Seq(
+        "items"   -> withItem,
+        "persons" -> withPerson,
+        "orgs"    -> withOrgs,
+        "equips"  -> withEquips,
+        "attachs" -> withAttach
+      ) ++ ctes): _*
    ) ++
-      fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++
+      selectKW ++ finalCols ++ fr" FROM items i" ++
      fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
      fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
      fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
@ -280,7 +283,7 @@ object QItem {
    val OC = ROrganization.Columns
    val EC = REquipment.Columns

-    val query = findItemsBase(q)
+    val query = findItemsBase(q, true)

    // inclusive tags are AND-ed
    val tagSelectsIncl = q.tagsInclude
@ -374,14 +377,34 @@ object QItem {
    frag.query[ListItem].stream
  }

+  case class SelectedItem(itemId: Ident, weight: Double)
+  def findSelectedItems(
+      q: Query,
+      items: Set[SelectedItem]
+  ): Stream[ConnectionIO, ListItem] =
+    if (items.isEmpty) Stream.empty
+    else {
+      val IC = RItem.Columns
+      val values = items
+        .map(it => fr"(${it.itemId}, ${it.weight})")
+        .reduce((r, e) => r ++ fr"," ++ e)
+
+      val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++
+        fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++
+        fr"ORDER BY tids.weight DESC"
+
+      logger.trace(s"fts query: $from")
+      from.query[ListItem].stream
+    }
+
  case class ListItemWithTags(item: ListItem, tags: List[RTag])

  /** Same as `findItems` but resolves the tags for each item. Note that
    * this is implemented by running an additional query per item.
    */
  def findItemsWithTags(
-    collective: Ident,
-    search: Stream[ConnectionIO, ListItem]
+      collective: Ident,
+      search: Stream[ConnectionIO, ListItem]
  ): Stream[ConnectionIO, ListItemWithTags] = {
    def findTag(
        cache: Ref[ConnectionIO, Map[Ident, RTag]],