Add a route that only searches the full-text index

It returns the results in the same order as received from the index to
preserve the relevance ordering.
This commit is contained in:
Eike Kettner 2020-06-23 23:02:58 +02:00
parent d9f0f05613
commit d5c9923a6d
8 changed files with 178 additions and 45 deletions

View File

@ -9,6 +9,7 @@ import docspell.backend.JobFactory
import docspell.store.Store import docspell.store.Store
import docspell.store.records.RJob import docspell.store.records.RJob
import docspell.store.queue.JobQueue import docspell.store.queue.JobQueue
import docspell.store.queries.QItem
import OItemSearch.{Batch, ListItem, ListItemWithTags, Query} import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
trait OFulltext[F[_]] { trait OFulltext[F[_]] {
@ -26,6 +27,12 @@ trait OFulltext[F[_]] {
batch: Batch batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]] ): F[Vector[OFulltext.FtsItemWithTags]]
def findIndexOnly(
fts: OFulltext.FtsInput,
collective: Ident,
batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]]
/** Clears the full-text index completely and launches a task that /** Clears the full-text index completely and launches a task that
* indexes all data. * indexes all data.
*/ */
@ -84,6 +91,41 @@ object OFulltext {
else queue.insertIfNew(job) *> joex.notifyAllNodes else queue.insertIfNew(job) *> joex.notifyAllNodes
} yield () } yield ()
def findIndexOnly(
ftsQ: OFulltext.FtsInput,
collective: Ident,
batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]] = {
val fq = FtsQuery(
ftsQ.query,
collective,
Set.empty,
batch.limit,
batch.offset,
FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
)
for {
ftsR <- fts.search(fq)
ftsItems = ftsR.results.groupBy(_.itemId)
select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
itemsWithTags <-
store
.transact(
QItem.findItemsWithTags(
collective,
QItem.findSelectedItems(QItem.Query.empty(collective), select)
)
)
.take(batch.limit.toLong)
.compile
.toVector
res =
itemsWithTags
.collect(convertFtsData(ftsR, ftsItems))
.map({ case (li, fd) => FtsItemWithTags(li, fd) })
} yield res
}
def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] = def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem]) findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
.drop(batch.offset.toLong) .drop(batch.offset.toLong)

View File

@ -53,7 +53,7 @@ object SolrQuery {
def query(q: QueryData): F[FtsResult] = { def query(q: QueryData): F[FtsResult] = {
val req = Method.POST(q.asJson, url) val req = Method.POST(q.asJson, url)
logger.debug(s"Running query: $req") logger.trace(s"Running query: $req : ${q.asJson}")
client.expect[FtsResult](req) client.expect[FtsResult](req)
} }

View File

@ -29,23 +29,21 @@ work (just the fulltext search is then not working).
## Decision Outcome ## Decision Outcome
If docspell is running on PostgreSQL, it would be the best option to If docspell is running on PostgreSQL, it would be nice to also use it
also use it for fulltext search. But I don't want to lock the database for fulltext search to save the cost of running another component. But
to PostgreSQL *only* because of the fulltext search feature. This I don't want to lock the database to PostgreSQL *only* because of the
would be a too large impact on the whole application. fulltext search feature.
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
part of Lucene and therefore lives in the Apache ecosystem. I would part of Lucene and therefore lives in the Apache ecosystem. I would
choose this over ElasticSearch, which is backed by a company (the oss choose SOLR over ElasticSearch, because I used it before.
version is released under the Apache License, afaiu). Regarding
features, both are great.
The last option (supporting all) is interesting, since it would enable The last option (supporting all) is interesting, since it would enable
to use PostgreSQL for fulltext search, when already using PostgreSQL to use PostgreSQL for fulltext search for those that use PostgreSQL as
as the database for docspell. the database for docspell.
So in a first step, identify what docspell needs from a fulltext In a first step, identify what docspell needs from a fulltext search
search component and create this interface and an implementation for component and create this interface and an implementation for Apache
Apache SOLR. This enables all users to use the fulltext search SOLR. This enables all users to use the fulltext search feature. As a
feature. As a later step, an implementation based on PostgreSQL could later step, an implementation based on PostgreSQL and/or ElasticSearch
be provided, too. could be provided, too.

View File

@ -1,16 +0,0 @@
---
layout: docs
title: Fulltext Search Design
---
# How to integrate Fulltext Search
## Context and Problem Statement
## Considered Options
## Decision Outcome

View File

@ -1027,9 +1027,13 @@ paths:
summary: Search for items. summary: Search for items.
description: | description: |
Search for items given a search form. The results are grouped Search for items given a search form. The results are grouped
by month by default. Tags are *not* resolved! The results will by month and are sorted by item date (newest first). Tags are
always contain an empty list for item tags. Use *not* resolved. The results will always contain an empty list
`/searchWithTags` to also retrieve all tags of an item. for item tags. Use `/searchWithTags` to also retrieve all tags
of an item.
The `fulltext` field can be used to restrict the results by
using full-text search in the documents contents.
security: security:
- authTokenHeader: [] - authTokenHeader: []
requestBody: requestBody:
@ -1051,7 +1055,11 @@ paths:
description: | description: |
Search for items given a search form. The results are grouped Search for items given a search form. The results are grouped
by month by default. For each item, its tags are also by month by default. For each item, its tags are also
returned. This uses more queries and is therefore slower. returned. This uses more queries and is therefore slower, but
returns all tags to an item.
The `fulltext` field can be used to restrict the results by
using full-text search in the documents contents.
security: security:
- authTokenHeader: [] - authTokenHeader: []
requestBody: requestBody:
@ -1066,6 +1074,37 @@ paths:
application/json: application/json:
schema: schema:
$ref: "#/components/schemas/ItemLightList" $ref: "#/components/schemas/ItemLightList"
/sec/item/searchIndex:
post:
tags: [ Item ]
summary: Search for items using full-text search only.
description: |
Search for items by only using the full-text search index.
Unlike the other search routes, this one only asks the
full-text search index and returns only one group that
contains the results in the same order as given from the
index. Most full-text search engines use an ordering that
reflect the relevance wrt the search term.
The other search routes always order the results by some
property (the item date) and thus the relevance ordering is
destroyed when using the full-text search.
security:
- authTokenHeader: []
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/ItemFtsSearch"
responses:
200:
description: Ok
content:
application/json:
schema:
$ref: "#/components/schemas/ItemLightList"
/sec/item/{id}: /sec/item/{id}:
get: get:
tags: [ Item ] tags: [ Item ]
@ -2295,6 +2334,28 @@ paths:
components: components:
schemas: schemas:
ItemFtsSearch:
description: |
Query description for a full-text only search.
required:
- query
- offset
- limit
properties:
offset:
type: integer
format: int32
limit:
type: integer
format: int32
description: |
The maximum number of results to return. Note that this
limit is a soft limit, there is some hard limit on the
server, too.
query:
type: string
description: |
A query searching the contents of documents.
MoveAttachment: MoveAttachment:
description: | description: |
Data to move an attachment to another position. Data to move an attachment to another position.

View File

@ -173,6 +173,11 @@ trait Conversions {
ItemLightList(gs) ItemLightList(gs)
} }
def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = {
if (v.isEmpty) ItemLightList(Nil)
else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList)))
}
def mkItemLight(i: OItemSearch.ListItem): ItemLight = def mkItemLight(i: OItemSearch.ListItem): ItemLight =
ItemLight( ItemLight(
i.id, i.id,

View File

@ -83,6 +83,26 @@ object ItemRoutes {
} }
} yield resp } yield resp
case req @ POST -> Root / "searchIndex" =>
for {
mask <- req.as[ItemFtsSearch]
resp <- mask.query match {
case q if q.length > 1 =>
val ftsIn = OFulltext.FtsInput(q)
for {
items <- backend.fulltext.findIndexOnly(
ftsIn,
user.account.collective,
Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize)
)
ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items))
} yield ok
case _ =>
BadRequest(BasicResult(false, "Query string too short"))
}
} yield resp
case GET -> Root / Ident(id) => case GET -> Root / Ident(id) =>
for { for {
item <- backend.itemSearch.findItem(id, user.account.collective) item <- backend.itemSearch.findItem(id, user.account.collective)

View File

@ -214,7 +214,7 @@ object QItem {
Batch(0, c) Batch(0, c)
} }
private def findItemsBase(q: Query): Fragment = { private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = {
val IC = RItem.Columns val IC = RItem.Columns
val AC = RAttachment.Columns val AC = RAttachment.Columns
val PC = RPerson.Columns val PC = RPerson.Columns
@ -258,14 +258,17 @@ object QItem {
val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++ val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")" fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"
val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT"
val query = withCTE( val query = withCTE(
"items" -> withItem, (Seq(
"persons" -> withPerson, "items" -> withItem,
"orgs" -> withOrgs, "persons" -> withPerson,
"equips" -> withEquips, "orgs" -> withOrgs,
"attachs" -> withAttach "equips" -> withEquips,
"attachs" -> withAttach
) ++ ctes): _*
) ++ ) ++
fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++ selectKW ++ finalCols ++ fr" FROM items i" ++
fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++ fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++ fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++ fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
@ -280,7 +283,7 @@ object QItem {
val OC = ROrganization.Columns val OC = ROrganization.Columns
val EC = REquipment.Columns val EC = REquipment.Columns
val query = findItemsBase(q) val query = findItemsBase(q, true)
// inclusive tags are AND-ed // inclusive tags are AND-ed
val tagSelectsIncl = q.tagsInclude val tagSelectsIncl = q.tagsInclude
@ -374,14 +377,34 @@ object QItem {
frag.query[ListItem].stream frag.query[ListItem].stream
} }
case class SelectedItem(itemId: Ident, weight: Double)
def findSelectedItems(
q: Query,
items: Set[SelectedItem]
): Stream[ConnectionIO, ListItem] =
if (items.isEmpty) Stream.empty
else {
val IC = RItem.Columns
val values = items
.map(it => fr"(${it.itemId}, ${it.weight})")
.reduce((r, e) => r ++ fr"," ++ e)
val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++
fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++
fr"ORDER BY tids.weight DESC"
logger.trace(s"fts query: $from")
from.query[ListItem].stream
}
case class ListItemWithTags(item: ListItem, tags: List[RTag]) case class ListItemWithTags(item: ListItem, tags: List[RTag])
/** Same as `findItems` but resolves the tags for each item. Note that /** Same as `findItems` but resolves the tags for each item. Note that
* this is implemented by running an additional query per item. * this is implemented by running an additional query per item.
*/ */
def findItemsWithTags( def findItemsWithTags(
collective: Ident, collective: Ident,
search: Stream[ConnectionIO, ListItem] search: Stream[ConnectionIO, ListItem]
): Stream[ConnectionIO, ListItemWithTags] = { ): Stream[ConnectionIO, ListItemWithTags] = {
def findTag( def findTag(
cache: Ref[ConnectionIO, Map[Ident, RTag]], cache: Ref[ConnectionIO, Map[Ident, RTag]],