Add a route that only searches the full-text index

It returns the results in the same order as received from the index to
preserve the relevance ordering.
This commit is contained in:
Eike Kettner 2020-06-23 23:02:58 +02:00
parent d9f0f05613
commit d5c9923a6d
8 changed files with 178 additions and 45 deletions

View File

@ -9,6 +9,7 @@ import docspell.backend.JobFactory
import docspell.store.Store
import docspell.store.records.RJob
import docspell.store.queue.JobQueue
import docspell.store.queries.QItem
import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
trait OFulltext[F[_]] {
@ -26,6 +27,12 @@ trait OFulltext[F[_]] {
batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]]
def findIndexOnly(
fts: OFulltext.FtsInput,
collective: Ident,
batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]]
/** Clears the full-text index completely and launches a task that
* indexes all data.
*/
@ -84,6 +91,41 @@ object OFulltext {
else queue.insertIfNew(job) *> joex.notifyAllNodes
} yield ()
def findIndexOnly(
ftsQ: OFulltext.FtsInput,
collective: Ident,
batch: Batch
): F[Vector[OFulltext.FtsItemWithTags]] = {
val fq = FtsQuery(
ftsQ.query,
collective,
Set.empty,
batch.limit,
batch.offset,
FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
)
for {
ftsR <- fts.search(fq)
ftsItems = ftsR.results.groupBy(_.itemId)
select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
itemsWithTags <-
store
.transact(
QItem.findItemsWithTags(
collective,
QItem.findSelectedItems(QItem.Query.empty(collective), select)
)
)
.take(batch.limit.toLong)
.compile
.toVector
res =
itemsWithTags
.collect(convertFtsData(ftsR, ftsItems))
.map({ case (li, fd) => FtsItemWithTags(li, fd) })
} yield res
}
def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
.drop(batch.offset.toLong)

View File

@ -53,7 +53,7 @@ object SolrQuery {
def query(q: QueryData): F[FtsResult] = {
val req = Method.POST(q.asJson, url)
logger.debug(s"Running query: $req")
logger.trace(s"Running query: $req : ${q.asJson}")
client.expect[FtsResult](req)
}

View File

@ -29,23 +29,21 @@ work (just the fulltext search is then not working).
## Decision Outcome
If docspell is running on PostgreSQL, it would be the best option to
also use it for fulltext search. But I don't want to lock the database
to PostgreSQL *only* because of the fulltext search feature. This
would be a too large impact on the whole application.
If docspell is running on PostgreSQL, it would be nice to also use it
for fulltext search to save the cost of running another component. But
I don't want to lock the database to PostgreSQL *only* because of the
fulltext search feature.
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
part of Lucene and therefore lives in the Apache ecosystem. I would
choose this over ElasticSearch, which is backed by a company (the oss
version is released under the Apache License, afaiu). Regarding
features, both are great.
choose SOLR over ElasticSearch, because I used it before.
The last option (supporting all) is interesting, since it would enable
to use PostgreSQL for fulltext search, when already using PostgreSQL
as the database for docspell.
to use PostgreSQL for fulltext search for those that use PostgreSQL as
the database for docspell.
So in a first step, identify what docspell needs from a fulltext
search component and create this interface and an implementation for
Apache SOLR. This enables all users to use the fulltext search
feature. As a later step, an implementation based on PostgreSQL could
be provided, too.
In a first step, identify what docspell needs from a fulltext search
component and create this interface and an implementation for Apache
SOLR. This enables all users to use the fulltext search feature. As a
later step, an implementation based on PostgreSQL and/or ElasticSearch
could be provided, too.

View File

@ -1,16 +0,0 @@
---
layout: docs
title: Fulltext Search Design
---
# How to integrate Fulltext Search
## Context and Problem Statement
## Considered Options
## Decision Outcome

View File

@ -1027,9 +1027,13 @@ paths:
summary: Search for items.
description: |
Search for items given a search form. The results are grouped
by month by default. Tags are *not* resolved! The results will
always contain an empty list for item tags. Use
`/searchWithTags` to also retrieve all tags of an item.
by month and are sorted by item date (newest first). Tags are
*not* resolved. The results will always contain an empty list
for item tags. Use `/searchWithTags` to also retrieve all tags
of an item.
The `fulltext` field can be used to restrict the results by
using full-text search in the documents contents.
security:
- authTokenHeader: []
requestBody:
@ -1051,7 +1055,11 @@ paths:
description: |
Search for items given a search form. The results are grouped
by month by default. For each item, its tags are also
returned. This uses more queries and is therefore slower.
returned. This uses more queries and is therefore slower, but
returns all tags to an item.
The `fulltext` field can be used to restrict the results by
using full-text search in the documents contents.
security:
- authTokenHeader: []
requestBody:
@ -1066,6 +1074,37 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ItemLightList"
/sec/item/searchIndex:
post:
tags: [ Item ]
summary: Search for items using full-text search only.
description: |
Search for items by only using the full-text search index.
Unlike the other search routes, this one only asks the
full-text search index and returns only one group that
contains the results in the same order as given from the
index. Most full-text search engines use an ordering that
reflect the relevance wrt the search term.
The other search routes always order the results by some
property (the item date) and thus the relevance ordering is
destroyed when using the full-text search.
security:
- authTokenHeader: []
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/ItemFtsSearch"
responses:
200:
description: Ok
content:
application/json:
schema:
$ref: "#/components/schemas/ItemLightList"
/sec/item/{id}:
get:
tags: [ Item ]
@ -2295,6 +2334,28 @@ paths:
components:
schemas:
ItemFtsSearch:
description: |
Query description for a full-text only search.
required:
- query
- offset
- limit
properties:
offset:
type: integer
format: int32
limit:
type: integer
format: int32
description: |
The maximum number of results to return. Note that this
limit is a soft limit, there is some hard limit on the
server, too.
query:
type: string
description: |
A query searching the contents of documents.
MoveAttachment:
description: |
Data to move an attachment to another position.

View File

@ -173,6 +173,11 @@ trait Conversions {
ItemLightList(gs)
}
def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = {
if (v.isEmpty) ItemLightList(Nil)
else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList)))
}
def mkItemLight(i: OItemSearch.ListItem): ItemLight =
ItemLight(
i.id,

View File

@ -83,6 +83,26 @@ object ItemRoutes {
}
} yield resp
case req @ POST -> Root / "searchIndex" =>
for {
mask <- req.as[ItemFtsSearch]
resp <- mask.query match {
case q if q.length > 1 =>
val ftsIn = OFulltext.FtsInput(q)
for {
items <- backend.fulltext.findIndexOnly(
ftsIn,
user.account.collective,
Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize)
)
ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items))
} yield ok
case _ =>
BadRequest(BasicResult(false, "Query string too short"))
}
} yield resp
case GET -> Root / Ident(id) =>
for {
item <- backend.itemSearch.findItem(id, user.account.collective)

View File

@ -214,7 +214,7 @@ object QItem {
Batch(0, c)
}
private def findItemsBase(q: Query): Fragment = {
private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = {
val IC = RItem.Columns
val AC = RAttachment.Columns
val PC = RPerson.Columns
@ -258,14 +258,17 @@ object QItem {
val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"
val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT"
val query = withCTE(
"items" -> withItem,
"persons" -> withPerson,
"orgs" -> withOrgs,
"equips" -> withEquips,
"attachs" -> withAttach
(Seq(
"items" -> withItem,
"persons" -> withPerson,
"orgs" -> withOrgs,
"equips" -> withEquips,
"attachs" -> withAttach
) ++ ctes): _*
) ++
fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++
selectKW ++ finalCols ++ fr" FROM items i" ++
fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
@ -280,7 +283,7 @@ object QItem {
val OC = ROrganization.Columns
val EC = REquipment.Columns
val query = findItemsBase(q)
val query = findItemsBase(q, true)
// inclusive tags are AND-ed
val tagSelectsIncl = q.tagsInclude
@ -374,14 +377,34 @@ object QItem {
frag.query[ListItem].stream
}
case class SelectedItem(itemId: Ident, weight: Double)
def findSelectedItems(
q: Query,
items: Set[SelectedItem]
): Stream[ConnectionIO, ListItem] =
if (items.isEmpty) Stream.empty
else {
val IC = RItem.Columns
val values = items
.map(it => fr"(${it.itemId}, ${it.weight})")
.reduce((r, e) => r ++ fr"," ++ e)
val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++
fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++
fr"ORDER BY tids.weight DESC"
logger.trace(s"fts query: $from")
from.query[ListItem].stream
}
case class ListItemWithTags(item: ListItem, tags: List[RTag])
/** Same as `findItems` but resolves the tags for each item. Note that
* this is implemented by running an additional query per item.
*/
def findItemsWithTags(
collective: Ident,
search: Stream[ConnectionIO, ListItem]
collective: Ident,
search: Stream[ConnectionIO, ListItem]
): Stream[ConnectionIO, ListItemWithTags] = {
def findTag(
cache: Ref[ConnectionIO, Map[Ident, RTag]],