mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-02 09:05:08 +00:00
Add a route that only searches the full-text index
It returns the results in the same order as received from the index to preserve the relevance ordering.
This commit is contained in:
parent
d9f0f05613
commit
d5c9923a6d
@ -9,6 +9,7 @@ import docspell.backend.JobFactory
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.RJob
|
||||
import docspell.store.queue.JobQueue
|
||||
import docspell.store.queries.QItem
|
||||
import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
|
||||
|
||||
trait OFulltext[F[_]] {
|
||||
@ -26,6 +27,12 @@ trait OFulltext[F[_]] {
|
||||
batch: Batch
|
||||
): F[Vector[OFulltext.FtsItemWithTags]]
|
||||
|
||||
def findIndexOnly(
|
||||
fts: OFulltext.FtsInput,
|
||||
collective: Ident,
|
||||
batch: Batch
|
||||
): F[Vector[OFulltext.FtsItemWithTags]]
|
||||
|
||||
/** Clears the full-text index completely and launches a task that
|
||||
* indexes all data.
|
||||
*/
|
||||
@ -84,6 +91,41 @@ object OFulltext {
|
||||
else queue.insertIfNew(job) *> joex.notifyAllNodes
|
||||
} yield ()
|
||||
|
||||
def findIndexOnly(
|
||||
ftsQ: OFulltext.FtsInput,
|
||||
collective: Ident,
|
||||
batch: Batch
|
||||
): F[Vector[OFulltext.FtsItemWithTags]] = {
|
||||
val fq = FtsQuery(
|
||||
ftsQ.query,
|
||||
collective,
|
||||
Set.empty,
|
||||
batch.limit,
|
||||
batch.offset,
|
||||
FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
|
||||
)
|
||||
for {
|
||||
ftsR <- fts.search(fq)
|
||||
ftsItems = ftsR.results.groupBy(_.itemId)
|
||||
select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
|
||||
itemsWithTags <-
|
||||
store
|
||||
.transact(
|
||||
QItem.findItemsWithTags(
|
||||
collective,
|
||||
QItem.findSelectedItems(QItem.Query.empty(collective), select)
|
||||
)
|
||||
)
|
||||
.take(batch.limit.toLong)
|
||||
.compile
|
||||
.toVector
|
||||
res =
|
||||
itemsWithTags
|
||||
.collect(convertFtsData(ftsR, ftsItems))
|
||||
.map({ case (li, fd) => FtsItemWithTags(li, fd) })
|
||||
} yield res
|
||||
}
|
||||
|
||||
def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
|
||||
findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
|
||||
.drop(batch.offset.toLong)
|
||||
|
@ -53,7 +53,7 @@ object SolrQuery {
|
||||
|
||||
def query(q: QueryData): F[FtsResult] = {
|
||||
val req = Method.POST(q.asJson, url)
|
||||
logger.debug(s"Running query: $req")
|
||||
logger.trace(s"Running query: $req : ${q.asJson}")
|
||||
client.expect[FtsResult](req)
|
||||
}
|
||||
|
||||
|
@ -29,23 +29,21 @@ work (just the fulltext search is then not working).
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
If docspell is running on PostgreSQL, it would be the best option to
|
||||
also use it for fulltext search. But I don't want to lock the database
|
||||
to PostgreSQL *only* because of the fulltext search feature. This
|
||||
would be a too large impact on the whole application.
|
||||
If docspell is running on PostgreSQL, it would be nice to also use it
|
||||
for fulltext search to save the cost of running another component. But
|
||||
I don't want to lock the database to PostgreSQL *only* because of the
|
||||
fulltext search feature.
|
||||
|
||||
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
|
||||
part of Lucene and therefore lives in the Apache ecosystem. I would
|
||||
choose this over ElasticSearch, which is backed by a company (the oss
|
||||
version is released under the Apache License, afaiu). Regarding
|
||||
features, both are great.
|
||||
choose SOLR over ElasticSearch, because I used it before.
|
||||
|
||||
The last option (supporting all) is interesting, since it would enable
|
||||
to use PostgreSQL for fulltext search, when already using PostgreSQL
|
||||
as the database for docspell.
|
||||
to use PostgreSQL for fulltext search for those that use PostgreSQL as
|
||||
the database for docspell.
|
||||
|
||||
So in a first step, identify what docspell needs from a fulltext
|
||||
search component and create this interface and an implementation for
|
||||
Apache SOLR. This enables all users to use the fulltext search
|
||||
feature. As a later step, an implementation based on PostgreSQL could
|
||||
be provided, too.
|
||||
In a first step, identify what docspell needs from a fulltext search
|
||||
component and create this interface and an implementation for Apache
|
||||
SOLR. This enables all users to use the fulltext search feature. As a
|
||||
later step, an implementation based on PostgreSQL and/or ElasticSearch
|
||||
could be provided, too.
|
||||
|
@ -1,16 +0,0 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Fulltext Search Design
|
||||
---
|
||||
|
||||
# How to integrate Fulltext Search
|
||||
|
||||
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
|
||||
## Considered Options
|
||||
|
||||
|
||||
## Decision Outcome
|
@ -1027,9 +1027,13 @@ paths:
|
||||
summary: Search for items.
|
||||
description: |
|
||||
Search for items given a search form. The results are grouped
|
||||
by month by default. Tags are *not* resolved! The results will
|
||||
always contain an empty list for item tags. Use
|
||||
`/searchWithTags` to also retrieve all tags of an item.
|
||||
by month and are sorted by item date (newest first). Tags are
|
||||
*not* resolved. The results will always contain an empty list
|
||||
for item tags. Use `/searchWithTags` to also retrieve all tags
|
||||
of an item.
|
||||
|
||||
The `fulltext` field can be used to restrict the results by
|
||||
using full-text search in the documents contents.
|
||||
security:
|
||||
- authTokenHeader: []
|
||||
requestBody:
|
||||
@ -1051,7 +1055,11 @@ paths:
|
||||
description: |
|
||||
Search for items given a search form. The results are grouped
|
||||
by month by default. For each item, its tags are also
|
||||
returned. This uses more queries and is therefore slower.
|
||||
returned. This uses more queries and is therefore slower, but
|
||||
returns all tags to an item.
|
||||
|
||||
The `fulltext` field can be used to restrict the results by
|
||||
using full-text search in the documents contents.
|
||||
security:
|
||||
- authTokenHeader: []
|
||||
requestBody:
|
||||
@ -1066,6 +1074,37 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ItemLightList"
|
||||
/sec/item/searchIndex:
|
||||
post:
|
||||
tags: [ Item ]
|
||||
summary: Search for items using full-text search only.
|
||||
description: |
|
||||
Search for items by only using the full-text search index.
|
||||
|
||||
Unlike the other search routes, this one only asks the
|
||||
full-text search index and returns only one group that
|
||||
contains the results in the same order as given from the
|
||||
index. Most full-text search engines use an ordering that
|
||||
reflect the relevance wrt the search term.
|
||||
|
||||
The other search routes always order the results by some
|
||||
property (the item date) and thus the relevance ordering is
|
||||
destroyed when using the full-text search.
|
||||
security:
|
||||
- authTokenHeader: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ItemFtsSearch"
|
||||
responses:
|
||||
200:
|
||||
description: Ok
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ItemLightList"
|
||||
|
||||
/sec/item/{id}:
|
||||
get:
|
||||
tags: [ Item ]
|
||||
@ -2295,6 +2334,28 @@ paths:
|
||||
|
||||
components:
|
||||
schemas:
|
||||
ItemFtsSearch:
|
||||
description: |
|
||||
Query description for a full-text only search.
|
||||
required:
|
||||
- query
|
||||
- offset
|
||||
- limit
|
||||
properties:
|
||||
offset:
|
||||
type: integer
|
||||
format: int32
|
||||
limit:
|
||||
type: integer
|
||||
format: int32
|
||||
description: |
|
||||
The maximum number of results to return. Note that this
|
||||
limit is a soft limit, there is some hard limit on the
|
||||
server, too.
|
||||
query:
|
||||
type: string
|
||||
description: |
|
||||
A query searching the contents of documents.
|
||||
MoveAttachment:
|
||||
description: |
|
||||
Data to move an attachment to another position.
|
||||
|
@ -173,6 +173,11 @@ trait Conversions {
|
||||
ItemLightList(gs)
|
||||
}
|
||||
|
||||
def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = {
|
||||
if (v.isEmpty) ItemLightList(Nil)
|
||||
else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList)))
|
||||
}
|
||||
|
||||
def mkItemLight(i: OItemSearch.ListItem): ItemLight =
|
||||
ItemLight(
|
||||
i.id,
|
||||
|
@ -83,6 +83,26 @@ object ItemRoutes {
|
||||
}
|
||||
} yield resp
|
||||
|
||||
case req @ POST -> Root / "searchIndex" =>
|
||||
for {
|
||||
mask <- req.as[ItemFtsSearch]
|
||||
resp <- mask.query match {
|
||||
case q if q.length > 1 =>
|
||||
val ftsIn = OFulltext.FtsInput(q)
|
||||
for {
|
||||
items <- backend.fulltext.findIndexOnly(
|
||||
ftsIn,
|
||||
user.account.collective,
|
||||
Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize)
|
||||
)
|
||||
ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items))
|
||||
} yield ok
|
||||
|
||||
case _ =>
|
||||
BadRequest(BasicResult(false, "Query string too short"))
|
||||
}
|
||||
} yield resp
|
||||
|
||||
case GET -> Root / Ident(id) =>
|
||||
for {
|
||||
item <- backend.itemSearch.findItem(id, user.account.collective)
|
||||
|
@ -214,7 +214,7 @@ object QItem {
|
||||
Batch(0, c)
|
||||
}
|
||||
|
||||
private def findItemsBase(q: Query): Fragment = {
|
||||
private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = {
|
||||
val IC = RItem.Columns
|
||||
val AC = RAttachment.Columns
|
||||
val PC = RPerson.Columns
|
||||
@ -258,14 +258,17 @@ object QItem {
|
||||
val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
|
||||
fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"
|
||||
|
||||
val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT"
|
||||
val query = withCTE(
|
||||
"items" -> withItem,
|
||||
"persons" -> withPerson,
|
||||
"orgs" -> withOrgs,
|
||||
"equips" -> withEquips,
|
||||
"attachs" -> withAttach
|
||||
(Seq(
|
||||
"items" -> withItem,
|
||||
"persons" -> withPerson,
|
||||
"orgs" -> withOrgs,
|
||||
"equips" -> withEquips,
|
||||
"attachs" -> withAttach
|
||||
) ++ ctes): _*
|
||||
) ++
|
||||
fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++
|
||||
selectKW ++ finalCols ++ fr" FROM items i" ++
|
||||
fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
|
||||
fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
|
||||
fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
|
||||
@ -280,7 +283,7 @@ object QItem {
|
||||
val OC = ROrganization.Columns
|
||||
val EC = REquipment.Columns
|
||||
|
||||
val query = findItemsBase(q)
|
||||
val query = findItemsBase(q, true)
|
||||
|
||||
// inclusive tags are AND-ed
|
||||
val tagSelectsIncl = q.tagsInclude
|
||||
@ -374,14 +377,34 @@ object QItem {
|
||||
frag.query[ListItem].stream
|
||||
}
|
||||
|
||||
case class SelectedItem(itemId: Ident, weight: Double)
|
||||
def findSelectedItems(
|
||||
q: Query,
|
||||
items: Set[SelectedItem]
|
||||
): Stream[ConnectionIO, ListItem] =
|
||||
if (items.isEmpty) Stream.empty
|
||||
else {
|
||||
val IC = RItem.Columns
|
||||
val values = items
|
||||
.map(it => fr"(${it.itemId}, ${it.weight})")
|
||||
.reduce((r, e) => r ++ fr"," ++ e)
|
||||
|
||||
val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++
|
||||
fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++
|
||||
fr"ORDER BY tids.weight DESC"
|
||||
|
||||
logger.trace(s"fts query: $from")
|
||||
from.query[ListItem].stream
|
||||
}
|
||||
|
||||
case class ListItemWithTags(item: ListItem, tags: List[RTag])
|
||||
|
||||
/** Same as `findItems` but resolves the tags for each item. Note that
|
||||
* this is implemented by running an additional query per item.
|
||||
*/
|
||||
def findItemsWithTags(
|
||||
collective: Ident,
|
||||
search: Stream[ConnectionIO, ListItem]
|
||||
collective: Ident,
|
||||
search: Stream[ConnectionIO, ListItem]
|
||||
): Stream[ConnectionIO, ListItemWithTags] = {
|
||||
def findTag(
|
||||
cache: Ref[ConnectionIO, Map[Ident, RTag]],
|
||||
|
Loading…
x
Reference in New Issue
Block a user