mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Add a route that only searches the full-text index
It returns the results in the same order as received from the index to preserve the relevance ordering.
This commit is contained in:
parent
d9f0f05613
commit
d5c9923a6d
@ -9,6 +9,7 @@ import docspell.backend.JobFactory
|
|||||||
import docspell.store.Store
|
import docspell.store.Store
|
||||||
import docspell.store.records.RJob
|
import docspell.store.records.RJob
|
||||||
import docspell.store.queue.JobQueue
|
import docspell.store.queue.JobQueue
|
||||||
|
import docspell.store.queries.QItem
|
||||||
import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
|
import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
|
||||||
|
|
||||||
trait OFulltext[F[_]] {
|
trait OFulltext[F[_]] {
|
||||||
@ -26,6 +27,12 @@ trait OFulltext[F[_]] {
|
|||||||
batch: Batch
|
batch: Batch
|
||||||
): F[Vector[OFulltext.FtsItemWithTags]]
|
): F[Vector[OFulltext.FtsItemWithTags]]
|
||||||
|
|
||||||
|
def findIndexOnly(
|
||||||
|
fts: OFulltext.FtsInput,
|
||||||
|
collective: Ident,
|
||||||
|
batch: Batch
|
||||||
|
): F[Vector[OFulltext.FtsItemWithTags]]
|
||||||
|
|
||||||
/** Clears the full-text index completely and launches a task that
|
/** Clears the full-text index completely and launches a task that
|
||||||
* indexes all data.
|
* indexes all data.
|
||||||
*/
|
*/
|
||||||
@ -84,6 +91,41 @@ object OFulltext {
|
|||||||
else queue.insertIfNew(job) *> joex.notifyAllNodes
|
else queue.insertIfNew(job) *> joex.notifyAllNodes
|
||||||
} yield ()
|
} yield ()
|
||||||
|
|
||||||
|
def findIndexOnly(
|
||||||
|
ftsQ: OFulltext.FtsInput,
|
||||||
|
collective: Ident,
|
||||||
|
batch: Batch
|
||||||
|
): F[Vector[OFulltext.FtsItemWithTags]] = {
|
||||||
|
val fq = FtsQuery(
|
||||||
|
ftsQ.query,
|
||||||
|
collective,
|
||||||
|
Set.empty,
|
||||||
|
batch.limit,
|
||||||
|
batch.offset,
|
||||||
|
FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
|
||||||
|
)
|
||||||
|
for {
|
||||||
|
ftsR <- fts.search(fq)
|
||||||
|
ftsItems = ftsR.results.groupBy(_.itemId)
|
||||||
|
select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
|
||||||
|
itemsWithTags <-
|
||||||
|
store
|
||||||
|
.transact(
|
||||||
|
QItem.findItemsWithTags(
|
||||||
|
collective,
|
||||||
|
QItem.findSelectedItems(QItem.Query.empty(collective), select)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.take(batch.limit.toLong)
|
||||||
|
.compile
|
||||||
|
.toVector
|
||||||
|
res =
|
||||||
|
itemsWithTags
|
||||||
|
.collect(convertFtsData(ftsR, ftsItems))
|
||||||
|
.map({ case (li, fd) => FtsItemWithTags(li, fd) })
|
||||||
|
} yield res
|
||||||
|
}
|
||||||
|
|
||||||
def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
|
def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
|
||||||
findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
|
findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
|
||||||
.drop(batch.offset.toLong)
|
.drop(batch.offset.toLong)
|
||||||
|
@ -53,7 +53,7 @@ object SolrQuery {
|
|||||||
|
|
||||||
def query(q: QueryData): F[FtsResult] = {
|
def query(q: QueryData): F[FtsResult] = {
|
||||||
val req = Method.POST(q.asJson, url)
|
val req = Method.POST(q.asJson, url)
|
||||||
logger.debug(s"Running query: $req")
|
logger.trace(s"Running query: $req : ${q.asJson}")
|
||||||
client.expect[FtsResult](req)
|
client.expect[FtsResult](req)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -29,23 +29,21 @@ work (just the fulltext search is then not working).
|
|||||||
|
|
||||||
## Decision Outcome
|
## Decision Outcome
|
||||||
|
|
||||||
If docspell is running on PostgreSQL, it would be the best option to
|
If docspell is running on PostgreSQL, it would be nice to also use it
|
||||||
also use it for fulltext search. But I don't want to lock the database
|
for fulltext search to save the cost of running another component. But
|
||||||
to PostgreSQL *only* because of the fulltext search feature. This
|
I don't want to lock the database to PostgreSQL *only* because of the
|
||||||
would be a too large impact on the whole application.
|
fulltext search feature.
|
||||||
|
|
||||||
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
|
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
|
||||||
part of Lucene and therefore lives in the Apache ecosystem. I would
|
part of Lucene and therefore lives in the Apache ecosystem. I would
|
||||||
choose this over ElasticSearch, which is backed by a company (the oss
|
choose SOLR over ElasticSearch, because I used it before.
|
||||||
version is released under the Apache License, afaiu). Regarding
|
|
||||||
features, both are great.
|
|
||||||
|
|
||||||
The last option (supporting all) is interesting, since it would enable
|
The last option (supporting all) is interesting, since it would enable
|
||||||
to use PostgreSQL for fulltext search, when already using PostgreSQL
|
to use PostgreSQL for fulltext search for those that use PostgreSQL as
|
||||||
as the database for docspell.
|
the database for docspell.
|
||||||
|
|
||||||
So in a first step, identify what docspell needs from a fulltext
|
In a first step, identify what docspell needs from a fulltext search
|
||||||
search component and create this interface and an implementation for
|
component and create this interface and an implementation for Apache
|
||||||
Apache SOLR. This enables all users to use the fulltext search
|
SOLR. This enables all users to use the fulltext search feature. As a
|
||||||
feature. As a later step, an implementation based on PostgreSQL could
|
later step, an implementation based on PostgreSQL and/or ElasticSearch
|
||||||
be provided, too.
|
could be provided, too.
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
---
|
|
||||||
layout: docs
|
|
||||||
title: Fulltext Search Design
|
|
||||||
---
|
|
||||||
|
|
||||||
# How to integrate Fulltext Search
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Context and Problem Statement
|
|
||||||
|
|
||||||
|
|
||||||
## Considered Options
|
|
||||||
|
|
||||||
|
|
||||||
## Decision Outcome
|
|
@ -1027,9 +1027,13 @@ paths:
|
|||||||
summary: Search for items.
|
summary: Search for items.
|
||||||
description: |
|
description: |
|
||||||
Search for items given a search form. The results are grouped
|
Search for items given a search form. The results are grouped
|
||||||
by month by default. Tags are *not* resolved! The results will
|
by month and are sorted by item date (newest first). Tags are
|
||||||
always contain an empty list for item tags. Use
|
*not* resolved. The results will always contain an empty list
|
||||||
`/searchWithTags` to also retrieve all tags of an item.
|
for item tags. Use `/searchWithTags` to also retrieve all tags
|
||||||
|
of an item.
|
||||||
|
|
||||||
|
The `fulltext` field can be used to restrict the results by
|
||||||
|
using full-text search in the documents contents.
|
||||||
security:
|
security:
|
||||||
- authTokenHeader: []
|
- authTokenHeader: []
|
||||||
requestBody:
|
requestBody:
|
||||||
@ -1051,7 +1055,11 @@ paths:
|
|||||||
description: |
|
description: |
|
||||||
Search for items given a search form. The results are grouped
|
Search for items given a search form. The results are grouped
|
||||||
by month by default. For each item, its tags are also
|
by month by default. For each item, its tags are also
|
||||||
returned. This uses more queries and is therefore slower.
|
returned. This uses more queries and is therefore slower, but
|
||||||
|
returns all tags to an item.
|
||||||
|
|
||||||
|
The `fulltext` field can be used to restrict the results by
|
||||||
|
using full-text search in the documents contents.
|
||||||
security:
|
security:
|
||||||
- authTokenHeader: []
|
- authTokenHeader: []
|
||||||
requestBody:
|
requestBody:
|
||||||
@ -1066,6 +1074,37 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ItemLightList"
|
$ref: "#/components/schemas/ItemLightList"
|
||||||
|
/sec/item/searchIndex:
|
||||||
|
post:
|
||||||
|
tags: [ Item ]
|
||||||
|
summary: Search for items using full-text search only.
|
||||||
|
description: |
|
||||||
|
Search for items by only using the full-text search index.
|
||||||
|
|
||||||
|
Unlike the other search routes, this one only asks the
|
||||||
|
full-text search index and returns only one group that
|
||||||
|
contains the results in the same order as given from the
|
||||||
|
index. Most full-text search engines use an ordering that
|
||||||
|
reflect the relevance wrt the search term.
|
||||||
|
|
||||||
|
The other search routes always order the results by some
|
||||||
|
property (the item date) and thus the relevance ordering is
|
||||||
|
destroyed when using the full-text search.
|
||||||
|
security:
|
||||||
|
- authTokenHeader: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ItemFtsSearch"
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: Ok
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/ItemLightList"
|
||||||
|
|
||||||
/sec/item/{id}:
|
/sec/item/{id}:
|
||||||
get:
|
get:
|
||||||
tags: [ Item ]
|
tags: [ Item ]
|
||||||
@ -2295,6 +2334,28 @@ paths:
|
|||||||
|
|
||||||
components:
|
components:
|
||||||
schemas:
|
schemas:
|
||||||
|
ItemFtsSearch:
|
||||||
|
description: |
|
||||||
|
Query description for a full-text only search.
|
||||||
|
required:
|
||||||
|
- query
|
||||||
|
- offset
|
||||||
|
- limit
|
||||||
|
properties:
|
||||||
|
offset:
|
||||||
|
type: integer
|
||||||
|
format: int32
|
||||||
|
limit:
|
||||||
|
type: integer
|
||||||
|
format: int32
|
||||||
|
description: |
|
||||||
|
The maximum number of results to return. Note that this
|
||||||
|
limit is a soft limit, there is some hard limit on the
|
||||||
|
server, too.
|
||||||
|
query:
|
||||||
|
type: string
|
||||||
|
description: |
|
||||||
|
A query searching the contents of documents.
|
||||||
MoveAttachment:
|
MoveAttachment:
|
||||||
description: |
|
description: |
|
||||||
Data to move an attachment to another position.
|
Data to move an attachment to another position.
|
||||||
|
@ -173,6 +173,11 @@ trait Conversions {
|
|||||||
ItemLightList(gs)
|
ItemLightList(gs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def mkItemListWithTagsFtsPlain(v: Vector[OFulltext.FtsItemWithTags]): ItemLightList = {
|
||||||
|
if (v.isEmpty) ItemLightList(Nil)
|
||||||
|
else ItemLightList(List(ItemLightGroup("Results", v.map(mkItemLightWithTags).toList)))
|
||||||
|
}
|
||||||
|
|
||||||
def mkItemLight(i: OItemSearch.ListItem): ItemLight =
|
def mkItemLight(i: OItemSearch.ListItem): ItemLight =
|
||||||
ItemLight(
|
ItemLight(
|
||||||
i.id,
|
i.id,
|
||||||
|
@ -83,6 +83,26 @@ object ItemRoutes {
|
|||||||
}
|
}
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
|
case req @ POST -> Root / "searchIndex" =>
|
||||||
|
for {
|
||||||
|
mask <- req.as[ItemFtsSearch]
|
||||||
|
resp <- mask.query match {
|
||||||
|
case q if q.length > 1 =>
|
||||||
|
val ftsIn = OFulltext.FtsInput(q)
|
||||||
|
for {
|
||||||
|
items <- backend.fulltext.findIndexOnly(
|
||||||
|
ftsIn,
|
||||||
|
user.account.collective,
|
||||||
|
Batch(mask.offset, mask.limit).restrictLimitTo(cfg.maxItemPageSize)
|
||||||
|
)
|
||||||
|
ok <- Ok(Conversions.mkItemListWithTagsFtsPlain(items))
|
||||||
|
} yield ok
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
BadRequest(BasicResult(false, "Query string too short"))
|
||||||
|
}
|
||||||
|
} yield resp
|
||||||
|
|
||||||
case GET -> Root / Ident(id) =>
|
case GET -> Root / Ident(id) =>
|
||||||
for {
|
for {
|
||||||
item <- backend.itemSearch.findItem(id, user.account.collective)
|
item <- backend.itemSearch.findItem(id, user.account.collective)
|
||||||
|
@ -214,7 +214,7 @@ object QItem {
|
|||||||
Batch(0, c)
|
Batch(0, c)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def findItemsBase(q: Query): Fragment = {
|
private def findItemsBase(q: Query, distinct: Boolean, ctes: (String, Fragment)*): Fragment = {
|
||||||
val IC = RItem.Columns
|
val IC = RItem.Columns
|
||||||
val AC = RAttachment.Columns
|
val AC = RAttachment.Columns
|
||||||
val PC = RPerson.Columns
|
val PC = RPerson.Columns
|
||||||
@ -258,14 +258,17 @@ object QItem {
|
|||||||
val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
|
val withAttach = fr"SELECT COUNT(" ++ AC.id.f ++ fr") as num, " ++ AC.itemId.f ++
|
||||||
fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"
|
fr"from" ++ RAttachment.table ++ fr"GROUP BY (" ++ AC.itemId.f ++ fr")"
|
||||||
|
|
||||||
|
val selectKW = if (distinct) fr"SELECT DISTINCT" else fr"SELECT"
|
||||||
val query = withCTE(
|
val query = withCTE(
|
||||||
"items" -> withItem,
|
(Seq(
|
||||||
"persons" -> withPerson,
|
"items" -> withItem,
|
||||||
"orgs" -> withOrgs,
|
"persons" -> withPerson,
|
||||||
"equips" -> withEquips,
|
"orgs" -> withOrgs,
|
||||||
"attachs" -> withAttach
|
"equips" -> withEquips,
|
||||||
|
"attachs" -> withAttach
|
||||||
|
) ++ ctes): _*
|
||||||
) ++
|
) ++
|
||||||
fr"SELECT DISTINCT" ++ finalCols ++ fr" FROM items i" ++
|
selectKW ++ finalCols ++ fr" FROM items i" ++
|
||||||
fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
|
fr"LEFT JOIN attachs a ON" ++ IC.id.prefix("i").is(AC.itemId.prefix("a")) ++
|
||||||
fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
|
fr"LEFT JOIN persons p0 ON" ++ IC.corrPerson.prefix("i").is(PC.pid.prefix("p0")) ++
|
||||||
fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
|
fr"LEFT JOIN orgs o0 ON" ++ IC.corrOrg.prefix("i").is(OC.oid.prefix("o0")) ++
|
||||||
@ -280,7 +283,7 @@ object QItem {
|
|||||||
val OC = ROrganization.Columns
|
val OC = ROrganization.Columns
|
||||||
val EC = REquipment.Columns
|
val EC = REquipment.Columns
|
||||||
|
|
||||||
val query = findItemsBase(q)
|
val query = findItemsBase(q, true)
|
||||||
|
|
||||||
// inclusive tags are AND-ed
|
// inclusive tags are AND-ed
|
||||||
val tagSelectsIncl = q.tagsInclude
|
val tagSelectsIncl = q.tagsInclude
|
||||||
@ -374,14 +377,34 @@ object QItem {
|
|||||||
frag.query[ListItem].stream
|
frag.query[ListItem].stream
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case class SelectedItem(itemId: Ident, weight: Double)
|
||||||
|
def findSelectedItems(
|
||||||
|
q: Query,
|
||||||
|
items: Set[SelectedItem]
|
||||||
|
): Stream[ConnectionIO, ListItem] =
|
||||||
|
if (items.isEmpty) Stream.empty
|
||||||
|
else {
|
||||||
|
val IC = RItem.Columns
|
||||||
|
val values = items
|
||||||
|
.map(it => fr"(${it.itemId}, ${it.weight})")
|
||||||
|
.reduce((r, e) => r ++ fr"," ++ e)
|
||||||
|
|
||||||
|
val from = findItemsBase(q, false, ("tids(item_id, weight)", fr"(VALUES" ++ values ++ fr")")) ++
|
||||||
|
fr"INNER JOIN tids ON" ++ IC.id.prefix("i").f ++ fr" = tids.item_id" ++
|
||||||
|
fr"ORDER BY tids.weight DESC"
|
||||||
|
|
||||||
|
logger.trace(s"fts query: $from")
|
||||||
|
from.query[ListItem].stream
|
||||||
|
}
|
||||||
|
|
||||||
case class ListItemWithTags(item: ListItem, tags: List[RTag])
|
case class ListItemWithTags(item: ListItem, tags: List[RTag])
|
||||||
|
|
||||||
/** Same as `findItems` but resolves the tags for each item. Note that
|
/** Same as `findItems` but resolves the tags for each item. Note that
|
||||||
* this is implemented by running an additional query per item.
|
* this is implemented by running an additional query per item.
|
||||||
*/
|
*/
|
||||||
def findItemsWithTags(
|
def findItemsWithTags(
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
search: Stream[ConnectionIO, ListItem]
|
search: Stream[ConnectionIO, ListItem]
|
||||||
): Stream[ConnectionIO, ListItemWithTags] = {
|
): Stream[ConnectionIO, ListItemWithTags] = {
|
||||||
def findTag(
|
def findTag(
|
||||||
cache: Ref[ConnectionIO, Map[Ident, RTag]],
|
cache: Ref[ConnectionIO, Map[Ident, RTag]],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user