mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Initial module setup
This commit is contained in:
parent
492f4d304f
commit
c7f598e3b0
24
build.sbt
24
build.sbt
@ -260,6 +260,26 @@ val analysis = project.in(file("modules/analysis")).
|
|||||||
Dependencies.stanfordNlpCore
|
Dependencies.stanfordNlpCore
|
||||||
).dependsOn(common, files % "test->test")
|
).dependsOn(common, files % "test->test")
|
||||||
|
|
||||||
|
val ftsclient = project.in(file("modules/fts-client")).
|
||||||
|
disablePlugins(RevolverPlugin).
|
||||||
|
settings(sharedSettings).
|
||||||
|
settings(testSettings).
|
||||||
|
settings(
|
||||||
|
name := "docspell-fts-client",
|
||||||
|
libraryDependencies ++= Seq.empty
|
||||||
|
).dependsOn(common)
|
||||||
|
|
||||||
|
val ftssolr = project.in(file("modules/fts-solr")).
|
||||||
|
disablePlugins(RevolverPlugin).
|
||||||
|
settings(sharedSettings).
|
||||||
|
settings(testSettings).
|
||||||
|
settings(
|
||||||
|
name := "docspell-fts-solr",
|
||||||
|
libraryDependencies ++=
|
||||||
|
Dependencies.http4sClient ++
|
||||||
|
Dependencies.circe
|
||||||
|
).dependsOn(common, ftsclient)
|
||||||
|
|
||||||
val restapi = project.in(file("modules/restapi")).
|
val restapi = project.in(file("modules/restapi")).
|
||||||
disablePlugins(RevolverPlugin).
|
disablePlugins(RevolverPlugin).
|
||||||
enablePlugins(OpenApiSchema).
|
enablePlugins(OpenApiSchema).
|
||||||
@ -303,7 +323,7 @@ val backend = project.in(file("modules/backend")).
|
|||||||
Dependencies.bcrypt ++
|
Dependencies.bcrypt ++
|
||||||
Dependencies.http4sClient ++
|
Dependencies.http4sClient ++
|
||||||
Dependencies.emil
|
Dependencies.emil
|
||||||
).dependsOn(store, joexapi)
|
).dependsOn(store, joexapi, ftsclient, ftssolr)
|
||||||
|
|
||||||
val webapp = project.in(file("modules/webapp")).
|
val webapp = project.in(file("modules/webapp")).
|
||||||
disablePlugins(RevolverPlugin).
|
disablePlugins(RevolverPlugin).
|
||||||
@ -472,6 +492,8 @@ val root = project.in(file(".")).
|
|||||||
, extract
|
, extract
|
||||||
, convert
|
, convert
|
||||||
, analysis
|
, analysis
|
||||||
|
, ftsclient
|
||||||
|
, ftssolr
|
||||||
, files
|
, files
|
||||||
, store
|
, store
|
||||||
, joexapi
|
, joexapi
|
||||||
|
@ -0,0 +1,19 @@
|
|||||||
|
package docspell.ftsclient
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import cats.implicits._
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
import FtsBasicResult.AttachmentMatch
|
||||||
|
|
||||||
|
final case class FtsBasicResult(item: Ident, attachments: NonEmptyList[AttachmentMatch]) {
|
||||||
|
|
||||||
|
def score: Double =
|
||||||
|
attachments.map(_.score).toList.max
|
||||||
|
}
|
||||||
|
|
||||||
|
object FtsBasicResult {
|
||||||
|
|
||||||
|
case class AttachmentMatch(id: Ident, score: Double)
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
package docspell.ftsclient
|
||||||
|
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
/** The fts client is the interface for docspell to a fulltext search
|
||||||
|
* engine.
|
||||||
|
*
|
||||||
|
* It defines all operations required for integration into docspell.
|
||||||
|
* It uses data structures and terms of docspell. Implementation
|
||||||
|
* modules need to translate it to the engine that provides the
|
||||||
|
* features.
|
||||||
|
*/
|
||||||
|
trait FtsClient[F[_]] {
|
||||||
|
|
||||||
|
def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult]
|
||||||
|
|
||||||
|
def indexData(data: TextData): F[Unit]
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package docspell.ftsclient
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
/** A fulltext query.
|
||||||
|
*
|
||||||
|
* The query itself is a raw string. Each implementation may
|
||||||
|
* interpret it according to the system in use.
|
||||||
|
*/
|
||||||
|
final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int)
|
@ -0,0 +1,5 @@
|
|||||||
|
package docspell.ftsclient
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String)
|
@ -0,0 +1,12 @@
|
|||||||
|
package docspell.ftssolr
|
||||||
|
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.ftsclient._
|
||||||
|
|
||||||
|
final class SolrFtsClient[F[_]] extends FtsClient[F] {
|
||||||
|
|
||||||
|
def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult] =
|
||||||
|
???
|
||||||
|
def indexData(data: TextData): F[Unit] =
|
||||||
|
???
|
||||||
|
}
|
@ -0,0 +1,51 @@
|
|||||||
|
---
|
||||||
|
layout: docs
|
||||||
|
title: Fulltext Search Engine
|
||||||
|
---
|
||||||
|
|
||||||
|
# Choose a Fulltext Search Engine
|
||||||
|
|
||||||
|
It should be possible to search the contents of all documents.
|
||||||
|
|
||||||
|
## Context and Problem Statement
|
||||||
|
|
||||||
|
To allow searching the documents contents efficiently, a separate
|
||||||
|
index is necessary. The "defacto standard" for fulltext search on the
|
||||||
|
JVM is something backed by [Lucene](https://lucene.apache.org).
|
||||||
|
Another option is to use a RDBMS that supports fulltext search.
|
||||||
|
|
||||||
|
This adds another component to the mix, which increases the complexity
|
||||||
|
of the setup and the software. Since docspell works great without this
|
||||||
|
feature, it shouldn't have a huge impact on the application, i.e. if
|
||||||
|
the fulltext search component is down or broken, docspell should still
|
||||||
|
work (just the fulltext search is then not working).
|
||||||
|
|
||||||
|
## Considered Options
|
||||||
|
|
||||||
|
* [Apache SOLR](https://lucene.apache.org/solr)
|
||||||
|
* [ElasticSearch](https://www.elastic.co/elasticsearch/)
|
||||||
|
* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html)
|
||||||
|
* All of them or a subset
|
||||||
|
|
||||||
|
## Decision Outcome
|
||||||
|
|
||||||
|
If docspell is running on PostgreSQL, it would be the best option to
|
||||||
|
also use it for fulltext search. But I don't want to lock the database
|
||||||
|
to PostgreSQL *only* because of the fulltext search feature. This
|
||||||
|
would be a too large impact on the whole application.
|
||||||
|
|
||||||
|
ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
|
||||||
|
part of Lucene and therefore lives in the Apache ecosystem. I would
|
||||||
|
choose this over ElasticSearch, which is backed by a company (the oss
|
||||||
|
version is released under the Apache License, afaiu). Regarding
|
||||||
|
features, both are great.
|
||||||
|
|
||||||
|
The last option (supporting all) is interesting, since it would enable
|
||||||
|
to use PostgreSQL for fulltext search, when already using PostgreSQL
|
||||||
|
as the database for docspell.
|
||||||
|
|
||||||
|
So in a first step, identify what docspell needs from a fulltext
|
||||||
|
search component and create this interface and an implementation for
|
||||||
|
Apache SOLR. This enables all users to use the fulltext search
|
||||||
|
feature. As a later step, an implementation based on PostgreSQL could
|
||||||
|
be provided, too.
|
@ -0,0 +1,16 @@
|
|||||||
|
---
|
||||||
|
layout: docs
|
||||||
|
title: Fulltext Search Design
|
||||||
|
---
|
||||||
|
|
||||||
|
# How to integrate Fulltext Search
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Context and Problem Statement
|
||||||
|
|
||||||
|
|
||||||
|
## Considered Options
|
||||||
|
|
||||||
|
|
||||||
|
## Decision Outcome
|
Loading…
x
Reference in New Issue
Block a user