From c7f598e3b0b6225c0ac96ed7a4c171a641c67f68 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 14 Jun 2020 22:53:20 +0200 Subject: [PATCH] Initial module setup --- build.sbt | 24 ++++++++- .../docspell/ftsclient/FtsBasicResult.scala | 19 +++++++ .../scala/docspell/ftsclient/FtsClient.scala | 18 +++++++ .../scala/docspell/ftsclient/FtsQuery.scala | 10 ++++ .../scala/docspell/ftsclient/TextData.scala | 5 ++ .../docspell/ftssolr/SolrFtsClient.scala | 12 +++++ .../dev/adr/0014_fulltext_search_engine.md | 51 +++++++++++++++++++ .../dev/adr/0015_fulltext_search_design.md | 16 ++++++ 8 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala create mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala create mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala create mode 100644 modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala create mode 100644 modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala create mode 100644 modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md create mode 100644 modules/microsite/docs/dev/adr/0015_fulltext_search_design.md diff --git a/build.sbt b/build.sbt index 7e97df57..70f0c10b 100644 --- a/build.sbt +++ b/build.sbt @@ -259,6 +259,26 @@ val analysis = project.in(file("modules/analysis")). Dependencies.fs2 ++ Dependencies.stanfordNlpCore ).dependsOn(common, files % "test->test") + +val ftsclient = project.in(file("modules/fts-client")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-fts-client", + libraryDependencies ++= Seq.empty + ).dependsOn(common) + +val ftssolr = project.in(file("modules/fts-solr")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-fts-solr", + libraryDependencies ++= + Dependencies.http4sClient ++ + Dependencies.circe + ).dependsOn(common, ftsclient) val restapi = project.in(file("modules/restapi")). disablePlugins(RevolverPlugin). @@ -303,7 +323,7 @@ val backend = project.in(file("modules/backend")). Dependencies.bcrypt ++ Dependencies.http4sClient ++ Dependencies.emil - ).dependsOn(store, joexapi) + ).dependsOn(store, joexapi, ftsclient, ftssolr) val webapp = project.in(file("modules/webapp")). disablePlugins(RevolverPlugin). @@ -472,6 +492,8 @@ val root = project.in(file(".")). , extract , convert , analysis + , ftsclient + , ftssolr , files , store , joexapi diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala new file mode 100644 index 00000000..3e0b5e61 --- /dev/null +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsBasicResult.scala @@ -0,0 +1,19 @@ +package docspell.ftsclient + +import cats.data.NonEmptyList +import cats.implicits._ +import docspell.common._ + +import FtsBasicResult.AttachmentMatch + +final case class FtsBasicResult(item: Ident, attachments: NonEmptyList[AttachmentMatch]) { + + def score: Double = + attachments.map(_.score).toList.max +} + +object FtsBasicResult { + + case class AttachmentMatch(id: Ident, score: Double) + +} diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala new file mode 100644 index 00000000..7664c757 --- /dev/null +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala @@ -0,0 +1,18 @@ +package docspell.ftsclient + +import fs2.Stream + +/** The fts client is the interface for docspell to a fulltext search + * engine. + * + * It defines all operations required for integration into docspell. + * It uses data structures and terms of docspell. Implementation + * modules need to translate it to the engine that provides the + * features. + */ +trait FtsClient[F[_]] { + + def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult] + + def indexData(data: TextData): F[Unit] +} diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala new file mode 100644 index 00000000..6cbee19f --- /dev/null +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala @@ -0,0 +1,10 @@ +package docspell.ftsclient + +import docspell.common._ + +/** A fulltext query. + * + * The query itself is a raw string. Each implementation may + * interpret it according to the system in use. + */ +final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala new file mode 100644 index 00000000..4b829932 --- /dev/null +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala @@ -0,0 +1,5 @@ +package docspell.ftsclient + +import docspell.common._ + +final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala new file mode 100644 index 00000000..df3e60a3 --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala @@ -0,0 +1,12 @@ +package docspell.ftssolr + +import fs2.Stream +import docspell.ftsclient._ + +final class SolrFtsClient[F[_]] extends FtsClient[F] { + + def searchBasic(q: FtsQuery): Stream[F, FtsBasicResult] = + ??? + def indexData(data: TextData): F[Unit] = + ??? +} diff --git a/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md new file mode 100644 index 00000000..a32ecd3b --- /dev/null +++ b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md @@ -0,0 +1,51 @@ +--- +layout: docs +title: Fulltext Search Engine +--- + +# Choose a Fulltext Search Engine + +It should be possible to search the contents of all documents. + +## Context and Problem Statement + +To allow searching the documents contents efficiently, a separate +index is necessary. The "defacto standard" for fulltext search on the +JVM is something backed by [Lucene](https://lucene.apache.org). +Another option is to use a RDBMS that supports fulltext search. + +This adds another component to the mix, which increases the complexity +of the setup and the software. Since docspell works great without this +feature, it shouldn't have a huge impact on the application, i.e. if +the fulltext search component is down or broken, docspell should still +work (just the fulltext search is then not working). + +## Considered Options + +* [Apache SOLR](https://lucene.apache.org/solr) +* [ElasticSearch](https://www.elastic.co/elasticsearch/) +* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html) +* All of them or a subset + +## Decision Outcome + +If docspell is running on PostgreSQL, it would be the best option to +also use it for fulltext search. But I don't want to lock the database +to PostgreSQL *only* because of the fulltext search feature. This +would be a too large impact on the whole application. + +ElasticSearch and Apache SOLR are quite similiar in features. SOLR is +part of Lucene and therefore lives in the Apache ecosystem. I would +choose this over ElasticSearch, which is backed by a company (the oss +version is released under the Apache License, afaiu). Regarding +features, both are great. + +The last option (supporting all) is interesting, since it would enable +to use PostgreSQL for fulltext search, when already using PostgreSQL +as the database for docspell. + +So in a first step, identify what docspell needs from a fulltext +search component and create this interface and an implementation for +Apache SOLR. This enables all users to use the fulltext search +feature. As a later step, an implementation based on PostgreSQL could +be provided, too. diff --git a/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md b/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md new file mode 100644 index 00000000..4c4f0ff7 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0015_fulltext_search_design.md @@ -0,0 +1,16 @@ +--- +layout: docs +title: Fulltext Search Design +--- + +# How to integrate Fulltext Search + + + +## Context and Problem Statement + + +## Considered Options + + +## Decision Outcome