From b2add008ed22872ec2984eccee64b0329ad13ea2 Mon Sep 17 00:00:00 2001 From: eikek Date: Sun, 20 Mar 2022 13:14:07 +0100 Subject: [PATCH 1/8] Pass language when updating fulltext index --- .../backend/fulltext/CreateIndex.scala | 10 ++++-- .../scala/docspell/backend/ops/OItem.scala | 33 +++++++++++++++---- .../scala/docspell/ftsclient/FtsClient.scala | 12 +++++-- .../scala/docspell/ftsclient/TextData.scala | 12 ++++--- .../joex/process/TextExtraction.scala | 5 +-- .../scala/docspell/store/queries/QItem.scala | 8 +++-- .../docspell/store/records/RAttachment.scala | 14 ++++++-- 7 files changed, 72 insertions(+), 22 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala b/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala index 38c1ea67..bce169e3 100644 --- a/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala +++ b/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala @@ -8,7 +8,6 @@ package docspell.backend.fulltext import cats.data.NonEmptyList import cats.effect._ - import docspell.common._ import docspell.ftsclient.FtsClient import docspell.ftsclient.TextData @@ -62,7 +61,14 @@ object CreateIndex { val items = store .transact(QItem.allNameAndNotes(collective, itemIds, chunkSize)) .map(nn => - TextData.item(nn.id, nn.collective, nn.folder, Option(nn.name), nn.notes) + TextData.item( + nn.id, + nn.collective, + nn.folder, + Option(nn.name), + nn.notes, + nn.language + ) ) fts.indexData(logger, attachs ++ items) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index a6e3b314..391a6593 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -605,7 +605,14 @@ object OItem { .transact(RItem.updateNotes(item, collective, notes)) ) .flatTap( - onSuccessIgnoreError(fts.updateItemNotes(logger, item, collective, notes)) + onSuccessIgnoreError { + store + .transact(RCollective.findLanguage(collective)) + .map(_.getOrElse(Language.English)) + .flatMap(lang => + fts.updateItemNotes(logger, item, collective, lang, notes) + ) + } ) def setName(item: Ident, name: String, collective: Ident): F[UpdateResult] = @@ -615,7 +622,14 @@ object OItem { .transact(RItem.updateName(item, collective, name)) ) .flatTap( - onSuccessIgnoreError(fts.updateItemName(logger, item, collective, name)) + onSuccessIgnoreError { + store + .transact(RCollective.findLanguage(collective)) + .map(_.getOrElse(Language.English)) + .flatMap(lang => + fts.updateItemName(logger, item, collective, lang, name) + ) + } ) def setNameMultiple( @@ -733,10 +747,17 @@ object OItem { ) .flatTap( onSuccessIgnoreError( - OptionT(store.transact(RAttachment.findItemId(attachId))) - .semiflatMap(itemId => - fts.updateAttachmentName(logger, itemId, attachId, collective, name) - ) + OptionT(store.transact(RAttachment.findItemAndLanguage(attachId))) + .semiflatMap { case (itemId, lang) => + fts.updateAttachmentName( + logger, + itemId, + attachId, + collective, + lang.getOrElse(Language.English), + name + ) + } .fold(())(identity) ) ) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala index 13ee23c3..920ce450 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala @@ -68,19 +68,24 @@ trait FtsClient[F[_]] { logger: Logger[F], itemId: Ident, collective: Ident, + language: Language, name: String ): F[Unit] = - updateIndex(logger, TextData.item(itemId, collective, None, Some(name), None)) + updateIndex( + logger, + TextData.item(itemId, collective, None, Some(name), None, language) + ) def updateItemNotes( logger: Logger[F], itemId: Ident, collective: Ident, + language: Language, notes: Option[String] ): F[Unit] = updateIndex( logger, - TextData.item(itemId, collective, None, None, Some(notes.getOrElse(""))) + TextData.item(itemId, collective, None, None, Some(notes.getOrElse("")), language) ) def updateAttachmentName( @@ -88,6 +93,7 @@ trait FtsClient[F[_]] { itemId: Ident, attachId: Ident, collective: Ident, + language: Language, name: Option[String] ): F[Unit] = updateIndex( @@ -97,7 +103,7 @@ trait FtsClient[F[_]] { attachId, collective, None, - Language.English, + language, Some(name.getOrElse("")), None ) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala index 8d71e17a..90da688a 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala @@ -18,6 +18,8 @@ sealed trait TextData { def folder: Option[Ident] + def language: Language + final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A = this match { case a: TextData.Attachment => f(a) @@ -32,7 +34,7 @@ object TextData { attachId: Ident, collective: Ident, folder: Option[Ident], - lang: Language, + language: Language, name: Option[String], text: Option[String] ) extends TextData { @@ -57,7 +59,8 @@ object TextData { collective: Ident, folder: Option[Ident], name: Option[String], - notes: Option[String] + notes: Option[String], + language: Language ) extends TextData { val id = Ident.unsafe("item") / item @@ -69,8 +72,9 @@ object TextData { collective: Ident, folder: Option[Ident], name: Option[String], - notes: Option[String] + notes: Option[String], + lang: Language ): TextData = - Item(item, collective, folder, name, notes) + Item(item, collective, folder, name, notes, lang) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index ee6b8939..52ce8601 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -47,9 +47,10 @@ object TextExtraction { ctx.args.meta.collective, ctx.args.meta.folderId, item.item.name.some, - None + None, + ctx.args.meta.language ) - _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*) + _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)): _*) dur <- start extractedTags = txt.flatMap(_.tags).distinct.toList _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}.") diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 41641538..5e9dacb4 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -576,7 +576,8 @@ object QItem { collective: Ident, folder: Option[Ident], name: String, - notes: Option[String] + notes: Option[String], + language: Language ) def allNameAndNotes( coll: Option[Ident], @@ -584,10 +585,11 @@ object QItem { chunkSize: Int ): Stream[ConnectionIO, NameAndNotes] = { val i = RItem.as("i") + val c = RCollective.as("c") Select( - select(i.id, i.cid, i.folder, i.name, i.notes), - from(i) + select(i.id, i.cid, i.folder, i.name, i.notes, c.language), + from(i).innerJoin(c, c.id === i.cid) ).where( i.state.in(ItemState.validStates) &&? itemIds.map(ids => i.id.in(ids)) &&? diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index bf99a01b..b1a9dcc9 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -236,8 +236,18 @@ object RAttachment { n3 <- DML.delete(T, T.id === attachId) } yield n0 + n1 + n2 + n3 - def findItemId(attachId: Ident): ConnectionIO[Option[Ident]] = - Select(T.itemId.s, from(T), T.id === attachId).build.query[Ident].option + def findItemAndLanguage( + attachId: Ident + ): ConnectionIO[Option[(Ident, Option[Language])]] = { + val a = RAttachment.as("a") + val m = RAttachmentMeta.as("m") + + Select( + select(a.itemId, m.language), + from(a).leftJoin(m, m.id === a.id), + a.id === attachId + ).build.query[(Ident, Option[Language])].option + } def findAll( coll: Option[Ident], From 029335e607aacda47b96468cecdae1135aa7ccfd Mon Sep 17 00:00:00 2001 From: eikek Date: Sun, 20 Mar 2022 21:44:32 +0100 Subject: [PATCH 2/8] Working poc of postgresql based fulltext search backend --- build.sbt | 17 ++ .../db/psqlfts/V2.0.0__initial_schema.sql | 27 +++ .../scala/docspell/ftspsql/DbMigration.scala | 36 ++++ .../scala/docspell/ftspsql/DoobieMeta.scala | 26 +++ .../scala/docspell/ftspsql/FtsRecord.scala | 58 ++++++ .../docspell/ftspsql/FtsRepository.scala | 178 ++++++++++++++++++ .../scala/docspell/ftspsql/PsqlConfig.scala | 5 + .../docspell/ftspsql/PsqlFtsClient.scala | 131 +++++++++++++ .../scala/docspell/ftspsql/SearchResult.scala | 47 +++++ .../docspell/ftspsql/SearchSummary.scala | 3 + .../docspell/ftspsql/MigrationTest.scala | 31 +++ .../scala/docspell/ftssolr/JsonCodec.scala | 4 +- .../main/scala/docspell/joex/JoexTasks.scala | 22 ++- .../docspell/restserver/RestAppImpl.scala | 27 ++- .../src/main/scala/docspell/store/Store.scala | 2 + .../scala/docspell/store/impl/StoreImpl.scala | 3 +- .../store/migrate/PostgresqlMigrateTest.scala | 2 +- 17 files changed, 601 insertions(+), 18 deletions(-) create mode 100644 modules/fts-psql/src/main/resources/db/psqlfts/V2.0.0__initial_schema.sql create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala create mode 100644 modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala diff --git a/build.sbt b/build.sbt index 7a837d9c..cba02caf 100644 --- a/build.sbt +++ b/build.sbt @@ -619,6 +619,20 @@ val ftssolr = project ) .dependsOn(common, ftsclient) +val ftspsql = project + .in(file("modules/fts-psql")) + .disablePlugins(RevolverPlugin) + .settings(sharedSettings) + .withTestSettings + .settings( + name := "docspell-fts-psql", + libraryDependencies ++= + Dependencies.doobie ++ + Dependencies.postgres ++ + Dependencies.flyway + ) + .dependsOn(common, ftsclient, store % "compile->test;test->test") + val restapi = project .in(file("modules/restapi")) .disablePlugins(RevolverPlugin) @@ -769,6 +783,7 @@ val joex = project joexapi, restapi, ftssolr, + ftspsql, pubsubNaive, notificationImpl, schedulerImpl @@ -841,6 +856,7 @@ val restserver = project backend, webapp, ftssolr, + ftspsql, oidc, pubsubNaive, notificationImpl, @@ -926,6 +942,7 @@ val root = project analysis, ftsclient, ftssolr, + ftspsql, files, store, joexapi, diff --git a/modules/fts-psql/src/main/resources/db/psqlfts/V2.0.0__initial_schema.sql b/modules/fts-psql/src/main/resources/db/psqlfts/V2.0.0__initial_schema.sql new file mode 100644 index 00000000..cad4ef9a --- /dev/null +++ b/modules/fts-psql/src/main/resources/db/psqlfts/V2.0.0__initial_schema.sql @@ -0,0 +1,27 @@ +create table "ftspsql_search"( + "id" varchar(254) not null primary key, + "item_id" varchar(254) not null, + "collective" varchar(254) not null, + "lang" varchar(254) not null, + "attach_id" varchar(254), + "folder_id" varchar(254), + "updated_at" timestamptz not null default current_timestamp, + --- content columns + "attach_name" text, + "attach_content" text, + "item_name" text, + "item_notes" text, + --- index column + "fts_config" regconfig not null, + "text_index" tsvector + generated always as ( + setweight(to_tsvector("fts_config", coalesce("attach_name", '')), 'B') || + setweight(to_tsvector("fts_config", coalesce("item_name", '')), 'B') || + setweight(to_tsvector("fts_config", coalesce("attach_content", '')), 'C') || + setweight(to_tsvector("fts_config", coalesce("item_notes", '')), 'C')) stored +); + +create index "ftspsql_search_ftsidx" on "ftspsql_search" using GIN ("text_index"); +create index "ftpsql_search_item_idx" on "ftspsql_search"("item_id"); +create index "ftpsql_search_attach_idx" on "ftspsql_search"("attach_id"); +create index "ftpsql_search_folder_idx" on "ftspsql_search"("folder_id"); diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala new file mode 100644 index 00000000..b5ce5fd3 --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala @@ -0,0 +1,36 @@ +package docspell.ftspsql + +import cats.effect._ +import cats.implicits._ +import org.flywaydb.core.Flyway +import org.flywaydb.core.api.output.MigrateResult + +final class DbMigration[F[_]: Sync](cfg: PsqlConfig) { + private[this] val logger = docspell.logging.getLogger[F] + private val location: String = "classpath:db/psqlfts" + + def run: F[MigrateResult] = + for { + fw <- createFlyway + _ <- logger.info(s"Running FTS migrations") + result <- Sync[F].blocking(fw.migrate()) + } yield result + + def createFlyway: F[Flyway] = + for { + _ <- logger.info(s"Creating Flyway for: $location") + fw = Flyway + .configure() + .table("flyway_fts_history") + .cleanDisabled(true) + .dataSource(cfg.url.asString, cfg.user, cfg.password.pass) + .locations(location) + .baselineOnMigrate(true) + .load() + } yield fw +} + +object DbMigration { + def apply[F[_]: Sync](cfg: PsqlConfig): DbMigration[F] = + new DbMigration[F](cfg) +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala new file mode 100644 index 00000000..60302f37 --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala @@ -0,0 +1,26 @@ +package docspell.ftspsql + +import docspell.common._ +import doobie._ +import doobie.util.log.Success + +trait DoobieMeta { + + implicit val sqlLogging: LogHandler = LogHandler { + case e @ Success(_, _, _, _) => + DoobieMeta.logger.trace("SQL " + e) + case e => + DoobieMeta.logger.error(s"SQL Failure: $e") + } + + implicit val metaIdent: Meta[Ident] = + Meta[String].timap(Ident.unsafe)(_.id) + + implicit val metaLanguage: Meta[Language] = + Meta[String].timap(Language.unsafe)(_.iso3) + +} + +object DoobieMeta { + private val logger = org.log4s.getLogger +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala new file mode 100644 index 00000000..2036923c --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala @@ -0,0 +1,58 @@ +package docspell.ftspsql + +import cats.syntax.all._ +import docspell.common.{Ident, Language} +import docspell.ftsclient.TextData + +final case class FtsRecord( + id: String, + itemId: Ident, + collective: Ident, + language: Language, + attachId: Option[Ident], + folderId: Option[Ident], + attachName: Option[String], + attachContent: Option[String], + itemName: Option[String], + itemNotes: Option[String] +) + +object FtsRecord { + def fromTextData(td: TextData): FtsRecord = + td match { + case TextData.Attachment( + item, + attachId, + collective, + folder, + language, + name, + text + ) => + FtsRecord( + td.id.id, + item, + collective, + language, + attachId.some, + folder, + name, + text, + None, + None + ) + case TextData.Item(item, collective, folder, name, notes, language) => + FtsRecord( + td.id.id, + item, + collective, + language, + None, + folder, + None, + None, + name, + notes + ) + } +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala new file mode 100644 index 00000000..251bcdc9 --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -0,0 +1,178 @@ +package docspell.ftspsql + +import cats.data.NonEmptyList +import docspell.common._ +import docspell.ftsclient.FtsQuery +import doobie._ +import doobie.implicits._ +import fs2.Chunk + +object FtsRepository extends DoobieMeta { + val table = fr"ftspsql_search" + + def searchSummary(q: FtsQuery): ConnectionIO[SearchSummary] = { + val selectRank = mkSelectRank + val query = mkQueryPart(q) + + sql"""select count(id), max($selectRank) + |from $table, $query + |where ${mkCondition(q)} AND query @@ text_index + |""".stripMargin + .query[SearchSummary] + .unique + } + + def search( + q: FtsQuery, + withHighlighting: Boolean + ): ConnectionIO[Vector[SearchResult]] = { + val selectRank = mkSelectRank + + val hlOption = + s"startsel=${q.highlight.pre},stopsel=${q.highlight.post}" + + val selectHl = + if (!withHighlighting) fr"null as highlight" + else + fr"""ts_headline( + | fts_config, + | coalesce(attach_name, '') || + | ' ' || coalesce(attach_content, '') || + | ' ' || coalesce(item_name, '') || + | ' ' || coalesce(item_notes, ''), query, $hlOption) as highlight""".stripMargin + + val select = + fr"id, item_id, collective, lang, attach_id, folder_id, attach_name, item_name, $selectRank as rank, $selectHl" + + val query = mkQueryPart(q) + + sql"""select $select + |from $table, $query + |where ${mkCondition(q)} AND query @@ text_index + |order by rank desc + |limit ${q.limit} + |offset ${q.offset} + |""".stripMargin + .query[SearchResult] + .to[Vector] + } + + private def mkCondition(q: FtsQuery): Fragment = { + val coll = fr"collective = ${q.collective}" + val items = + NonEmptyList.fromList(q.items.toList).map { nel => + val ids = nel.map(id => fr"$id").reduceLeft(_ ++ fr"," ++ _) + fr"item_id in ($ids)" + } + + val folders = + NonEmptyList.fromList(q.folders.toList).map { nel => + val ids = nel.map(id => fr"$id").reduceLeft(_ ++ fr"," ++ _) + fr"folder_id in ($ids)" + } + + List(items, folders).flatten.foldLeft(coll)(_ ++ fr"AND" ++ _) + } + + private def mkQueryPart(q: FtsQuery): Fragment = + fr"websearch_to_tsquery(fts_config, ${q.q}) query" + + private def mkSelectRank: Fragment = + fr"ts_rank_cd(text_index, query, 4)" + + def replaceChunk(r: Chunk[FtsRecord]): ConnectionIO[Int] = + r.traverse(replace).map(_.foldLeft(0)(_ + _)) + + def replace(r: FtsRecord): ConnectionIO[Int] = + (fr"INSERT INTO $table (id,item_id,collective,lang,attach_id,folder_id,attach_name,attach_content,item_name,item_notes,fts_config) VALUES (" ++ + commas( + sql"${r.id}", + sql"${r.itemId}", + sql"${r.collective}", + sql"${r.language}", + sql"${r.attachId}", + sql"${r.folderId}", + sql"${r.attachName}", + sql"${r.attachContent}", + sql"${r.itemName}", + sql"${r.itemNotes}", + sql"${pgConfig(r.language)}::regconfig" + ) ++ sql") on conflict (id) do update set " ++ commas( + sql"lang = ${r.language}", + sql"folder_id = ${r.folderId}", + sql"attach_name = ${r.attachName}", + sql"attach_content = ${r.attachContent}", + sql"item_name = ${r.itemName}", + sql"item_notes = ${r.itemNotes}", + sql"fts_config = ${pgConfig(r.language)}::regconfig" + )).update.run + + def update(r: FtsRecord): ConnectionIO[Int] = + (fr"UPDATE $table SET" ++ commas( + sql"lang = ${r.language}", + sql"folder_id = ${r.folderId}", + sql"attach_name = ${r.attachName}", + sql"attach_content = ${r.attachContent}", + sql"item_name = ${r.itemName}", + sql"item_notes = ${r.itemNotes}", + sql"fts_config = ${pgConfig(r.language)}::regconfig" + ) ++ fr"WHERE id = ${r.id}").update.run + + def updateChunk(r: Chunk[FtsRecord]): ConnectionIO[Int] = + r.traverse(update).map(_.foldLeft(0)(_ + _)) + + def updateFolder( + itemId: Ident, + collective: Ident, + folder: Option[Ident] + ): ConnectionIO[Int] = + (sql"UPDATE $table" ++ + fr"SET folder_id = $folder" ++ + fr"WHERE item_id = $itemId AND collective = $collective").update.run + + def deleteByItemId(itemId: Ident): ConnectionIO[Int] = + sql"DELETE FROM $table WHERE item_id = $itemId".update.run + + def deleteByAttachId(attachId: Ident): ConnectionIO[Int] = + sql"DELETE FROM $table WHERE attach_id = $attachId".update.run + + def deleteAll: ConnectionIO[Int] = + sql"DELETE FROM $table".update.run + + def delete(collective: Ident): ConnectionIO[Int] = + sql"DELETE FROM $table WHERE collective = $collective".update.run + + def resetAll: ConnectionIO[Int] = { + val dropFlyway = sql"DROP TABLE IF EXISTS flyway_fts_history".update.run + val dropSearch = sql"DROP TABLE IF EXISTS $table".update.run + for { + a <- dropFlyway + b <- dropSearch + } yield a + b + } + + private def commas(fr: Fragment, frn: Fragment*): Fragment = + frn.foldLeft(fr)(_ ++ fr"," ++ _) + + def pgConfig(language: Language): String = + language match { + case Language.English => "english" + case Language.German => "german" + case Language.French => "french" + case Language.Italian => "italian" + case Language.Spanish => "spanish" + case Language.Hungarian => "hungarian" + case Language.Portuguese => "portuguese" + case Language.Czech => "simple" // ? + case Language.Danish => "danish" + case Language.Finnish => "finnish" + case Language.Norwegian => "norwegian" + case Language.Swedish => "swedish" + case Language.Russian => "russian" + case Language.Romanian => "romanian" + case Language.Dutch => "dutch" + case Language.Latvian => "lithuanian" // ? + case Language.Japanese => "simple" + case Language.Hebrew => "simple" + } +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala new file mode 100644 index 00000000..136f919f --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala @@ -0,0 +1,5 @@ +package docspell.ftspsql + +import docspell.common.{LenientUri, Password} + +case class PsqlConfig(url: LenientUri, user: String, password: Password) diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala new file mode 100644 index 00000000..f16f170d --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala @@ -0,0 +1,131 @@ +package docspell.ftspsql + +import cats.effect._ +import cats.implicits._ +import com.zaxxer.hikari.HikariDataSource +import docspell.common._ +import docspell.ftsclient._ +import docspell.logging.Logger +import doobie._ +import doobie.hikari.HikariTransactor +import doobie.implicits._ +import fs2.Stream + +import scala.concurrent.ExecutionContext + +final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) + extends FtsClient[F] { + val engine = Ident.unsafe("postgres") + + def initialize: F[List[FtsMigration[F]]] = + Sync[F].pure( + List( + FtsMigration( + 0, + engine, + "initialize", + DbMigration[F](cfg).run.as(FtsMigration.Result.WorkDone) + ) + ) + ) + + def initializeNew: List[FtsMigration[F]] = + List( + FtsMigration( + 10, + engine, + "reset", + FtsRepository.resetAll.transact(xa).as(FtsMigration.Result.workDone) + ), + FtsMigration( + 20, + engine, + "schema", + DbMigration[F](cfg).run.as(FtsMigration.Result.workDone) + ), + FtsMigration(20, engine, "index all", FtsMigration.Result.indexAll.pure[F]) + ) + + def search(q: FtsQuery): F[FtsResult] = + for { + startNanos <- Sync[F].delay(System.nanoTime()) + summary <- FtsRepository.searchSummary(q).transact(xa) + results <- FtsRepository.search(q, true).transact(xa) + endNanos <- Sync[F].delay(System.nanoTime()) + duration = Duration.nanos(endNanos - startNanos) + res = SearchResult + .toFtsResult(summary, results) + .copy(qtime = duration) + } yield res + + def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = + data + .map(FtsRecord.fromTextData) + .chunkN(50) + .evalMap(chunk => + logger.debug(s"Update fts index with ${chunk.size} records") *> FtsRepository + .replaceChunk(chunk) + .transact(xa) + ) + .compile + .drain + + def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = + data + .map(FtsRecord.fromTextData) + .chunkN(50) + .evalMap(chunk => FtsRepository.updateChunk(chunk).transact(xa)) + .compile + .drain + + def updateFolder( + logger: Logger[F], + itemId: Ident, + collective: Ident, + folder: Option[Ident] + ): F[Unit] = + logger.debug(s"Update folder '${folder + .map(_.id)}' in fts for collective ${collective.id} and item ${itemId.id}") *> + FtsRepository.updateFolder(itemId, collective, folder).transact(xa).void + + def removeItem(logger: Logger[F], itemId: Ident): F[Unit] = + logger.debug(s"Removing item from fts index: ${itemId.id}") *> + FtsRepository.deleteByItemId(itemId).transact(xa).void + + def removeAttachment(logger: Logger[F], attachId: Ident): F[Unit] = + logger.debug(s"Removing attachment from fts index: ${attachId.id}") *> + FtsRepository.deleteByAttachId(attachId).transact(xa).void + + def clearAll(logger: Logger[F]): F[Unit] = + logger.info(s"Deleting complete FTS index") *> + FtsRepository.deleteAll.transact(xa).void + + def clear(logger: Logger[F], collective: Ident): F[Unit] = + logger.info(s"Deleting index for collective ${collective.id}") *> + FtsRepository.delete(collective).transact(xa).void +} + +object PsqlFtsClient { + def apply[F[_]: Async]( + cfg: PsqlConfig, + connectEC: ExecutionContext + ): Resource[F, PsqlFtsClient[F]] = { + val acquire = Sync[F].delay(new HikariDataSource()) + val free: HikariDataSource => F[Unit] = ds => Sync[F].delay(ds.close()) + + for { + ds <- Resource.make(acquire)(free) + _ = Resource.pure { + ds.setJdbcUrl(cfg.url.asString) + ds.setUsername(cfg.user) + ds.setPassword(cfg.password.pass) + ds.setDriverClassName("org.postgresql.Driver") + } + xa = HikariTransactor[F](ds, connectEC) + + pc = new PsqlFtsClient[F](cfg, xa) + // _ <- Resource.eval(st.migrate) + } yield pc + } + +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala new file mode 100644 index 00000000..8d2fdab6 --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala @@ -0,0 +1,47 @@ +package docspell.ftspsql + +import docspell.common._ +import docspell.ftsclient.FtsResult.{ItemMatch, MatchData} +import docspell.ftsclient.FtsResult + +final case class SearchResult( + id: Ident, + itemId: Ident, + collective: Ident, + language: Language, + attachId: Option[Ident], + folderId: Option[Ident], + attachName: Option[String], + itemName: Option[String], + rank: Double, + highlight: Option[String] +) + +object SearchResult { + + def toFtsResult(summary: SearchSummary, results: Vector[SearchResult]): FtsResult = { + def mkEntry(r: SearchResult): (ItemMatch, (Ident, List[String])) = { + def create(md: MatchData) = ItemMatch(r.id, r.itemId, r.collective, r.rank, md) + + val itemMatch = + r.attachId match { + case Some(aId) => + create(FtsResult.AttachmentData(aId, r.attachName.getOrElse(""))) + case None => + create(FtsResult.ItemData) + } + + (itemMatch, r.id -> r.highlight.toList) + } + + val (items, hl) = results.map(mkEntry).unzip + + FtsResult( + Duration.zero, + summary.count.toInt, + summary.maxScore, + hl.toMap, + items.toList + ) + } +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala new file mode 100644 index 00000000..3e4e838c --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala @@ -0,0 +1,3 @@ +package docspell.ftspsql + +case class SearchSummary(count: Long, maxScore: Double) diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala new file mode 100644 index 00000000..b21c9368 --- /dev/null +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala @@ -0,0 +1,31 @@ +package docspell.ftspsql + +import cats.effect._ +import cats.effect.unsafe.implicits._ +import docspell.logging.{Level, LogConfig} +//import cats.implicits._ +import com.dimafeng.testcontainers.PostgreSQLContainer +import com.dimafeng.testcontainers.munit.TestContainerForAll +import docspell.common._ +import docspell.logging.TestLoggingConfig +import munit.FunSuite +import org.testcontainers.utility.DockerImageName + +class MigrationTest extends FunSuite with TestContainerForAll with TestLoggingConfig { + override val containerDef: PostgreSQLContainer.Def = + PostgreSQLContainer.Def(DockerImageName.parse("postgres:14")) + + override def docspellLogConfig: LogConfig = + LogConfig(Level.Debug, LogConfig.Format.Fancy) + + override def rootMinimumLevel = Level.Warn + + test("create schema") { + withContainers { cnt => + val jdbc = + PsqlConfig(LenientUri.unsafe(cnt.jdbcUrl), cnt.username, Password(cnt.password)) + + new DbMigration[IO](jdbc).run.void.unsafeRunSync() + } + } +} diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala index 31c093d9..0d972749 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala @@ -22,7 +22,7 @@ trait JsonCodec { new Encoder[TextData.Attachment] { final def apply(td: TextData.Attachment): Json = { val cnt = - (Field.contentField(td.lang).name, Json.fromString(td.text.getOrElse(""))) + (Field.contentField(td.language).name, Json.fromString(td.text.getOrElse(""))) Json.fromFields( cnt :: List( @@ -165,7 +165,7 @@ trait JsonCodec { val setter = List( td.name.map(n => (Field.attachmentName.name, Map("set" -> n.asJson).asJson)), td.text.map(txt => - (Field.contentField(td.lang).name, Map("set" -> txt.asJson).asJson) + (Field.contentField(td.language).name, Map("set" -> txt.asJson).asJson) ) ).flatten Json.fromFields( diff --git a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala index 334943f7..c6ab41f4 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala @@ -7,13 +7,13 @@ package docspell.joex import cats.effect.{Async, Resource} - import docspell.analysis.TextAnalyser import docspell.backend.fulltext.CreateIndex import docspell.backend.ops._ import docspell.common._ import docspell.ftsclient.FtsClient -import docspell.ftssolr.SolrFtsClient +import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} +//import docspell.ftssolr.SolrFtsClient import docspell.joex.analysis.RegexNerFile import docspell.joex.emptytrash.EmptyTrashTask import docspell.joex.filecopy.{FileCopyTask, FileIntegrityCheckTask} @@ -33,7 +33,6 @@ import docspell.pubsub.api.PubSubT import docspell.scheduler.impl.JobStoreModuleBuilder import docspell.scheduler.{JobStoreModule, JobTask, JobTaskRegistry} import docspell.store.Store - import emil.Emil import org.http4s.client.Client @@ -221,7 +220,7 @@ object JoexTasks { joex <- OJoex(pubSub) store = jobStoreModule.store upload <- OUpload(store, jobStoreModule.jobs) - fts <- createFtsClient(cfg)(httpClient) + fts <- createFtsClient(cfg, store) createIndex <- CreateIndex.resource(fts, store) itemOps <- OItem(store, fts, createIndex, jobStoreModule.jobs) itemSearchOps <- OItemSearch(store) @@ -249,8 +248,17 @@ object JoexTasks { ) private def createFtsClient[F[_]: Async]( - cfg: Config - )(client: Client[F]): Resource[F, FtsClient[F]] = - if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) + cfg: Config, + store: Store[F] /*, + client: Client[F] */ + ): Resource[F, FtsClient[F]] = + // if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) + if (cfg.fullTextSearch.enabled) + Resource.pure[F, FtsClient[F]]( + new PsqlFtsClient[F]( + PsqlConfig(cfg.jdbc.url, cfg.jdbc.user, Password(cfg.jdbc.password)), + store.transactor + ) + ) else Resource.pure[F, FtsClient[F]](FtsClient.none[F]) } diff --git a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala index 7e6c7025..6016afb6 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala @@ -9,11 +9,12 @@ package docspell.restserver import cats.effect._ import fs2.Stream import fs2.concurrent.Topic - import docspell.backend.BackendApp import docspell.backend.auth.{AuthToken, ShareToken} +import docspell.common.Password import docspell.ftsclient.FtsClient -import docspell.ftssolr.SolrFtsClient +import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} +//import docspell.ftssolr.SolrFtsClient import docspell.notification.api.NotificationModule import docspell.notification.impl.NotificationModuleImpl import docspell.oidc.CodeFlowRoutes @@ -25,7 +26,6 @@ import docspell.restserver.webapp.{TemplateRoutes, Templates, WebjarRoutes} import docspell.restserver.ws.{OutputEvent, WebSocketRoutes} import docspell.scheduler.impl.JobStoreModuleBuilder import docspell.store.Store - import emil.javamail.JavaMailEmil import org.http4s.HttpRoutes import org.http4s.client.Client @@ -163,7 +163,7 @@ object RestAppImpl { val logger = docspell.logging.getLogger[F](s"restserver-${cfg.appId.id}") for { - ftsClient <- createFtsClient(cfg)(httpClient) + ftsClient <- createFtsClient(cfg, store) pubSubT = PubSubT(pubSub, logger) javaEmil = JavaMailEmil(cfg.backend.mailSettings) notificationMod <- Resource.eval( @@ -188,8 +188,21 @@ object RestAppImpl { } private def createFtsClient[F[_]: Async]( - cfg: Config - )(client: Client[F]): Resource[F, FtsClient[F]] = - if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) + cfg: Config, + store: Store[F] /*, client: Client[F] */ + ): Resource[F, FtsClient[F]] = + // if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) + if (cfg.fullTextSearch.enabled) + Resource.pure[F, FtsClient[F]]( + new PsqlFtsClient[F]( + PsqlConfig( + cfg.backend.jdbc.url, + cfg.backend.jdbc.user, + Password(cfg.backend.jdbc.password) + ), + store.transactor + ) + ) else Resource.pure[F, FtsClient[F]](FtsClient.none[F]) + } diff --git a/modules/store/src/main/scala/docspell/store/Store.scala b/modules/store/src/main/scala/docspell/store/Store.scala index b7f611e3..d41e83d2 100644 --- a/modules/store/src/main/scala/docspell/store/Store.scala +++ b/modules/store/src/main/scala/docspell/store/Store.scala @@ -34,6 +34,8 @@ trait Store[F[_]] { ): FileRepository[F] def add(insert: ConnectionIO[Int], exists: ConnectionIO[Boolean]): F[AddResult] + + def transactor: Transactor[F] } object Store { diff --git a/modules/store/src/main/scala/docspell/store/impl/StoreImpl.scala b/modules/store/src/main/scala/docspell/store/impl/StoreImpl.scala index 87703a8b..d68ef6e3 100644 --- a/modules/store/src/main/scala/docspell/store/impl/StoreImpl.scala +++ b/modules/store/src/main/scala/docspell/store/impl/StoreImpl.scala @@ -24,8 +24,9 @@ final class StoreImpl[F[_]: Async]( val fileRepo: FileRepository[F], jdbc: JdbcConfig, ds: DataSource, - xa: Transactor[F] + val transactor: Transactor[F] ) extends Store[F] { + private[this] val xa = transactor def createFileRepository( cfg: FileRepositoryConfig, diff --git a/modules/store/src/test/scala/docspell/store/migrate/PostgresqlMigrateTest.scala b/modules/store/src/test/scala/docspell/store/migrate/PostgresqlMigrateTest.scala index 1ba69f55..235b240e 100644 --- a/modules/store/src/test/scala/docspell/store/migrate/PostgresqlMigrateTest.scala +++ b/modules/store/src/test/scala/docspell/store/migrate/PostgresqlMigrateTest.scala @@ -23,7 +23,7 @@ class PostgresqlMigrateTest with TestContainerForAll with TestLoggingConfig { override val containerDef: PostgreSQLContainer.Def = - PostgreSQLContainer.Def(DockerImageName.parse("postgres:13")) + PostgreSQLContainer.Def(DockerImageName.parse("postgres:14")) test("postgres empty schema migration") { assume(Docker.existsUnsafe, "docker doesn't exist!") From fef00bdfb5de5f5be2cfdc6c98b85e3427cca03e Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 00:10:28 +0100 Subject: [PATCH 3/8] Some basic tests and config --- .../scala/docspell/ftspsql/DoobieMeta.scala | 2 +- .../scala/docspell/ftspsql/FtsRecord.scala | 6 +- .../docspell/ftspsql/FtsRepository.scala | 51 ++++--- .../docspell/ftspsql/PgQueryParser.scala | 32 ++++ .../scala/docspell/ftspsql/PsqlConfig.scala | 24 ++- .../docspell/ftspsql/PsqlFtsClient.scala | 30 +++- .../docspell/ftspsql/RankNormalization.scala | 40 +++++ .../docspell/ftspsql/MigrationTest.scala | 25 ++- .../scala/docspell/ftspsql/PgFixtures.scala | 69 +++++++++ .../docspell/ftspsql/PsqlFtsClientTest.scala | 143 ++++++++++++++++++ .../main/scala/docspell/joex/JoexTasks.scala | 2 +- .../docspell/restserver/RestAppImpl.scala | 2 +- 12 files changed, 385 insertions(+), 41 deletions(-) create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala create mode 100644 modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala create mode 100644 modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala create mode 100644 modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala index 60302f37..fd17df58 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala @@ -8,7 +8,7 @@ trait DoobieMeta { implicit val sqlLogging: LogHandler = LogHandler { case e @ Success(_, _, _, _) => - DoobieMeta.logger.trace("SQL " + e) + DoobieMeta.logger.debug("SQL " + e) case e => DoobieMeta.logger.error(s"SQL Failure: $e") } diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala index 2036923c..0b6f48ab 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala @@ -5,7 +5,7 @@ import docspell.common.{Ident, Language} import docspell.ftsclient.TextData final case class FtsRecord( - id: String, + id: Ident, itemId: Ident, collective: Ident, language: Language, @@ -30,7 +30,7 @@ object FtsRecord { text ) => FtsRecord( - td.id.id, + td.id, item, collective, language, @@ -43,7 +43,7 @@ object FtsRecord { ) case TextData.Item(item, collective, folder, name, notes, language) => FtsRecord( - td.id.id, + td.id, item, collective, language, diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index 251bcdc9..38515ea8 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -10,11 +10,13 @@ import fs2.Chunk object FtsRepository extends DoobieMeta { val table = fr"ftspsql_search" - def searchSummary(q: FtsQuery): ConnectionIO[SearchSummary] = { - val selectRank = mkSelectRank - val query = mkQueryPart(q) + def searchSummary(pq: PgQueryParser, rn: RankNormalization)( + q: FtsQuery + ): ConnectionIO[SearchSummary] = { + val selectRank = mkSelectRank(rn) + val query = mkQueryPart(pq, q) - sql"""select count(id), max($selectRank) + sql"""select count(id), coalesce(max($selectRank), 0) |from $table, $query |where ${mkCondition(q)} AND query @@ text_index |""".stripMargin @@ -22,11 +24,11 @@ object FtsRepository extends DoobieMeta { .unique } - def search( + def search(pq: PgQueryParser, rn: RankNormalization)( q: FtsQuery, withHighlighting: Boolean ): ConnectionIO[Vector[SearchResult]] = { - val selectRank = mkSelectRank + val selectRank = mkSelectRank(rn) val hlOption = s"startsel=${q.highlight.pre},stopsel=${q.highlight.post}" @@ -44,7 +46,7 @@ object FtsRepository extends DoobieMeta { val select = fr"id, item_id, collective, lang, attach_id, folder_id, attach_name, item_name, $selectRank as rank, $selectHl" - val query = mkQueryPart(q) + val query = mkQueryPart(pq, q) sql"""select $select |from $table, $query @@ -74,16 +76,22 @@ object FtsRepository extends DoobieMeta { List(items, folders).flatten.foldLeft(coll)(_ ++ fr"AND" ++ _) } - private def mkQueryPart(q: FtsQuery): Fragment = - fr"websearch_to_tsquery(fts_config, ${q.q}) query" + private def mkQueryPart(p: PgQueryParser, q: FtsQuery): Fragment = { + val fname = Fragment.const(p.name) + fr"$fname(fts_config, ${q.q}) query" + } - private def mkSelectRank: Fragment = - fr"ts_rank_cd(text_index, query, 4)" + private def mkSelectRank(rn: RankNormalization): Fragment = { + val bits = rn.value.toNonEmptyList.map(n => sql"$n").reduceLeft(_ ++ sql"|" ++ _) + fr"ts_rank_cd(text_index, query, $bits)" + } - def replaceChunk(r: Chunk[FtsRecord]): ConnectionIO[Int] = - r.traverse(replace).map(_.foldLeft(0)(_ + _)) + def replaceChunk(pgConfig: Language => String)(r: Chunk[FtsRecord]): ConnectionIO[Int] = + r.traverse(replace(pgConfig)).map(_.foldLeft(0)(_ + _)) - def replace(r: FtsRecord): ConnectionIO[Int] = + def replace( + pgConfig: Language => String + )(r: FtsRecord): ConnectionIO[Int] = (fr"INSERT INTO $table (id,item_id,collective,lang,attach_id,folder_id,attach_name,attach_content,item_name,item_notes,fts_config) VALUES (" ++ commas( sql"${r.id}", @@ -107,7 +115,7 @@ object FtsRepository extends DoobieMeta { sql"fts_config = ${pgConfig(r.language)}::regconfig" )).update.run - def update(r: FtsRecord): ConnectionIO[Int] = + def update(pgConfig: Language => String)(r: FtsRecord): ConnectionIO[Int] = (fr"UPDATE $table SET" ++ commas( sql"lang = ${r.language}", sql"folder_id = ${r.folderId}", @@ -118,8 +126,8 @@ object FtsRepository extends DoobieMeta { sql"fts_config = ${pgConfig(r.language)}::regconfig" ) ++ fr"WHERE id = ${r.id}").update.run - def updateChunk(r: Chunk[FtsRecord]): ConnectionIO[Int] = - r.traverse(update).map(_.foldLeft(0)(_ + _)) + def updateChunk(pgConfig: Language => String)(r: Chunk[FtsRecord]): ConnectionIO[Int] = + r.traverse(update(pgConfig)).map(_.foldLeft(0)(_ + _)) def updateFolder( itemId: Ident, @@ -154,7 +162,10 @@ object FtsRepository extends DoobieMeta { private def commas(fr: Fragment, frn: Fragment*): Fragment = frn.foldLeft(fr)(_ ++ fr"," ++ _) - def pgConfig(language: Language): String = + def getPgConfig(select: PartialFunction[Language, String])(language: Language): String = + select.applyOrElse(language, defaultPgConfig) + + def defaultPgConfig(language: Language): String = language match { case Language.English => "english" case Language.German => "german" @@ -163,7 +174,6 @@ object FtsRepository extends DoobieMeta { case Language.Spanish => "spanish" case Language.Hungarian => "hungarian" case Language.Portuguese => "portuguese" - case Language.Czech => "simple" // ? case Language.Danish => "danish" case Language.Finnish => "finnish" case Language.Norwegian => "norwegian" @@ -171,7 +181,8 @@ object FtsRepository extends DoobieMeta { case Language.Russian => "russian" case Language.Romanian => "romanian" case Language.Dutch => "dutch" - case Language.Latvian => "lithuanian" // ? + case Language.Czech => "simple" + case Language.Latvian => "simple" case Language.Japanese => "simple" case Language.Hebrew => "simple" } diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala new file mode 100644 index 00000000..f189a0aa --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala @@ -0,0 +1,32 @@ +package docspell.ftspsql + +import cats.data.NonEmptyList + +sealed trait PgQueryParser { + def name: String +} + +object PgQueryParser { + + case object ToTsQuery extends PgQueryParser { + val name = "to_tsquery" + } + case object Plain extends PgQueryParser { + val name = "plainto_tsquery" + } + case object Phrase extends PgQueryParser { + val name = "phraseto_tsquery" + } + case object Websearch extends PgQueryParser { + val name = "websearch_to_tsquery" + } + + val all: NonEmptyList[PgQueryParser] = + NonEmptyList.of(ToTsQuery, Plain, Phrase, Websearch) + + def fromName(name: String): Either[String, PgQueryParser] = + all.find(_.name.equalsIgnoreCase(name)).toRight(s"Unknown pg query parser: $name") + + def unsafeFromName(name: String): PgQueryParser = + fromName(name).fold(sys.error, identity) +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala index 136f919f..41a10af7 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala @@ -1,5 +1,25 @@ package docspell.ftspsql -import docspell.common.{LenientUri, Password} +import docspell.common._ -case class PsqlConfig(url: LenientUri, user: String, password: Password) +final case class PsqlConfig( + url: LenientUri, + user: String, + password: Password, + pgConfigSelect: PartialFunction[Language, String], + pgQueryParser: PgQueryParser, + rankNormalization: RankNormalization +) + +object PsqlConfig { + + def defaults(url: LenientUri, user: String, password: Password): PsqlConfig = + PsqlConfig( + url, + user, + password, + PartialFunction.empty, + PgQueryParser.Websearch, + RankNormalization.Mhd && RankNormalization.Scale + ) +} diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala index f16f170d..b8156114 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala @@ -17,6 +17,19 @@ final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) extends FtsClient[F] { val engine = Ident.unsafe("postgres") + val config = cfg + private[ftspsql] val transactor = xa + + private[this] val searchSummary = + FtsRepository.searchSummary(cfg.pgQueryParser, cfg.rankNormalization) _ + private[this] val search = + FtsRepository.search(cfg.pgQueryParser, cfg.rankNormalization) _ + + private[this] val replaceChunk = + FtsRepository.replaceChunk(FtsRepository.getPgConfig(cfg.pgConfigSelect)) _ + private[this] val updateChunk = + FtsRepository.updateChunk(FtsRepository.getPgConfig(cfg.pgConfigSelect)) _ + def initialize: F[List[FtsMigration[F]]] = Sync[F].pure( List( @@ -49,8 +62,8 @@ final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) def search(q: FtsQuery): F[FtsResult] = for { startNanos <- Sync[F].delay(System.nanoTime()) - summary <- FtsRepository.searchSummary(q).transact(xa) - results <- FtsRepository.search(q, true).transact(xa) + summary <- searchSummary(q).transact(xa) + results <- search(q, true).transact(xa) endNanos <- Sync[F].delay(System.nanoTime()) duration = Duration.nanos(endNanos - startNanos) res = SearchResult @@ -63,9 +76,8 @@ final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) .map(FtsRecord.fromTextData) .chunkN(50) .evalMap(chunk => - logger.debug(s"Update fts index with ${chunk.size} records") *> FtsRepository - .replaceChunk(chunk) - .transact(xa) + logger.debug(s"Add to fts index ${chunk.size} records") *> + replaceChunk(chunk).transact(xa) ) .compile .drain @@ -74,7 +86,10 @@ final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) data .map(FtsRecord.fromTextData) .chunkN(50) - .evalMap(chunk => FtsRepository.updateChunk(chunk).transact(xa)) + .evalMap(chunk => + logger.debug(s"Update fts index with ${chunk.size} records") *> + updateChunk(chunk).transact(xa) + ) .compile .drain @@ -124,8 +139,9 @@ object PsqlFtsClient { xa = HikariTransactor[F](ds, connectEC) pc = new PsqlFtsClient[F](cfg, xa) - // _ <- Resource.eval(st.migrate) } yield pc } + def fromTransactor[F[_]: Async](cfg: PsqlConfig, xa: Transactor[F]): PsqlFtsClient[F] = + new PsqlFtsClient[F](cfg, xa) } diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala new file mode 100644 index 00000000..cc923a96 --- /dev/null +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala @@ -0,0 +1,40 @@ +package docspell.ftspsql + +import cats.Order +import cats.data.NonEmptySet + +sealed trait RankNormalization { self => + def value: NonEmptySet[Int] + + def &&(other: RankNormalization): RankNormalization = + new RankNormalization { val value = self.value ++ other.value } +} + +object RankNormalization { +// see https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING + + case object IgnoreDocLength extends RankNormalization { val value = NonEmptySet.one(0) } + case object LogDocLength extends RankNormalization { val value = NonEmptySet.one(1) } + case object DocLength extends RankNormalization { val value = NonEmptySet.one(2) } + case object Mhd extends RankNormalization { val value = NonEmptySet.one(4) } + case object UniqueWords extends RankNormalization { val value = NonEmptySet.one(8) } + case object LogUniqueWords extends RankNormalization { val value = NonEmptySet.one(16) } + case object Scale extends RankNormalization { val value = NonEmptySet.one(32) } + + def byNumber(n: Int): Either[String, RankNormalization] = + all.find(_.value.contains(n)).toRight(s"Unknown rank normalization number: $n") + + implicit val order: Order[RankNormalization] = + Order.by(_.value.reduce) + + val all: NonEmptySet[RankNormalization] = + NonEmptySet.of( + IgnoreDocLength, + LogDocLength, + DocLength, + Mhd, + UniqueWords, + LogUniqueWords, + Scale + ) +} diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala index b21c9368..62f1f4f9 100644 --- a/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala @@ -1,17 +1,20 @@ package docspell.ftspsql import cats.effect._ -import cats.effect.unsafe.implicits._ import docspell.logging.{Level, LogConfig} -//import cats.implicits._ +import munit.CatsEffectSuite import com.dimafeng.testcontainers.PostgreSQLContainer import com.dimafeng.testcontainers.munit.TestContainerForAll import docspell.common._ import docspell.logging.TestLoggingConfig -import munit.FunSuite import org.testcontainers.utility.DockerImageName +import doobie.implicits._ -class MigrationTest extends FunSuite with TestContainerForAll with TestLoggingConfig { +class MigrationTest + extends CatsEffectSuite + with PgFixtures + with TestContainerForAll + with TestLoggingConfig { override val containerDef: PostgreSQLContainer.Def = PostgreSQLContainer.Def(DockerImageName.parse("postgres:14")) @@ -23,9 +26,19 @@ class MigrationTest extends FunSuite with TestContainerForAll with TestLoggingCo test("create schema") { withContainers { cnt => val jdbc = - PsqlConfig(LenientUri.unsafe(cnt.jdbcUrl), cnt.username, Password(cnt.password)) + PsqlConfig.defaults( + LenientUri.unsafe(cnt.jdbcUrl), + cnt.username, + Password(cnt.password) + ) - new DbMigration[IO](jdbc).run.void.unsafeRunSync() + for { + _ <- DbMigration[IO](jdbc).run + n <- runQuery(cnt)( + sql"SELECT count(*) FROM ${FtsRepository.table}".query[Int].unique + ) + _ = assertEquals(n, 0) + } yield () } } } diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala new file mode 100644 index 00000000..82e15e26 --- /dev/null +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala @@ -0,0 +1,69 @@ +package docspell.ftspsql + +import cats.syntax.all._ +import com.dimafeng.testcontainers.PostgreSQLContainer +import docspell.common._ +import docspell.store.{JdbcConfig, StoreFixture} +import doobie._ +import doobie.implicits._ +import cats.effect._ +import docspell.ftsclient.TextData + +import javax.sql.DataSource + +trait PgFixtures { + def ident(n: String): Ident = Ident.unsafe(n) + + def psqlConfig(cnt: PostgreSQLContainer): PsqlConfig = + PsqlConfig.defaults( + LenientUri.unsafe(cnt.jdbcUrl), + cnt.username, + Password(cnt.password) + ) + + def jdbcConfig(cnt: PostgreSQLContainer): JdbcConfig = + JdbcConfig(LenientUri.unsafe(cnt.jdbcUrl), cnt.username, cnt.password) + + def dataSource(cnt: PostgreSQLContainer): Resource[IO, DataSource] = + StoreFixture.dataSource(jdbcConfig(cnt)) + + def transactor(cnt: PostgreSQLContainer): Resource[IO, Transactor[IO]] = + dataSource(cnt).flatMap(StoreFixture.makeXA) + + def psqlFtsClient(cnt: PostgreSQLContainer): Resource[IO, PsqlFtsClient[IO]] = + transactor(cnt) + .map(xa => PsqlFtsClient.fromTransactor(psqlConfig(cnt), xa)) + .evalTap(client => DbMigration[IO](client.config).run) + + def runQuery[A](cnt: PostgreSQLContainer)(q: ConnectionIO[A]): IO[A] = + transactor(cnt).use(q.transact(_)) + + implicit class QueryOps[A](self: ConnectionIO[A]) { + def exec(implicit client: PsqlFtsClient[IO]): IO[A] = + self.transact(client.transactor) + } + + val collective1 = ident("coll1") + val collective2 = ident("coll2") + + val itemData: TextData.Item = + TextData.Item( + item = ident("item-id-1"), + collective = collective1, + folder = None, + name = "mydoc.pdf".some, + notes = Some("my notes are these"), + language = Language.English + ) + + val attachData: TextData.Attachment = + TextData.Attachment( + item = ident("item-id-1"), + attachId = ident("attach-id-1"), + collective = collective1, + folder = None, + language = Language.English, + name = "mydoc.pdf".some, + text = "lorem ipsum dolores est".some + ) +} diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala new file mode 100644 index 00000000..eb611315 --- /dev/null +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala @@ -0,0 +1,143 @@ +package docspell.ftspsql + +import cats.syntax.all._ +import com.dimafeng.testcontainers.PostgreSQLContainer +import com.dimafeng.testcontainers.munit.TestContainerForAll +import docspell.logging.{Level, LogConfig, TestLoggingConfig} +import munit.CatsEffectSuite +import org.testcontainers.utility.DockerImageName +import cats.effect._ +import docspell.ftsclient.{FtsQuery, TextData} +import doobie.implicits._ + +class PsqlFtsClientTest + extends CatsEffectSuite + with PgFixtures + with TestContainerForAll + with TestLoggingConfig { + override val containerDef: PostgreSQLContainer.Def = + PostgreSQLContainer.Def(DockerImageName.parse("postgres:14")) + + val logger = docspell.logging.getLogger[IO] + + private val table = FtsRepository.table + + override def docspellLogConfig: LogConfig = + LogConfig(Level.Debug, LogConfig.Format.Fancy) + + override def rootMinimumLevel = Level.Warn + + test("insert data into index") { + withContainers { cnt => + psqlFtsClient(cnt).use { implicit client => + def assertions(id: TextData.Item, ad: TextData.Attachment) = + for { + n <- sql"SELECT count(*) from $table".query[Int].unique.exec + _ = assertEquals(n, 2) + itemStored <- + sql"select item_name, item_notes from $table WHERE id = ${id.id}" + .query[(Option[String], Option[String])] + .unique + .exec + _ = assertEquals(itemStored, (id.name, id.notes)) + attachStored <- + sql"select attach_name, attach_content from $table where id = ${ad.id}" + .query[(Option[String], Option[String])] + .unique + .exec + _ = assertEquals(attachStored, (ad.name, ad.text)) + } yield () + + for { + _ <- client.indexData(logger, itemData, attachData) + _ <- assertions(itemData, attachData) + _ <- client.indexData(logger, itemData, attachData) + _ <- assertions(itemData, attachData) + + _ <- client.indexData( + logger, + itemData.copy(notes = None), + attachData.copy(name = "ha.pdf".some) + ) + _ <- assertions( + itemData.copy(notes = None), + attachData.copy(name = "ha.pdf".some) + ) + } yield () + } + } + } + + test("clear index") { + withContainers { cnt => + psqlFtsClient(cnt).use { implicit client => + for { + _ <- client.indexData(logger, itemData, attachData) + _ <- client.clearAll(logger) + n <- sql"select count(*) from $table".query[Int].unique.exec + _ = assertEquals(n, 0) + } yield () + } + } + } + + test("clear index by collective") { + withContainers { cnt => + psqlFtsClient(cnt).use { implicit client => + for { + _ <- client.indexData( + logger, + itemData, + attachData, + itemData.copy(collective = collective2, item = ident("item-id-2")), + attachData.copy(collective = collective2, item = ident("item-id-2")) + ) + n <- sql"select count(*) from $table".query[Int].unique.exec + _ = assertEquals(n, 4) + + _ <- client.clear(logger, collective1) + n <- sql"select count(*) from $table".query[Int].unique.exec + _ = assertEquals(n, 2) + } yield () + } + } + } + + test("search by query") { + def query(s: String): FtsQuery = + FtsQuery( + q = s, + collective = collective1, + items = Set.empty, + folders = Set.empty, + limit = 10, + offset = 0, + highlight = FtsQuery.HighlightSetting.default + ) + + withContainers { cnt => + psqlFtsClient(cnt).use { implicit client => + for { + _ <- client.indexData( + logger, + itemData, + attachData, + itemData.copy(collective = collective2, item = ident("item-id-2")), + attachData.copy(collective = collective2, item = ident("item-id-2")) + ) + + res0 <- client.search(query("lorem uiaeduiae")) + _ = assertEquals(res0.count, 0) + + res1 <- client.search(query("lorem")) + _ = assertEquals(res1.count, 1) + _ = assertEquals(res1.results.head.id, attachData.id) + + res2 <- client.search(query("note")) + _ = assertEquals(res2.count, 1) + _ = assertEquals(res2.results.head.id, itemData.id) + } yield () + } + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala index c6ab41f4..59e0ff69 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala @@ -256,7 +256,7 @@ object JoexTasks { if (cfg.fullTextSearch.enabled) Resource.pure[F, FtsClient[F]]( new PsqlFtsClient[F]( - PsqlConfig(cfg.jdbc.url, cfg.jdbc.user, Password(cfg.jdbc.password)), + PsqlConfig.defaults(cfg.jdbc.url, cfg.jdbc.user, Password(cfg.jdbc.password)), store.transactor ) ) diff --git a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala index 6016afb6..c19916cb 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala @@ -195,7 +195,7 @@ object RestAppImpl { if (cfg.fullTextSearch.enabled) Resource.pure[F, FtsClient[F]]( new PsqlFtsClient[F]( - PsqlConfig( + PsqlConfig.defaults( cfg.backend.jdbc.url, cfg.backend.jdbc.user, Password(cfg.backend.jdbc.password) From 1e56e832dad8e1ca332cd15b5e77f7f2b18a3c3c Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 00:13:52 +0100 Subject: [PATCH 4/8] Reformat code --- .../backend/fulltext/CreateIndex.scala | 1 + .../scala/docspell/ftspsql/DbMigration.scala | 7 +++++++ .../scala/docspell/ftspsql/DoobieMeta.scala | 7 +++++++ .../scala/docspell/ftspsql/FtsRecord.scala | 7 +++++++ .../scala/docspell/ftspsql/FtsRepository.scala | 10 +++++++++- .../scala/docspell/ftspsql/PgQueryParser.scala | 6 ++++++ .../scala/docspell/ftspsql/PsqlConfig.scala | 6 ++++++ .../scala/docspell/ftspsql/PsqlFtsClient.scala | 16 ++++++++++++---- .../docspell/ftspsql/RankNormalization.scala | 6 ++++++ .../scala/docspell/ftspsql/SearchResult.scala | 8 +++++++- .../scala/docspell/ftspsql/SearchSummary.scala | 6 ++++++ .../scala/docspell/ftspsql/MigrationTest.scala | 18 +++++++++++++----- .../scala/docspell/ftspsql/PgFixtures.scala | 18 +++++++++++++----- .../docspell/ftspsql/PsqlFtsClientTest.scala | 16 ++++++++++++---- .../main/scala/docspell/joex/JoexTasks.scala | 3 ++- .../docspell/restserver/RestAppImpl.scala | 3 ++- 16 files changed, 116 insertions(+), 22 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala b/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala index bce169e3..a30e0922 100644 --- a/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala +++ b/modules/backend/src/main/scala/docspell/backend/fulltext/CreateIndex.scala @@ -8,6 +8,7 @@ package docspell.backend.fulltext import cats.data.NonEmptyList import cats.effect._ + import docspell.common._ import docspell.ftsclient.FtsClient import docspell.ftsclient.TextData diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala index b5ce5fd3..fb746587 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/DbMigration.scala @@ -1,7 +1,14 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.effect._ import cats.implicits._ + import org.flywaydb.core.Flyway import org.flywaydb.core.api.output.MigrateResult diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala index fd17df58..1a537ae5 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/DoobieMeta.scala @@ -1,6 +1,13 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import docspell.common._ + import doobie._ import doobie.util.log.Success diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala index 0b6f48ab..f868b273 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRecord.scala @@ -1,6 +1,13 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.syntax.all._ + import docspell.common.{Ident, Language} import docspell.ftsclient.TextData diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index 38515ea8..ff32acbf 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -1,11 +1,19 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.data.NonEmptyList +import fs2.Chunk + import docspell.common._ import docspell.ftsclient.FtsQuery + import doobie._ import doobie.implicits._ -import fs2.Chunk object FtsRepository extends DoobieMeta { val table = fr"ftspsql_search" diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala index f189a0aa..01f7607e 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PgQueryParser.scala @@ -1,3 +1,9 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.data.NonEmptyList diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala index 41a10af7..a6dbe6bd 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlConfig.scala @@ -1,3 +1,9 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import docspell.common._ diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala index b8156114..fefa3e55 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/PsqlFtsClient.scala @@ -1,17 +1,25 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql +import scala.concurrent.ExecutionContext + import cats.effect._ import cats.implicits._ -import com.zaxxer.hikari.HikariDataSource +import fs2.Stream + import docspell.common._ import docspell.ftsclient._ import docspell.logging.Logger + +import com.zaxxer.hikari.HikariDataSource import doobie._ import doobie.hikari.HikariTransactor import doobie.implicits._ -import fs2.Stream - -import scala.concurrent.ExecutionContext final class PsqlFtsClient[F[_]: Sync](cfg: PsqlConfig, xa: Transactor[F]) extends FtsClient[F] { diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala index cc923a96..c6b3eeb9 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/RankNormalization.scala @@ -1,3 +1,9 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.Order diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala index 8d2fdab6..faf37fe7 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchResult.scala @@ -1,8 +1,14 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import docspell.common._ -import docspell.ftsclient.FtsResult.{ItemMatch, MatchData} import docspell.ftsclient.FtsResult +import docspell.ftsclient.FtsResult.{ItemMatch, MatchData} final case class SearchResult( id: Ident, diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala index 3e4e838c..e218f698 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/SearchSummary.scala @@ -1,3 +1,9 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql case class SearchSummary(count: Long, maxScore: Double) diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala index 62f1f4f9..2335cf85 100644 --- a/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/MigrationTest.scala @@ -1,14 +1,22 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql import cats.effect._ -import docspell.logging.{Level, LogConfig} -import munit.CatsEffectSuite -import com.dimafeng.testcontainers.PostgreSQLContainer -import com.dimafeng.testcontainers.munit.TestContainerForAll + import docspell.common._ import docspell.logging.TestLoggingConfig -import org.testcontainers.utility.DockerImageName +import docspell.logging.{Level, LogConfig} + +import com.dimafeng.testcontainers.PostgreSQLContainer +import com.dimafeng.testcontainers.munit.TestContainerForAll import doobie.implicits._ +import munit.CatsEffectSuite +import org.testcontainers.utility.DockerImageName class MigrationTest extends CatsEffectSuite diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala index 82e15e26..acbbe9e1 100644 --- a/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/PgFixtures.scala @@ -1,15 +1,23 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql +import javax.sql.DataSource + +import cats.effect._ import cats.syntax.all._ -import com.dimafeng.testcontainers.PostgreSQLContainer + import docspell.common._ +import docspell.ftsclient.TextData import docspell.store.{JdbcConfig, StoreFixture} + +import com.dimafeng.testcontainers.PostgreSQLContainer import doobie._ import doobie.implicits._ -import cats.effect._ -import docspell.ftsclient.TextData - -import javax.sql.DataSource trait PgFixtures { def ident(n: String): Ident = Ident.unsafe(n) diff --git a/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala b/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala index eb611315..b5c099f3 100644 --- a/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala +++ b/modules/fts-psql/src/test/scala/docspell/ftspsql/PsqlFtsClientTest.scala @@ -1,14 +1,22 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + package docspell.ftspsql +import cats.effect._ import cats.syntax.all._ + +import docspell.ftsclient.{FtsQuery, TextData} +import docspell.logging.{Level, LogConfig, TestLoggingConfig} + import com.dimafeng.testcontainers.PostgreSQLContainer import com.dimafeng.testcontainers.munit.TestContainerForAll -import docspell.logging.{Level, LogConfig, TestLoggingConfig} +import doobie.implicits._ import munit.CatsEffectSuite import org.testcontainers.utility.DockerImageName -import cats.effect._ -import docspell.ftsclient.{FtsQuery, TextData} -import doobie.implicits._ class PsqlFtsClientTest extends CatsEffectSuite diff --git a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala index 59e0ff69..303e0e55 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala @@ -7,13 +7,13 @@ package docspell.joex import cats.effect.{Async, Resource} + import docspell.analysis.TextAnalyser import docspell.backend.fulltext.CreateIndex import docspell.backend.ops._ import docspell.common._ import docspell.ftsclient.FtsClient import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} -//import docspell.ftssolr.SolrFtsClient import docspell.joex.analysis.RegexNerFile import docspell.joex.emptytrash.EmptyTrashTask import docspell.joex.filecopy.{FileCopyTask, FileIntegrityCheckTask} @@ -33,6 +33,7 @@ import docspell.pubsub.api.PubSubT import docspell.scheduler.impl.JobStoreModuleBuilder import docspell.scheduler.{JobStoreModule, JobTask, JobTaskRegistry} import docspell.store.Store + import emil.Emil import org.http4s.client.Client diff --git a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala index c19916cb..484a3e23 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala @@ -9,12 +9,12 @@ package docspell.restserver import cats.effect._ import fs2.Stream import fs2.concurrent.Topic + import docspell.backend.BackendApp import docspell.backend.auth.{AuthToken, ShareToken} import docspell.common.Password import docspell.ftsclient.FtsClient import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} -//import docspell.ftssolr.SolrFtsClient import docspell.notification.api.NotificationModule import docspell.notification.impl.NotificationModuleImpl import docspell.oidc.CodeFlowRoutes @@ -26,6 +26,7 @@ import docspell.restserver.webapp.{TemplateRoutes, Templates, WebjarRoutes} import docspell.restserver.ws.{OutputEvent, WebSocketRoutes} import docspell.scheduler.impl.JobStoreModuleBuilder import docspell.store.Store + import emil.javamail.JavaMailEmil import org.http4s.HttpRoutes import org.http4s.client.Client From 21e13341e390871d7caf1544d9047fcf5d999ad9 Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 11:04:58 +0100 Subject: [PATCH 5/8] Configure postgres fts backend --- build.sbt | 27 +++++++------ .../main/scala/docspell/common/Banner.scala | 4 +- .../main/scala/docspell/config/FtsType.scala | 27 +++++++++++++ .../scala/docspell/config/Implicits.scala | 27 +++++++++++-- .../scala/docspell/config/PgFtsConfig.scala | 37 +++++++++++++++++ .../joex/src/main/resources/reference.conf | 40 +++++++++++++++++++ .../src/main/scala/docspell/joex/Config.scala | 21 +++++++++- .../main/scala/docspell/joex/ConfigFile.scala | 12 +++++- .../scala/docspell/joex/JoexAppImpl.scala | 4 +- .../main/scala/docspell/joex/JoexServer.scala | 2 +- .../main/scala/docspell/joex/JoexTasks.scala | 32 ++++++++++----- .../src/main/scala/docspell/joex/Main.scala | 2 +- .../src/main/resources/reference.conf | 40 +++++++++++++++++++ .../scala/docspell/restserver/Config.scala | 22 +++++++++- .../docspell/restserver/ConfigFile.scala | 13 +++++- .../main/scala/docspell/restserver/Main.scala | 2 +- .../docspell/restserver/RestAppImpl.scala | 37 ++++++++++------- .../docspell/restserver/RestServer.scala | 2 +- 18 files changed, 295 insertions(+), 56 deletions(-) create mode 100644 modules/config/src/main/scala/docspell/config/FtsType.scala create mode 100644 modules/config/src/main/scala/docspell/config/PgFtsConfig.scala diff --git a/build.sbt b/build.sbt index cba02caf..087f8d1d 100644 --- a/build.sbt +++ b/build.sbt @@ -319,19 +319,6 @@ val common = project ) .dependsOn(loggingApi) -val config = project - .in(file("modules/config")) - .disablePlugins(RevolverPlugin) - .settings(sharedSettings) - .withTestSettings - .settings( - name := "docspell-config", - libraryDependencies ++= - Dependencies.fs2 ++ - Dependencies.pureconfig - ) - .dependsOn(common, loggingApi) - val loggingScribe = project .in(file("modules/logging/scribe")) .disablePlugins(RevolverPlugin) @@ -729,6 +716,20 @@ val webapp = project ) .dependsOn(query.js) +// Config project shared among the two applications only +val config = project + .in(file("modules/config")) + .disablePlugins(RevolverPlugin) + .settings(sharedSettings) + .withTestSettings + .settings( + name := "docspell-config", + libraryDependencies ++= + Dependencies.fs2 ++ + Dependencies.pureconfig + ) + .dependsOn(common, loggingApi, ftspsql, store) + // --- Application(s) val joex = project diff --git a/modules/common/src/main/scala/docspell/common/Banner.scala b/modules/common/src/main/scala/docspell/common/Banner.scala index 21a7f299..2e29897d 100644 --- a/modules/common/src/main/scala/docspell/common/Banner.scala +++ b/modules/common/src/main/scala/docspell/common/Banner.scala @@ -14,7 +14,7 @@ case class Banner( configFile: Option[String], appId: Ident, baseUrl: LenientUri, - ftsUrl: Option[LenientUri], + ftsInfo: Option[String], fileStoreConfig: FileStoreConfig ) { @@ -35,7 +35,7 @@ case class Banner( s"Id: ${appId.id}", s"Base-Url: ${baseUrl.asString}", s"Database: ${jdbcUrl.asString}", - s"Fts: ${ftsUrl.map(_.asString).getOrElse("-")}", + s"Fts: ${ftsInfo.getOrElse("-")}", s"Config: ${configFile.getOrElse("")}", s"FileRepo: ${fileStoreConfig}", "" diff --git a/modules/config/src/main/scala/docspell/config/FtsType.scala b/modules/config/src/main/scala/docspell/config/FtsType.scala new file mode 100644 index 00000000..2b6aec14 --- /dev/null +++ b/modules/config/src/main/scala/docspell/config/FtsType.scala @@ -0,0 +1,27 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.config + +import cats.data.NonEmptyList + +sealed trait FtsType { + def name: String +} + +object FtsType { + case object Solr extends FtsType { val name = "solr" } + case object PostgreSQL extends FtsType { val name = "postgresql" } + + val all: NonEmptyList[FtsType] = + NonEmptyList.of(Solr, PostgreSQL) + + def fromName(str: String): Either[String, FtsType] = + all.find(_.name.equalsIgnoreCase(str)).toRight(s"Unknown fts type: $str") + + def unsafeFromName(str: String): FtsType = + fromName(str).fold(sys.error, identity) +} diff --git a/modules/config/src/main/scala/docspell/config/Implicits.scala b/modules/config/src/main/scala/docspell/config/Implicits.scala index e9b23348..ddfc428e 100644 --- a/modules/config/src/main/scala/docspell/config/Implicits.scala +++ b/modules/config/src/main/scala/docspell/config/Implicits.scala @@ -10,9 +10,11 @@ import java.nio.file.{Path => JPath} import scala.reflect.ClassTag +import cats.syntax.all._ import fs2.io.file.Path import docspell.common._ +import docspell.ftspsql.{PgQueryParser, RankNormalization} import docspell.logging.{Level, LogConfig} import com.github.eikek.calev.CalEvent @@ -85,11 +87,28 @@ object Implicits { implicit val fileStoreTypeReader: ConfigReader[FileStoreType] = ConfigReader[String].emap(reason(FileStoreType.fromString)) - def reason[A: ClassTag]( - f: String => Either[String, A] - ): String => Either[FailureReason, A] = + implicit val pgQueryParserReader: ConfigReader[PgQueryParser] = + ConfigReader[String].emap(reason(PgQueryParser.fromName)) + + implicit val pgRankNormalizationReader: ConfigReader[RankNormalization] = + ConfigReader[List[Int]].emap( + reason(ints => ints.traverse(RankNormalization.byNumber).map(_.reduce(_ && _))) + ) + + implicit val languageReader: ConfigReader[Language] = + ConfigReader[String].emap(reason(Language.fromString)) + + implicit def languageMapReader[B: ConfigReader]: ConfigReader[Map[Language, B]] = + pureconfig.configurable.genericMapReader[Language, B](reason(Language.fromString)) + + implicit val ftsTypeReader: ConfigReader[FtsType] = + ConfigReader[String].emap(reason(FtsType.fromName)) + + def reason[T, A: ClassTag]( + f: T => Either[String, A] + ): T => Either[FailureReason, A] = in => f(in).left.map(str => - CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str) + CannotConvert(in.toString, implicitly[ClassTag[A]].runtimeClass.toString, str) ) } diff --git a/modules/config/src/main/scala/docspell/config/PgFtsConfig.scala b/modules/config/src/main/scala/docspell/config/PgFtsConfig.scala new file mode 100644 index 00000000..4979234a --- /dev/null +++ b/modules/config/src/main/scala/docspell/config/PgFtsConfig.scala @@ -0,0 +1,37 @@ +/* + * Copyright 2020 Eike K. & Contributors + * + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package docspell.config + +import docspell.common._ +import docspell.ftspsql._ +import docspell.store.JdbcConfig + +case class PgFtsConfig( + useDefaultConnection: Boolean, + jdbc: JdbcConfig, + pgQueryParser: PgQueryParser, + pgRankNormalization: RankNormalization, + pgConfig: Map[Language, String] +) { + + def toPsqlConfig(stdConn: JdbcConfig): PsqlConfig = { + val db = + if (useDefaultConnection) stdConn + else jdbc + + PsqlConfig( + db.url, + db.user, + Password(db.password), + pgConfig, + pgQueryParser, + pgRankNormalization + ) + } +} + +object PgFtsConfig {} diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 318bdeff..62211305 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -697,6 +697,9 @@ Docpell Update Check # Currently the SOLR search platform is supported. enabled = false + # Which backend to use, either solr or postgresql + backend = "solr" + # Configuration for the SOLR backend. solr = { # The URL to solr @@ -713,6 +716,43 @@ Docpell Update Check q-op = "OR" } + # Configuration for PostgreSQL backend + postgresql = { + # Whether to use the default database, only works if it is + # postgresql + use-default-connection = false + + # The database connection. + jdbc { + url = "jdbc:postgresql://server:5432/db" + user = "pguser" + password = "" + } + + # A mapping from a language to a postgres text search config. By + # default a language is mapped to a predefined config. + # PostgreSQL has predefined configs for some languages. This + # setting allows to create a custom text search config and + # define it here for some or all languages. + # + # Example: + # { german = "my-german" } + # + # See https://www.postgresql.org/docs/14/textsearch-tables.html ff. + pg-config = { + } + + # Define which query parser to use. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES + pg-query-parser = "websearch_to_tsquery" + + # Allows to define a normalization for the ranking. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING + pg-rank-normalization = [ 4 ] + } + # Settings for running the index migration tasks migration = { # Chunk size to use when indexing data from the database. This diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 3418a56d..de171135 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -13,6 +13,7 @@ import docspell.analysis.TextAnalysisConfig import docspell.analysis.classifier.TextClassifierConfig import docspell.backend.Config.Files import docspell.common._ +import docspell.config.{FtsType, PgFtsConfig} import docspell.convert.ConvertConfig import docspell.extract.ExtractConfig import docspell.ftssolr.SolrConfig @@ -65,9 +66,25 @@ object Config { case class FullTextSearch( enabled: Boolean, + backend: FtsType, migration: FullTextSearch.Migration, - solr: SolrConfig - ) + solr: SolrConfig, + postgresql: PgFtsConfig + ) { + + def info: String = + if (!enabled) "Disabled." + else + backend match { + case FtsType.Solr => + s"Solr(${solr.url.asString})" + case FtsType.PostgreSQL => + if (postgresql.useDefaultConnection) + "PostgreSQL(default)" + else + s"PostgreSQL(${postgresql.jdbc.url.asString})" + } + } object FullTextSearch { diff --git a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala index ec5e70da..d7f8b175 100644 --- a/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala +++ b/modules/joex/src/main/scala/docspell/joex/ConfigFile.scala @@ -9,7 +9,7 @@ package docspell.joex import cats.effect.Async import docspell.config.Implicits._ -import docspell.config.{ConfigFactory, Validation} +import docspell.config.{ConfigFactory, FtsType, Validation} import docspell.scheduler.CountingScheme import emil.MailAddress @@ -53,6 +53,14 @@ object ConfigFile { cfg => cfg.updateCheck.enabled && cfg.updateCheck.subject.els.isEmpty, "No subject given for enabled update check!" ), - Validation(cfg => cfg.files.validate.map(_ => cfg)) + Validation(cfg => cfg.files.validate.map(_ => cfg)), + Validation.failWhen( + cfg => + cfg.fullTextSearch.enabled && + cfg.fullTextSearch.backend == FtsType.PostgreSQL && + cfg.fullTextSearch.postgresql.useDefaultConnection && + !cfg.jdbc.dbmsName.contains("postgresql"), + s"PostgreSQL defined fulltext search backend with default-connection, which is not a PostgreSQL connection!" + ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 5d8b6a40..c410cc42 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -102,7 +102,8 @@ object JoexAppImpl extends MailAddressCodec { termSignal: SignallingRef[F, Boolean], store: Store[F], httpClient: Client[F], - pubSub: PubSub[F] + pubSub: PubSub[F], + pools: Pools ): Resource[F, JoexApp[F]] = for { joexLogger <- Resource.pure(docspell.logging.getLogger[F](s"joex-${cfg.appId.id}")) @@ -120,6 +121,7 @@ object JoexAppImpl extends MailAddressCodec { tasks <- JoexTasks.resource( cfg, + pools, jobStoreModule, httpClient, pubSubT, diff --git a/modules/joex/src/main/scala/docspell/joex/JoexServer.scala b/modules/joex/src/main/scala/docspell/joex/JoexServer.scala index a13d4b1f..5bdc8f18 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexServer.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexServer.scala @@ -52,7 +52,7 @@ object JoexServer { httpClient )(Topics.all.map(_.topic)) - joexApp <- JoexAppImpl.create[F](cfg, signal, store, httpClient, pubSub) + joexApp <- JoexAppImpl.create[F](cfg, signal, store, httpClient, pubSub, pools) httpApp = Router( "/internal" -> InternalHeader(settings.internalRouteKey) { diff --git a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala index 303e0e55..3dafed16 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexTasks.scala @@ -12,8 +12,10 @@ import docspell.analysis.TextAnalyser import docspell.backend.fulltext.CreateIndex import docspell.backend.ops._ import docspell.common._ +import docspell.config.FtsType import docspell.ftsclient.FtsClient -import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} +import docspell.ftspsql.PsqlFtsClient +import docspell.ftssolr.SolrFtsClient import docspell.joex.analysis.RegexNerFile import docspell.joex.emptytrash.EmptyTrashTask import docspell.joex.filecopy.{FileCopyTask, FileIntegrityCheckTask} @@ -211,6 +213,7 @@ object JoexTasks { def resource[F[_]: Async]( cfg: Config, + pools: Pools, jobStoreModule: JobStoreModuleBuilder.Module[F], httpClient: Client[F], pubSub: PubSubT[F], @@ -221,7 +224,7 @@ object JoexTasks { joex <- OJoex(pubSub) store = jobStoreModule.store upload <- OUpload(store, jobStoreModule.jobs) - fts <- createFtsClient(cfg, store) + fts <- createFtsClient(cfg, pools, store, httpClient) createIndex <- CreateIndex.resource(fts, store) itemOps <- OItem(store, fts, createIndex, jobStoreModule.jobs) itemSearchOps <- OItemSearch(store) @@ -250,16 +253,23 @@ object JoexTasks { private def createFtsClient[F[_]: Async]( cfg: Config, - store: Store[F] /*, - client: Client[F] */ + pools: Pools, + store: Store[F], + client: Client[F] ): Resource[F, FtsClient[F]] = - // if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) if (cfg.fullTextSearch.enabled) - Resource.pure[F, FtsClient[F]]( - new PsqlFtsClient[F]( - PsqlConfig.defaults(cfg.jdbc.url, cfg.jdbc.user, Password(cfg.jdbc.password)), - store.transactor - ) - ) + cfg.fullTextSearch.backend match { + case FtsType.Solr => + SolrFtsClient(cfg.fullTextSearch.solr, client) + + case FtsType.PostgreSQL => + val psqlCfg = cfg.fullTextSearch.postgresql.toPsqlConfig(cfg.jdbc) + if (cfg.fullTextSearch.postgresql.useDefaultConnection) + Resource.pure[F, FtsClient[F]]( + new PsqlFtsClient[F](psqlCfg, store.transactor) + ) + else + PsqlFtsClient(psqlCfg, pools.connectEC) + } else Resource.pure[F, FtsClient[F]](FtsClient.none[F]) } diff --git a/modules/joex/src/main/scala/docspell/joex/Main.scala b/modules/joex/src/main/scala/docspell/joex/Main.scala index a7607a5f..d9e77e89 100644 --- a/modules/joex/src/main/scala/docspell/joex/Main.scala +++ b/modules/joex/src/main/scala/docspell/joex/Main.scala @@ -31,7 +31,7 @@ object Main extends IOApp { Option(System.getProperty("config.file")), cfg.appId, cfg.baseUrl, - Some(cfg.fullTextSearch.solr.url).filter(_ => cfg.fullTextSearch.enabled), + Some(cfg.fullTextSearch.info).filter(_ => cfg.fullTextSearch.enabled), cfg.files.defaultStoreConfig ) _ <- logger.info(s"\n${banner.render("***>")}") diff --git a/modules/restserver/src/main/resources/reference.conf b/modules/restserver/src/main/resources/reference.conf index ee9bd476..df269e3f 100644 --- a/modules/restserver/src/main/resources/reference.conf +++ b/modules/restserver/src/main/resources/reference.conf @@ -289,6 +289,9 @@ docspell.server { # Currently the SOLR search platform is supported. enabled = false + # Which backend to use, either solr or postgresql + backend = "solr" + # Configuration for the SOLR backend. solr = { # The URL to solr @@ -304,6 +307,43 @@ docspell.server { # The default combiner for tokens. One of {AND, OR}. q-op = "OR" } + + # Configuration for PostgreSQL backend + postgresql = { + # Whether to use the default database, only works if it is + # postgresql + use-default-connection = false + + # The database connection. + jdbc { + url = "jdbc:postgresql://server:5432/db" + user = "pguser" + password = "" + } + + # A mapping from a language to a postgres text search config. By + # default a language is mapped to a predefined config. + # PostgreSQL has predefined configs for some languages. This + # setting allows to create a custom text search config and + # define it here for some or all languages. + # + # Example: + # { german = "my-german" } + # + # See https://www.postgresql.org/docs/14/textsearch-tables.html ff. + pg-config = { + } + + # Define which query parser to use. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES + pg-query-parser = "websearch_to_tsquery" + + # Allows to define a normalization for the ranking. + # + # https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING + pg-rank-normalization = [ 4 ] + } } # Configuration for the backend. diff --git a/modules/restserver/src/main/scala/docspell/restserver/Config.scala b/modules/restserver/src/main/scala/docspell/restserver/Config.scala index d0032b6f..23ac9a43 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/Config.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/Config.scala @@ -9,6 +9,7 @@ package docspell.restserver import docspell.backend.auth.Login import docspell.backend.{Config => BackendConfig} import docspell.common._ +import docspell.config.{FtsType, PgFtsConfig} import docspell.ftssolr.SolrConfig import docspell.logging.LogConfig import docspell.oidc.ProviderConfig @@ -92,7 +93,26 @@ object Config { } } - case class FullTextSearch(enabled: Boolean, solr: SolrConfig) + case class FullTextSearch( + enabled: Boolean, + backend: FtsType, + solr: SolrConfig, + postgresql: PgFtsConfig + ) { + + def info: String = + if (!enabled) "Disabled." + else + backend match { + case FtsType.Solr => + s"Solr(${solr.url.asString})" + case FtsType.PostgreSQL => + if (postgresql.useDefaultConnection) + "PostgreSQL(default)" + else + s"PostgreSQL(${postgresql.jdbc.url.asString})" + } + } object FullTextSearch {} diff --git a/modules/restserver/src/main/scala/docspell/restserver/ConfigFile.scala b/modules/restserver/src/main/scala/docspell/restserver/ConfigFile.scala index 2e225cae..a3f6d222 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/ConfigFile.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/ConfigFile.scala @@ -13,7 +13,7 @@ import cats.effect.Async import docspell.backend.signup.{Config => SignupConfig} import docspell.config.Implicits._ -import docspell.config.{ConfigFactory, Validation} +import docspell.config.{ConfigFactory, FtsType, Validation} import docspell.oidc.{ProviderConfig, SignatureAlgo} import docspell.restserver.auth.OpenId @@ -106,4 +106,15 @@ object ConfigFile { def filesValidate: Validation[Config] = Validation(cfg => cfg.backend.files.validate.map(_ => cfg)) + + def postgresFtsValidate: Validation[Config] = + Validation.failWhen( + cfg => + cfg.fullTextSearch.enabled && + cfg.fullTextSearch.backend == FtsType.PostgreSQL && + cfg.fullTextSearch.postgresql.useDefaultConnection && + !cfg.backend.jdbc.dbmsName.contains("postgresql"), + s"PostgreSQL defined fulltext search backend with default-connection, which is not a PostgreSQL connection!" + ) + } diff --git a/modules/restserver/src/main/scala/docspell/restserver/Main.scala b/modules/restserver/src/main/scala/docspell/restserver/Main.scala index 106052a1..66437744 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/Main.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/Main.scala @@ -28,7 +28,7 @@ object Main extends IOApp { Option(System.getProperty("config.file")), cfg.appId, cfg.baseUrl, - Some(cfg.fullTextSearch.solr.url).filter(_ => cfg.fullTextSearch.enabled), + Some(cfg.fullTextSearch.info).filter(_ => cfg.fullTextSearch.enabled), cfg.backend.files.defaultStoreConfig ) _ <- logger.info(s"\n${banner.render("***>")}") diff --git a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala index 484a3e23..b2553dee 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/RestAppImpl.scala @@ -12,9 +12,11 @@ import fs2.concurrent.Topic import docspell.backend.BackendApp import docspell.backend.auth.{AuthToken, ShareToken} -import docspell.common.Password +import docspell.common.Pools +import docspell.config.FtsType import docspell.ftsclient.FtsClient -import docspell.ftspsql.{PsqlConfig, PsqlFtsClient} +import docspell.ftspsql.PsqlFtsClient +import docspell.ftssolr.SolrFtsClient import docspell.notification.api.NotificationModule import docspell.notification.impl.NotificationModuleImpl import docspell.oidc.CodeFlowRoutes @@ -156,6 +158,7 @@ object RestAppImpl { def create[F[_]: Async]( cfg: Config, + pools: Pools, store: Store[F], httpClient: Client[F], pubSub: PubSub[F], @@ -164,7 +167,7 @@ object RestAppImpl { val logger = docspell.logging.getLogger[F](s"restserver-${cfg.appId.id}") for { - ftsClient <- createFtsClient(cfg, store) + ftsClient <- createFtsClient(cfg, pools, store, httpClient) pubSubT = PubSubT(pubSub, logger) javaEmil = JavaMailEmil(cfg.backend.mailSettings) notificationMod <- Resource.eval( @@ -190,20 +193,24 @@ object RestAppImpl { private def createFtsClient[F[_]: Async]( cfg: Config, - store: Store[F] /*, client: Client[F] */ + pools: Pools, + store: Store[F], + client: Client[F] ): Resource[F, FtsClient[F]] = - // if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client) if (cfg.fullTextSearch.enabled) - Resource.pure[F, FtsClient[F]]( - new PsqlFtsClient[F]( - PsqlConfig.defaults( - cfg.backend.jdbc.url, - cfg.backend.jdbc.user, - Password(cfg.backend.jdbc.password) - ), - store.transactor - ) - ) + cfg.fullTextSearch.backend match { + case FtsType.Solr => + SolrFtsClient(cfg.fullTextSearch.solr, client) + + case FtsType.PostgreSQL => + val psqlCfg = cfg.fullTextSearch.postgresql.toPsqlConfig(cfg.backend.jdbc) + if (cfg.fullTextSearch.postgresql.useDefaultConnection) + Resource.pure[F, FtsClient[F]]( + new PsqlFtsClient[F](psqlCfg, store.transactor) + ) + else + PsqlFtsClient(psqlCfg, pools.connectEC) + } else Resource.pure[F, FtsClient[F]](FtsClient.none[F]) } diff --git a/modules/restserver/src/main/scala/docspell/restserver/RestServer.scala b/modules/restserver/src/main/scala/docspell/restserver/RestServer.scala index 25d59cc0..1dfae260 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/RestServer.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/RestServer.scala @@ -88,7 +88,7 @@ object RestServer { store, httpClient )(Topics.all.map(_.topic)) - restApp <- RestAppImpl.create[F](cfg, store, httpClient, pubSub, wsTopic) + restApp <- RestAppImpl.create[F](cfg, pools, store, httpClient, pubSub, wsTopic) } yield (restApp, pubSub, setting) def createHttpApp[F[_]: Async]( From 3e87feff7b8242d0c60d6b09db7a5d20b44cbddf Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 13:42:10 +0100 Subject: [PATCH 6/8] Add some docs for postgres fts --- website/site/content/docs/configure/_index.md | 103 +++++++++++++++++- .../site/content/docs/install/download_run.md | 2 +- 2 files changed, 99 insertions(+), 6 deletions(-) diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index 4c09fce3..2fc11ed6 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -177,17 +177,37 @@ this account and setup the notification hooks in there - not in your normal account. -## Full-Text Search: SOLR +## Full-Text Search -[Apache SOLR](https://solr.apache.org) is used to provide the -full-text search. Both docspell components must provide the same -connection setup. This is defined in the `full-text-search.solr` +Fulltext search is optional and provided by external systems. There +are currently [Apache SOLR](https://solr.apache.org) and [PostgreSQL's +text search](https://www.postgresql.org/docs/14/textsearch.html) +available. + +You can enable and configure the fulltext search backends as described +below and then choose the wanted backend: + +```conf +full-text-search { + enabled = true + # Which backend to use, either solr or postgresql + backend = "solr" + … +} +``` + +All docspell components must provide the same fulltext search +configuration. + +### SOLR + +[Apache SOLR](https://solr.apache.org) can be used to provide the +full-text search. This is defined in the `full-text-search.solr` subsection: ``` bash ... full-text-search { - enabled = true ... solr = { url = "http://localhost:8983/solr/docspell" @@ -247,6 +267,79 @@ The solr index doesn't contain any new information, it can be regenerated any time using the above REST call. Thus it doesn't need to be backed up. +### PostgreSQL + +PostgreSQL provides many additional features, one of them is [text +search](https://www.postgresql.org/docs/14/textsearch.html). Docspell +can utilize this to provide the fulltext search feature. This is +especially useful, if PostgreSQL is used as the primary database for +docspell. + +You can choose to use the same database or separate connection. The +fulltext search will create a single table `ftspsql_search` that holds +all necessary data. When doing backups, you can exclude this table as +it can be recreated from the primary data any time. + +The configuration is placed inside `full-text-search`: + +```conf +full-text-search { + … + postgresql = { + use-default-connection = false + + jdbc { + url = "jdbc:postgresql://server:5432/db" + user = "pguser" + password = "" + } + + pg-config = { + } + pg-query-parser = "websearch_to_tsquery" + pg-rank-normalization = [ 4 ] + } +} +``` + +The flag `use-default-connection` can be set to `true` if you use +PostgreSQL as the primary db to have it also used for the fulltext +search. If set to `false`, the subsequent `jdbc` block defines the +connection to the postgres database to use. + +It follows some settings to tune PostgreSQL's text search feature. +Please visit [their +documentation](https://www.postgresql.org/docs/14/textsearch.html) for +all the details. + +- `pg-config`: this is an optional mapping from document languages as + used in Docspell to a PostgreSQL text search configuration. Not all + languages are equally well supported out of the box. You can create + your own text search config in PostgreSQL and then define it in this + map for your language. For example: + + ```conf + pg-config = { + english = "my-english" + german = "my-german" + } + ``` + + By default, the predefined configs are used for some lanugages and + otherwise fallback to `simple`. + + *If you change this setting, you must re-index everything.* +- `pg-query-parser`: the parser applied to the fulltext query. By + default it is `websearch_to_tsquery`. (relevant [doc + link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES)) +- `pg-rank-normalization`: this is used to tweak rank calculation that + affects the order of the elements returned from a query. It is an + array of numbers out of `1`, `2`, `4`, `8`, `16` or `32`. (relevant + [doc + link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING)) + + + ## Bind The host and port the http server binds to. This applies to both diff --git a/website/site/content/docs/install/download_run.md b/website/site/content/docs/install/download_run.md index bab0ee6b..e87f8ea5 100644 --- a/website/site/content/docs/install/download_run.md +++ b/website/site/content/docs/install/download_run.md @@ -110,7 +110,7 @@ Fulltext search is powered by [SOLR](https://solr.apache.org). You need to install solr and create a core for docspell. Then cange the solr url for both components (restserver and joex) accordingly. See the relevant section in the [config -page](@/docs/configure/_index.md#full-text-search-solr). +page](@/docs/configure/_index.md#full-text-search). ### Watching a directory From 5de6c8940d92ffe50fadd1319ccd658d2852c892 Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 14:41:39 +0100 Subject: [PATCH 7/8] Reorganize docs about configuration --- website/site/content/docs/api/intro.md | 4 +- website/site/content/docs/api/upload.md | 2 +- website/site/content/docs/configure/_index.md | 895 +----------------- .../content/docs/configure/admin-endpoint.md | 39 + .../content/docs/configure/authentication.md | 124 +++ .../site/content/docs/configure/baseurl.md | 38 + website/site/content/docs/configure/bind.md | 27 + .../site/content/docs/configure/database.md | 71 ++ .../content/docs/configure/file-backends.md | 147 +++ .../content/docs/configure/file-processing.md | 122 +++ .../content/docs/configure/fulltext-search.md | 176 ++++ website/site/content/docs/configure/main.md | 192 ++++ .../content/docs/configure/registration.md | 44 + website/site/content/docs/features/_index.md | 6 +- .../site/content/docs/install/download_run.md | 9 +- website/site/content/docs/install/prereq.md | 2 +- website/site/content/docs/install/rpi.md | 8 +- .../site/content/docs/joex/file-processing.md | 2 +- website/site/content/docs/joex/intro.md | 2 +- website/site/content/docs/tools/cli.md | 4 +- .../content/docs/webapp/itemcard-customize.md | 6 +- .../site/content/docs/webapp/scanmailbox.md | 6 +- website/site/content/docs/webapp/totp.md | 5 +- 23 files changed, 1014 insertions(+), 917 deletions(-) create mode 100644 website/site/content/docs/configure/admin-endpoint.md create mode 100644 website/site/content/docs/configure/authentication.md create mode 100644 website/site/content/docs/configure/baseurl.md create mode 100644 website/site/content/docs/configure/bind.md create mode 100644 website/site/content/docs/configure/database.md create mode 100644 website/site/content/docs/configure/file-backends.md create mode 100644 website/site/content/docs/configure/file-processing.md create mode 100644 website/site/content/docs/configure/fulltext-search.md create mode 100644 website/site/content/docs/configure/main.md create mode 100644 website/site/content/docs/configure/registration.md diff --git a/website/site/content/docs/api/intro.md b/website/site/content/docs/api/intro.md index 23dabea1..b4248dc1 100644 --- a/website/site/content/docs/api/intro.md +++ b/website/site/content/docs/api/intro.md @@ -59,7 +59,7 @@ via the header `Docspell-Share-Auth`. Docspell can be configured to be a relying party for OpenID Connect. Please see [the config -section](@/docs/configure/_index.md#openid-connect-oauth2) for +section](@/docs/configure/authentication.md#openid-connect-oauth2) for details. @@ -80,7 +80,7 @@ $ curl -XPOST -H "Docspell-Admin-Secret: test123" http://localhost:7880/api/v1/a ``` To enable these endpoints, you must provide a secret in the -[configuration](@/docs/configure/_index.md#admin-endpoint). +[configuration](@/docs/configure/admin-endpoint.md). ## Live Api diff --git a/website/site/content/docs/api/upload.md b/website/site/content/docs/api/upload.md index 4ff8886f..254be71c 100644 --- a/website/site/content/docs/api/upload.md +++ b/website/site/content/docs/api/upload.md @@ -163,7 +163,7 @@ on the same host or network). The endpoint is disabled by default, an admin must change the `docspell.server.integration-endpoint.enabled` flag to `true` in the -[configuration file](@/docs/configure/_index.md#rest-server). +[configuration file](@/docs/configure/main.md#rest-server). If queried by a `GET` request, it returns whether it is enabled and the collective exists. diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md index 2fc11ed6..9ac65a93 100644 --- a/website/site/content/docs/configure/_index.md +++ b/website/site/content/docs/configure/_index.md @@ -3,896 +3,9 @@ title = "Configuration" insert_anchor_links = "right" description = "Describes the configuration file and shows all default settings." weight = 40 -template = "docs.html" +template = "pages.html" +sort_by = "weight" +redirect_to = "docs/configure/main" +++ -# Configuration - -Docspell's executables (restserver and joex) can take one argument – a -configuration file. If that is not given, the defaults are used, -overriden by environment variables. A config file overrides default -values, so only values that differ from the defaults are necessary. -The complete default options and their documentation is at the end of -this page. - -Besides the config file, another way is to provide individual settings -via key-value pairs to the executable by the `-D` option. For example -to override only `base-url` you could add the argument -`-Ddocspell.server.base-url=…` to the command. Multiple options are -possible. For more than few values this is very tedious, obviously, so -the recommended way is to maintain a config file. If these options -*and* a file is provded, then any setting given via the `-D…` option -overrides the same setting from the config file. - -At last, it is possible to configure docspell via environment -variables if there is no config file supplied (if a config file *is* -supplied, it is always preferred). Note that this approach is limited, -as arrays are not supported. A list of environment variables can be -found at the [end of this page](#environment-variables). The -environment variable name follows the corresponding config key - where -dots are replaced by underscores and dashes are replaced by two -underscores. For example, the config key `docspell.server.app-name` -can be defined as env variable `DOCSPELL_SERVER_APP__NAME`. - -It is also possible to specify environment variables inside a config -file (to get a mix of both) - please see the [documentation of the -config library](https://github.com/lightbend/config#standard-behavior) -for more on this. - -# File Format - -The format of the configuration files can be -[HOCON](https://github.com/lightbend/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation), -JSON or what this [config -library](https://github.com/lightbend/config) understands. The default -values below are in HOCON format, which is recommended, since it -allows comments and has some [advanced -features](https://github.com/lightbend/config#features-of-hocon). -Please also see their documentation for more details. - -A short description (please check the links for better understanding): -The config consists of key-value pairs and can be written in a -JSON-like format (called HOCON). Keys are organized in trees, and a -key defines a full path into the tree. There are two ways: - -``` -a.b.c.d=15 -``` - -or - -``` -a { - b { - c { - d = 15 - } - } -} -``` - -Both are exactly the same and these forms are both used at the same -time. Usually the braces approach is used to group some more settings, -for better readability. - -Strings that contain "not-so-common" characters should be enclosed in -quotes. It is possible to define values at the top of the file and -reuse them on different locations via the `${full.path.to.key}` -syntax. When using these variables, they *must not* be enclosed in -quotes. - - -# Important Config Options - -The configuration of both components uses separate namespaces. The -configuration for the REST server is below `docspell.server`, while -the one for joex is below `docspell.joex`. - -You can therefore use two separate config files or one single file -containing both namespaces. - -## JDBC - -This configures the connection to the database. This has to be -specified for the rest server and joex. By default, a H2 database in -the current `/tmp` directory is configured. - -The config looks like this (both components): - -``` bash -docspell.joex.jdbc { - url = ... - user = ... - password = ... -} - -docspell.server.backend.jdbc { - url = ... - user = ... - password = ... -} -``` - -The `url` is the connection to the database. It must start with -`jdbc`, followed by name of the database. The rest is specific to the -database used: it is either a path to a file for H2 or a host/database -url for MariaDB and PostgreSQL. - -When using H2, the user and password can be chosen freely on first -start, but must stay the same on subsequent starts. Usually, the user -is `sa` and the password is left empty. Additionally, the url must -include these options: - -``` -;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE -``` - -### Examples - -PostgreSQL: -``` -url = "jdbc:postgresql://localhost:5432/docspelldb" -``` - -MariaDB: -``` -url = "jdbc:mariadb://localhost:3306/docspelldb" -``` - -H2 -``` -url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" -``` - -## Admin Endpoint - -The admin endpoint defines some [routes](@/docs/api/intro.md#admin) -for adminstration tasks. This is disabled by default and can be -enabled by providing a secret: - -``` bash -... - admin-endpoint { - secret = "123" - } -``` - -This secret must be provided to all requests to a `/api/v1/admin/` -endpoint. - -The most convenient way to execute admin tasks is to use the -[cli](@/docs/tools/cli.md). You get a list of possible admin commands -via `dsc admin help`. - -To see the output of the commands, there are these ways: - -1. looking at the joex logs, which gives most details. -2. Use the job-queue page when logged in as `docspell-system` -3. setup a [webhook](@/docs/webapp/notification.md) to be notified - when a job finishes. This way you get a small message. - -All admin tasks (and also some other system tasks) are run under the -account `docspell-system` (collective and user). You need to create -this account and setup the notification hooks in there - not in your -normal account. - - -## Full-Text Search - -Fulltext search is optional and provided by external systems. There -are currently [Apache SOLR](https://solr.apache.org) and [PostgreSQL's -text search](https://www.postgresql.org/docs/14/textsearch.html) -available. - -You can enable and configure the fulltext search backends as described -below and then choose the wanted backend: - -```conf -full-text-search { - enabled = true - # Which backend to use, either solr or postgresql - backend = "solr" - … -} -``` - -All docspell components must provide the same fulltext search -configuration. - -### SOLR - -[Apache SOLR](https://solr.apache.org) can be used to provide the -full-text search. This is defined in the `full-text-search.solr` -subsection: - -``` bash -... - full-text-search { - ... - solr = { - url = "http://localhost:8983/solr/docspell" - } - } -``` - -The default configuration at the end of this page contains more -information about each setting. - -The `solr.url` is the mandatory setting that you need to change to -point to your SOLR instance. Then you need to set the `enabled` flag -to `true`. - -When installing docspell manually, just install solr and create a core -as described in the [solr -documentation](https://solr.apache.org/guide/8_4/installing-solr.html). -That will provide you with the connection url (the last part is the -core name). If Docspell detects an empty core it will run a schema -setup on start automatically. - -The `full-text-search.solr` options are the same for joex and the -restserver. - -There is an [admin route](@/docs/api/intro.md#admin) that allows to -re-create the entire index (for all collectives). This is possible via -a call: - -``` bash -$ curl -XPOST -H "Docspell-Admin-Secret: test123" http://localhost:7880/api/v1/admin/fts/reIndexAll -``` - -or use the [cli](@/docs/tools/cli.md): - -```bash -dsc admin -a test123 recreate-index -``` - -Here the `test123` is the key defined with `admin-endpoint.secret`. If -it is empty (the default), this call is disabled (all admin routes). -Otherwise, the POST request will submit a system task that is executed -by a joex instance eventually. - -Using this endpoint, the entire index (including the schema) will be -re-created. This is sometimes necessary, for example if you upgrade -SOLR or delete the core to provide a new one (see -[here](https://solr.apache.org/guide/8_4/reindexing.html) for -details). Another way is to restart docspell (while clearing the -index). If docspell detects an empty index at startup, it will submit -a task to build the index automatically. - -Note that a collective can also re-index their data using a similiar -endpoint; but this is only deleting their data and doesn't do a full -re-index. - -The solr index doesn't contain any new information, it can be -regenerated any time using the above REST call. Thus it doesn't need -to be backed up. - -### PostgreSQL - -PostgreSQL provides many additional features, one of them is [text -search](https://www.postgresql.org/docs/14/textsearch.html). Docspell -can utilize this to provide the fulltext search feature. This is -especially useful, if PostgreSQL is used as the primary database for -docspell. - -You can choose to use the same database or separate connection. The -fulltext search will create a single table `ftspsql_search` that holds -all necessary data. When doing backups, you can exclude this table as -it can be recreated from the primary data any time. - -The configuration is placed inside `full-text-search`: - -```conf -full-text-search { - … - postgresql = { - use-default-connection = false - - jdbc { - url = "jdbc:postgresql://server:5432/db" - user = "pguser" - password = "" - } - - pg-config = { - } - pg-query-parser = "websearch_to_tsquery" - pg-rank-normalization = [ 4 ] - } -} -``` - -The flag `use-default-connection` can be set to `true` if you use -PostgreSQL as the primary db to have it also used for the fulltext -search. If set to `false`, the subsequent `jdbc` block defines the -connection to the postgres database to use. - -It follows some settings to tune PostgreSQL's text search feature. -Please visit [their -documentation](https://www.postgresql.org/docs/14/textsearch.html) for -all the details. - -- `pg-config`: this is an optional mapping from document languages as - used in Docspell to a PostgreSQL text search configuration. Not all - languages are equally well supported out of the box. You can create - your own text search config in PostgreSQL and then define it in this - map for your language. For example: - - ```conf - pg-config = { - english = "my-english" - german = "my-german" - } - ``` - - By default, the predefined configs are used for some lanugages and - otherwise fallback to `simple`. - - *If you change this setting, you must re-index everything.* -- `pg-query-parser`: the parser applied to the fulltext query. By - default it is `websearch_to_tsquery`. (relevant [doc - link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES)) -- `pg-rank-normalization`: this is used to tweak rank calculation that - affects the order of the elements returned from a query. It is an - array of numbers out of `1`, `2`, `4`, `8`, `16` or `32`. (relevant - [doc - link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING)) - - - -## Bind - -The host and port the http server binds to. This applies to both -components. The joex component also exposes a small REST api to -inspect its state and notify the scheduler. - -``` bash -docspell.server.bind { - address = localhost - port = 7880 -} -docspell.joex.bind { - address = localhost - port = 7878 -} -``` - -By default, it binds to `localhost` and some predefined port. This -must be changed, if components are on different machines. - -## Baseurl - -The base url is an important setting that defines the http URL where -the corresponding component can be reached. It applies to both -components. For a joex component, the url must be resolvable from a -REST server component. The REST server also uses this url to create -absolute urls and to configure the authenication cookie. - -By default it is build using the information from the `bind` setting, -which is `http://localhost:7880`. - -If the default is not changed, docspell will use the request to -determine the base-url. It first inspects the `X-Forwarded-For` header -that is often used with reverse proxies. If that is not present, the -`Host` header of the request is used. However, if the `base-url` -setting is changed, then only this setting is used. - -``` -docspell.server.base-url = ... -docspell.joex.base-url = ... -``` - -If you are unsure, leave it at its default. - -### Examples - -``` -docspell.server.baseurl = "https://docspell.example.com" -docspell.joex.baseurl = "http://192.168.101.10" -``` - - -## App-id - -The `app-id` is the identifier of the corresponding instance. It *must -be unique* for all instances. By default the REST server uses `rest1` -and joex `joex1`. It is recommended to overwrite this setting to have -an explicit and stable identifier should multiple instances are -intended. - -``` bash -docspell.server.app-id = "rest1" -docspell.joex.app-id = "joex1" -``` - -## Registration Options - -This defines if and how new users can create accounts. There are 3 -options: - -- *closed* no new user can sign up -- *open* new users can sign up -- *invite* new users can sign up but require an invitation key - -This applies only to the REST sevrer component. - -``` bash -docspell.server.backend.signup { - mode = "open" - - # If mode == 'invite', a password must be provided to generate - # invitation keys. It must not be empty. - new-invite-password = "" - - # If mode == 'invite', this is the period an invitation token is - # considered valid. - invite-time = "3 days" -} -``` - -The mode `invite` is intended to open the application only to some -users. The admin can create these invitation keys and distribute them -to the desired people. For this, the `new-invite-password` must be -given. The idea is that only the person who installs docspell knows -this. If it is not set, then invitation won't work. New invitation -keys can be generated from within the web application or via REST -calls (using `curl`, for example). - -``` bash -curl -X POST -d '{"password":"blabla"}' "http://localhost:7880/api/v1/open/signup/newinvite" -``` - -## Authentication - -Authentication works in two ways: - -- with an account-name / password pair -- with an authentication token - -The initial authentication must occur with an accountname/password -pair. This will generate an authentication token which is valid for a -some time. Subsequent calls to secured routes can use this token. The -token can be given as a normal http header or via a cookie header. - -These settings apply only to the REST server. - -``` bash -docspell.server.auth { - server-secret = "hex:caffee" # or "b64:Y2FmZmVlCg==" - session-valid = "5 minutes" -} -``` - -The `server-secret` is used to sign the token. If multiple REST -servers are deployed, all must share the same server secret. Otherwise -tokens from one instance are not valid on another instance. The secret -can be given as Base64 encoded string or in hex form. Use the prefix -`hex:` and `b64:`, respectively. If no prefix is given, the UTF8 bytes -of the string are used. - -The `session-valid` determines how long a token is valid. This can be -just some minutes, the web application obtains new ones -periodically. So a rather short time is recommended. - -## OpenID Connect / OAuth2 - -You can integrate Docspell into your SSO solution via [OpenID -Connect](https://openid.net/connect/) (OIDC). This requires to set up -an OpenID Provider (OP) somewhere and to configure Docspell -accordingly to act as the relying party. - -You can define multiple OPs to use. For some examples, please see the -default configuration file [below](#rest-server). - -The configuration of a provider highly depends on how it is setup. -Here is an example for a setup using -[keycloak](https://www.keycloak.org): - -``` conf -provider = { - provider-id = "keycloak", - client-id = "docspell", - client-secret = "example-secret-439e-bf06-911e4cdd56a6", - scope = "profile", # scope is required for OIDC - authorize-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/auth", - token-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/token", - #User URL is not used when signature key is set. - #user-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/userinfo", - sign-key = "b64:MII…ZYL09vAwLn8EAcSkCAwEAAQ==", - sig-algo = "RS512" -} -``` - -The `provider-id` is some identifier that is used in the URL to -distinguish between possibly multiple providers. The `client-id` and -`client-secret` define the two parameters required for a "confidential -client". The different URLs are best explained at the [keycloak -docs](https://www.keycloak.org/docs/latest/server_admin/). -They are available for all OPs in some way. The `user-url` is not -required, if the access token is already containing the necessary -data. If not, then docspell performs another request to the -`user-url`, which must be the user-info endpoint, to obtain the -required user data. - -If the data is taken from the token directly and not via a request to -the user-info endpoint, then the token must be validated using the -given `sign-key` and `sig-algo`. These two values are then required to -specify! However, if the user-info endpoint should be used, then leave -the `sign-key` empty and specify the correct url in `user-url`. When -specifying the `sign-key` use a prefix of `b64:` if it is Base64 -encoded or `hex:` if it is hex encoded. Otherwise the unicode bytes -are used, which is most probably not wanted for this setting. - -Once the user is authenticated, docspell tries to setup an account and -does some checks. For this it must get to the username and collective -name somehow. How it does this, can be specified by the `user-key` and -`collective-key` settings: - -``` conf -# The collective of the user is given in the access token as -# property `docspell_collective`. -collective-key = "lookup:docspell_collective", -# The username to use for the docspell account -user-key = "preferred_username" -``` - -The `user-key` is some string that is used to search the JSON response -from the OP for an object with that key. The search happens -recursively, so the field can be in a nested object. The found value -is used as the user name. Keycloak transmits the `preferred_username` -when asking for the `profile` scope. This can be used as the user -name. - -The collective name can be obtained by different ways. For example, -you can instruct your OP (like keycloak) to provide a collective name -in the token and/or user-info responses. If you do this, then use the -`lookup:` prefix as in the example above. This instructs docspell to -search for a value the same way as the `user-key`. You can also set a -fixed collective, using `fixed:` prefix; in this case all users are in -the same collective! A third option is to prefix it with `account:` - -then the value that is looked up is interpreted as the full account -name, like `collective/user` and the `user-key` setting is ignored. If -you want to put each user in its own collective, you can just use the -same value as in `user-key`, only prefixed with `lookup:`. In the -example it would be `lookup:preferred_username`. - -If you find that these methods do not suffice for your case, please -open an issue. - - -## File Backends - -Docspell allows to choose from different storage backends for binary -files. You can choose between: - -1. *Database (the recommended default)* - - The database can be used to store the files as well. It is the - default. It doesn't require any other configuration and works well - with multiple instances of restservers and joex nodes. -2. *S3* - - The S3 backend allows to store files in an S3 compatible storage. - It was tested with MinIO, which is possible to self host. - -3. *Filesystem* - - The filesystem can also be used directly, by specifying a - directory. Be aware that _all_ nodes must have read and write - access into this directory! When running multiple nodes over a - network, consider using one of the above instead. Docspell uses a - fixed structure for storing the files below the given directory, it - cannot be configured. - -When using S3 or filesystem, remember to backup the database *and* the -files! - -Note that Docspell not only stores the file that are uploaded, but -also some other files for internal use. - -### Configuring - -{% warningbubble(title="Note") %} - -Each node must have the same config for its file backend! When using -the filesystem, make sure all processes can access the directory with -read and write permissions. - -{% end %} - -The file storage backend can be configured inside the `files` section -(see the default configs below): - -```conf -files { - … - default-store = "database" - - stores = { - database = - { enabled = true - type = "default-database" - } - - filesystem = - { enabled = false - type = "file-system" - directory = "/some/directory" - } - - minio = - { enabled = false - type = "s3" - endpoint = "http://localhost:9000" - access-key = "username" - secret-key = "password" - bucket = "docspell" - } - } -} -``` - -The `stores` object defines a set of stores and the `default-store` -selects the one that should be used. All disabled store configurations -are removed from the list. Thus the `default-store` must be enabled. -Other enabled stores can be used as the target when copying files (see -below). - -A store configuration requires a `enabled` and `type` property. -Depending on the `type` property, other properties are required, they -are presented above. The available storage types are -`default-database`, `file-system` and `s3`. - -If you use the docker setup, you can find the corresponding -environment variables to the above config snippet -[below](#environment-variables). - -### Change Backends - -It is possible to change backends with a bit of manual effort. When -doing this, please make sure that the application is not used. It is -important that no file is uploaded during the following steps. - -The [cli](@/docs/tools/cli.md) will be used, please set it up first -and you need to enable the [admin endpoint](#admin-endpoint). Config -changes mentioned here must be applied to all nodes - joex and -restserver! - -1. In the config, enable a second file backend (besides the default) - you want to change to and start docspell as normal. Don't change - `default-store` yet. -2. Run the file integrity check in order to see whether all files are - ok as they are in the current store. This can be done using the - [cli](@/docs/tools/cli.md) by running: - - ```bash - dsc admin file-integrity-check - ``` -3. Run the copy files admin command which will copy all files from the - current `default-store` to all other enabled stores. - - ```bash - dsc admin clone-file-repository - ``` - - And wait until it's done :-). You can see the progress in the jobs - page when logged in as `docspell-system` or just look at the logs. -4. In the config, change the `default-store` to the one you just - copied all the files to and restart docspell. -5. Login and do some smoke tests. Then run the file integrity check - again: - - ```bash - dsc admin file-integrity-check - ``` - -If all is fine, then you are done and are now using the new file -backend. If the second integrity check fails, please open an issue. -You need then to revert the config change of step 4 to use the -previous `default-store` again. - -If you want to delete the files from the database, you can do so by -running the following SQL against the database: - -```sql -DELETE FROM filechunk -``` - -You can copy them back into the database using the steps above. - - -## File Processing - -Files are being processed by the joex component. So all the respective -configuration is in this config only. - -File processing involves several stages, detailed information can be -found [here](@/docs/joex/file-processing.md#text-analysis) and in the -corresponding sections in [joex default config](#joex). - -Configuration allows to define the external tools and set some -limitations to control memory usage. The sections are: - -- `docspell.joex.extraction` -- `docspell.joex.text-analysis` -- `docspell.joex.convert` - -Options to external commands can use variables that are replaced by -values at runtime. Variables are enclosed in double braces `{{…}}`. -Please see the default configuration for what variables exist per -command. - -### Classification - -In `text-analysis.classification` you can define how many documents at -most should be used for learning. The default settings should work -well for most cases. However, it always depends on the amount of data -and the machine that runs joex. For example, by default the documents -to learn from are limited to 600 (`classification.item-count`) and -every text is cut after 5000 characters (`text-analysis.max-length`). -This is fine if *most* of your documents are small and only a few are -near 5000 characters). But if *all* your documents are very large, you -probably need to either assign more heap memory or go down with the -limits. - -Classification can be disabled, too, for when it's not needed. - -### NLP - -This setting defines which NLP mode to use. It defaults to `full`, -which requires more memory for certain languages (with the advantage -of better results). Other values are `basic`, `regexonly` and -`disabled`. The modes `full` and `basic` use pre-defined lanugage -models for procesing documents of languaes German, English, French and -Spanish. These require some amount of memory (see below). - -The mode `basic` is like the "light" variant to `full`. It doesn't use -all NLP features, which makes memory consumption much lower, but comes -with the compromise of less accurate results. - -The mode `regexonly` doesn't use pre-defined lanuage models, even if -available. It checks your address book against a document to find -metadata. That means, it is language independent. Also, when using -`full` or `basic` with lanugages where no pre-defined models exist, it -will degrade to `regexonly` for these. - -The mode `disabled` skips NLP processing completely. This has least -impact in memory consumption, obviously, but then only the classifier -is used to find metadata (unless it is disabled, too). - -You might want to try different modes and see what combination suits -best your usage pattern and machine running joex. If a powerful -machine is used, simply leave the defaults. When running on an -raspberry pi, for example, you might need to adjust things. - -### Memory Usage - -The memory requirements for the joex component depends on the document -language and the enabled features for text-analysis. The `nlp.mode` -setting has significant impact, especially when your documents are in -German. Here are some rough numbers on jvm heap usage (the same file -was used for all tries): - - - - - - - - - - - - - - - - -
nlp.modeEnglishGermanFrench
full420M950M490M
basic170M380M390M
- -Note that these are only rough numbers and they show the maximum used -heap memory while processing a file. - -When using `mode=full`, a heap setting of at least `-Xmx1400M` is -recommended. For `mode=basic` a heap setting of at least `-Xmx500M` is -recommended. - -Other languages can't use these two modes, and so don't require this -amount of memory (but don't have as good results). Then you can go -with less heap. For these languages, the nlp mode is the same as -`regexonly`. - -Training the classifier is also memory intensive, which solely depends -on the size and number of documents that are being trained. However, -training the classifier is done periodically and can happen maybe -every two weeks. When classifying new documents, memory requirements -are lower, since the model already exists. - -More details about these modes can be found -[here](@/docs/joex/file-processing.md#text-analysis). - - -The restserver component is very lightweight, here you can use -defaults. - - -# JVM Options - -The start scripts support some options to configure the JVM. One often -used setting is the maximum heap size of the JVM. By default, java -determines it based on properties of the current machine. You can -specify it by given java startup options to the command: - -``` -$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -- /path/to/server-config.conf -``` - -This would limit the maximum heap to 1GB. The double slash separates -internal options and the arguments to the program. Another frequently -used option is to change the default temp directory. Usually it is -`/tmp`, but it may be desired to have a dedicated temp directory, -which can be configured: - -``` -$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -Djava.io.tmpdir=/path/to/othertemp -- /path/to/server-config.conf -``` - -The command: - -``` -$ ./docspell-restserver*/bin/docspell-restserver -h -``` - -gives an overview of supported options. - -It is recommended to run joex with the G1GC enabled. If you use java8, -you need to add an option to use G1GC (`-XX:+UseG1GC`), for java11 -this is not necessary (but doesn't hurt either). This could look like -this: - -``` -./docspell-joex-{{version()}}/bin/docspell-joex -J-Xmx1596M -J-XX:+UseG1GC -- /path/to/joex.conf -``` - -Using these options you can define how much memory the JVM process is -able to use. This might be necessary to adopt depending on the usage -scenario and configured text analysis features. - -Please have a look at the corresponding [section](@/docs/configure/_index.md#memory-usage). - - - -# Logging - -By default, docspell logs to stdout. This works well, when managed by -systemd or other inits. Logging can be configured in the configuration -file or via environment variables. There are only two settings: - -- `minimum-level` specifies the log level to control the verbosity. - Levels are ordered from: *Trace*, *Debug*, *Info*, *Warn* and - *Error* -- `format` this defines how the logs are formatted. There are two - formats for humans: *Plain* and *Fancy*. And two more suited for - machine consumption: *Json* and *Logfmt*. The *Json* format contains - all details, while the others may omit some for readability - -These settings are the same for joex and the restserver component. - -# Default Config -## Rest Server - -{{ incl_conf(path="templates/shortcodes/server.conf") }} - - -## Joex - - -{{ incl_conf(path="templates/shortcodes/joex.conf") }} - -## Environment Variables - -Environment variables can be used when there is no config file -supplied. The listing below shows all possible variables and their -default values. - -{{ incl_conf(path="templates/shortcodes/config.env.txt") }} +No content here. diff --git a/website/site/content/docs/configure/admin-endpoint.md b/website/site/content/docs/configure/admin-endpoint.md new file mode 100644 index 00000000..f56229e2 --- /dev/null +++ b/website/site/content/docs/configure/admin-endpoint.md @@ -0,0 +1,39 @@ ++++ +title = "Admin Endpoint" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 60 +template = "docs.html" ++++ + +# Admin Endpoint + +The admin endpoint defines some [routes](@/docs/api/intro.md#admin) +for adminstration tasks. This is disabled by default and can be +enabled by providing a secret: + +``` bash +... + admin-endpoint { + secret = "123" + } +``` + +This secret must be provided to all requests to a `/api/v1/admin/` +endpoint. + +The most convenient way to execute admin tasks is to use the +[cli](@/docs/tools/cli.md). You get a list of possible admin commands +via `dsc admin help`. + +To see the output of the commands, there are these ways: + +1. looking at the joex logs, which gives most details. +2. Use the job-queue page when logged in as `docspell-system` +3. setup a [webhook](@/docs/webapp/notification.md) to be notified + when a job finishes. This way you get a small message. + +All admin tasks (and also some other system tasks) are run under the +account `docspell-system` (collective and user). You need to create +this account and setup the notification hooks in there - not in your +normal account. diff --git a/website/site/content/docs/configure/authentication.md b/website/site/content/docs/configure/authentication.md new file mode 100644 index 00000000..9d3e559e --- /dev/null +++ b/website/site/content/docs/configure/authentication.md @@ -0,0 +1,124 @@ ++++ +title = "Authentication" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 70 +template = "docs.html" ++++ + +## Authentication + +Authentication works in two ways: + +- with an account-name / password pair +- with an authentication token + +The initial authentication must occur with an accountname/password +pair. This will generate an authentication token which is valid for a +some time. Subsequent calls to secured routes can use this token. The +token can be given as a normal http header or via a cookie header. + +These settings apply only to the REST server. + +``` bash +docspell.server.auth { + server-secret = "hex:caffee" # or "b64:Y2FmZmVlCg==" + session-valid = "5 minutes" +} +``` + +The `server-secret` is used to sign the token. If multiple REST +servers are deployed, all must share the same server secret. Otherwise +tokens from one instance are not valid on another instance. The secret +can be given as Base64 encoded string or in hex form. Use the prefix +`hex:` and `b64:`, respectively. If no prefix is given, the UTF8 bytes +of the string are used. + +The `session-valid` determines how long a token is valid. This can be +just some minutes, the web application obtains new ones +periodically. So a rather short time is recommended. + +## OpenID Connect / OAuth2 + +You can integrate Docspell into your SSO solution via [OpenID +Connect](https://openid.net/connect/) (OIDC). This requires to set up +an OpenID Provider (OP) somewhere and to configure Docspell +accordingly to act as the relying party. + +You can define multiple OPs to use. For some examples, please see the +[default configuration](@/docs/configure/main.md#default-config). + +The configuration of a provider highly depends on how it is setup. +Here is an example for a setup using +[keycloak](https://www.keycloak.org): + +``` conf +provider = { + provider-id = "keycloak", + client-id = "docspell", + client-secret = "example-secret-439e-bf06-911e4cdd56a6", + scope = "profile", # scope is required for OIDC + authorize-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/auth", + token-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/token", + #User URL is not used when signature key is set. + #user-url = "http://localhost:8080/auth/realms/home/protocol/openid-connect/userinfo", + sign-key = "b64:MII…ZYL09vAwLn8EAcSkCAwEAAQ==", + sig-algo = "RS512" +} +``` + +The `provider-id` is some identifier that is used in the URL to +distinguish between possibly multiple providers. The `client-id` and +`client-secret` define the two parameters required for a "confidential +client". The different URLs are best explained at the [keycloak +docs](https://www.keycloak.org/docs/latest/server_admin/). +They are available for all OPs in some way. The `user-url` is not +required, if the access token is already containing the necessary +data. If not, then docspell performs another request to the +`user-url`, which must be the user-info endpoint, to obtain the +required user data. + +If the data is taken from the token directly and not via a request to +the user-info endpoint, then the token must be validated using the +given `sign-key` and `sig-algo`. These two values are then required to +specify! However, if the user-info endpoint should be used, then leave +the `sign-key` empty and specify the correct url in `user-url`. When +specifying the `sign-key` use a prefix of `b64:` if it is Base64 +encoded or `hex:` if it is hex encoded. Otherwise the unicode bytes +are used, which is most probably not wanted for this setting. + +Once the user is authenticated, docspell tries to setup an account and +does some checks. For this it must get to the username and collective +name somehow. How it does this, can be specified by the `user-key` and +`collective-key` settings: + +``` conf +# The collective of the user is given in the access token as +# property `docspell_collective`. +collective-key = "lookup:docspell_collective", +# The username to use for the docspell account +user-key = "preferred_username" +``` + +The `user-key` is some string that is used to search the JSON response +from the OP for an object with that key. The search happens +recursively, so the field can be in a nested object. The found value +is used as the user name. Keycloak transmits the `preferred_username` +when asking for the `profile` scope. This can be used as the user +name. + +The collective name can be obtained by different ways. For example, +you can instruct your OP (like keycloak) to provide a collective name +in the token and/or user-info responses. If you do this, then use the +`lookup:` prefix as in the example above. This instructs docspell to +search for a value the same way as the `user-key`. You can also set a +fixed collective, using `fixed:` prefix; in this case all users are in +the same collective! A third option is to prefix it with `account:` - +then the value that is looked up is interpreted as the full account +name, like `collective/user` and the `user-key` setting is ignored. If +you want to put each user in its own collective, you can just use the +same value as in `user-key`, only prefixed with `lookup:`. In the +example it would be `lookup:preferred_username`. + +If you find that these methods do not suffice for your case, please +open an issue. diff --git a/website/site/content/docs/configure/baseurl.md b/website/site/content/docs/configure/baseurl.md new file mode 100644 index 00000000..c355960d --- /dev/null +++ b/website/site/content/docs/configure/baseurl.md @@ -0,0 +1,38 @@ ++++ +title = "Base URL" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 90 +template = "docs.html" ++++ + +## Baseurl + +The base url is an important setting that defines the http URL where +the corresponding component can be reached. It applies to both +components. For a joex component, the url must be resolvable from a +REST server component. The REST server also uses this url to create +absolute urls and to configure the authenication cookie. + +By default it is build using the information from the `bind` setting, +which is `http://localhost:7880`. + +If the default is not changed, docspell will use the request to +determine the base-url. It first inspects the `X-Forwarded-For` header +that is often used with reverse proxies. If that is not present, the +`Host` header of the request is used. However, if the `base-url` +setting is changed, then only this setting is used. + +``` +docspell.server.base-url = ... +docspell.joex.base-url = ... +``` + +If you are unsure, leave it at its default. + +### Examples + +``` +docspell.server.baseurl = "https://docspell.example.com" +docspell.joex.baseurl = "http://192.168.101.10" +``` diff --git a/website/site/content/docs/configure/bind.md b/website/site/content/docs/configure/bind.md new file mode 100644 index 00000000..cabe2115 --- /dev/null +++ b/website/site/content/docs/configure/bind.md @@ -0,0 +1,27 @@ ++++ +title = "Bind" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 12 +template = "docs.html" ++++ + +## Bind + +The host and port the http server binds to. This applies to both +components. The joex component also exposes a small REST api to +inspect its state and notify the scheduler. + +``` bash +docspell.server.bind { + address = localhost + port = 7880 +} +docspell.joex.bind { + address = localhost + port = 7878 +} +``` + +By default, it binds to `localhost` and some predefined port. This +must be changed, if components are on different machines. diff --git a/website/site/content/docs/configure/database.md b/website/site/content/docs/configure/database.md new file mode 100644 index 00000000..37122f03 --- /dev/null +++ b/website/site/content/docs/configure/database.md @@ -0,0 +1,71 @@ ++++ +title = "Database" +insert_anchor_links = "right" +description = "Details about configuring the database." +weight = 20 +template = "docs.html" ++++ + + +# Database + +The database holds by default all the data and must be configured +exactly the same on all nodes. + +The following are supported DBs: + +- PostgreSQL (recommended) +- MariaDB +- H2 + +This has to be specified for the rest server and joex. By default, a +H2 database in the current `/tmp` directory is configured. + +## Options + +The config looks like this (both components): + +``` bash +docspell.joex.jdbc { + url = ... + user = ... + password = ... +} + +docspell.server.backend.jdbc { + url = ... + user = ... + password = ... +} +``` + +The `url` is the connection to the database. It must start with +`jdbc`, followed by name of the database. The rest is specific to the +database used: it is either a path to a file for H2 or a host/database +url for MariaDB and PostgreSQL. + +When using H2, the user and password can be chosen freely on first +start, but must stay the same on subsequent starts. Usually, the user +is `sa` and the password is left empty. Additionally, the url must +include these options: + +``` +;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE +``` + +## Examples + +PostgreSQL: +``` +url = "jdbc:postgresql://localhost:5432/docspelldb" +``` + +MariaDB: +``` +url = "jdbc:mariadb://localhost:3306/docspelldb" +``` + +H2 +``` +url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" +``` diff --git a/website/site/content/docs/configure/file-backends.md b/website/site/content/docs/configure/file-backends.md new file mode 100644 index 00000000..caa37492 --- /dev/null +++ b/website/site/content/docs/configure/file-backends.md @@ -0,0 +1,147 @@ ++++ +title = "File Backends" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 30 +template = "docs.html" ++++ + +## File Backends + +Docspell allows to choose from different storage backends for binary +files. You can choose between: + +1. *Database (the recommended default)* + + The database can be used to store the files as well. It is the + default. It doesn't require any other configuration and works well + with multiple instances of restservers and joex nodes. +2. *S3* + + The S3 backend allows to store files in an S3 compatible storage. + It was tested with MinIO, which is possible to self host. + +3. *Filesystem* + + The filesystem can also be used directly, by specifying a + directory. Be aware that _all_ nodes must have read and write + access into this directory! When running multiple nodes over a + network, consider using one of the above instead. Docspell uses a + fixed structure for storing the files below the given directory, it + cannot be configured. + +When using S3 or filesystem, remember to backup the database *and* the +files! + +Note that Docspell not only stores the file that are uploaded, but +also some other files for internal use. + +### Configuring + +{% warningbubble(title="Note") %} + +Each node must have the same config for its file backend! When using +the filesystem, make sure all processes can access the directory with +read and write permissions. + +{% end %} + +The file storage backend can be configured inside the `files` section +(see the [default configs](@/docs/configure/main.md#default-config)): + +```conf +files { + … + default-store = "database" + + stores = { + database = + { enabled = true + type = "default-database" + } + + filesystem = + { enabled = false + type = "file-system" + directory = "/some/directory" + } + + minio = + { enabled = false + type = "s3" + endpoint = "http://localhost:9000" + access-key = "username" + secret-key = "password" + bucket = "docspell" + } + } +} +``` + +The `stores` object defines a set of stores and the `default-store` +selects the one that should be used. All disabled store configurations +are removed from the list. Thus the `default-store` must be enabled. +Other enabled stores can be used as the target when copying files (see +below). + +A store configuration requires a `enabled` and `type` property. +Depending on the `type` property, other properties are required, they +are presented above. The available storage types are +`default-database`, `file-system` and `s3`. + +If you use the docker setup, you can find the corresponding +environment variables to the above config snippet +[below](#environment-variables). + +### Change Backends + +It is possible to change backends with a bit of manual effort. When +doing this, please make sure that the application is not used. It is +important that no file is uploaded during the following steps. + +The [cli](@/docs/tools/cli.md) will be used, please set it up first +and you need to enable the [admin endpoint](#admin-endpoint). Config +changes mentioned here must be applied to all nodes - joex and +restserver! + +1. In the config, enable a second file backend (besides the default) + you want to change to and start docspell as normal. Don't change + `default-store` yet. +2. Run the file integrity check in order to see whether all files are + ok as they are in the current store. This can be done using the + [cli](@/docs/tools/cli.md) by running: + + ```bash + dsc admin file-integrity-check + ``` +3. Run the copy files admin command which will copy all files from the + current `default-store` to all other enabled stores. + + ```bash + dsc admin clone-file-repository + ``` + + And wait until it's done :-). You can see the progress in the jobs + page when logged in as `docspell-system` or just look at the logs. +4. In the config, change the `default-store` to the one you just + copied all the files to and restart docspell. +5. Login and do some smoke tests. Then run the file integrity check + again: + + ```bash + dsc admin file-integrity-check + ``` + +If all is fine, then you are done and are now using the new file +backend. If the second integrity check fails, please open an issue. +You need then to revert the config change of step 4 to use the +previous `default-store` again. + +If you want to delete the files from the database, you can do so by +running the following SQL against the database: + +```sql +DELETE FROM filechunk +``` + +You can copy them back into the database using the steps above. diff --git a/website/site/content/docs/configure/file-processing.md b/website/site/content/docs/configure/file-processing.md new file mode 100644 index 00000000..d30db8ce --- /dev/null +++ b/website/site/content/docs/configure/file-processing.md @@ -0,0 +1,122 @@ ++++ +title = "File Processing" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 40 +template = "docs.html" ++++ + +## File Processing + +Files are being processed by the joex component. So all the respective +configuration is in this config only. + +File processing involves several stages, detailed information can be +found [here](@/docs/joex/file-processing.md#text-analysis) and in the +corresponding sections in [joex default +config](@/docs/configure/main.md#joex). + +Configuration allows to define the external tools and set some +limitations to control memory usage. The sections are: + +- `docspell.joex.extraction` +- `docspell.joex.text-analysis` +- `docspell.joex.convert` + +Options to external commands can use variables that are replaced by +values at runtime. Variables are enclosed in double braces `{{…}}`. +Please see the default configuration for what variables exist per +command. + +### Classification + +In `text-analysis.classification` you can define how many documents at +most should be used for learning. The default settings should work +well for most cases. However, it always depends on the amount of data +and the machine that runs joex. For example, by default the documents +to learn from are limited to 600 (`classification.item-count`) and +every text is cut after 5000 characters (`text-analysis.max-length`). +This is fine if *most* of your documents are small and only a few are +near 5000 characters). But if *all* your documents are very large, you +probably need to either assign more heap memory or go down with the +limits. + +Classification can be disabled, too, for when it's not needed. + +### NLP + +This setting defines which NLP mode to use. It defaults to `full`, +which requires more memory for certain languages (with the advantage +of better results). Other values are `basic`, `regexonly` and +`disabled`. The modes `full` and `basic` use pre-defined lanugage +models for procesing documents of languaes German, English, French and +Spanish. These require some amount of memory (see below). + +The mode `basic` is like the "light" variant to `full`. It doesn't use +all NLP features, which makes memory consumption much lower, but comes +with the compromise of less accurate results. + +The mode `regexonly` doesn't use pre-defined lanuage models, even if +available. It checks your address book against a document to find +metadata. That means, it is language independent. Also, when using +`full` or `basic` with lanugages where no pre-defined models exist, it +will degrade to `regexonly` for these. + +The mode `disabled` skips NLP processing completely. This has least +impact in memory consumption, obviously, but then only the classifier +is used to find metadata (unless it is disabled, too). + +You might want to try different modes and see what combination suits +best your usage pattern and machine running joex. If a powerful +machine is used, simply leave the defaults. When running on an +raspberry pi, for example, you might need to adjust things. + +### Memory Usage + +The memory requirements for the joex component depends on the document +language and the enabled features for text-analysis. The `nlp.mode` +setting has significant impact, especially when your documents are in +German. Here are some rough numbers on jvm heap usage (the same file +was used for all tries): + + + + + + + + + + + + + + + + +
nlp.modeEnglishGermanFrench
full420M950M490M
basic170M380M390M
+ +Note that these are only rough numbers and they show the maximum used +heap memory while processing a file. + +When using `mode=full`, a heap setting of at least `-Xmx1400M` is +recommended. For `mode=basic` a heap setting of at least `-Xmx500M` is +recommended. + +Other languages can't use these two modes, and so don't require this +amount of memory (but don't have as good results). Then you can go +with less heap. For these languages, the nlp mode is the same as +`regexonly`. + +Training the classifier is also memory intensive, which solely depends +on the size and number of documents that are being trained. However, +training the classifier is done periodically and can happen maybe +every two weeks. When classifying new documents, memory requirements +are lower, since the model already exists. + +More details about these modes can be found +[here](@/docs/joex/file-processing.md#text-analysis). + + +The restserver component is very lightweight, here you can use +defaults. diff --git a/website/site/content/docs/configure/fulltext-search.md b/website/site/content/docs/configure/fulltext-search.md new file mode 100644 index 00000000..f67dfc74 --- /dev/null +++ b/website/site/content/docs/configure/fulltext-search.md @@ -0,0 +1,176 @@ ++++ +title = "Full-Text Search" +insert_anchor_links = "right" +description = "Details about configuring the fulltext search." +weight = 50 +template = "docs.html" ++++ + + +# Full-Text Search + +Fulltext search is optional and provided by external systems. There +are currently [Apache SOLR](https://solr.apache.org) and [PostgreSQL's +text search](https://www.postgresql.org/docs/14/textsearch.html) +available. + +You can enable and configure the fulltext search backends as described +below and then choose the backend: + +```conf +full-text-search { + enabled = true + # Which backend to use, either solr or postgresql + backend = "solr" + … +} +``` + +All docspell components must provide the same fulltext search +configuration. + + +## SOLR + +[Apache SOLR](https://solr.apache.org) can be used to provide the +full-text search. This is defined in the `full-text-search.solr` +subsection: + +``` bash +... + full-text-search { + ... + solr = { + url = "http://localhost:8983/solr/docspell" + } + } +``` + +The default configuration at the end of this page contains more +information about each setting. + +The `solr.url` is the mandatory setting that you need to change to +point to your SOLR instance. Then you need to set the `enabled` flag +to `true`. + +When installing docspell manually, just install solr and create a core +as described in the [solr +documentation](https://solr.apache.org/guide/8_4/installing-solr.html). +That will provide you with the connection url (the last part is the +core name). If Docspell detects an empty core it will run a schema +setup on start automatically. + +The `full-text-search.solr` options are the same for joex and the +restserver. + +Sometimes it is necessary to re-create the entire index, for example +if you upgrade SOLR or delete the core to provide a new one (see +[here](https://solr.apache.org/guide/8_4/reindexing.html) for +details). Another way is to restart docspell (while clearing the +index). If docspell detects an empty index at startup, it will submit +a task to build the index automatically. + +Note that a collective can also re-index their data using a similiar +endpoint; but this is only deleting their data and doesn't do a full +re-index. + +The solr index doesn't contain any new information, it can be +regenerated any time using the above REST call. Thus it doesn't need +to be backed up. + + +## PostgreSQL + +PostgreSQL provides many additional features, one of them is [text +search](https://www.postgresql.org/docs/14/textsearch.html). Docspell +can utilize this to provide the fulltext search feature. This is +especially useful, if PostgreSQL is used as the primary database for +docspell. + +You can choose to use the same database or separate connection. The +fulltext search will create a single table `ftspsql_search` that holds +all necessary data. When doing backups, you can exclude this table as +it can be recreated from the primary data any time. + +The configuration is placed inside `full-text-search`: + +```conf +full-text-search { + … + postgresql = { + use-default-connection = false + + jdbc { + url = "jdbc:postgresql://server:5432/db" + user = "pguser" + password = "" + } + + pg-config = { + } + pg-query-parser = "websearch_to_tsquery" + pg-rank-normalization = [ 4 ] + } +} +``` + +The flag `use-default-connection` can be set to `true` if you use +PostgreSQL as the primary db to have it also used for the fulltext +search. If set to `false`, the subsequent `jdbc` block defines the +connection to the postgres database to use. + +It follows some settings to tune PostgreSQL's text search feature. +Please visit [their +documentation](https://www.postgresql.org/docs/14/textsearch.html) for +all the details. + +- `pg-config`: this is an optional mapping from document languages as + used in Docspell to a PostgreSQL text search configuration. Not all + languages are equally well supported out of the box. You can create + your own text search config in PostgreSQL and then define it in this + map for your language. For example: + + ```conf + pg-config = { + english = "my-english" + german = "my-german" + } + ``` + + By default, the predefined configs are used for some lanugages and + otherwise fallback to `simple`. + + *If you change this setting, you must re-index everything.* +- `pg-query-parser`: the parser applied to the fulltext query. By + default it is `websearch_to_tsquery`. (relevant [doc + link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES)) +- `pg-rank-normalization`: this is used to tweak rank calculation that + affects the order of the elements returned from a query. It is an + array of numbers out of `1`, `2`, `4`, `8`, `16` or `32`. (relevant + [doc + link](https://www.postgresql.org/docs/14/textsearch-controls.html#TEXTSEARCH-RANKING)) + + +# Re-create the index + +There is an [admin route](@/docs/api/intro.md#admin) that allows to +re-create the entire index (for all collectives). This is possible via +a call: + +``` bash +$ curl -XPOST -H "Docspell-Admin-Secret: test123" http://localhost:7880/api/v1/admin/fts/reIndexAll +``` + +or use the [cli](@/docs/tools/cli.md): + +```bash +dsc admin -a test123 recreate-index +``` + +Here the `test123` is the key defined with `admin-endpoint.secret`. If +it is empty (the default), this call is disabled (all admin routes). +Otherwise, the POST request will submit a system task that is executed +by a joex instance eventually. + +Using this endpoint, the entire index (including the schema) will be +re-created. diff --git a/website/site/content/docs/configure/main.md b/website/site/content/docs/configure/main.md new file mode 100644 index 00000000..3044437e --- /dev/null +++ b/website/site/content/docs/configure/main.md @@ -0,0 +1,192 @@ ++++ +title = "Main" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 10 +template = "docs.html" ++++ + +# Configuration + +Docspell's executables (restserver and joex) can take one argument – a +configuration file. If that is not given, the defaults are used, +overriden by environment variables. A config file overrides default +values, so only values that differ from the defaults are necessary. +The complete default options and their documentation is at the end of +this page. + +Besides the config file, another way is to provide individual settings +via key-value pairs to the executable by the `-D` option. For example +to override only `base-url` you could add the argument +`-Ddocspell.server.base-url=…` to the command. Multiple options are +possible. For more than few values this is very tedious, obviously, so +the recommended way is to maintain a config file. If these options +*and* a file is provded, then any setting given via the `-D…` option +overrides the same setting from the config file. + +At last, it is possible to configure docspell via environment +variables if there is no config file supplied (if a config file *is* +supplied, it is always preferred). Note that this approach is limited, +as arrays are not supported. A list of environment variables can be +found at the [end of this page](#environment-variables). The +environment variable name follows the corresponding config key - where +dots are replaced by underscores and dashes are replaced by two +underscores. For example, the config key `docspell.server.app-name` +can be defined as env variable `DOCSPELL_SERVER_APP__NAME`. + +It is also possible to specify environment variables inside a config +file (to get a mix of both) - please see the [documentation of the +config library](https://github.com/lightbend/config#standard-behavior) +for more on this. + +# File Format + +The format of the configuration files can be +[HOCON](https://github.com/lightbend/config/blob/master/HOCON.md#hocon-human-optimized-config-object-notation), +JSON or what this [config +library](https://github.com/lightbend/config) understands. The default +values below are in HOCON format, which is recommended, since it +allows comments and has some [advanced +features](https://github.com/lightbend/config#features-of-hocon). +Please also see their documentation for more details. + +A short description (please check the links for better understanding): +The config consists of key-value pairs and can be written in a +JSON-like format (called HOCON). Keys are organized in trees, and a +key defines a full path into the tree. There are two ways: + +``` +a.b.c.d=15 +``` + +or + +``` +a { + b { + c { + d = 15 + } + } +} +``` + +Both are exactly the same and these forms are both used at the same +time. Usually the braces approach is used to group some more settings, +for better readability. + +Strings that contain "not-so-common" characters should be enclosed in +quotes. It is possible to define values at the top of the file and +reuse them on different locations via the `${full.path.to.key}` +syntax. When using these variables, they *must not* be enclosed in +quotes. + + +# Config Options + +The configuration of both components uses separate namespaces. The +configuration for the REST server is below `docspell.server`, while +the one for joex is below `docspell.joex`. + +You can therefore use two separate config files or one single file +containing both namespaces. + +## App-id + +The `app-id` is the identifier of the corresponding instance. It *must +be unique* for all instances. By default the REST server uses `rest1` +and joex `joex1`. It is recommended to overwrite this setting to have +an explicit and stable identifier should multiple instances are +intended. + +``` bash +docspell.server.app-id = "rest1" +docspell.joex.app-id = "joex1" +``` + +## Other options + +Please see the menu on the left for details about specific +configuration options. + +# JVM Options + +The start scripts support some options to configure the JVM. One often +used setting is the maximum heap size of the JVM. By default, java +determines it based on properties of the current machine. You can +specify it by given java startup options to the command: + +``` +$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -- /path/to/server-config.conf +``` + +This would limit the maximum heap to 1GB. The double slash separates +internal options and the arguments to the program. Another frequently +used option is to change the default temp directory. Usually it is +`/tmp`, but it may be desired to have a dedicated temp directory, +which can be configured: + +``` +$ ./docspell-restserver*/bin/docspell-restserver -J-Xmx1G -Djava.io.tmpdir=/path/to/othertemp -- /path/to/server-config.conf +``` + +The command: + +``` +$ ./docspell-restserver*/bin/docspell-restserver -h +``` + +gives an overview of supported options. + +It is recommended to run joex with the G1GC enabled. If you use java8, +you need to add an option to use G1GC (`-XX:+UseG1GC`), for java11 +this is not necessary (but doesn't hurt either). This could look like +this: + +``` +./docspell-joex-{{version()}}/bin/docspell-joex -J-Xmx1596M -J-XX:+UseG1GC -- /path/to/joex.conf +``` + +Using these options you can define how much memory the JVM process is +able to use. This might be necessary to adopt depending on the usage +scenario and configured text analysis features. + +Please have a look at the corresponding +[section](@/docs/configure/file-processing.md#memory-usage). + + + +# Logging + +By default, docspell logs to stdout. This works well, when managed by +systemd or other inits. Logging can be configured in the configuration +file or via environment variables. There are only two settings: + +- `minimum-level` specifies the log level to control the verbosity. + Levels are ordered from: *Trace*, *Debug*, *Info*, *Warn* and + *Error* +- `format` this defines how the logs are formatted. There are two + formats for humans: *Plain* and *Fancy*. And two more suited for + machine consumption: *Json* and *Logfmt*. The *Json* format contains + all details, while the others may omit some for readability + +These settings are the same for joex and the restserver component. + +# Default Config +## Rest Server + +{{ incl_conf(path="templates/shortcodes/server.conf") }} + + +## Joex + + +{{ incl_conf(path="templates/shortcodes/joex.conf") }} + +## Environment Variables + +Environment variables can be used when there is no config file +supplied. The listing below shows all possible variables and their +default values. + +{{ incl_conf(path="templates/shortcodes/config.env.txt") }} diff --git a/website/site/content/docs/configure/registration.md b/website/site/content/docs/configure/registration.md new file mode 100644 index 00000000..3b0da15c --- /dev/null +++ b/website/site/content/docs/configure/registration.md @@ -0,0 +1,44 @@ ++++ +title = "Registration" +insert_anchor_links = "right" +description = "Describes the configuration file and shows all default settings." +weight = 80 +template = "docs.html" ++++ + +# Registration Options + +This defines if and how new users can create accounts. There are 3 +options: + +- *closed* no new user can sign up +- *open* new users can sign up +- *invite* new users can sign up but require an invitation key + +This applies only to the REST sevrer component. + +``` bash +docspell.server.backend.signup { + mode = "open" + + # If mode == 'invite', a password must be provided to generate + # invitation keys. It must not be empty. + new-invite-password = "" + + # If mode == 'invite', this is the period an invitation token is + # considered valid. + invite-time = "3 days" +} +``` + +The mode `invite` is intended to open the application only to some +users. The admin can create these invitation keys and distribute them +to the desired people. For this, the `new-invite-password` must be +given. The idea is that only the person who installs docspell knows +this. If it is not set, then invitation won't work. New invitation +keys can be generated from within the web application or via REST +calls (using `curl`, for example). + +``` bash +curl -X POST -d '{"password":"blabla"}' "http://localhost:7880/api/v1/open/signup/newinvite" +``` diff --git a/website/site/content/docs/features/_index.md b/website/site/content/docs/features/_index.md index 892883dd..272aec9f 100644 --- a/website/site/content/docs/features/_index.md +++ b/website/site/content/docs/features/_index.md @@ -14,7 +14,8 @@ template = "docs.html" - Handle multiple documents as one unit - OCR using [tesseract](https://github.com/tesseract-ocr/tesseract) - [Full-Text Search](@/docs/webapp/finding.md#full-text-search) based - on [Apache SOLR](https://solr.apache.org) + on [Apache SOLR](https://solr.apache.org) or [PostgreSQL's text + search](https://www.postgresql.org/docs/14/textsearch.html) - Conversion to PDF: all files are converted into a PDF file. PDFs with only images (as often returned from scanners) are converted into searchable PDF/A pdfs. @@ -36,7 +37,8 @@ template = "docs.html" [REST Api](@/docs/api/_index.md); allows to [generate clients](https://openapi-generator.tech/docs/generators) for many languages -- [OpenID Connect](@/docs/configure/_index.md#openid-connect-oauth2) +- [OpenID + Connect](@/docs/configure/authentication.md#openid-connect-oauth2) support allows Docspell to integrate into your SSO setup, for example with keycloak. - Two-Factor Authentication using [TOTP](@/docs/webapp/totp.md) built diff --git a/website/site/content/docs/install/download_run.md b/website/site/content/docs/install/download_run.md index e87f8ea5..a4954bcb 100644 --- a/website/site/content/docs/install/download_run.md +++ b/website/site/content/docs/install/download_run.md @@ -74,9 +74,10 @@ $ ./docspell-joex*/bin/docspell-joex ``` This will startup both components using the default configuration. -Please refer to the [configuration page](@/docs/configure/_index.md) -for how to create a custom config file. Once you have your config -file, simply pass it as argument to the command: +Please refer to the [configuration +page](@/docs/configure/main.md) for how to create a custom +config file. Once you have your config file, simply pass it as +argument to the command: ``` $ ./docspell-restserver*/bin/docspell-restserver /path/to/server-config.conf @@ -110,7 +111,7 @@ Fulltext search is powered by [SOLR](https://solr.apache.org). You need to install solr and create a core for docspell. Then cange the solr url for both components (restserver and joex) accordingly. See the relevant section in the [config -page](@/docs/configure/_index.md#full-text-search). +page](@/docs/configure/fulltext-search.md). ### Watching a directory diff --git a/website/site/content/docs/install/prereq.md b/website/site/content/docs/install/prereq.md index 9b4fe10c..d7a70fc7 100644 --- a/website/site/content/docs/install/prereq.md +++ b/website/site/content/docs/install/prereq.md @@ -102,7 +102,7 @@ When using H2, make sure that all components access the same database – the jdbc url must point to the same file. Then, it is important to add the options `;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE` at the end -of the url. See the [config page](@/docs/configure/_index.md#jdbc) for +of the url. See the [config page](@/docs/configure/database.md) for an example. For large installations, PostgreSQL or MariaDB is recommended. Create diff --git a/website/site/content/docs/install/rpi.md b/website/site/content/docs/install/rpi.md index f932592d..b59817d0 100644 --- a/website/site/content/docs/install/rpi.md +++ b/website/site/content/docs/install/rpi.md @@ -30,10 +30,10 @@ result in long processing times for OCR and text analysis. The board should provide 4G of RAM (like the current RPi4), especially if also a database and solr are running next to it. The memory required by joex depends on the config and document language. Please pick a value that -suits your setup from [here](@/docs/configure/_index.md#memory-usage). -For boards like the RPi, it might be necessary to use -`nlp.mode=basic`, rather than `nlp.mode=full`. You should also set the -joex pool size to 1. +suits your setup from +[here](@/docs/configure/file-processing.md#memory-usage). For boards +like the RPi, it might be necessary to use `nlp.mode=basic`, rather +than `nlp.mode=full`. You should also set the joex pool size to 1. An example: on this [UP board](https://up-board.org/up/specifications/) with an Intel Atom diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md index c3b0fcf7..0c59e8b4 100644 --- a/website/site/content/docs/joex/file-processing.md +++ b/website/site/content/docs/joex/file-processing.md @@ -80,7 +80,7 @@ line are required. As you see for `wkhtmltopdf` the page size is fixed to DIN A4. Other commands are configured like this as well. For the default values, please see the [configuration -page](@/docs/configure/_index.md#joex). +page](@/docs/configure/main.md#joex). ## Duplicate Check diff --git a/website/site/content/docs/joex/intro.md b/website/site/content/docs/joex/intro.md index 634e9104..050982db 100644 --- a/website/site/content/docs/joex/intro.md +++ b/website/site/content/docs/joex/intro.md @@ -23,7 +23,7 @@ For larger installations, it is probably better to run several joex components on different machines. That works out of the box, as long as all components point to the same database and use different `app-id`s (see [configuring -docspell](@/docs/configure/_index.md#app-id)). +docspell](@/docs/configure/main.md#app-id)). When files are submitted to docspell, they are stored in the database and all known joex components are notified about new work. Then they diff --git a/website/site/content/docs/tools/cli.md b/website/site/content/docs/tools/cli.md index 4e9cfaa6..e12c7121 100644 --- a/website/site/content/docs/tools/cli.md +++ b/website/site/content/docs/tools/cli.md @@ -323,8 +323,8 @@ full detail. These are a set of commands that simply call a route at the server to submit a maintenance task or to reset the password of some user. These commands require the [admin -secret](@/docs/configure/_index.md#admin-endpoint) either in the -config file or as an argument. +secret](@/docs/configure/admin-endpoint.md) either in the config file +or as an argument. ### Reset user password diff --git a/website/site/content/docs/webapp/itemcard-customize.md b/website/site/content/docs/webapp/itemcard-customize.md index 5b43bcc1..132b47e3 100644 --- a/website/site/content/docs/webapp/itemcard-customize.md +++ b/website/site/content/docs/webapp/itemcard-customize.md @@ -19,15 +19,15 @@ _UI Settings_. Among other things, there is a _Item Cards_ section: This defines how many of the item notes to display in the card. You can set it to `0` to not show any notes at all. This is only a "soft limit", there is also a "hard limit" in [docspell's -configuration](@/docs/configure/_index.md#rest-server) (see `max-note-length`), -that is an upper limit to this value. +configuration](@/docs/configure/main.md#rest-server) (see +`max-note-length`), that is an upper limit to this value. ### Size of item preview The item preview is an image of the first page of the first attachment. You can change the order of attachments in the item detail view. This image has a predefined size, which is specified [docspell's -configuration](@/docs/configure/_index.md#joex) (see +configuration](@/docs/configure/main.md#joex) (see `extraction.preview.dpi`). The size for displaying it, can be specified via this setting. A _small_ preview uses about 80px width, a _medium_ one 160px and _large_ means to use the available space in the diff --git a/website/site/content/docs/webapp/scanmailbox.md b/website/site/content/docs/webapp/scanmailbox.md index 772a6b54..68570961 100644 --- a/website/site/content/docs/webapp/scanmailbox.md +++ b/website/site/content/docs/webapp/scanmailbox.md @@ -191,9 +191,9 @@ file to look for duplicates, too. Docspell will go through all folders and download mails in “batches”. This size can be set by the admin in the [configuration -file](@/docs/configure/_index.md#joex) and applies to all these tasks -(same for all users). This batch only contains the mail headers and -not the complete mail. +file](@/docs/configure/main.md#joex) and applies to all these +tasks (same for all users). This batch only contains the mail headers +and not the complete mail. Then each mail is downloaded completely one by one and converted into an [eml](https://en.wikipedia.org/wiki/Email#Filename_extensions) file diff --git a/website/site/content/docs/webapp/totp.md b/website/site/content/docs/webapp/totp.md index 3bacc47a..747aaebb 100644 --- a/website/site/content/docs/webapp/totp.md +++ b/website/site/content/docs/webapp/totp.md @@ -11,7 +11,8 @@ Docspell has built-in support for two-factor (2FA) authentication using [TOTP](https://en.wikipedia.org/wiki/Time-based_One-Time_Password)s. For anything more, consider a dedicated account management tool and -[OpenID Connect](@/docs/configure/_index.md#openid-connect-oauth2). +[OpenID +Connect](@/docs/configure/authentication.md#openid-connect-oauth2). ## Setup @@ -65,7 +66,7 @@ client](@/docs/tools/cli.md) to execute an admin command that removes 2FA for a given user. For this to work, you need to [enable the admin -endpoint](@/docs/configure/_index.md#admin-endpoint). Then execute the +endpoint](@/docs/configure/admin-endpoint.md). Then execute the `disable-2fa` admin command and specify the complete account. ``` From f0b652d142390c88dbee6d214d8d2a27ea73955e Mon Sep 17 00:00:00 2001 From: eikek Date: Mon, 21 Mar 2022 14:58:15 +0100 Subject: [PATCH 8/8] Extend nix modules for new config options --- nix/configuration-test.nix | 5 +++ nix/module-joex.nix | 73 ++++++++++++++++++++++++++++++++++++++ nix/module-server.nix | 65 +++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) diff --git a/nix/configuration-test.nix b/nix/configuration-test.nix index 94236770..4d5c2172 100644 --- a/nix/configuration-test.nix +++ b/nix/configuration-test.nix @@ -4,6 +4,11 @@ let full-text-search = { enabled = true; solr.url = "http://localhost:${toString config.services.solr.port}/solr/docspell"; + postgresql = { + pg-config = { + "german" = "my-germam"; + }; + }; }; in { diff --git a/nix/module-joex.nix b/nix/module-joex.nix index ca7bbfc4..0663a699 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -213,6 +213,7 @@ Docpell Update Check }; full-text-search = { enabled = false; + backend = "solr"; solr = { url = "http://localhost:8983/solr/docspell"; commit-within = 1000; @@ -220,6 +221,17 @@ Docpell Update Check def-type = "lucene"; q-op = "OR"; }; + postgresql = { + use-default-connection = false; + jdbc = { + url = "jdbc:postgresql://server:5432/db"; + user = "pguser"; + password = ""; + }; + pg-config = {}; + pg-query-parser = "websearch_to_tsquery"; + pg-rank-normalization = [ 4 ]; + }; migration = { index-all-chunk = 10; }; @@ -1371,6 +1383,12 @@ in { Currently the SOLR search platform is supported. ''; }; + backend = mkOption { + type = types.str; + default = defaults.full-text-search.backend; + description = "The backend to use, either solr or postgresql"; + }; + solr = mkOption { type = types.submodule({ options = { @@ -1408,6 +1426,61 @@ in { default = defaults.full-text-search.solr; description = "Configuration for the SOLR backend."; }; + + postgresql = mkOption { + type = types.submodule({ + options = { + use-default-connection = mkOption { + type = types.bool; + default = defaults.full-text-search.postgresql.use-default-connection; + description = "Whether to use the primary db connection."; + }; + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.jdbc.url; + description = '' + The URL to the database. + ''; + }; + user = mkOption { + type = types.str; + default = defaults.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.full-text-search.postgresql.jdbc; + description = "Database connection settings"; + }; + pg-config = mkOption { + type = types.attrs; + default = defaults.full-text-search.postgresql.pg-config; + description = ""; + }; + pg-query-parser = mkOption { + type = types.str; + default = defaults.full-text-search.postgresql.pg-query-parser; + description = ""; + }; + pg-rank-normalization = mkOption { + type = types.listOf types.int; + default = defaults.full-text-search.postgresql.pg-rank-normalization; + description = ""; + }; + }; + }); + default = defaults.full-text-search.postgresql; + description = "PostgreSQL for fulltext search"; + }; + migration = mkOption { type = types.submodule({ options = { diff --git a/nix/module-server.nix b/nix/module-server.nix index c7c0a2ca..0e794290 100644 --- a/nix/module-server.nix +++ b/nix/module-server.nix @@ -62,6 +62,17 @@ let def-type = "lucene"; q-op = "OR"; }; + postgresql = { + use-default-connection = false; + jdbc = { + url = "jdbc:postgresql://server:5432/db"; + user = "pguser"; + password = ""; + }; + pg-config = {}; + pg-query-parser = "websearch_to_tsquery"; + pg-rank-normalization = [ 4 ]; + }; }; auth = { server-secret = "hex:caffee"; @@ -575,6 +586,60 @@ in { default = defaults.full-text-search.solr; description = "Configuration for the SOLR backend."; }; + + postgresql = mkOption { + type = types.submodule({ + options = { + use-default-connection = mkOption { + type = types.bool; + default = defaults.full-text-search.postgresql.use-default-connection; + description = "Whether to use the primary db connection."; + }; + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.jdbc.url; + description = '' + The URL to the database. + ''; + }; + user = mkOption { + type = types.str; + default = defaults.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.full-text-search.postgresql.jdbc; + description = "Database connection settings"; + }; + pg-config = mkOption { + type = types.attrs; + default = defaults.full-text-search.postgresql.pg-config; + description = ""; + }; + pg-query-parser = mkOption { + type = types.str; + default = defaults.full-text-search.postgresql.pg-query-parser; + description = ""; + }; + pg-rank-normalization = mkOption { + type = types.listOf types.int; + default = defaults.full-text-search.postgresql.pg-rank-normalization; + description = ""; + }; + }; + }); + default = defaults.full-text-search.postgresql; + description = "PostgreSQL for fulltext search"; + }; }; }); default = defaults.full-text-search;