diff --git a/build.sbt b/build.sbt
index 7e97df57..051a767b 100644
--- a/build.sbt
+++ b/build.sbt
@@ -259,6 +259,28 @@ val analysis = project.in(file("modules/analysis")).
Dependencies.fs2 ++
Dependencies.stanfordNlpCore
).dependsOn(common, files % "test->test")
+
+val ftsclient = project.in(file("modules/fts-client")).
+ disablePlugins(RevolverPlugin).
+ settings(sharedSettings).
+ settings(testSettings).
+ settings(
+ name := "docspell-fts-client",
+ libraryDependencies ++= Seq.empty
+ ).dependsOn(common)
+
+val ftssolr = project.in(file("modules/fts-solr")).
+ disablePlugins(RevolverPlugin).
+ settings(sharedSettings).
+ settings(testSettings).
+ settings(
+ name := "docspell-fts-solr",
+ libraryDependencies ++=
+ Dependencies.http4sClient ++
+ Dependencies.http4sCirce ++
+ Dependencies.http4sDsl ++
+ Dependencies.circe
+ ).dependsOn(common, ftsclient)
val restapi = project.in(file("modules/restapi")).
disablePlugins(RevolverPlugin).
@@ -303,7 +325,7 @@ val backend = project.in(file("modules/backend")).
Dependencies.bcrypt ++
Dependencies.http4sClient ++
Dependencies.emil
- ).dependsOn(store, joexapi)
+ ).dependsOn(store, joexapi, ftsclient)
val webapp = project.in(file("modules/webapp")).
disablePlugins(RevolverPlugin).
@@ -336,7 +358,9 @@ val joex = project.in(file("modules/joex")).
name := "docspell-joex",
libraryDependencies ++=
Dependencies.fs2 ++
- Dependencies.http4s ++
+ Dependencies.http4sServer ++
+ Dependencies.http4sCirce ++
+ Dependencies.http4sDsl ++
Dependencies.circe ++
Dependencies.pureconfig ++
Dependencies.emilTnef ++
@@ -350,7 +374,7 @@ val joex = project.in(file("modules/joex")).
addCompilerPlugin(Dependencies.betterMonadicFor),
buildInfoPackage := "docspell.joex",
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
- ).dependsOn(store, backend, extract, convert, analysis, joexapi, restapi)
+ ).dependsOn(store, backend, extract, convert, analysis, joexapi, restapi, ftssolr)
val restserver = project.in(file("modules/restserver")).
enablePlugins(BuildInfoPlugin
@@ -364,7 +388,9 @@ val restserver = project.in(file("modules/restserver")).
settings(
name := "docspell-restserver",
libraryDependencies ++=
- Dependencies.http4s ++
+ Dependencies.http4sServer ++
+ Dependencies.http4sCirce ++
+ Dependencies.http4sDsl ++
Dependencies.circe ++
Dependencies.pureconfig ++
Dependencies.yamusca ++
@@ -386,7 +412,7 @@ val restserver = project.in(file("modules/restserver")).
}.taskValue,
Compile/unmanagedResourceDirectories ++= Seq((Compile/resourceDirectory).value.getParentFile/"templates"),
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
- ).dependsOn(restapi, joexapi, backend, webapp)
+ ).dependsOn(restapi, joexapi, backend, webapp, ftssolr)
@@ -472,6 +498,8 @@ val root = project.in(file(".")).
, extract
, convert
, analysis
+ , ftsclient
+ , ftssolr
, files
, store
, joexapi
diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
index 18ab38c8..acb4e08d 100644
--- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
+++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
@@ -1,12 +1,17 @@
package docspell.backend
import cats.effect.{Blocker, ConcurrentEffect, ContextShift, Resource}
+import org.http4s.client.Client
+import org.http4s.client.blaze.BlazeClientBuilder
+
import docspell.backend.auth.Login
import docspell.backend.ops._
import docspell.backend.signup.OSignup
+import docspell.joexapi.client.JoexClient
import docspell.store.Store
import docspell.store.queue.JobQueue
import docspell.store.usertask.UserTaskStore
+import docspell.ftsclient.FtsClient
import scala.concurrent.ExecutionContext
import emil.javamail.{JavaMailEmil, Settings}
@@ -25,6 +30,7 @@ trait BackendApp[F[_]] {
def job: OJob[F]
def item: OItem[F]
def itemSearch: OItemSearch[F]
+ def fulltext: OFulltext[F]
def mail: OMail[F]
def joex: OJoex[F]
def userTask: OUserTask[F]
@@ -35,7 +41,8 @@ object BackendApp {
def create[F[_]: ConcurrentEffect: ContextShift](
cfg: Config,
store: Store[F],
- httpClientEc: ExecutionContext,
+ httpClient: Client[F],
+ ftsClient: FtsClient[F],
blocker: Blocker
): Resource[F, BackendApp[F]] =
for {
@@ -48,12 +55,13 @@ object BackendApp {
tagImpl <- OTag[F](store)
equipImpl <- OEquipment[F](store)
orgImpl <- OOrganization(store)
- joexImpl <- OJoex.create(httpClientEc, store)
+ joexImpl <- OJoex(JoexClient(httpClient), store)
uploadImpl <- OUpload(store, queue, cfg.files, joexImpl)
nodeImpl <- ONode(store)
jobImpl <- OJob(store, joexImpl)
- itemImpl <- OItem(store)
+ itemImpl <- OItem(store, ftsClient)
itemSearchImpl <- OItemSearch(store)
+ fulltextImpl <- OFulltext(itemSearchImpl, ftsClient, store, queue, joexImpl)
javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
mailImpl <- OMail(store, javaEmil)
@@ -71,6 +79,7 @@ object BackendApp {
val job = jobImpl
val item = itemImpl
val itemSearch = itemSearchImpl
+ val fulltext = fulltextImpl
val mail = mailImpl
val joex = joexImpl
val userTask = userTaskImpl
@@ -81,9 +90,11 @@ object BackendApp {
connectEC: ExecutionContext,
httpClientEc: ExecutionContext,
blocker: Blocker
- ): Resource[F, BackendApp[F]] =
+ )(ftsFactory: Client[F] => Resource[F, FtsClient[F]]): Resource[F, BackendApp[F]] =
for {
- store <- Store.create(cfg.jdbc, connectEC, blocker)
- backend <- create(cfg, store, httpClientEc, blocker)
+ store <- Store.create(cfg.jdbc, connectEC, blocker)
+ httpClient <- BlazeClientBuilder[F](httpClientEc).resource
+ ftsClient <- ftsFactory(httpClient)
+ backend <- create(cfg, store, httpClient, ftsClient, blocker)
} yield backend
}
diff --git a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala
new file mode 100644
index 00000000..42d6a654
--- /dev/null
+++ b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala
@@ -0,0 +1,90 @@
+package docspell.backend
+
+import cats.effect._
+import cats.implicits._
+import docspell.common._
+import docspell.store.records.RJob
+
+object JobFactory {
+
+ def processItem[F[_]: Sync](
+ args: ProcessItemArgs,
+ account: AccountId,
+ prio: Priority,
+ tracker: Option[Ident]
+ ): F[RJob] =
+ for {
+ id <- Ident.randomId[F]
+ now <- Timestamp.current[F]
+ job = RJob.newJob(
+ id,
+ ProcessItemArgs.taskName,
+ account.collective,
+ args,
+ args.makeSubject,
+ now,
+ account.user,
+ prio,
+ tracker
+ )
+ } yield job
+
+ def processItems[F[_]: Sync](
+ args: Vector[ProcessItemArgs],
+ account: AccountId,
+ prio: Priority,
+ tracker: Option[Ident]
+ ): F[Vector[RJob]] = {
+ def create(id: Ident, now: Timestamp, arg: ProcessItemArgs): RJob =
+ RJob.newJob(
+ id,
+ ProcessItemArgs.taskName,
+ account.collective,
+ arg,
+ arg.makeSubject,
+ now,
+ account.user,
+ prio,
+ tracker
+ )
+
+ for {
+ id <- Ident.randomId[F]
+ now <- Timestamp.current[F]
+ jobs = args.map(a => create(id, now, a))
+ } yield jobs
+ }
+
+ def reIndexAll[F[_]: Sync]: F[RJob] =
+ for {
+ id <- Ident.randomId[F]
+ now <- Timestamp.current[F]
+ } yield RJob.newJob(
+ id,
+ ReIndexTaskArgs.taskName,
+ DocspellSystem.taskGroup,
+ ReIndexTaskArgs(None),
+ s"Recreate full-text index",
+ now,
+ DocspellSystem.taskGroup,
+ Priority.Low,
+ Some(DocspellSystem.migrationTaskTracker)
+ )
+
+ def reIndex[F[_]: Sync](account: AccountId): F[RJob] =
+ for {
+ id <- Ident.randomId[F]
+ now <- Timestamp.current[F]
+ args = ReIndexTaskArgs(Some(account.collective))
+ } yield RJob.newJob(
+ id,
+ ReIndexTaskArgs.taskName,
+ account.collective,
+ args,
+ s"Recreate full-text index",
+ now,
+ account.user,
+ Priority.Low,
+ Some(ReIndexTaskArgs.tracker(args))
+ )
+}
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala
new file mode 100644
index 00000000..33bedffc
--- /dev/null
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala
@@ -0,0 +1,243 @@
+package docspell.backend.ops
+
+import cats.effect._
+import cats.implicits._
+import fs2.Stream
+import docspell.common._
+import docspell.ftsclient._
+import docspell.backend.JobFactory
+import docspell.store.Store
+import docspell.store.records.RJob
+import docspell.store.queue.JobQueue
+import docspell.store.queries.QItem
+import OItemSearch.{Batch, ListItem, ListItemWithTags, Query}
+
+trait OFulltext[F[_]] {
+
+ def findItems(
+ q: Query,
+ fts: OFulltext.FtsInput,
+ batch: Batch
+ ): F[Vector[OFulltext.FtsItem]]
+
+ /** Same as `findItems` but does more queries per item to find all tags. */
+ def findItemsWithTags(
+ q: Query,
+ fts: OFulltext.FtsInput,
+ batch: Batch
+ ): F[Vector[OFulltext.FtsItemWithTags]]
+
+ def findIndexOnly(
+ fts: OFulltext.FtsInput,
+ collective: Ident,
+ batch: Batch
+ ): F[Vector[OFulltext.FtsItemWithTags]]
+
+ /** Clears the full-text index completely and launches a task that
+ * indexes all data.
+ */
+ def reindexAll: F[Unit]
+
+ /** Clears the full-text index for the given collective and starts a
+ * task indexing all their data.
+ */
+ def reindexCollective(account: AccountId): F[Unit]
+}
+
+object OFulltext {
+
+ case class FtsInput(
+ query: String,
+ highlightPre: String = "***",
+ highlightPost: String = "***"
+ )
+
+ case class FtsDataItem(
+ score: Double,
+ matchData: FtsResult.MatchData,
+ context: List[String]
+ )
+ case class FtsData(
+ maxScore: Double,
+ count: Int,
+ qtime: Duration,
+ items: List[FtsDataItem]
+ )
+ case class FtsItem(item: ListItem, ftsData: FtsData)
+ case class FtsItemWithTags(item: ListItemWithTags, ftsData: FtsData)
+
+ def apply[F[_]: Effect](
+ itemSearch: OItemSearch[F],
+ fts: FtsClient[F],
+ store: Store[F],
+ queue: JobQueue[F],
+ joex: OJoex[F]
+ ): Resource[F, OFulltext[F]] =
+ Resource.pure[F, OFulltext[F]](new OFulltext[F] {
+ def reindexAll: F[Unit] =
+ for {
+ job <- JobFactory.reIndexAll[F]
+ _ <- queue.insertIfNew(job) *> joex.notifyAllNodes
+ } yield ()
+
+ def reindexCollective(account: AccountId): F[Unit] =
+ for {
+ exist <- store.transact(
+ RJob.findNonFinalByTracker(DocspellSystem.migrationTaskTracker)
+ )
+ job <- JobFactory.reIndex(account)
+ _ <-
+ if (exist.isDefined) ().pure[F]
+ else queue.insertIfNew(job) *> joex.notifyAllNodes
+ } yield ()
+
+ def findIndexOnly(
+ ftsQ: OFulltext.FtsInput,
+ collective: Ident,
+ batch: Batch
+ ): F[Vector[OFulltext.FtsItemWithTags]] = {
+ val fq = FtsQuery(
+ ftsQ.query,
+ collective,
+ Set.empty,
+ batch.limit,
+ batch.offset,
+ FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
+ )
+ for {
+ ftsR <- fts.search(fq)
+ ftsItems = ftsR.results.groupBy(_.itemId)
+ select = ftsR.results.map(r => QItem.SelectedItem(r.itemId, r.score)).toSet
+ itemsWithTags <-
+ store
+ .transact(
+ QItem.findItemsWithTags(
+ collective,
+ QItem.findSelectedItems(QItem.Query.empty(collective), select)
+ )
+ )
+ .take(batch.limit.toLong)
+ .compile
+ .toVector
+ res =
+ itemsWithTags
+ .collect(convertFtsData(ftsR, ftsItems))
+ .map({ case (li, fd) => FtsItemWithTags(li, fd) })
+ } yield res
+ }
+
+ def findItems(q: Query, ftsQ: FtsInput, batch: Batch): F[Vector[FtsItem]] =
+ findItemsFts(q, ftsQ, batch.first, itemSearch.findItems, convertFtsData[ListItem])
+ .drop(batch.offset.toLong)
+ .take(batch.limit.toLong)
+ .map({ case (li, fd) => FtsItem(li, fd) })
+ .compile
+ .toVector
+
+ def findItemsWithTags(
+ q: Query,
+ ftsQ: FtsInput,
+ batch: Batch
+ ): F[Vector[FtsItemWithTags]] =
+ findItemsFts(
+ q,
+ ftsQ,
+ batch.first,
+ itemSearch.findItemsWithTags,
+ convertFtsData[ListItemWithTags]
+ )
+ .drop(batch.offset.toLong)
+ .take(batch.limit.toLong)
+ .map({ case (li, fd) => FtsItemWithTags(li, fd) })
+ .compile
+ .toVector
+
+ // Helper
+
+ private def findItemsFts[A: ItemId, B](
+ q: Query,
+ ftsQ: FtsInput,
+ batch: Batch,
+ search: (Query, Batch) => F[Vector[A]],
+ convert: (
+ FtsResult,
+ Map[Ident, List[FtsResult.ItemMatch]]
+ ) => PartialFunction[A, (A, FtsData)]
+ ): Stream[F, (A, FtsData)] =
+ findItemsFts0(q, ftsQ, batch, search, convert)
+ .takeThrough(_._1 >= batch.limit)
+ .flatMap(x => Stream.emits(x._2))
+
+ private def findItemsFts0[A: ItemId, B](
+ q: Query,
+ ftsQ: FtsInput,
+ batch: Batch,
+ search: (Query, Batch) => F[Vector[A]],
+ convert: (
+ FtsResult,
+ Map[Ident, List[FtsResult.ItemMatch]]
+ ) => PartialFunction[A, (A, FtsData)]
+ ): Stream[F, (Int, Vector[(A, FtsData)])] = {
+ val sqlResult = search(q, batch)
+ val fq = FtsQuery(
+ ftsQ.query,
+ q.collective,
+ Set.empty,
+ 0,
+ 0,
+ FtsQuery.HighlightSetting(ftsQ.highlightPre, ftsQ.highlightPost)
+ )
+
+ val qres =
+ for {
+ items <- sqlResult
+ ids = items.map(a => ItemId[A].itemId(a))
+ // must find all index results involving the items.
+ // Currently there is one result per item + one result per
+ // attachment
+ limit = items.map(a => ItemId[A].fileCount(a)).sum + items.size
+ ftsQ = fq.copy(items = ids.toSet, limit = limit)
+ ftsR <- fts.search(ftsQ)
+ ftsItems = ftsR.results.groupBy(_.itemId)
+ res = items.collect(convert(ftsR, ftsItems))
+ } yield (items.size, res)
+
+ Stream.eval(qres) ++ findItemsFts0(q, ftsQ, batch.next, search, convert)
+ }
+
+ private def convertFtsData[A: ItemId](
+ ftr: FtsResult,
+ ftrItems: Map[Ident, List[FtsResult.ItemMatch]]
+ ): PartialFunction[A, (A, FtsData)] = {
+ case a if ftrItems.contains(ItemId[A].itemId(a)) =>
+ val ftsDataItems = ftrItems
+ .get(ItemId[A].itemId(a))
+ .getOrElse(Nil)
+ .map(im =>
+ FtsDataItem(im.score, im.data, ftr.highlight.get(im.id).getOrElse(Nil))
+ )
+ (a, FtsData(ftr.maxScore, ftr.count, ftr.qtime, ftsDataItems))
+ }
+ })
+
+ trait ItemId[A] {
+ def itemId(a: A): Ident
+
+ def fileCount(a: A): Int
+ }
+ object ItemId {
+ def apply[A](implicit ev: ItemId[A]): ItemId[A] = ev
+
+ def from[A](f: A => Ident, g: A => Int): ItemId[A] =
+ new ItemId[A] {
+ def itemId(a: A) = f(a)
+ def fileCount(a: A) = g(a)
+ }
+
+ implicit val listItemId: ItemId[ListItem] =
+ ItemId.from(_.id, _.fileCount)
+
+ implicit val listItemWithTagsId: ItemId[ListItemWithTags] =
+ ItemId.from(_.item.id, _.item.fileCount)
+ }
+}
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
index 1c503266..9a9cd646 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
@@ -5,10 +5,12 @@ import cats.implicits._
import cats.effect.{Effect, Resource}
import doobie._
import doobie.implicits._
+import org.log4s.getLogger
import docspell.store.{AddResult, Store}
import docspell.store.queries.{QAttachment, QItem}
-import docspell.common.{Direction, Ident, ItemState, MetaProposalList, Timestamp}
+import docspell.common._
import docspell.store.records._
+import docspell.ftsclient.FtsClient
trait OItem[F[_]] {
@@ -38,7 +40,7 @@ trait OItem[F[_]] {
def setNotes(item: Ident, notes: Option[String], collective: Ident): F[AddResult]
- def setName(item: Ident, notes: String, collective: Ident): F[AddResult]
+ def setName(item: Ident, name: String, collective: Ident): F[AddResult]
def setState(item: Ident, state: ItemState, collective: Ident): F[AddResult]
@@ -67,11 +69,12 @@ trait OItem[F[_]] {
object OItem {
- def apply[F[_]: Effect](store: Store[F]): Resource[F, OItem[F]] =
+ def apply[F[_]: Effect](store: Store[F], fts: FtsClient[F]): Resource[F, OItem[F]] =
for {
otag <- OTag(store)
oorg <- OOrganization(store)
oequip <- OEquipment(store)
+ logger <- Resource.pure[F, Logger[F]](Logger.log4s(getLogger))
oitem <- Resource.pure[F, OItem[F]](new OItem[F] {
def moveAttachmentBefore(
itemId: Ident,
@@ -259,12 +262,18 @@ object OItem {
.transact(RItem.updateNotes(item, collective, notes))
.attempt
.map(AddResult.fromUpdate)
+ .flatTap(
+ onSuccessIgnoreError(fts.updateItemNotes(logger, item, collective, notes))
+ )
def setName(item: Ident, name: String, collective: Ident): F[AddResult] =
store
.transact(RItem.updateName(item, collective, name))
.attempt
.map(AddResult.fromUpdate)
+ .flatTap(
+ onSuccessIgnoreError(fts.updateItemName(logger, item, collective, name))
+ )
def setState(item: Ident, state: ItemState, collective: Ident): F[AddResult] =
store
@@ -293,13 +302,17 @@ object OItem {
.map(AddResult.fromUpdate)
def deleteItem(itemId: Ident, collective: Ident): F[Int] =
- QItem.delete(store)(itemId, collective)
+ QItem
+ .delete(store)(itemId, collective)
+ .flatTap(_ => fts.removeItem(logger, itemId))
def getProposals(item: Ident, collective: Ident): F[MetaProposalList] =
store.transact(QAttachment.getMetaProposals(item, collective))
def deleteAttachment(id: Ident, collective: Ident): F[Int] =
- QAttachment.deleteSingleAttachment(store)(id, collective)
+ QAttachment
+ .deleteSingleAttachment(store)(id, collective)
+ .flatTap(_ => fts.removeAttachment(logger, id))
def setAttachmentName(
attachId: Ident,
@@ -310,6 +323,29 @@ object OItem {
.transact(RAttachment.updateName(attachId, collective, name))
.attempt
.map(AddResult.fromUpdate)
+ .flatTap(
+ onSuccessIgnoreError(
+ OptionT(store.transact(RAttachment.findItemId(attachId)))
+ .semiflatMap(itemId =>
+ fts.updateAttachmentName(logger, itemId, attachId, collective, name)
+ )
+ .fold(())(identity)
+ )
+ )
+
+ private def onSuccessIgnoreError(update: F[Unit])(ar: AddResult): F[Unit] =
+ ar match {
+ case AddResult.Success =>
+ update.attempt.flatMap {
+ case Right(()) => ().pure[F]
+ case Left(ex) =>
+ logger.warn(s"Error updating full-text index: ${ex.getMessage}")
+ }
+ case AddResult.Failure(_) =>
+ ().pure[F]
+ case AddResult.EntityExists(_) =>
+ ().pure[F]
+ }
})
} yield oitem
}
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItemSearch.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItemSearch.scala
index a1478aad..d6bf64ee 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItemSearch.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItemSearch.scala
@@ -110,11 +110,15 @@ object OItemSearch {
.compile
.toVector
- def findItemsWithTags(q: Query, batch: Batch): F[Vector[ListItemWithTags]] =
+ def findItemsWithTags(q: Query, batch: Batch): F[Vector[ListItemWithTags]] = {
+ val search = QItem.findItems(q, batch)
store
- .transact(QItem.findItemsWithTags(q, batch).take(batch.limit.toLong))
+ .transact(
+ QItem.findItemsWithTags(q.collective, search).take(batch.limit.toLong)
+ )
.compile
.toVector
+ }
def findAttachment(id: Ident, collective: Ident): F[Option[AttachmentData[F]]] =
store
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala
index 70fda316..44f89a9f 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OUpload.scala
@@ -5,7 +5,7 @@ import cats.Functor
import cats.data.{EitherT, OptionT}
import cats.effect._
import cats.implicits._
-import docspell.backend.Config
+import docspell.backend.{Config, JobFactory}
import fs2.Stream
import docspell.common._
import docspell.common.syntax.all._
@@ -204,26 +204,7 @@ object OUpload {
account: AccountId,
prio: Priority,
tracker: Option[Ident]
- ): F[Vector[RJob]] = {
- def create(id: Ident, now: Timestamp, arg: ProcessItemArgs): RJob =
- RJob.newJob(
- id,
- ProcessItemArgs.taskName,
- account.collective,
- arg,
- arg.makeSubject,
- now,
- account.user,
- prio,
- tracker
- )
-
- for {
- id <- Ident.randomId[F]
- now <- Timestamp.current[F]
- jobs = args.map(a => create(id, now, a))
- } yield jobs
-
- }
+ ): F[Vector[RJob]] =
+ JobFactory.processItems[F](args, account, prio, tracker)
})
}
diff --git a/modules/common/src/main/scala/docspell/common/DocspellSystem.scala b/modules/common/src/main/scala/docspell/common/DocspellSystem.scala
new file mode 100644
index 00000000..21247829
--- /dev/null
+++ b/modules/common/src/main/scala/docspell/common/DocspellSystem.scala
@@ -0,0 +1,8 @@
+package docspell.common
+
+object DocspellSystem {
+
+ val taskGroup = Ident.unsafe("docspell-system")
+ val migrationTaskTracker = Ident.unsafe("full-text-index-tracker")
+
+}
diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala
index bb47059e..dfda4652 100644
--- a/modules/common/src/main/scala/docspell/common/Duration.scala
+++ b/modules/common/src/main/scala/docspell/common/Duration.scala
@@ -25,6 +25,9 @@ case class Duration(nanos: Long) {
def formatExact: String =
s"$millis ms"
+
+ override def toString(): String =
+ s"Duration(${millis}ms)"
}
object Duration {
diff --git a/modules/common/src/main/scala/docspell/common/Ident.scala b/modules/common/src/main/scala/docspell/common/Ident.scala
index a1d6cb8a..3d752053 100644
--- a/modules/common/src/main/scala/docspell/common/Ident.scala
+++ b/modules/common/src/main/scala/docspell/common/Ident.scala
@@ -15,6 +15,9 @@ case class Ident(id: String) {
def nonEmpty: Boolean =
!isEmpty
+
+ def /(next: Ident): Ident =
+ new Ident(id + "." + next.id)
}
object Ident {
diff --git a/modules/common/src/main/scala/docspell/common/ReIndexTaskArgs.scala b/modules/common/src/main/scala/docspell/common/ReIndexTaskArgs.scala
new file mode 100644
index 00000000..d7624048
--- /dev/null
+++ b/modules/common/src/main/scala/docspell/common/ReIndexTaskArgs.scala
@@ -0,0 +1,24 @@
+package docspell.common
+
+import io.circe._
+import io.circe.generic.semiauto._
+
+final case class ReIndexTaskArgs(collective: Option[Ident])
+
+object ReIndexTaskArgs {
+ val taskName = Ident.unsafe("full-text-reindex")
+
+ def tracker(args: ReIndexTaskArgs): Ident =
+ args.collective match {
+ case Some(cid) =>
+ cid / DocspellSystem.migrationTaskTracker
+ case None =>
+ DocspellSystem.migrationTaskTracker
+ }
+
+ implicit val jsonEncoder: Encoder[ReIndexTaskArgs] =
+ deriveEncoder[ReIndexTaskArgs]
+
+ implicit val jsonDecoder: Decoder[ReIndexTaskArgs] =
+ deriveDecoder[ReIndexTaskArgs]
+}
diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala
new file mode 100644
index 00000000..703162cc
--- /dev/null
+++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala
@@ -0,0 +1,132 @@
+package docspell.ftsclient
+
+import fs2.Stream
+import cats.implicits._
+import cats.effect._
+import org.log4s.getLogger
+import docspell.common._
+
+/** The fts client is the interface for docspell to a fulltext search
+ * engine.
+ *
+ * It defines all operations required for integration into docspell.
+ * It uses data structures from docspell. Implementation modules need
+ * to translate it to the engine that provides the features.
+ */
+trait FtsClient[F[_]] {
+
+ /** Initialization tasks. This is called exactly once and then never
+ * again (except when re-indexing everything). It may be used to
+ * setup the database.
+ */
+ def initialize: F[Unit]
+
+ /** Run a full-text search. */
+ def search(q: FtsQuery): F[FtsResult]
+
+ /** Continually run a full-text search and concatenate the results. */
+ def searchAll(q: FtsQuery): Stream[F, FtsResult] =
+ Stream.eval(search(q)).flatMap { result =>
+ if (result.results.size < q.limit) Stream.emit(result)
+ else Stream.emit(result) ++ searchAll(q.nextPage)
+ }
+
+ /** Push all data to the index. Data with same `id' is replaced.
+ * Values that are `None' are removed from the index (or set to an
+ * empty string).
+ */
+ def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
+
+ def indexData(logger: Logger[F], data: TextData*): F[Unit] =
+ indexData(logger, Stream.emits(data))
+
+ /** Push all data to the index, but only update existing entries. No
+ * new entries are created and values that are given as `None' are
+ * skipped.
+ */
+ def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
+
+ def updateIndex(logger: Logger[F], data: TextData*): F[Unit] =
+ updateIndex(logger, Stream.emits(data))
+
+ def updateItemName(
+ logger: Logger[F],
+ itemId: Ident,
+ collective: Ident,
+ name: String
+ ): F[Unit] =
+ updateIndex(logger, TextData.item(itemId, collective, Some(name), None))
+
+ def updateItemNotes(
+ logger: Logger[F],
+ itemId: Ident,
+ collective: Ident,
+ notes: Option[String]
+ ): F[Unit] =
+ updateIndex(
+ logger,
+ TextData.item(itemId, collective, None, Some(notes.getOrElse("")))
+ )
+
+ def updateAttachmentName(
+ logger: Logger[F],
+ itemId: Ident,
+ attachId: Ident,
+ collective: Ident,
+ name: Option[String]
+ ): F[Unit] =
+ updateIndex(
+ logger,
+ TextData.attachment(
+ itemId,
+ attachId,
+ collective,
+ Language.English,
+ Some(name.getOrElse("")),
+ None
+ )
+ )
+
+ def removeItem(logger: Logger[F], itemId: Ident): F[Unit]
+
+ def removeAttachment(logger: Logger[F], attachId: Ident): F[Unit]
+
+ /** Clears the index – removes everything. */
+ def clearAll(logger: Logger[F]): F[Unit]
+
+ /** Clears the index from all data belonging to the given collective. */
+ def clear(logger: Logger[F], collective: Ident): F[Unit]
+
+}
+
+object FtsClient {
+
+ def none[F[_]: Sync] =
+ new FtsClient[F] {
+ private[this] val logger = Logger.log4s[F](getLogger)
+
+ def initialize: F[Unit] =
+ logger.info("Full-text search is disabled!")
+
+ def search(q: FtsQuery): F[FtsResult] =
+ logger.warn("Full-text search is disabled!") *> FtsResult.empty.pure[F]
+
+ def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+
+ def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+
+ def removeItem(logger: Logger[F], itemId: Ident): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+
+ def removeAttachment(logger: Logger[F], attachId: Ident): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+
+ def clearAll(logger: Logger[F]): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+
+ def clear(logger: Logger[F], collective: Ident): F[Unit] =
+ logger.warn("Full-text search is disabled!")
+ }
+}
diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala
new file mode 100644
index 00000000..785d2e20
--- /dev/null
+++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala
@@ -0,0 +1,34 @@
+package docspell.ftsclient
+
+import docspell.common._
+
+/** A fulltext query.
+ *
+ * The query itself is a raw string. Each implementation may
+ * interpret it according to the system in use.
+ *
+ * Searches must only look for given collective and in the given list
+ * of item ids, if it is non-empty. If the item set is empty, then
+ * don't restrict the result in this way.
+ */
+final case class FtsQuery(
+ q: String,
+ collective: Ident,
+ items: Set[Ident],
+ limit: Int,
+ offset: Int,
+ highlight: FtsQuery.HighlightSetting
+) {
+
+ def nextPage: FtsQuery =
+ copy(offset = limit + offset)
+}
+
+object FtsQuery {
+
+ case class HighlightSetting(pre: String, post: String)
+
+ object HighlightSetting {
+ val default = HighlightSetting("**", "**")
+ }
+}
diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala
new file mode 100644
index 00000000..df1c43ac
--- /dev/null
+++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsResult.scala
@@ -0,0 +1,31 @@
+package docspell.ftsclient
+
+import docspell.common._
+
+import FtsResult.ItemMatch
+
+final case class FtsResult(
+ qtime: Duration,
+ count: Int,
+ maxScore: Double,
+ highlight: Map[Ident, List[String]],
+ results: List[ItemMatch]
+) {}
+
+object FtsResult {
+
+ val empty =
+ FtsResult(Duration.millis(0), 0, 0.0, Map.empty, Nil)
+
+ sealed trait MatchData
+ case class AttachmentData(attachId: Ident, attachName: String) extends MatchData
+ case object ItemData extends MatchData
+
+ case class ItemMatch(
+ id: Ident,
+ itemId: Ident,
+ collectiveId: Ident,
+ score: Double,
+ data: MatchData
+ )
+}
diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala
new file mode 100644
index 00000000..625411ad
--- /dev/null
+++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala
@@ -0,0 +1,63 @@
+package docspell.ftsclient
+
+import docspell.common._
+
+sealed trait TextData {
+
+ def id: Ident
+
+ def item: Ident
+
+ def collective: Ident
+
+ final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A =
+ this match {
+ case a: TextData.Attachment => f(a)
+ case a: TextData.Item => g(a)
+ }
+}
+
+object TextData {
+
+ final case class Attachment(
+ item: Ident,
+ attachId: Ident,
+ collective: Ident,
+ lang: Language,
+ name: Option[String],
+ text: Option[String]
+ ) extends TextData {
+
+ val id = item / attachId
+
+ }
+
+ def attachment(
+ item: Ident,
+ attachId: Ident,
+ collective: Ident,
+ lang: Language,
+ name: Option[String],
+ text: Option[String]
+ ): TextData =
+ Attachment(item, attachId, collective, lang, name, text)
+
+ final case class Item(
+ item: Ident,
+ collective: Ident,
+ name: Option[String],
+ notes: Option[String]
+ ) extends TextData {
+
+ val id = Ident.unsafe("item") / item
+
+ }
+
+ def item(
+ item: Ident,
+ collective: Ident,
+ name: Option[String],
+ notes: Option[String]
+ ): TextData =
+ Item(item, collective, name, notes)
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
new file mode 100644
index 00000000..256844ee
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala
@@ -0,0 +1,38 @@
+package docspell.ftssolr
+
+import io.circe._
+import docspell.common._
+
+final class Field(val name: String) extends AnyVal {
+
+ override def toString(): String = s"Field($name)"
+}
+
+object Field {
+
+ def apply(name: String): Field =
+ new Field(name)
+
+ val id = Field("id")
+ val itemId = Field("itemId")
+ val collectiveId = Field("collectiveId")
+ val attachmentId = Field("attachmentId")
+ val discriminator = Field("discriminator")
+ val attachmentName = Field("attachmentName")
+ val content = Field("content")
+ val content_de = Field("content_de")
+ val content_en = Field("content_en")
+ val itemName = Field("itemName")
+ val itemNotes = Field("itemNotes")
+
+ def contentField(lang: Language): Field =
+ lang match {
+ case Language.German =>
+ Field.content_de
+ case Language.English =>
+ Field.content_en
+ }
+
+ implicit val jsonEncoder: Encoder[Field] =
+ Encoder.encodeString.contramap(_.name)
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala
new file mode 100644
index 00000000..96ee969d
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala
@@ -0,0 +1,136 @@
+package docspell.ftssolr
+
+import io.circe._
+import io.circe.syntax._
+import docspell.common._
+import docspell.ftsclient._
+
+trait JsonCodec {
+
+ implicit def attachmentEncoder(implicit
+ enc: Encoder[Ident]
+ ): Encoder[TextData.Attachment] =
+ new Encoder[TextData.Attachment] {
+ final def apply(td: TextData.Attachment): Json = {
+ val cnt =
+ (Field.contentField(td.lang).name, Json.fromString(td.text.getOrElse("")))
+
+ Json.fromFields(
+ cnt :: List(
+ (Field.id.name, enc(td.id)),
+ (Field.itemId.name, enc(td.item)),
+ (Field.collectiveId.name, enc(td.collective)),
+ (Field.attachmentId.name, enc(td.attachId)),
+ (Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))),
+ (Field.discriminator.name, Json.fromString("attachment"))
+ )
+ )
+
+ }
+ }
+
+ implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] =
+ new Encoder[TextData.Item] {
+ final def apply(td: TextData.Item): Json =
+ Json.obj(
+ (Field.id.name, enc(td.id)),
+ (Field.itemId.name, enc(td.item)),
+ (Field.collectiveId.name, enc(td.collective)),
+ (Field.itemName.name, Json.fromString(td.name.getOrElse(""))),
+ (Field.itemNotes.name, Json.fromString(td.notes.getOrElse(""))),
+ (Field.discriminator.name, Json.fromString("item"))
+ )
+ }
+
+ implicit def textDataEncoder(implicit
+ ae: Encoder[TextData.Attachment],
+ ie: Encoder[TextData.Item]
+ ): Encoder[TextData] =
+ Encoder(_.fold(ae.apply, ie.apply))
+
+ implicit def ftsResultDecoder: Decoder[FtsResult] =
+ new Decoder[FtsResult] {
+ final def apply(c: HCursor): Decoder.Result[FtsResult] =
+ for {
+ qtime <- c.downField("responseHeader").get[Duration]("QTime")
+ count <- c.downField("response").get[Int]("numFound")
+ maxScore <- c.downField("response").get[Double]("maxScore")
+ results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs")
+ highlightng <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting")
+ highlight = highlightng.map(kv => kv._1 -> kv._2.values.flatten.toList)
+ } yield FtsResult(qtime, count, maxScore, highlight, results)
+ }
+
+ implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] =
+ new Decoder[FtsResult.ItemMatch] {
+ final def apply(c: HCursor): Decoder.Result[FtsResult.ItemMatch] =
+ for {
+ itemId <- c.get[Ident](Field.itemId.name)
+ id <- c.get[Ident](Field.id.name)
+ coll <- c.get[Ident](Field.collectiveId.name)
+ score <- c.get[Double]("score")
+ md <- decodeMatchData(c)
+ } yield FtsResult.ItemMatch(id, itemId, coll, score, md)
+ }
+
+ def decodeMatchData: Decoder[FtsResult.MatchData] =
+ new Decoder[FtsResult.MatchData] {
+ final def apply(c: HCursor): Decoder.Result[FtsResult.MatchData] =
+ for {
+ disc <- c.get[String]("discriminator")
+ md <-
+ if ("attachment" == disc)
+ for {
+ aId <- c.get[Ident](Field.attachmentId.name)
+ aName <- c.get[String](Field.attachmentName.name)
+ } yield FtsResult.AttachmentData(aId, aName)
+ else Right(FtsResult.ItemData)
+ } yield md
+ }
+
+ implicit def identKeyEncoder: KeyEncoder[Ident] =
+ new KeyEncoder[Ident] {
+ override def apply(ident: Ident): String = ident.id
+ }
+ implicit def identKeyDecoder: KeyDecoder[Ident] =
+ new KeyDecoder[Ident] {
+ override def apply(ident: String): Option[Ident] = Ident(ident).toOption
+ }
+
+ def setAttachmentEncoder(implicit
+ enc: Encoder[Ident]
+ ): Encoder[TextData.Attachment] =
+ new Encoder[TextData.Attachment] {
+ final def apply(td: TextData.Attachment): Json = {
+ val setter = List(
+ td.name.map(n => (Field.attachmentName.name, Map("set" -> n.asJson).asJson)),
+ td.text.map(txt =>
+ (Field.contentField(td.lang).name, Map("set" -> txt.asJson).asJson)
+ )
+ ).flatten
+ Json.fromFields(
+ (Field.id.name, enc(td.id)) :: setter
+ )
+ }
+ }
+
+ def setItemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] =
+ new Encoder[TextData.Item] {
+ final def apply(td: TextData.Item): Json = {
+ val setter = List(
+ td.name.map(n => (Field.itemName.name, Map("set" -> n.asJson).asJson)),
+ td.notes.map(n => (Field.itemNotes.name, Map("set" -> n.asJson).asJson))
+ ).flatten
+
+ Json.fromFields(
+ (Field.id.name, enc(td.id)) :: setter
+ )
+ }
+ }
+
+ implicit def textDataEncoder: Encoder[SetFields] =
+ Encoder(_.td.fold(setAttachmentEncoder.apply, setItemEncoder.apply))
+
+}
+
+object JsonCodec extends JsonCodec
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala
new file mode 100644
index 00000000..063f3048
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/QueryData.scala
@@ -0,0 +1,68 @@
+package docspell.ftssolr
+
+import io.circe._
+import io.circe.generic.semiauto._
+import docspell.ftsclient.FtsQuery
+
+final case class QueryData(
+ query: String,
+ filter: String,
+ limit: Int,
+ offset: Int,
+ fields: List[Field],
+ params: Map[String, String]
+) {
+
+ def nextPage: QueryData =
+ copy(offset = offset + limit)
+
+ def withHighLight(fields: List[Field], pre: String, post: String): QueryData =
+ copy(params =
+ params ++ Map(
+ "hl" -> "on",
+ "hl.requireFieldMatch" -> "true",
+ "hl.fl" -> fields.map(_.name).mkString(","),
+ "hl.simple.pre" -> pre,
+ "hl.simple.post" -> post
+ )
+ )
+}
+
+object QueryData {
+
+ implicit val jsonEncoder: Encoder[QueryData] =
+ deriveEncoder[QueryData]
+
+ def apply(
+ cfg: SolrConfig,
+ search: List[Field],
+ fields: List[Field],
+ fq: FtsQuery
+ ): QueryData = {
+ val q = sanitize(fq.q)
+ val extQ = search.map(f => s"${f.name}:($q)").mkString(" OR ")
+ val items = fq.items.map(_.id).mkString(" ")
+ val collQ = s"""${Field.collectiveId.name}:"${fq.collective.id}""""
+ val filterQ = fq.items match {
+ case s if s.isEmpty =>
+ collQ
+ case _ =>
+ (collQ :: List(s"""${Field.itemId.name}:($items)""")).mkString(" AND ")
+ }
+ QueryData(
+ extQ,
+ filterQ,
+ fq.limit,
+ fq.offset,
+ fields,
+ Map("defType" -> cfg.defType, "q.op" -> cfg.qOp)
+ ).withHighLight(
+ search,
+ fq.highlight.pre,
+ fq.highlight.post
+ )
+ }
+
+ private def sanitize(q: String): String =
+ q.replaceAll("[\\(,\\)]+", " ")
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SetFields.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SetFields.scala
new file mode 100644
index 00000000..af9b370e
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SetFields.scala
@@ -0,0 +1,5 @@
+package docspell.ftssolr
+
+import docspell.ftsclient._
+
+final case class SetFields(td: TextData)
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrConfig.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrConfig.scala
new file mode 100644
index 00000000..639ae424
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrConfig.scala
@@ -0,0 +1,13 @@
+package docspell.ftssolr
+
+import docspell.common._
+
+final case class SolrConfig(
+ url: LenientUri,
+ commitWithin: Int,
+ logVerbose: Boolean,
+ defType: String,
+ qOp: String
+)
+
+object SolrConfig {}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala
new file mode 100644
index 00000000..4bdf3fda
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala
@@ -0,0 +1,89 @@
+package docspell.ftssolr
+
+import fs2.Stream
+import cats.effect._
+import cats.implicits._
+import org.http4s.client.Client
+import org.http4s.client.middleware.Logger
+import org.log4s.getLogger
+
+import docspell.common._
+import docspell.ftsclient._
+
+final class SolrFtsClient[F[_]: Effect](
+ solrUpdate: SolrUpdate[F],
+ solrSetup: SolrSetup[F],
+ solrQuery: SolrQuery[F]
+) extends FtsClient[F] {
+
+ def initialize: F[Unit] =
+ solrSetup.setupSchema
+
+ def search(q: FtsQuery): F[FtsResult] =
+ solrQuery.query(q)
+
+ def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
+ modifyIndex(logger, data)(solrUpdate.add)
+
+ def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
+ modifyIndex(logger, data)(solrUpdate.update)
+
+ def modifyIndex(logger: Logger[F], data: Stream[F, TextData])(
+ f: List[TextData] => F[Unit]
+ ): F[Unit] =
+ (for {
+ _ <- Stream.eval(logger.debug("Updating SOLR index"))
+ chunks <- data.chunks
+ res <- Stream.eval(f(chunks.toList).attempt)
+ _ <- res match {
+ case Right(()) => Stream.emit(())
+ case Left(ex) =>
+ Stream.eval(logger.error(ex)("Error updating with chunk of data"))
+ }
+ } yield ()).compile.drain
+
+ def removeItem(logger: Logger[F], itemId: Ident): F[Unit] =
+ logger.debug(s"Remove item '${itemId.id}' from index") *>
+ solrUpdate.delete(s"${Field.itemId.name}:${itemId.id}", None)
+
+ def removeAttachment(logger: Logger[F], attachId: Ident): F[Unit] =
+ logger.debug(s"Remove attachment '${attachId.id}' from index") *>
+ solrUpdate.delete(s"${Field.attachmentId.name}:${attachId.id}", None)
+
+ def clearAll(logger: Logger[F]): F[Unit] =
+ logger.info("Deleting complete full-text index!") *>
+ solrUpdate.delete("*:*", Option(0))
+
+ def clear(logger: Logger[F], collective: Ident): F[Unit] =
+ logger.info(s"Deleting full-text index for collective ${collective.id}") *>
+ solrUpdate.delete(s"${Field.collectiveId.name}:${collective.id}", Option(0))
+}
+
+object SolrFtsClient {
+ private[this] val logger = getLogger
+
+ def apply[F[_]: ConcurrentEffect](
+ cfg: SolrConfig,
+ httpClient: Client[F]
+ ): Resource[F, FtsClient[F]] = {
+ val client = loggingMiddleware(cfg, httpClient)
+ Resource.pure[F, FtsClient[F]](
+ new SolrFtsClient(
+ SolrUpdate(cfg, client),
+ SolrSetup(cfg, client),
+ SolrQuery(cfg, client)
+ )
+ )
+ }
+
+ private def loggingMiddleware[F[_]: Concurrent](
+ cfg: SolrConfig,
+ client: Client[F]
+ ): Client[F] =
+ Logger(
+ logHeaders = true,
+ logBody = cfg.logVerbose,
+ logAction = Some((msg: String) => Sync[F].delay(logger.trace(msg)))
+ )(client)
+
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
new file mode 100644
index 00000000..c557344d
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala
@@ -0,0 +1,60 @@
+package docspell.ftssolr
+
+import cats.effect._
+import org.http4s._
+import org.http4s.client.Client
+import org.http4s.circe._
+import org.http4s.circe.CirceEntityDecoder._
+import org.http4s.client.dsl.Http4sClientDsl
+import _root_.io.circe.syntax._
+
+import docspell.ftsclient._
+import JsonCodec._
+
+trait SolrQuery[F[_]] {
+
+ def query(q: QueryData): F[FtsResult]
+
+ def query(q: FtsQuery): F[FtsResult]
+}
+
+object SolrQuery {
+ def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrQuery[F] = {
+ val dsl = new Http4sClientDsl[F] {}
+ import dsl._
+
+ new SolrQuery[F] {
+ val url = Uri.unsafeFromString(cfg.url.asString) / "query"
+
+ def query(q: QueryData): F[FtsResult] = {
+ val req = Method.POST(q.asJson, url)
+ client.expect[FtsResult](req)
+ }
+
+ def query(q: FtsQuery): F[FtsResult] = {
+ val fq = QueryData(
+ cfg,
+ List(
+ Field.content,
+ Field.content_de,
+ Field.content_en,
+ Field.itemName,
+ Field.itemNotes,
+ Field.attachmentName
+ ),
+ List(
+ Field.id,
+ Field.itemId,
+ Field.collectiveId,
+ Field("score"),
+ Field.attachmentId,
+ Field.attachmentName,
+ Field.discriminator
+ ),
+ q
+ )
+ query(fq)
+ }
+ }
+ }
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
new file mode 100644
index 00000000..c25431a8
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -0,0 +1,121 @@
+package docspell.ftssolr
+
+import cats.effect._
+import org.http4s._
+import cats.implicits._
+import org.http4s.client.Client
+import org.http4s.circe._
+import org.http4s.client.dsl.Http4sClientDsl
+import _root_.io.circe.syntax._
+import _root_.io.circe._
+import _root_.io.circe.generic.semiauto._
+import docspell.common._
+
+trait SolrSetup[F[_]] {
+
+ def setupSchema: F[Unit]
+
+}
+
+object SolrSetup {
+
+ def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrSetup[F] = {
+ val dsl = new Http4sClientDsl[F] {}
+ import dsl._
+
+ new SolrSetup[F] {
+ val url = (Uri.unsafeFromString(cfg.url.asString) / "schema")
+ .withQueryParam("commitWithin", cfg.commitWithin.toString)
+
+ def setupSchema: F[Unit] = {
+ val cmds0 =
+ List(
+ Field.id,
+ Field.itemId,
+ Field.collectiveId,
+ Field.discriminator,
+ Field.attachmentId
+ )
+ .traverse(addStringField)
+ val cmds1 = List(
+ Field.attachmentName,
+ Field.content,
+ Field.itemName,
+ Field.itemNotes
+ )
+ .traverse(addTextField(None))
+
+ val cntLang = Language.all.traverse {
+ case l @ Language.German =>
+ addTextField(l.some)(Field.content_de)
+ case l @ Language.English =>
+ addTextField(l.some)(Field.content_en)
+ }
+
+ cmds0 *> cmds1 *> cntLang *> ().pure[F]
+ }
+
+ private def run(cmd: Json): F[Unit] = {
+ val req = Method.POST(cmd, url)
+ client.expect[Unit](req)
+ }
+
+ private def addStringField(field: Field): F[Unit] =
+ run(DeleteField.command(DeleteField(field))).attempt *>
+ run(AddField.command(AddField.string(field)))
+
+ private def addTextField(lang: Option[Language])(field: Field): F[Unit] =
+ lang match {
+ case None =>
+ run(DeleteField.command(DeleteField(field))).attempt *>
+ run(AddField.command(AddField.text(field)))
+ case Some(Language.German) =>
+ run(DeleteField.command(DeleteField(field))).attempt *>
+ run(AddField.command(AddField.textDE(field)))
+ case Some(Language.English) =>
+ run(DeleteField.command(DeleteField(field))).attempt *>
+ run(AddField.command(AddField.textEN(field)))
+ }
+ }
+ }
+
+ // Schema Commands: The structure is for conveniently creating the
+ // solr json. All fields must be stored, because of highlighting and
+ // single-updates only work when all fields are stored.
+
+ case class AddField(
+ name: Field,
+ `type`: String,
+ stored: Boolean,
+ indexed: Boolean,
+ multiValued: Boolean
+ )
+ object AddField {
+ implicit val encoder: Encoder[AddField] =
+ deriveEncoder[AddField]
+
+ def command(body: AddField): Json =
+ Map("add-field" -> body.asJson).asJson
+
+ def string(field: Field): AddField =
+ AddField(field, "string", true, true, false)
+
+ def text(field: Field): AddField =
+ AddField(field, "text_general", true, true, false)
+
+ def textDE(field: Field): AddField =
+ AddField(field, "text_de", true, true, false)
+
+ def textEN(field: Field): AddField =
+ AddField(field, "text_en", true, true, false)
+ }
+
+ case class DeleteField(name: Field)
+ object DeleteField {
+ implicit val encoder: Encoder[DeleteField] =
+ deriveEncoder[DeleteField]
+
+ def command(body: DeleteField): Json =
+ Map("delete-field" -> body.asJson).asJson
+ }
+}
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala
new file mode 100644
index 00000000..fcfe1151
--- /dev/null
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala
@@ -0,0 +1,78 @@
+package docspell.ftssolr
+
+import cats.effect._
+import org.http4s._
+import org.http4s.client.Client
+import org.http4s.circe._
+import org.http4s.client.dsl.Http4sClientDsl
+import _root_.io.circe._
+import _root_.io.circe.syntax._
+
+import docspell.ftsclient._
+import JsonCodec._
+
+trait SolrUpdate[F[_]] {
+
+ def add(tds: List[TextData]): F[Unit]
+
+ def update(tds: List[TextData]): F[Unit]
+
+ def delete(q: String, commitWithin: Option[Int]): F[Unit]
+}
+
+object SolrUpdate {
+
+ def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrUpdate[F] = {
+ val dsl = new Http4sClientDsl[F] {}
+ import dsl._
+
+ new SolrUpdate[F] {
+ val url = (Uri.unsafeFromString(cfg.url.asString) / "update")
+ .withQueryParam("commitWithin", cfg.commitWithin.toString)
+ .withQueryParam("overwrite", "true")
+ .withQueryParam("wt", "json")
+
+ def add(tds: List[TextData]): F[Unit] = {
+ val req = Method.POST(tds.asJson, url)
+ client.expect[Unit](req)
+ }
+
+ def update(tds: List[TextData]): F[Unit] = {
+ val req = Method.POST(tds.filter(minOneChange).map(SetFields).asJson, url)
+ client.expect[Unit](req)
+ }
+
+ def delete(q: String, commitWithin: Option[Int]): F[Unit] = {
+ val uri = commitWithin match {
+ case Some(n) =>
+ if (n <= 0)
+ url.removeQueryParam("commitWithin").withQueryParam("commit", "true")
+ else url.withQueryParam("commitWithin", n.toString)
+ case None =>
+ url
+ }
+ val req = Method.POST(Delete(q).asJson, uri)
+ client.expect[Unit](req)
+ }
+
+ private val minOneChange: TextData => Boolean =
+ _ match {
+ case td: TextData.Attachment =>
+ td.name.isDefined || td.text.isDefined
+ case td: TextData.Item =>
+ td.name.isDefined || td.notes.isDefined
+ }
+ }
+ }
+
+ case class Delete(query: String)
+ object Delete {
+ implicit val jsonEncoder: Encoder[Delete] =
+ new Encoder[Delete] {
+ def apply(d: Delete): Json =
+ Json.obj(
+ ("delete", Json.obj("query" -> d.query.asJson))
+ )
+ }
+ }
+}
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 1958070b..16bc791a 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -364,4 +364,38 @@ docspell.joex {
# By default all files are allowed.
valid-mime-types = [ ]
}
+
+ # Configuration of the full-text search engine.
+ full-text-search {
+ # The full-text search feature can be disabled. It requires an
+ # additional index server which needs additional memory and disk
+ # space. It can be enabled later any time.
+ #
+ # Currently the SOLR search platform is supported.
+ enabled = false
+
+ # Configuration for the SOLR backend.
+ solr = {
+ # The URL to solr
+ url = "http://localhost:8983/solr/docspell"
+ # Used to tell solr when to commit the data
+ commit-within = 1000
+ # If true, logs request and response bodies
+ log-verbose = false
+ # The defType parameter to lucene that defines the parser to
+ # use. You might want to try "edismax" or look here:
+ # https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing
+ def-type = "lucene"
+ # The default combiner for tokens. One of {AND, OR}.
+ q-op = "OR"
+ }
+
+ # Settings for running the index migration tasks
+ migration = {
+ # Chunk size to use when indexing data from the database. This
+ # many attachments are loaded into memory and pushed to the
+ # full-text index.
+ index-all-chunk = 10
+ }
+ }
}
\ No newline at end of file
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index 27817d69..c9c54528 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -8,6 +8,7 @@ import docspell.convert.ConvertConfig
import docspell.extract.ExtractConfig
import docspell.joex.hk.HouseKeepingConfig
import docspell.backend.Config.Files
+import docspell.ftssolr.SolrConfig
case class Config(
appId: Ident,
@@ -23,7 +24,8 @@ case class Config(
convert: ConvertConfig,
sendMail: MailSendConfig,
files: Files,
- mailDebug: Boolean
+ mailDebug: Boolean,
+ fullTextSearch: Config.FullTextSearch
)
object Config {
@@ -34,4 +36,15 @@ object Config {
math.min(mailChunkSize, maxMails)
}
case class UserTasks(scanMailbox: ScanMailbox)
+
+ case class FullTextSearch(
+ enabled: Boolean,
+ migration: FullTextSearch.Migration,
+ solr: SolrConfig
+ )
+
+ object FullTextSearch {
+
+ final case class Migration(indexAllChunk: Int)
+ }
}
diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
index f2d3cd91..fe3fc586 100644
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -3,10 +3,15 @@ package docspell.joex
import cats.implicits._
import cats.effect._
import emil.javamail._
+import fs2.concurrent.SignallingRef
+import scala.concurrent.ExecutionContext
+import org.http4s.client.Client
+import org.http4s.client.blaze.BlazeClientBuilder
import docspell.common._
import docspell.backend.ops._
import docspell.joex.hk._
import docspell.joex.notify._
+import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.scanmailbox._
import docspell.joex.process.ItemHandler
import docspell.joex.scheduler._
@@ -14,13 +19,14 @@ import docspell.joexapi.client.JoexClient
import docspell.store.Store
import docspell.store.queue._
import docspell.store.records.RJobLog
-import fs2.concurrent.SignallingRef
-import scala.concurrent.ExecutionContext
+import docspell.ftsclient.FtsClient
+import docspell.ftssolr.SolrFtsClient
final class JoexAppImpl[F[_]: ConcurrentEffect: ContextShift: Timer](
cfg: Config,
nodeOps: ONode[F],
store: Store[F],
+ queue: JobQueue[F],
pstore: PeriodicTaskStore[F],
termSignal: SignallingRef[F, Boolean],
val scheduler: Scheduler[F],
@@ -50,7 +56,10 @@ final class JoexAppImpl[F[_]: ConcurrentEffect: ContextShift: Timer](
periodicScheduler.shutdown *> scheduler.shutdown(false) *> termSignal.set(true)
private def scheduleBackgroundTasks: F[Unit] =
- HouseKeepingTask.periodicTask[F](cfg.houseKeeping.schedule).flatMap(pstore.insert)
+ HouseKeepingTask
+ .periodicTask[F](cfg.houseKeeping.schedule)
+ .flatMap(pstore.insert) *>
+ MigrationTask.job.flatMap(queue.insertIfNew)
}
object JoexAppImpl {
@@ -63,13 +72,15 @@ object JoexAppImpl {
blocker: Blocker
): Resource[F, JoexApp[F]] =
for {
- client <- JoexClient.resource(clientEC)
+ httpClient <- BlazeClientBuilder[F](clientEC).resource
+ client = JoexClient(httpClient)
store <- Store.create(cfg.jdbc, connectEC, blocker)
queue <- JobQueue(store)
pstore <- PeriodicTaskStore.create(store)
nodeOps <- ONode(store)
joex <- OJoex(client, store)
upload <- OUpload(store, queue, cfg.files, joex)
+ fts <- createFtsClient(cfg)(httpClient)
javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@@ -77,7 +88,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
ProcessItemArgs.taskName,
- ItemHandler.newItem[F](cfg),
+ ItemHandler.newItem[F](cfg, fts),
ItemHandler.onCancel[F]
)
)
@@ -95,6 +106,20 @@ object JoexAppImpl {
ScanMailboxTask.onCancel[F]
)
)
+ .withTask(
+ JobTask.json(
+ MigrationTask.taskName,
+ MigrationTask[F](cfg.fullTextSearch, fts),
+ MigrationTask.onCancel[F]
+ )
+ )
+ .withTask(
+ JobTask.json(
+ ReIndexTask.taskName,
+ ReIndexTask[F](cfg.fullTextSearch, fts),
+ ReIndexTask.onCancel[F]
+ )
+ )
.withTask(
JobTask.json(
HouseKeepingTask.taskName,
@@ -111,7 +136,13 @@ object JoexAppImpl {
client,
Timer[F]
)
- app = new JoexAppImpl(cfg, nodeOps, store, pstore, termSignal, sch, psch)
+ app = new JoexAppImpl(cfg, nodeOps, store, queue, pstore, termSignal, sch, psch)
appR <- Resource.make(app.init.map(_ => app))(_.shutdown)
} yield appR
+
+ private def createFtsClient[F[_]: ConcurrentEffect: ContextShift](
+ cfg: Config
+ )(client: Client[F]): Resource[F, FtsClient[F]] =
+ if (cfg.fullTextSearch.enabled) SolrFtsClient(cfg.fullTextSearch.solr, client)
+ else Resource.pure[F, FtsClient[F]](FtsClient.none[F])
}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/FtsContext.scala b/modules/joex/src/main/scala/docspell/joex/fts/FtsContext.scala
new file mode 100644
index 00000000..ac1267e6
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/FtsContext.scala
@@ -0,0 +1,24 @@
+package docspell.joex.fts
+
+import docspell.common.Logger
+import docspell.joex.Config
+import docspell.joex.scheduler.Context
+import docspell.store.Store
+import docspell.ftsclient.FtsClient
+
+case class FtsContext[F[_]](
+ cfg: Config.FullTextSearch,
+ store: Store[F],
+ fts: FtsClient[F],
+ logger: Logger[F]
+)
+
+object FtsContext {
+
+ def apply[F[_]](
+ cfg: Config.FullTextSearch,
+ fts: FtsClient[F],
+ ctx: Context[F, _]
+ ): FtsContext[F] =
+ FtsContext(cfg, ctx.store, fts, ctx.logger)
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala b/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala
new file mode 100644
index 00000000..5e861b8d
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/FtsWork.scala
@@ -0,0 +1,93 @@
+package docspell.joex.fts
+
+import cats.effect._
+import cats.data.{Kleisli, NonEmptyList}
+import cats.{ApplicativeError, FlatMap, Semigroup}
+import cats.implicits._
+import docspell.common._
+import docspell.ftsclient._
+import docspell.joex.scheduler.Context
+import docspell.joex.Config
+import docspell.store.queries.{QAttachment, QItem}
+
+object FtsWork {
+ def apply[F[_]](f: FtsContext[F] => F[Unit]): FtsWork[F] =
+ Kleisli(f)
+
+ def all[F[_]: FlatMap](
+ m0: FtsWork[F],
+ mn: FtsWork[F]*
+ ): FtsWork[F] =
+ NonEmptyList.of(m0, mn: _*).reduce(semigroup[F])
+
+ implicit def semigroup[F[_]: FlatMap]: Semigroup[FtsWork[F]] =
+ Semigroup.instance((mt1, mt2) => mt1.flatMap(_ => mt2))
+
+ // some tasks
+
+ def log[F[_]](f: Logger[F] => F[Unit]): FtsWork[F] =
+ FtsWork(ctx => f(ctx.logger))
+
+ def initialize[F[_]]: FtsWork[F] =
+ FtsWork(_.fts.initialize)
+
+ def clearIndex[F[_]](coll: Option[Ident]): FtsWork[F] =
+ coll match {
+ case Some(cid) =>
+ FtsWork(ctx => ctx.fts.clear(ctx.logger, cid))
+ case None =>
+ FtsWork(ctx => ctx.fts.clearAll(ctx.logger))
+ }
+
+ def insertAll[F[_]: Effect](coll: Option[Ident]): FtsWork[F] =
+ FtsWork
+ .all(
+ FtsWork(ctx =>
+ ctx.fts.indexData(
+ ctx.logger,
+ ctx.store
+ .transact(
+ QAttachment
+ .allAttachmentMetaAndName(coll, ctx.cfg.migration.indexAllChunk)
+ )
+ .map(caa =>
+ TextData
+ .attachment(
+ caa.item,
+ caa.id,
+ caa.collective,
+ caa.lang,
+ caa.name,
+ caa.content
+ )
+ )
+ )
+ ),
+ FtsWork(ctx =>
+ ctx.fts.indexData(
+ ctx.logger,
+ ctx.store
+ .transact(QItem.allNameAndNotes(coll, ctx.cfg.migration.indexAllChunk * 5))
+ .map(nn => TextData.item(nn.id, nn.collective, Option(nn.name), nn.notes))
+ )
+ )
+ )
+
+ object syntax {
+ implicit final class FtsWorkOps[F[_]](mt: FtsWork[F]) {
+ def ++(mn: FtsWork[F])(implicit ev: FlatMap[F]): FtsWork[F] =
+ all(mt, mn)
+
+ def recoverWith(
+ other: FtsWork[F]
+ )(implicit ev: ApplicativeError[F, Throwable]): FtsWork[F] =
+ Kleisli(ctx => mt.run(ctx).onError({ case _ => other.run(ctx) }))
+
+ def forContext(
+ cfg: Config.FullTextSearch,
+ fts: FtsClient[F]
+ ): Kleisli[F, Context[F, _], Unit] =
+ mt.local(ctx => FtsContext(cfg, fts, ctx))
+ }
+ }
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala
new file mode 100644
index 00000000..cfc63940
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala
@@ -0,0 +1,66 @@
+package docspell.joex.fts
+
+import cats.effect._
+import cats.implicits._
+import cats.data.{Kleisli, OptionT}
+import cats.Traverse
+import docspell.common._
+import docspell.joex.Config
+import docspell.store.{AddResult, Store}
+import docspell.store.records.RFtsMigration
+import docspell.ftsclient._
+
+case class Migration[F[_]](
+ version: Int,
+ engine: Ident,
+ description: String,
+ task: FtsWork[F]
+)
+
+object Migration {
+
+ def apply[F[_]: Effect](
+ cfg: Config.FullTextSearch,
+ fts: FtsClient[F],
+ store: Store[F],
+ logger: Logger[F]
+ ): Kleisli[F, List[Migration[F]], Unit] = {
+ val ctx = FtsContext(cfg, store, fts, logger)
+ Kleisli(migs => Traverse[List].sequence(migs.map(applySingle[F](ctx))).map(_ => ()))
+ }
+
+ def applySingle[F[_]: Effect](ctx: FtsContext[F])(m: Migration[F]): F[Unit] = {
+ val insertRecord: F[Option[RFtsMigration]] =
+ for {
+ rec <- RFtsMigration.create(m.version, m.engine, m.description)
+ res <- ctx.store.add(
+ RFtsMigration.insert(rec),
+ RFtsMigration.exists(m.version, m.engine)
+ )
+ ret <- res match {
+ case AddResult.Success => rec.some.pure[F]
+ case AddResult.EntityExists(_) => None.pure[F]
+ case AddResult.Failure(ex) => Effect[F].raiseError(ex)
+ }
+ } yield ret
+
+ (for {
+ _ <- OptionT.liftF(ctx.logger.info(s"Apply ${m.version}/${m.description}"))
+ rec <- OptionT(insertRecord)
+ res <- OptionT.liftF(m.task.run(ctx).attempt)
+ ret <- OptionT.liftF(res match {
+ case Right(()) => ().pure[F]
+ case Left(ex) =>
+ ctx.logger.error(ex)(
+ s"Applying index migration ${m.version}/${m.description} failed"
+ ) *>
+ ctx.store.transact(RFtsMigration.deleteById(rec.id)) *> Effect[F]
+ .raiseError[Unit](
+ ex
+ )
+ })
+ } yield ret).getOrElseF(
+ ctx.logger.info(s"Migration ${m.version}/${m.description} already applied.")
+ )
+ }
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/MigrationTask.scala b/modules/joex/src/main/scala/docspell/joex/fts/MigrationTask.scala
new file mode 100644
index 00000000..f7d1b980
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/MigrationTask.scala
@@ -0,0 +1,53 @@
+package docspell.joex.fts
+
+import cats.effect._
+import cats.implicits._
+import docspell.common._
+import docspell.joex.Config
+import docspell.joex.scheduler.Task
+import docspell.ftsclient._
+import docspell.store.records.RJob
+
+object MigrationTask {
+ val taskName = Ident.unsafe("full-text-index")
+
+ def apply[F[_]: ConcurrentEffect](
+ cfg: Config.FullTextSearch,
+ fts: FtsClient[F]
+ ): Task[F, Unit, Unit] =
+ Task
+ .log[F, Unit](_.info(s"Running full-text-index migrations now"))
+ .flatMap(_ =>
+ Task(ctx =>
+ Migration[F](cfg, fts, ctx.store, ctx.logger)
+ .run(migrationTasks[F])
+ )
+ )
+
+ def onCancel[F[_]: Sync]: Task[F, Unit, Unit] =
+ Task.log[F, Unit](_.warn("Cancelling full-text-index task"))
+
+ def job[F[_]: Sync]: F[RJob] =
+ for {
+ id <- Ident.randomId[F]
+ now <- Timestamp.current[F]
+ } yield RJob.newJob(
+ id,
+ taskName,
+ DocspellSystem.taskGroup,
+ (),
+ "Create full-text index",
+ now,
+ DocspellSystem.taskGroup,
+ Priority.Low,
+ Some(DocspellSystem.migrationTaskTracker)
+ )
+
+ private val solrEngine = Ident.unsafe("solr")
+ def migrationTasks[F[_]: Effect]: List[Migration[F]] =
+ List(
+ Migration[F](1, solrEngine, "initialize", FtsWork.initialize[F]),
+ Migration[F](2, solrEngine, "Index all from database", FtsWork.insertAll[F](None))
+ )
+
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala b/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala
new file mode 100644
index 00000000..edb1cc0d
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/ReIndexTask.scala
@@ -0,0 +1,54 @@
+package docspell.joex.fts
+
+import cats.effect._
+import docspell.common._
+import docspell.joex.Config
+import docspell.joex.scheduler.Task
+import docspell.ftsclient._
+import FtsWork.syntax._
+
+object ReIndexTask {
+ type Args = ReIndexTaskArgs
+
+ val taskName = ReIndexTaskArgs.taskName
+ val tracker = DocspellSystem.migrationTaskTracker
+
+ def apply[F[_]: ConcurrentEffect](
+ cfg: Config.FullTextSearch,
+ fts: FtsClient[F]
+ ): Task[F, Args, Unit] =
+ Task
+ .log[F, Args](_.info(s"Running full-text re-index now"))
+ .flatMap(_ =>
+ Task(ctx =>
+ (clearData[F](ctx.args.collective) ++
+ FtsWork.log[F](_.info("Inserting data from database")) ++
+ FtsWork.insertAll[F](
+ ctx.args.collective
+ )).forContext(cfg, fts).run(ctx)
+ )
+ )
+
+ def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
+ Task.log[F, Args](_.warn("Cancelling full-text re-index task"))
+
+ private def clearData[F[_]: ConcurrentEffect](collective: Option[Ident]): FtsWork[F] =
+ FtsWork.log[F](_.info("Clearing index data")) ++
+ (collective match {
+ case Some(_) =>
+ FtsWork
+ .clearIndex(collective)
+ .recoverWith(
+ FtsWork.log[F](_.info("Clearing data failed. Continue re-indexing."))
+ )
+
+ case None =>
+ FtsWork
+ .clearIndex(None)
+ .recoverWith(
+ FtsWork.log[F](_.info("Clearing data failed. Continue re-indexing."))
+ ) ++
+ FtsWork.log[F](_.info("Running index initialize")) ++
+ FtsWork.initialize[F]
+ })
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/fts/package.scala b/modules/joex/src/main/scala/docspell/joex/fts/package.scala
new file mode 100644
index 00000000..784754ab
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/fts/package.scala
@@ -0,0 +1,9 @@
+package docspell.joex
+
+import cats.data.Kleisli
+
+package object fts {
+
+ type FtsWork[F[_]] = Kleisli[F, FtsContext[F], Unit]
+
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/hk/HouseKeepingTask.scala b/modules/joex/src/main/scala/docspell/joex/hk/HouseKeepingTask.scala
index 909f14e7..1ebcaa96 100644
--- a/modules/joex/src/main/scala/docspell/joex/hk/HouseKeepingTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/hk/HouseKeepingTask.scala
@@ -11,7 +11,6 @@ import docspell.store.records._
object HouseKeepingTask {
private val periodicId = Ident.unsafe("docspell-houskeeping")
- val systemGroup: Ident = Ident.unsafe("docspell-system")
val taskName: Ident = Ident.unsafe("housekeeping")
@@ -29,10 +28,10 @@ object HouseKeepingTask {
.createJson(
true,
taskName,
- systemGroup,
+ DocspellSystem.taskGroup,
(),
"Docspell house-keeping",
- systemGroup,
+ DocspellSystem.taskGroup,
Priority.Low,
ce
)
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
index dbc0f70a..5b7e9552 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala
@@ -8,6 +8,7 @@ import docspell.joex.Config
import docspell.joex.scheduler.Task
import docspell.store.queries.QItem
import docspell.store.records.RItem
+import docspell.ftsclient.FtsClient
object ItemHandler {
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
@@ -16,11 +17,12 @@ object ItemHandler {
)
def newItem[F[_]: ConcurrentEffect: ContextShift](
- cfg: Config
+ cfg: Config,
+ fts: FtsClient[F]
): Task[F, ProcessItemArgs, Unit] =
CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing))
- .flatMap(safeProcess[F](cfg))
+ .flatMap(safeProcess[F](cfg, fts))
.map(_ => ())
def itemStateTask[F[_]: Sync, A](
@@ -36,11 +38,12 @@ object ItemHandler {
Task(_.isLastRetry)
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
- cfg: Config
+ cfg: Config,
+ fts: FtsClient[F]
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
isLastRetry[F].flatMap {
case true =>
- ProcessItem[F](cfg)(data).attempt.flatMap({
+ ProcessItem[F](cfg, fts)(data).attempt.flatMap({
case Right(d) =>
Task.pure(d)
case Left(ex) =>
@@ -50,7 +53,7 @@ object ItemHandler {
.andThen(_ => Sync[F].raiseError(ex))
})
case false =>
- ProcessItem[F](cfg)(data).flatMap(itemStateTask(ItemState.Created))
+ ProcessItem[F](cfg, fts)(data).flatMap(itemStateTask(ItemState.Created))
}
def deleteByFileIds[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
index b667d894..de5de412 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -5,17 +5,19 @@ import docspell.common.ProcessItemArgs
import docspell.analysis.TextAnalysisConfig
import docspell.joex.scheduler.Task
import docspell.joex.Config
+import docspell.ftsclient.FtsClient
object ProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift](
- cfg: Config
+ cfg: Config,
+ fts: FtsClient[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ExtractArchive(item)
.flatMap(Task.setProgress(20))
.flatMap(ConvertPdf(cfg.convert, _))
.flatMap(Task.setProgress(40))
- .flatMap(TextExtraction(cfg.extraction, _))
+ .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(60))
.flatMap(analysisOnly[F](cfg.textAnalysis))
.flatMap(Task.setProgress(80))
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
index e1bb4de1..8bfa250b 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -3,17 +3,17 @@ package docspell.joex.process
import bitpeace.{Mimetype, RangeDef}
import cats.data.OptionT
import cats.implicits._
-import cats.effect.{ContextShift, Sync}
+import cats.effect._
import docspell.common._
import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
import docspell.store.syntax.MimeTypes._
+import docspell.ftsclient.{FtsClient, TextData}
object TextExtraction {
- def apply[F[_]: Sync: ContextShift](
- cfg: ExtractConfig,
+ def apply[F[_]: ConcurrentEffect: ContextShift](cfg: ExtractConfig, fts: FtsClient[F])(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
@@ -21,28 +21,52 @@ object TextExtraction {
_ <- ctx.logger.info("Starting text extraction")
start <- Duration.stopTime[F]
txt <- item.attachments.traverse(
- extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)
+ extractTextIfEmpty(
+ ctx,
+ cfg,
+ ctx.args.meta.language,
+ ctx.args.meta.collective,
+ item
+ )
)
- _ <- ctx.logger.debug("Storing extracted texts")
- _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
+ _ <- ctx.logger.debug("Storing extracted texts")
+ _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
+ idxItem =
+ TextData.item(item.item.id, ctx.args.meta.collective, item.item.name.some, None)
+ _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
- } yield item.copy(metas = txt)
+ } yield item.copy(metas = txt.map(_._1))
}
def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, _],
cfg: ExtractConfig,
lang: Language,
+ collective: Ident,
item: ItemData
- )(ra: RAttachment): F[RAttachmentMeta] = {
+ )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
+ def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
+ (
+ rm,
+ TextData.attachment(
+ item.item.id,
+ ra.id,
+ collective,
+ lang,
+ ra.name,
+ rm.content
+ )
+ )
+
val rm = item.findOrCreate(ra.id)
rm.content match {
case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
- rm.pure[F]
+ makeTextData(rm).pure[F]
case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
+ .map(makeTextData)
}
}
diff --git a/modules/microsite/docs/demo.md b/modules/microsite/docs/demo.md
index 3b8c237b..234b60f3 100644
--- a/modules/microsite/docs/demo.md
+++ b/modules/microsite/docs/demo.md
@@ -6,6 +6,17 @@ permalink: demo
# {{ page.title }}
+## Finding Items
+
+
-
+## Basic Idea (First Version)
+
+
diff --git a/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md
new file mode 100644
index 00000000..1719c7fc
--- /dev/null
+++ b/modules/microsite/docs/dev/adr/0014_fulltext_search_engine.md
@@ -0,0 +1,49 @@
+---
+layout: docs
+title: Fulltext Search Engine
+---
+
+# Choose a Fulltext Search Engine
+
+It should be possible to search the contents of all documents.
+
+## Context and Problem Statement
+
+To allow searching the documents contents efficiently, a separate
+index is necessary. The "defacto standard" for fulltext search on the
+JVM is something backed by [Lucene](https://lucene.apache.org).
+Another option is to use a RDBMS that supports fulltext search.
+
+This adds another component to the mix, which increases the complexity
+of the setup and the software. Since docspell works great without this
+feature, it shouldn't have a huge impact on the application, i.e. if
+the fulltext search component is down or broken, docspell should still
+work (just the fulltext search is then not working).
+
+## Considered Options
+
+* [Apache SOLR](https://lucene.apache.org/solr)
+* [ElasticSearch](https://www.elastic.co/elasticsearch/)
+* [PostgreSQL](https://www.postgresql.org/docs/12/textsearch.html)
+* All of them or a subset
+
+## Decision Outcome
+
+If docspell is running on PostgreSQL, it would be nice to also use it
+for fulltext search to save the cost of running another component. But
+I don't want to lock the database to PostgreSQL *only* because of the
+fulltext search feature.
+
+ElasticSearch and Apache SOLR are quite similiar in features. SOLR is
+part of Lucene and therefore lives in the Apache ecosystem. I would
+choose SOLR over ElasticSearch, because I used it before.
+
+The last option (supporting all) is interesting, since it would enable
+to use PostgreSQL for fulltext search for those that use PostgreSQL as
+the database for docspell.
+
+In a first step, identify what docspell needs from a fulltext search
+component and create this interface and an implementation for Apache
+SOLR. This enables all users to use the fulltext search feature. As a
+later step, an implementation based on PostgreSQL and/or ElasticSearch
+could be provided, too.
diff --git a/modules/microsite/docs/doc/configure.md b/modules/microsite/docs/doc/configure.md
index ffcdc553..1f170670 100644
--- a/modules/microsite/docs/doc/configure.md
+++ b/modules/microsite/docs/doc/configure.md
@@ -72,6 +72,68 @@ H2
url = "jdbc:h2:///path/to/a/file.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"
```
+
+### Full-Text Search: SOLR
+
+[Apache SOLR](https://lucene.apache.org/solr) is used to provide the
+full-text search. Both docspell components must provide the same
+connection setup. This is defined in the `full-text-search.solr`
+subsection:
+
+```
+...
+ full-text-search {
+ enabled = true
+ ...
+ solr = {
+ url = "http://localhost:8983/solr/docspell"
+ }
+ }
+```
+
+The default configuration at the end of this page contains more
+information about each setting.
+
+The `solr.url` is the mandatory setting that you need to change to
+point to your SOLR instance. Then you need to set the `enabled` flag
+to `true`.
+
+When installing docspell manually, just install solr and create a core
+as described in the [solr
+documentation](https://lucene.apache.org/solr/guide/8_4/installing-solr.html).
+That will provide you with the connection url (the last part is the
+core name).
+
+While the `full-text-search.solr` options are the same for joex and
+the restserver, there are some settings that differ. The restserver
+has this additional setting, that may be of interest:
+
+```
+full-text-search {
+ recreate-key = "test123"
+}
+```
+
+This key is required if you want docspell to drop and re-create the
+entire index. This is possible via a REST call:
+
+``` shell
+$ curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123
+```
+
+Here the `test123` is the key defined with `recreate-key`. If it is
+empty (the default), this REST call is disabled. Otherwise, the POST
+request will submit a system task that is executed by a joex instance
+eventually.
+
+Using this endpoint, the index will be re-created. This is sometimes
+necessary, for example if you upgrade SOLR or delete the core to
+provide a new one (see
+[here](https://lucene.apache.org/solr/guide/8_4/reindexing.html) for
+details). Note that a collective can also re-index their data using a
+similiar endpoint; but this is only deleting their data and doesn't do
+a full re-index.
+
### Bind
The host and port the http server binds to. This applies to both
diff --git a/modules/microsite/docs/doc/curate.md b/modules/microsite/docs/doc/curate.md
index 766fde1d..feb52c9e 100644
--- a/modules/microsite/docs/doc/curate.md
+++ b/modules/microsite/docs/doc/curate.md
@@ -1,6 +1,6 @@
---
layout: docs
-title: Find and Review
+title: Curate Metadata
permalink: doc/curate
---
diff --git a/modules/microsite/docs/doc/finding.md b/modules/microsite/docs/doc/finding.md
index 99414ffa..69e4985e 100644
--- a/modules/microsite/docs/doc/finding.md
+++ b/modules/microsite/docs/doc/finding.md
@@ -6,9 +6,9 @@ permalink: doc/finding
# {{ page.title }}
-Items can be searched by their annotated meta data. The landing page
-shows a list of current items. Items are displayed sorted by their
-date, newest first.
+Items can be searched by their annotated meta data and their contents
+using full text search. The landing page shows a list of current
+items. Items are displayed sorted by their date, newest first.
Docspell has two modes for searching: a simple search bar and a search
menu with many options. Both are active at the same time, but only one
@@ -19,32 +19,51 @@ is visible. You can switch between them without affecting the results.
-By default, the search bar is shown. It searches in the name
-properties of the following meta data:
+By default, the search bar is shown. It provides a refined view of the
+search menu. The dropdown contains different options to do a quick
+search.
-- the item name
-- the notes
-- correspondent organization and person
-- concerning person and equipment
+### *All Names* and *Contents*
-A wildcard `*` can be used at the start or end of a search term to do
-a substring match. A `*` means "everything". So a term `*company`
-matches all names ending in `company` and `*company*` matches all
-names containing the word `company`. The matching is case insensitive.
+These two options correspond to the same named field in the search
+menu. If you switch between search menu and search bar (by clicking
+the icon on the left), you'll see that they are the same fields.
+Typing in the search bar also fills the corresponding field in the
+search menu (and vice versa).
-Docspell adds a `*` to the front and end of a term automatically,
-unless one of the following is true:
+- The *All Names* searches in the item name, item notes, names of
+ correspondent organization and person, and names of concering person
+ and equipment. It uses a simple substring search.
+- The option *Contents* searches the contents of all attachments
+ (documents), attachment names, the item name and item notes. It uses
+ full text search. However, it does not search the names of attached
+ meta data.
-- The term already has a wildcard.
-- The term is enclosed in quotes `"`.
+When searching with one of these fields active, it simply submits the
+(hidden) search menu. So if the menu has other fields filled out, they
+will affect the result, too. Using one of these fields, the bar is
+just a reduced view of the search menu.
-You can go to the search menu by clicking the left icon in the search
-bar.
+So you can choose tags or correspondents in the search menu and
+further restrict the results using full text search. The results will
+be returned sorted by the item date, newest first.
-If the search bar shows a little blue bubble, it means that there are
-more search fields filled out in the search menu. In this case the
-results are not only restricted by the search term given in the
-search-bar, but also by what is specified in the search menu.
+If the left button in the search bar shows a little blue bubble, it
+means that there are more search fields filled out in the search menu
+that you currently can't see. In this case the results are not only
+restricted by the search term given in the search-bar, but also by
+what is specified in the search menu.
+
+
+### *Contents Only*
+
+This option has no corresponding part in the search menu. Searching
+with this option active, there is only a full text search done in the
+attachments contents, attachment names, item name and item notes.
+
+The results are not ordered by item date, but by relevance with
+respect to the search term. This ordering is returned from the full
+text search engine and is simply transfered unmodified.
## Search Menu
@@ -104,9 +123,61 @@ within this range. Items without a due date are not shown.
Specify whether to show only incoming, only outgoing or all items.
+## Customize Substring Search
+
+The substring search of the *All Names* and *Name* field can be
+customized in the following way: A wildcard `*` can be used at the
+start or end of a search term to do a substring match. A `*` means
+"everything". So a term `*company` matches all names ending in
+`company` and `*company*` matches all names containing the word
+`company`. The matching is case insensitive.
+
+Docspell adds a `*` to the front and end of a term automatically,
+unless one of the following is true:
+
+- The term already has a wildcard.
+- The term is enclosed in quotes `"`.
+
+
+## Full Text Search
+
+
+### The Query
+
+The query string for full text search is very powerful. Docspell
+currently supports [Apache SOLR](https://lucene.apache.org/solr/) as
+full text search backend, so you may want to have a look at their
+[documentation on query
+syntax](https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing)
+for a in depth guide.
+
+- Wildcards: `?` matches any single character, `*` matches zero or
+ more characters
+- Fuzzy search: Appending a `~` to a term, results in a fuzzy search
+ (search this term and similiar spelled ones)
+- Proximity Search: Search for terms that "near" each other, again
+ using `~` appended to a search phrase. Example: `"cheese cake"~5`.
+- Boosting: apply more weight to a term with `^`. Example: `cheese^4
+ cake` – cheese is 4x more important.
+
+Docspell will preprocess the search query to prepare a query for SOLR.
+It will by default search all indexed fields, which are: attachment
+contents, attachment names, item name and item notes.
+
+
+### The Results
+
+When using full text search, each item in the result list is annotated
+with the highlighted occurrence of the match.
+
+