Add task to index current database state

This commit is contained in:
Eike Kettner
2020-06-18 22:38:45 +02:00
parent 146d1b0562
commit 60c079f664
12 changed files with 317 additions and 8 deletions

View File

@ -368,6 +368,9 @@ docspell.joex {
# Configuration of the full-text search engine.
full-text-search {
enabled = true
migration = {
index-all-chunk = 10
}
solr = {
url = "http://localhost:8983/solr/docspell_core"
}

View File

@ -37,5 +37,14 @@ object Config {
}
case class UserTasks(scanMailbox: ScanMailbox)
case class FullTextSearch(enabled: Boolean, solr: SolrConfig)
case class FullTextSearch(
enabled: Boolean,
migration: FullTextSearch.Migration,
solr: SolrConfig
)
object FullTextSearch {
final case class Migration(indexAllChunk: Int)
}
}

View File

@ -7,6 +7,7 @@ import docspell.common._
import docspell.backend.ops._
import docspell.joex.hk._
import docspell.joex.notify._
import docspell.joex.fts.IndexTask
import docspell.joex.scanmailbox._
import docspell.joex.process.ItemHandler
import docspell.joex.scheduler._
@ -23,6 +24,7 @@ final class JoexAppImpl[F[_]: ConcurrentEffect: ContextShift: Timer](
cfg: Config,
nodeOps: ONode[F],
store: Store[F],
queue: JobQueue[F],
pstore: PeriodicTaskStore[F],
termSignal: SignallingRef[F, Boolean],
val scheduler: Scheduler[F],
@ -52,7 +54,9 @@ final class JoexAppImpl[F[_]: ConcurrentEffect: ContextShift: Timer](
periodicScheduler.shutdown *> scheduler.shutdown(false) *> termSignal.set(true)
private def scheduleBackgroundTasks: F[Unit] =
HouseKeepingTask.periodicTask[F](cfg.houseKeeping.schedule).flatMap(pstore.insert)
HouseKeepingTask
.periodicTask[F](cfg.houseKeeping.schedule)
.flatMap(pstore.insert) *> IndexTask.job.flatMap(queue.insert)
}
object JoexAppImpl {
@ -99,6 +103,13 @@ object JoexAppImpl {
ScanMailboxTask.onCancel[F]
)
)
.withTask(
JobTask.json(
IndexTask.taskName,
IndexTask[F](cfg.fullTextSearch, fts),
IndexTask.onCancel[F]
)
)
.withTask(
JobTask.json(
HouseKeepingTask.taskName,
@ -115,7 +126,7 @@ object JoexAppImpl {
client,
Timer[F]
)
app = new JoexAppImpl(cfg, nodeOps, store, pstore, termSignal, sch, psch)
app = new JoexAppImpl(cfg, nodeOps, store, queue, pstore, termSignal, sch, psch)
appR <- Resource.make(app.init.map(_ => app))(_.shutdown)
} yield appR
}

View File

@ -0,0 +1,48 @@
package docspell.joex.fts
import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler.Task
import docspell.ftsclient._
import docspell.store.records.RJob
import docspell.joex.hk.HouseKeepingTask
object IndexTask {
val taskName: Ident = Ident.unsafe("full-text-index")
val systemGroup = HouseKeepingTask.systemGroup
def apply[F[_]: ConcurrentEffect](
cfg: Config.FullTextSearch,
fts: FtsClient[F]
): Task[F, Unit, Unit] =
Task
.log[F, Unit](_.info(s"Running full-text-index task now"))
.flatMap(_ =>
Task(ctx =>
Migration[F](cfg, ctx.store, fts, ctx.logger)
.run(Migration.migrationTasks[F])
)
)
def onCancel[F[_]: Sync]: Task[F, Unit, Unit] =
Task.log[F, Unit](_.warn("Cancelling full-text-index task"))
def job[F[_]: Sync]: F[RJob] =
for {
id <- Ident.randomId[F]
now <- Timestamp.current[F]
} yield RJob.newJob(
id,
taskName,
systemGroup,
(),
"Create full-text index",
now,
systemGroup,
Priority.Low,
None
)
}

View File

@ -0,0 +1,110 @@
package docspell.joex.fts
import cats.effect._
import cats.implicits._
import cats.data.{Kleisli, OptionT}
import cats.Traverse
import docspell.common._
import docspell.joex.Config
import docspell.store.{AddResult, Store}
import docspell.store.records.RFtsMigration
import docspell.store.queries.{QAttachment, QItem}
import docspell.ftsclient._
object Migration {
private val solrEngine = Ident.unsafe("solr")
case class MigrateCtx[F[_]](
cfg: Config.FullTextSearch,
store: Store[F],
fts: FtsClient[F],
logger: Logger[F]
)
case class Migration[F[_]](
version: Int,
engine: Ident,
description: String,
task: Kleisli[F, MigrateCtx[F], Unit]
)
def apply[F[_]: Effect](
cfg: Config.FullTextSearch,
store: Store[F],
fts: FtsClient[F],
logger: Logger[F]
): Kleisli[F, List[Migration[F]], Unit] = {
val ctx = MigrateCtx(cfg, store, fts, logger)
Kleisli(migs => Traverse[List].sequence(migs.map(applySingle[F](ctx))).map(_ => ()))
}
def applySingle[F[_]: Effect](ctx: MigrateCtx[F])(m: Migration[F]): F[Unit] = {
val insertRecord: F[Option[RFtsMigration]] =
for {
rec <- RFtsMigration.create(m.version, m.engine, m.description)
res <- ctx.store.add(
RFtsMigration.insert(rec),
RFtsMigration.exists(m.version, m.engine)
)
ret <- res match {
case AddResult.Success => rec.some.pure[F]
case AddResult.EntityExists(_) => None.pure[F]
case AddResult.Failure(ex) => Effect[F].raiseError(ex)
}
} yield ret
(for {
_ <- OptionT.liftF(ctx.logger.info(s"Apply ${m.version}/${m.description}"))
rec <- OptionT(insertRecord)
res <- OptionT.liftF(m.task.run(ctx).attempt)
_ <- OptionT.liftF(res match {
case Right(()) => ().pure[F]
case Left(ex) =>
ctx.logger.error(ex)(
s"Applying index migration ${m.version}/${m.description} failed"
) *>
ctx.store.transact(RFtsMigration.deleteById(rec.id)) *> Effect[F].raiseError(
ex
)
})
} yield ()).getOrElseF(
ctx.logger.info(s"Migration ${m.version}/${m.description} already applied.")
)
}
def migrationTasks[F[_]]: List[Migration[F]] =
List(
Migration[F](1, solrEngine, "initialize", Kleisli(ctx => ctx.fts.initialize)),
Migration[F](
2,
solrEngine,
"Index all attachments from database",
Kleisli(ctx =>
ctx.fts.indexData(
ctx.logger,
ctx.store
.transact(
QAttachment.allAttachmentMetaAndName(ctx.cfg.migration.indexAllChunk)
)
.map(caa =>
TextData
.attachment(caa.item, caa.id, caa.collective, caa.name, caa.content)
)
)
)
),
Migration[F](
3,
solrEngine,
"Index all items from database",
Kleisli(ctx =>
ctx.fts.indexData(
ctx.logger,
ctx.store
.transact(QItem.allNameAndNotes(ctx.cfg.migration.indexAllChunk * 5))
.map(nn => TextData.item(nn.id, nn.collective, Option(nn.name), nn.notes))
)
)
)
)
}

View File

@ -32,7 +32,7 @@ object TextExtraction {
)
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <- fts.indexData(Stream.emits(txt.map(_._2)))
_ <- fts.indexData(ctx.logger, Stream.emits(txt.map(_._2)))
dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1))