Make data to index more flexible and extensible

This commit is contained in:
Eike Kettner 2020-06-17 21:18:48 +02:00
parent 522daaf57e
commit 146d1b0562
3 changed files with 82 additions and 19 deletions

View File

@ -15,6 +15,9 @@ case class Ident(id: String) {
def nonEmpty: Boolean =
!isEmpty
def / (next: Ident): Ident =
new Ident(id + "/" + next.id)
}
object Ident {

View File

@ -2,4 +2,55 @@ package docspell.ftsclient
import docspell.common._
final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String)
sealed trait TextData {
def id: Ident
def item: Ident
def collective: Ident
}
object TextData {
final case class Attachment(
item: Ident,
attachId: Ident,
collective: Ident,
name: Option[String],
text: Option[String]
) extends TextData {
val id = item / attachId
}
def attachment(
item: Ident,
attachId: Ident,
collective: Ident,
name: Option[String],
text: Option[String]
): TextData =
Attachment(item, attachId, collective, name, text)
final case class Item(
item: Ident,
collective: Ident,
name: Option[String],
notes: Option[String]
) extends TextData {
val id = Ident.unsafe("item") / item
}
def item(
item: Ident,
collective: Ident,
name: Option[String],
notes: Option[String]
): TextData =
Item(item, collective, name, notes)
}

View File

@ -22,40 +22,49 @@ object TextExtraction {
_ <- ctx.logger.info("Starting text extraction")
start <- Duration.stopTime[F]
txt <- item.attachments.traverse(
extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)
)
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
_ <- fts.indexData(
Stream
.emits(txt)
.map(a =>
TextData(
item.item.id,
a.id,
ctx.args.meta.collective,
a.content.getOrElse("")
)
)
extractTextIfEmpty(
ctx,
cfg,
ctx.args.meta.language,
ctx.args.meta.collective,
item
)
)
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <- fts.indexData(Stream.emits(txt.map(_._2)))
dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt)
} yield item.copy(metas = txt.map(_._1))
}
def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, _],
cfg: ExtractConfig,
lang: Language,
collective: Ident,
item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] = {
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
(
rm,
TextData.attachment(
item.item.id,
ra.id,
collective,
ra.name,
rm.content
)
)
val rm = item.findOrCreate(ra.id)
rm.content match {
case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
rm.pure[F]
makeTextData(rm).pure[F]
case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData)
}
}