Make data to index more flexible and extensible

This commit is contained in:
Eike Kettner 2020-06-17 21:18:48 +02:00
parent 522daaf57e
commit 146d1b0562
3 changed files with 82 additions and 19 deletions

View File

@ -15,6 +15,9 @@ case class Ident(id: String) {
def nonEmpty: Boolean = def nonEmpty: Boolean =
!isEmpty !isEmpty
def / (next: Ident): Ident =
new Ident(id + "/" + next.id)
} }
object Ident { object Ident {

View File

@ -2,4 +2,55 @@ package docspell.ftsclient
import docspell.common._ import docspell.common._
final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String) sealed trait TextData {
def id: Ident
def item: Ident
def collective: Ident
}
object TextData {
final case class Attachment(
item: Ident,
attachId: Ident,
collective: Ident,
name: Option[String],
text: Option[String]
) extends TextData {
val id = item / attachId
}
def attachment(
item: Ident,
attachId: Ident,
collective: Ident,
name: Option[String],
text: Option[String]
): TextData =
Attachment(item, attachId, collective, name, text)
final case class Item(
item: Ident,
collective: Ident,
name: Option[String],
notes: Option[String]
) extends TextData {
val id = Ident.unsafe("item") / item
}
def item(
item: Ident,
collective: Ident,
name: Option[String],
notes: Option[String]
): TextData =
Item(item, collective, name, notes)
}

View File

@ -22,40 +22,49 @@ object TextExtraction {
_ <- ctx.logger.info("Starting text extraction") _ <- ctx.logger.info("Starting text extraction")
start <- Duration.stopTime[F] start <- Duration.stopTime[F]
txt <- item.attachments.traverse( txt <- item.attachments.traverse(
extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item) extractTextIfEmpty(
) ctx,
_ <- ctx.logger.debug("Storing extracted texts") cfg,
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) ctx.args.meta.language,
_ <- fts.indexData( ctx.args.meta.collective,
Stream item
.emits(txt) )
.map(a =>
TextData(
item.item.id,
a.id,
ctx.args.meta.collective,
a.content.getOrElse("")
)
)
) )
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <- fts.indexData(Stream.emits(txt.map(_._2)))
dur <- start dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt) } yield item.copy(metas = txt.map(_._1))
} }
def extractTextIfEmpty[F[_]: Sync: ContextShift]( def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, _], ctx: Context[F, _],
cfg: ExtractConfig, cfg: ExtractConfig,
lang: Language, lang: Language,
collective: Ident,
item: ItemData item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] = { )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
(
rm,
TextData.attachment(
item.item.id,
ra.id,
collective,
ra.name,
rm.content
)
)
val rm = item.findOrCreate(ra.id) val rm = item.findOrCreate(ra.id)
rm.content match { rm.content match {
case Some(_) => case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *> ctx.logger.info("TextExtraction skipped, since text is already available.") *>
rm.pure[F] makeTextData(rm).pure[F]
case None => case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra) extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData)
} }
} }