mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-03 05:52:51 +00:00
Make data to index more flexible and extensible
This commit is contained in:
parent
522daaf57e
commit
146d1b0562
@ -15,6 +15,9 @@ case class Ident(id: String) {
|
||||
|
||||
def nonEmpty: Boolean =
|
||||
!isEmpty
|
||||
|
||||
def / (next: Ident): Ident =
|
||||
new Ident(id + "/" + next.id)
|
||||
}
|
||||
|
||||
object Ident {
|
||||
|
@ -2,4 +2,55 @@ package docspell.ftsclient
|
||||
|
||||
import docspell.common._
|
||||
|
||||
final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String)
|
||||
sealed trait TextData {
|
||||
|
||||
def id: Ident
|
||||
|
||||
def item: Ident
|
||||
|
||||
def collective: Ident
|
||||
|
||||
}
|
||||
|
||||
object TextData {
|
||||
|
||||
final case class Attachment(
|
||||
item: Ident,
|
||||
attachId: Ident,
|
||||
collective: Ident,
|
||||
name: Option[String],
|
||||
text: Option[String]
|
||||
) extends TextData {
|
||||
|
||||
val id = item / attachId
|
||||
|
||||
}
|
||||
|
||||
def attachment(
|
||||
item: Ident,
|
||||
attachId: Ident,
|
||||
collective: Ident,
|
||||
name: Option[String],
|
||||
text: Option[String]
|
||||
): TextData =
|
||||
Attachment(item, attachId, collective, name, text)
|
||||
|
||||
final case class Item(
|
||||
item: Ident,
|
||||
collective: Ident,
|
||||
name: Option[String],
|
||||
notes: Option[String]
|
||||
) extends TextData {
|
||||
|
||||
val id = Ident.unsafe("item") / item
|
||||
|
||||
}
|
||||
|
||||
def item(
|
||||
item: Ident,
|
||||
collective: Ident,
|
||||
name: Option[String],
|
||||
notes: Option[String]
|
||||
): TextData =
|
||||
Item(item, collective, name, notes)
|
||||
}
|
||||
|
@ -22,40 +22,49 @@ object TextExtraction {
|
||||
_ <- ctx.logger.info("Starting text extraction")
|
||||
start <- Duration.stopTime[F]
|
||||
txt <- item.attachments.traverse(
|
||||
extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)
|
||||
)
|
||||
_ <- ctx.logger.debug("Storing extracted texts")
|
||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
|
||||
_ <- fts.indexData(
|
||||
Stream
|
||||
.emits(txt)
|
||||
.map(a =>
|
||||
TextData(
|
||||
item.item.id,
|
||||
a.id,
|
||||
ctx.args.meta.collective,
|
||||
a.content.getOrElse("")
|
||||
)
|
||||
)
|
||||
extractTextIfEmpty(
|
||||
ctx,
|
||||
cfg,
|
||||
ctx.args.meta.language,
|
||||
ctx.args.meta.collective,
|
||||
item
|
||||
)
|
||||
)
|
||||
_ <- ctx.logger.debug("Storing extracted texts")
|
||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
|
||||
_ <- fts.indexData(Stream.emits(txt.map(_._2)))
|
||||
dur <- start
|
||||
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
||||
} yield item.copy(metas = txt)
|
||||
} yield item.copy(metas = txt.map(_._1))
|
||||
}
|
||||
|
||||
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, _],
|
||||
cfg: ExtractConfig,
|
||||
lang: Language,
|
||||
collective: Ident,
|
||||
item: ItemData
|
||||
)(ra: RAttachment): F[RAttachmentMeta] = {
|
||||
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
|
||||
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
|
||||
(
|
||||
rm,
|
||||
TextData.attachment(
|
||||
item.item.id,
|
||||
ra.id,
|
||||
collective,
|
||||
ra.name,
|
||||
rm.content
|
||||
)
|
||||
)
|
||||
|
||||
val rm = item.findOrCreate(ra.id)
|
||||
rm.content match {
|
||||
case Some(_) =>
|
||||
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||
rm.pure[F]
|
||||
makeTextData(rm).pure[F]
|
||||
case None =>
|
||||
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||
.map(makeTextData)
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user