mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-04 22:25:58 +00:00
Make data to index more flexible and extensible
This commit is contained in:
parent
522daaf57e
commit
146d1b0562
@ -15,6 +15,9 @@ case class Ident(id: String) {
|
|||||||
|
|
||||||
def nonEmpty: Boolean =
|
def nonEmpty: Boolean =
|
||||||
!isEmpty
|
!isEmpty
|
||||||
|
|
||||||
|
def / (next: Ident): Ident =
|
||||||
|
new Ident(id + "/" + next.id)
|
||||||
}
|
}
|
||||||
|
|
||||||
object Ident {
|
object Ident {
|
||||||
|
@ -2,4 +2,55 @@ package docspell.ftsclient
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String)
|
sealed trait TextData {
|
||||||
|
|
||||||
|
def id: Ident
|
||||||
|
|
||||||
|
def item: Ident
|
||||||
|
|
||||||
|
def collective: Ident
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object TextData {
|
||||||
|
|
||||||
|
final case class Attachment(
|
||||||
|
item: Ident,
|
||||||
|
attachId: Ident,
|
||||||
|
collective: Ident,
|
||||||
|
name: Option[String],
|
||||||
|
text: Option[String]
|
||||||
|
) extends TextData {
|
||||||
|
|
||||||
|
val id = item / attachId
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def attachment(
|
||||||
|
item: Ident,
|
||||||
|
attachId: Ident,
|
||||||
|
collective: Ident,
|
||||||
|
name: Option[String],
|
||||||
|
text: Option[String]
|
||||||
|
): TextData =
|
||||||
|
Attachment(item, attachId, collective, name, text)
|
||||||
|
|
||||||
|
final case class Item(
|
||||||
|
item: Ident,
|
||||||
|
collective: Ident,
|
||||||
|
name: Option[String],
|
||||||
|
notes: Option[String]
|
||||||
|
) extends TextData {
|
||||||
|
|
||||||
|
val id = Ident.unsafe("item") / item
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def item(
|
||||||
|
item: Ident,
|
||||||
|
collective: Ident,
|
||||||
|
name: Option[String],
|
||||||
|
notes: Option[String]
|
||||||
|
): TextData =
|
||||||
|
Item(item, collective, name, notes)
|
||||||
|
}
|
||||||
|
@ -22,40 +22,49 @@ object TextExtraction {
|
|||||||
_ <- ctx.logger.info("Starting text extraction")
|
_ <- ctx.logger.info("Starting text extraction")
|
||||||
start <- Duration.stopTime[F]
|
start <- Duration.stopTime[F]
|
||||||
txt <- item.attachments.traverse(
|
txt <- item.attachments.traverse(
|
||||||
extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)
|
extractTextIfEmpty(
|
||||||
)
|
ctx,
|
||||||
_ <- ctx.logger.debug("Storing extracted texts")
|
cfg,
|
||||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
|
ctx.args.meta.language,
|
||||||
_ <- fts.indexData(
|
ctx.args.meta.collective,
|
||||||
Stream
|
item
|
||||||
.emits(txt)
|
)
|
||||||
.map(a =>
|
|
||||||
TextData(
|
|
||||||
item.item.id,
|
|
||||||
a.id,
|
|
||||||
ctx.args.meta.collective,
|
|
||||||
a.content.getOrElse("")
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
_ <- ctx.logger.debug("Storing extracted texts")
|
||||||
|
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
|
||||||
|
_ <- fts.indexData(Stream.emits(txt.map(_._2)))
|
||||||
dur <- start
|
dur <- start
|
||||||
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
||||||
} yield item.copy(metas = txt)
|
} yield item.copy(metas = txt.map(_._1))
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
|
collective: Ident,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[RAttachmentMeta] = {
|
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
|
||||||
|
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
|
||||||
|
(
|
||||||
|
rm,
|
||||||
|
TextData.attachment(
|
||||||
|
item.item.id,
|
||||||
|
ra.id,
|
||||||
|
collective,
|
||||||
|
ra.name,
|
||||||
|
rm.content
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
val rm = item.findOrCreate(ra.id)
|
val rm = item.findOrCreate(ra.id)
|
||||||
rm.content match {
|
rm.content match {
|
||||||
case Some(_) =>
|
case Some(_) =>
|
||||||
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||||
rm.pure[F]
|
makeTextData(rm).pure[F]
|
||||||
case None =>
|
case None =>
|
||||||
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||||
|
.map(makeTextData)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user