diff --git a/modules/common/src/main/scala/docspell/common/Ident.scala b/modules/common/src/main/scala/docspell/common/Ident.scala index a1d6cb8a..b9bfa95c 100644 --- a/modules/common/src/main/scala/docspell/common/Ident.scala +++ b/modules/common/src/main/scala/docspell/common/Ident.scala @@ -15,6 +15,9 @@ case class Ident(id: String) { def nonEmpty: Boolean = !isEmpty + + def / (next: Ident): Ident = + new Ident(id + "/" + next.id) } object Ident { diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala index 4b829932..84a07920 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala @@ -2,4 +2,55 @@ package docspell.ftsclient import docspell.common._ -final case class TextData(item: Ident, attachment: Ident, collective: Ident, text: String) +sealed trait TextData { + + def id: Ident + + def item: Ident + + def collective: Ident + +} + +object TextData { + + final case class Attachment( + item: Ident, + attachId: Ident, + collective: Ident, + name: Option[String], + text: Option[String] + ) extends TextData { + + val id = item / attachId + + } + + def attachment( + item: Ident, + attachId: Ident, + collective: Ident, + name: Option[String], + text: Option[String] + ): TextData = + Attachment(item, attachId, collective, name, text) + + final case class Item( + item: Ident, + collective: Ident, + name: Option[String], + notes: Option[String] + ) extends TextData { + + val id = Ident.unsafe("item") / item + + } + + def item( + item: Ident, + collective: Ident, + name: Option[String], + notes: Option[String] + ): TextData = + Item(item, collective, name, notes) +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 8affe716..5f87e891 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -22,40 +22,49 @@ object TextExtraction { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] txt <- item.attachments.traverse( - extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item) - ) - _ <- ctx.logger.debug("Storing extracted texts") - _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) - _ <- fts.indexData( - Stream - .emits(txt) - .map(a => - TextData( - item.item.id, - a.id, - ctx.args.meta.collective, - a.content.getOrElse("") - ) - ) + extractTextIfEmpty( + ctx, + cfg, + ctx.args.meta.language, + ctx.args.meta.collective, + item + ) ) + _ <- ctx.logger.debug("Storing extracted texts") + _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) + _ <- fts.indexData(Stream.emits(txt.map(_._2))) dur <- start _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") - } yield item.copy(metas = txt) + } yield item.copy(metas = txt.map(_._1)) } def extractTextIfEmpty[F[_]: Sync: ContextShift]( ctx: Context[F, _], cfg: ExtractConfig, lang: Language, + collective: Ident, item: ItemData - )(ra: RAttachment): F[RAttachmentMeta] = { + )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = { + def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) = + ( + rm, + TextData.attachment( + item.item.id, + ra.id, + collective, + ra.name, + rm.content + ) + ) + val rm = item.findOrCreate(ra.id) rm.content match { case Some(_) => ctx.logger.info("TextExtraction skipped, since text is already available.") *> - rm.pure[F] + makeTextData(rm).pure[F] case None => extractTextToMeta[F](ctx, cfg, lang, item)(ra) + .map(makeTextData) } }