diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala index c58213a5..41669cbb 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OFulltext.scala @@ -48,17 +48,14 @@ object OFulltext { batch: Batch, search: (Query, Batch) => F[Vector[A]] ): Stream[F, A] = { - val fq = FtsQuery(ftsQ, q.collective, batch.limit, batch.offset, Nil) + val fq = FtsQuery(ftsQ, q.collective, Nil, batch.limit, batch.offset) val qres = for { items <- fts - .searchBasic(fq) - .flatMap(r => Stream.emits(r.results)) - .map(_.itemId) - .compile - .toVector + .search(fq) + .map(_.results.map(_.itemId)) .map(_.toSet) sq = q.copy(itemIds = Some(items)) res <- search(sq, batch) diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala index 94d45f1c..20d92892 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsClient.scala @@ -7,19 +7,41 @@ import docspell.common._ * engine. * * It defines all operations required for integration into docspell. - * It uses data structures and terms of docspell. Implementation - * modules need to translate it to the engine that provides the - * features. + * It uses data structures from docspell. Implementation modules need + * to translate it to the engine that provides the features. */ trait FtsClient[F[_]] { - /** Optional operation to do some initialization tasks. This is called - * exactly once and then never again. It may be used to setup the - * database. + /** Initialization tasks. This is called exactly once and then never + * again (except when re-indexing everything). It may be used to + * setup the database. */ def initialize: F[Unit] - def searchBasic(q: FtsQuery): Stream[F, FtsResult] + def search(q: FtsQuery): F[FtsResult] + def searchAll(q: FtsQuery): Stream[F, FtsResult] = + Stream.eval(search(q)).flatMap { result => + if (result.results.size < q.limit) Stream.emit(result) + else Stream.emit(result) ++ searchAll(q.nextPage) + } + + /** Push all data to the index. Data with same `id' is replaced. + * Values that are `None' are removed from the index (or set to an + * empty string). + */ def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] + + def indexData(logger: Logger[F], data: TextData*): F[Unit] = + indexData(logger, Stream.emits(data)) + + /** Push all data to the index, but only update existing entries. No + * new entries are created and values that are given as `None' are + * skipped. + */ + def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] + + def updateIndex(logger: Logger[F], data: TextData*): F[Unit] = + updateIndex(logger, Stream.emits(data)) + } diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala index 93dff968..ca0d68a7 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/FtsQuery.scala @@ -6,5 +6,18 @@ import docspell.common._ * * The query itself is a raw string. Each implementation may * interpret it according to the system in use. + * + * Searches must only look for given collective and in the given list + * of item ids. */ -final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int, items: List[Ident]) +final case class FtsQuery( + q: String, + collective: Ident, + items: List[Ident], + limit: Int, + offset: Int +) { + + def nextPage: FtsQuery = + copy(offset = limit + offset) +} diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala index a6fe4e21..625411ad 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala @@ -23,6 +23,7 @@ object TextData { item: Ident, attachId: Ident, collective: Ident, + lang: Language, name: Option[String], text: Option[String] ) extends TextData { @@ -35,10 +36,11 @@ object TextData { item: Ident, attachId: Ident, collective: Ident, + lang: Language, name: Option[String], text: Option[String] ): TextData = - Attachment(item, attachId, collective, name, text) + Attachment(item, attachId, collective, lang, name, text) final case class Item( item: Ident, diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala index be6cd0eb..1bf78304 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/Field.scala @@ -13,13 +13,15 @@ object Field { new Field(name) - val discriminator = Field("discriminator") val id = Field("id") val itemId = Field("itemId") val collectiveId = Field("collectiveId") val attachmentId = Field("attachmentId") + val discriminator = Field("discriminator") val attachmentName = Field("attachmentName") val content = Field("content") + val content_de = Field("content_de") + val content_en = Field("content_en") val itemName = Field("itemName") val itemNotes = Field("itemNotes") diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala index 6b55efb3..42a0bd5c 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala @@ -1,6 +1,5 @@ package docspell.ftssolr -//import cats.implicits._ import io.circe._ import docspell.common._ import docspell.ftsclient._ @@ -11,16 +10,30 @@ trait JsonCodec { enc: Encoder[Ident] ): Encoder[TextData.Attachment] = new Encoder[TextData.Attachment] { - final def apply(td: TextData.Attachment): Json = - Json.obj( - (Field.id.name, enc(td.id)), - (Field.itemId.name, enc(td.item)), - (Field.collectiveId.name, enc(td.collective)), - (Field.attachmentId.name, enc(td.attachId)), - (Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))), - (Field.content.name, Json.fromString(td.text.getOrElse(""))), - (Field.discriminator.name, Json.fromString("attachment")) + final def apply(td: TextData.Attachment): Json = { + val cnt = + ( + td.lang match { + case Language.German => + Field.content_de.name + case Language.English => + Field.content_en.name + }, + Json.fromString(td.text.getOrElse("")) + ) + + Json.fromFields( + cnt :: List( + (Field.id.name, enc(td.id)), + (Field.itemId.name, enc(td.item)), + (Field.collectiveId.name, enc(td.collective)), + (Field.attachmentId.name, enc(td.attachId)), + (Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))), + (Field.discriminator.name, Json.fromString("attachment")) + ) ) + + } } implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] = @@ -46,13 +59,13 @@ trait JsonCodec { new Decoder[FtsResult] { final def apply(c: HCursor): Decoder.Result[FtsResult] = for { - qtime <- c.downField("responseHeader").get[Duration]("QTime") - count <- c.downField("response").get[Int]("numFound") - maxScore <- c.downField("response").get[Double]("maxScore") - results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs") - highligh <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting") - highline = highligh.map(kv => kv._1 -> kv._2.values.flatten.toList) - } yield FtsResult(qtime, count, maxScore, highline, results) + qtime <- c.downField("responseHeader").get[Duration]("QTime") + count <- c.downField("response").get[Int]("numFound") + maxScore <- c.downField("response").get[Double]("maxScore") + results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs") + highlightng <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting") + highlight = highlightng.map(kv => kv._1 -> kv._2.values.flatten.toList) + } yield FtsResult(qtime, count, maxScore, highlight, results) } implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] = diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala index cc25726a..75e14ca8 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala @@ -17,8 +17,8 @@ final class SolrFtsClient[F[_]: Effect]( def initialize: F[Unit] = solrSetup.setupSchema - def searchBasic(q: FtsQuery): Stream[F, FtsResult] = - Stream.eval(solrQuery.query(q)) + def search(q: FtsQuery): F[FtsResult] = + solrQuery.query(q) def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = (for { @@ -32,6 +32,8 @@ final class SolrFtsClient[F[_]: Effect]( } } yield ()).compile.drain + def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = ??? + } object SolrFtsClient { diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala index 435402e2..7348d4fe 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrQuery.scala @@ -18,7 +18,14 @@ trait SolrQuery[F[_]] { def query(q: FtsQuery): F[FtsResult] = { val fq = QueryData( - List(Field.content, Field.itemName, Field.itemNotes, Field.attachmentName), + List( + Field.content, + Field.content_de, + Field.content_en, + Field.itemName, + Field.itemNotes, + Field.attachmentName + ), List( Field.id, Field.itemId, diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala index 13d43d17..275f61d8 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala @@ -10,6 +10,7 @@ import org.log4s.getLogger import _root_.io.circe.syntax._ import _root_.io.circe._ import _root_.io.circe.generic.semiauto._ +import docspell.common._ trait SolrSetup[F[_]] { @@ -44,9 +45,16 @@ object SolrSetup { Field.itemName, Field.itemNotes ) - .traverse(addTextField) + .traverse(addTextField(None)) - cmds0 *> cmds1 *> ().pure[F] + val cntLang = Language.all.traverse { + case l @ Language.German => + addTextField(l.some)(Field.content_de) + case l @ Language.English => + addTextField(l.some)(Field.content_en) + } + + cmds0 *> cmds1 *> cntLang *> ().pure[F] } private def run(cmd: Json): F[Unit] = { @@ -59,10 +67,18 @@ object SolrSetup { run(DeleteField.command(DeleteField(field))).attempt *> run(AddField.command(AddField.string(field))) - private def addTextField(field: Field): F[Unit] = - run(DeleteField.command(DeleteField(field))).attempt *> - run(AddField.command(AddField.text(field))) - + private def addTextField(lang: Option[Language])(field: Field): F[Unit] = + lang match { + case None => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.text(field))) + case Some(Language.German) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textDE(field))) + case Some(Language.English) => + run(DeleteField.command(DeleteField(field))).attempt *> + run(AddField.command(AddField.textEN(field))) + } } } @@ -87,6 +103,12 @@ object SolrSetup { def text(field: Field): AddField = AddField(field, "text_general", true, true, false) + + def textDE(field: Field): AddField = + AddField(field, "text_de", true, true, false) + + def textEN(field: Field): AddField = + AddField(field, "text_en", true, true, false) } case class DeleteField(name: Field) diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala index 3b772cd2..59c21eba 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala @@ -35,14 +35,12 @@ object SolrUpdate { def single(td: TextData): F[Unit] = { val req = Method.POST(td.asJson, url) - logger.debug(s"Running request $req") - client.expect[String](req).map(r => logger.debug(s"Response: $r")) + client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r")) } def many(tds: List[TextData]): F[Unit] = { val req = Method.POST(tds.asJson, url) - logger.debug(s"Running request $req") - client.expect[String](req).map(r => logger.debug(s"Response: $r")) + client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r")) } } } diff --git a/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala index 88af2ef4..c546e690 100644 --- a/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala +++ b/modules/joex/src/main/scala/docspell/joex/fts/Migration.scala @@ -89,7 +89,14 @@ object Migration { ) .map(caa => TextData - .attachment(caa.item, caa.id, caa.collective, caa.name, caa.content) + .attachment( + caa.item, + caa.id, + caa.collective, + caa.lang, + caa.name, + caa.content + ) ) ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 543da0ee..8bfa250b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -1,6 +1,5 @@ package docspell.joex.process -import fs2.Stream import bitpeace.{Mimetype, RangeDef} import cats.data.OptionT import cats.implicits._ @@ -30,9 +29,11 @@ object TextExtraction { item ) ) - _ <- ctx.logger.debug("Storing extracted texts") - _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) - _ <- fts.indexData(ctx.logger, Stream.emits(txt.map(_._2))) + _ <- ctx.logger.debug("Storing extracted texts") + _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) + idxItem = + TextData.item(item.item.id, ctx.args.meta.collective, item.item.name.some, None) + _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*) dur <- start _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") } yield item.copy(metas = txt.map(_._1)) @@ -52,6 +53,7 @@ object TextExtraction { item.item.id, ra.id, collective, + lang, ra.name, rm.content ) diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index a09f58ff..b2fd2db1 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -6,7 +6,7 @@ import cats.effect.Sync import cats.data.OptionT import doobie._ import doobie.implicits._ -import docspell.common.{Ident, MetaProposalList} +import docspell.common._ import docspell.store.Store import docspell.store.impl.Implicits._ import docspell.store.records._ @@ -143,6 +143,7 @@ object QAttachment { id: Ident, item: Ident, collective: Ident, + lang: Language, name: Option[String], content: Option[String] ) @@ -154,11 +155,14 @@ object QAttachment { val mContent = RAttachmentMeta.Columns.content.prefix("m") val iId = RItem.Columns.id.prefix("i") val iColl = RItem.Columns.cid.prefix("i") + val cId = RCollective.Columns.id.prefix("c") + val cLang = RCollective.Columns.language.prefix("c") - val cols = Seq(aId, aItem, iColl, aName, mContent) + val cols = Seq(aId, aItem, iColl, cLang, aName, mContent) val from = RAttachment.table ++ fr"a INNER JOIN" ++ RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ - fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) + fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) ++ + fr"INNER JOIN" ++ RCollective.table ++ fr"c ON" ++ cId.is(iColl) selectSimple(cols, from, Fragment.empty) .query[ContentAndName]