Add language to schema, extend fts-client

This commit is contained in:
Eike Kettner 2020-06-20 22:27:26 +02:00
parent 3576c45d1a
commit 1f4ff0d4c4
13 changed files with 145 additions and 54 deletions

View File

@ -48,17 +48,14 @@ object OFulltext {
batch: Batch, batch: Batch,
search: (Query, Batch) => F[Vector[A]] search: (Query, Batch) => F[Vector[A]]
): Stream[F, A] = { ): Stream[F, A] = {
val fq = FtsQuery(ftsQ, q.collective, batch.limit, batch.offset, Nil) val fq = FtsQuery(ftsQ, q.collective, Nil, batch.limit, batch.offset)
val qres = val qres =
for { for {
items <- items <-
fts fts
.searchBasic(fq) .search(fq)
.flatMap(r => Stream.emits(r.results)) .map(_.results.map(_.itemId))
.map(_.itemId)
.compile
.toVector
.map(_.toSet) .map(_.toSet)
sq = q.copy(itemIds = Some(items)) sq = q.copy(itemIds = Some(items))
res <- search(sq, batch) res <- search(sq, batch)

View File

@ -7,19 +7,41 @@ import docspell.common._
* engine. * engine.
* *
* It defines all operations required for integration into docspell. * It defines all operations required for integration into docspell.
* It uses data structures and terms of docspell. Implementation * It uses data structures from docspell. Implementation modules need
* modules need to translate it to the engine that provides the * to translate it to the engine that provides the features.
* features.
*/ */
trait FtsClient[F[_]] { trait FtsClient[F[_]] {
/** Optional operation to do some initialization tasks. This is called /** Initialization tasks. This is called exactly once and then never
* exactly once and then never again. It may be used to setup the * again (except when re-indexing everything). It may be used to
* database. * setup the database.
*/ */
def initialize: F[Unit] def initialize: F[Unit]
def searchBasic(q: FtsQuery): Stream[F, FtsResult] def search(q: FtsQuery): F[FtsResult]
def searchAll(q: FtsQuery): Stream[F, FtsResult] =
Stream.eval(search(q)).flatMap { result =>
if (result.results.size < q.limit) Stream.emit(result)
else Stream.emit(result) ++ searchAll(q.nextPage)
}
/** Push all data to the index. Data with same `id' is replaced.
* Values that are `None' are removed from the index (or set to an
* empty string).
*/
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
def indexData(logger: Logger[F], data: TextData*): F[Unit] =
indexData(logger, Stream.emits(data))
/** Push all data to the index, but only update existing entries. No
* new entries are created and values that are given as `None' are
* skipped.
*/
def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
def updateIndex(logger: Logger[F], data: TextData*): F[Unit] =
updateIndex(logger, Stream.emits(data))
} }

View File

@ -6,5 +6,18 @@ import docspell.common._
* *
* The query itself is a raw string. Each implementation may * The query itself is a raw string. Each implementation may
* interpret it according to the system in use. * interpret it according to the system in use.
*
* Searches must only look for given collective and in the given list
* of item ids.
*/ */
final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int, items: List[Ident]) final case class FtsQuery(
q: String,
collective: Ident,
items: List[Ident],
limit: Int,
offset: Int
) {
def nextPage: FtsQuery =
copy(offset = limit + offset)
}

View File

@ -23,6 +23,7 @@ object TextData {
item: Ident, item: Ident,
attachId: Ident, attachId: Ident,
collective: Ident, collective: Ident,
lang: Language,
name: Option[String], name: Option[String],
text: Option[String] text: Option[String]
) extends TextData { ) extends TextData {
@ -35,10 +36,11 @@ object TextData {
item: Ident, item: Ident,
attachId: Ident, attachId: Ident,
collective: Ident, collective: Ident,
lang: Language,
name: Option[String], name: Option[String],
text: Option[String] text: Option[String]
): TextData = ): TextData =
Attachment(item, attachId, collective, name, text) Attachment(item, attachId, collective, lang, name, text)
final case class Item( final case class Item(
item: Ident, item: Ident,

View File

@ -13,13 +13,15 @@ object Field {
new Field(name) new Field(name)
val discriminator = Field("discriminator")
val id = Field("id") val id = Field("id")
val itemId = Field("itemId") val itemId = Field("itemId")
val collectiveId = Field("collectiveId") val collectiveId = Field("collectiveId")
val attachmentId = Field("attachmentId") val attachmentId = Field("attachmentId")
val discriminator = Field("discriminator")
val attachmentName = Field("attachmentName") val attachmentName = Field("attachmentName")
val content = Field("content") val content = Field("content")
val content_de = Field("content_de")
val content_en = Field("content_en")
val itemName = Field("itemName") val itemName = Field("itemName")
val itemNotes = Field("itemNotes") val itemNotes = Field("itemNotes")

View File

@ -1,6 +1,5 @@
package docspell.ftssolr package docspell.ftssolr
//import cats.implicits._
import io.circe._ import io.circe._
import docspell.common._ import docspell.common._
import docspell.ftsclient._ import docspell.ftsclient._
@ -11,16 +10,30 @@ trait JsonCodec {
enc: Encoder[Ident] enc: Encoder[Ident]
): Encoder[TextData.Attachment] = ): Encoder[TextData.Attachment] =
new Encoder[TextData.Attachment] { new Encoder[TextData.Attachment] {
final def apply(td: TextData.Attachment): Json = final def apply(td: TextData.Attachment): Json = {
Json.obj( val cnt =
(Field.id.name, enc(td.id)), (
(Field.itemId.name, enc(td.item)), td.lang match {
(Field.collectiveId.name, enc(td.collective)), case Language.German =>
(Field.attachmentId.name, enc(td.attachId)), Field.content_de.name
(Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))), case Language.English =>
(Field.content.name, Json.fromString(td.text.getOrElse(""))), Field.content_en.name
(Field.discriminator.name, Json.fromString("attachment")) },
Json.fromString(td.text.getOrElse(""))
)
Json.fromFields(
cnt :: List(
(Field.id.name, enc(td.id)),
(Field.itemId.name, enc(td.item)),
(Field.collectiveId.name, enc(td.collective)),
(Field.attachmentId.name, enc(td.attachId)),
(Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))),
(Field.discriminator.name, Json.fromString("attachment"))
)
) )
}
} }
implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] = implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] =
@ -46,13 +59,13 @@ trait JsonCodec {
new Decoder[FtsResult] { new Decoder[FtsResult] {
final def apply(c: HCursor): Decoder.Result[FtsResult] = final def apply(c: HCursor): Decoder.Result[FtsResult] =
for { for {
qtime <- c.downField("responseHeader").get[Duration]("QTime") qtime <- c.downField("responseHeader").get[Duration]("QTime")
count <- c.downField("response").get[Int]("numFound") count <- c.downField("response").get[Int]("numFound")
maxScore <- c.downField("response").get[Double]("maxScore") maxScore <- c.downField("response").get[Double]("maxScore")
results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs") results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs")
highligh <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting") highlightng <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting")
highline = highligh.map(kv => kv._1 -> kv._2.values.flatten.toList) highlight = highlightng.map(kv => kv._1 -> kv._2.values.flatten.toList)
} yield FtsResult(qtime, count, maxScore, highline, results) } yield FtsResult(qtime, count, maxScore, highlight, results)
} }
implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] = implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] =

View File

@ -17,8 +17,8 @@ final class SolrFtsClient[F[_]: Effect](
def initialize: F[Unit] = def initialize: F[Unit] =
solrSetup.setupSchema solrSetup.setupSchema
def searchBasic(q: FtsQuery): Stream[F, FtsResult] = def search(q: FtsQuery): F[FtsResult] =
Stream.eval(solrQuery.query(q)) solrQuery.query(q)
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
(for { (for {
@ -32,6 +32,8 @@ final class SolrFtsClient[F[_]: Effect](
} }
} yield ()).compile.drain } yield ()).compile.drain
def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = ???
} }
object SolrFtsClient { object SolrFtsClient {

View File

@ -18,7 +18,14 @@ trait SolrQuery[F[_]] {
def query(q: FtsQuery): F[FtsResult] = { def query(q: FtsQuery): F[FtsResult] = {
val fq = QueryData( val fq = QueryData(
List(Field.content, Field.itemName, Field.itemNotes, Field.attachmentName), List(
Field.content,
Field.content_de,
Field.content_en,
Field.itemName,
Field.itemNotes,
Field.attachmentName
),
List( List(
Field.id, Field.id,
Field.itemId, Field.itemId,

View File

@ -10,6 +10,7 @@ import org.log4s.getLogger
import _root_.io.circe.syntax._ import _root_.io.circe.syntax._
import _root_.io.circe._ import _root_.io.circe._
import _root_.io.circe.generic.semiauto._ import _root_.io.circe.generic.semiauto._
import docspell.common._
trait SolrSetup[F[_]] { trait SolrSetup[F[_]] {
@ -44,9 +45,16 @@ object SolrSetup {
Field.itemName, Field.itemName,
Field.itemNotes Field.itemNotes
) )
.traverse(addTextField) .traverse(addTextField(None))
cmds0 *> cmds1 *> ().pure[F] val cntLang = Language.all.traverse {
case l @ Language.German =>
addTextField(l.some)(Field.content_de)
case l @ Language.English =>
addTextField(l.some)(Field.content_en)
}
cmds0 *> cmds1 *> cntLang *> ().pure[F]
} }
private def run(cmd: Json): F[Unit] = { private def run(cmd: Json): F[Unit] = {
@ -59,10 +67,18 @@ object SolrSetup {
run(DeleteField.command(DeleteField(field))).attempt *> run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.string(field))) run(AddField.command(AddField.string(field)))
private def addTextField(field: Field): F[Unit] = private def addTextField(lang: Option[Language])(field: Field): F[Unit] =
run(DeleteField.command(DeleteField(field))).attempt *> lang match {
run(AddField.command(AddField.text(field))) case None =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.text(field)))
case Some(Language.German) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textDE(field)))
case Some(Language.English) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textEN(field)))
}
} }
} }
@ -87,6 +103,12 @@ object SolrSetup {
def text(field: Field): AddField = def text(field: Field): AddField =
AddField(field, "text_general", true, true, false) AddField(field, "text_general", true, true, false)
def textDE(field: Field): AddField =
AddField(field, "text_de", true, true, false)
def textEN(field: Field): AddField =
AddField(field, "text_en", true, true, false)
} }
case class DeleteField(name: Field) case class DeleteField(name: Field)

View File

@ -35,14 +35,12 @@ object SolrUpdate {
def single(td: TextData): F[Unit] = { def single(td: TextData): F[Unit] = {
val req = Method.POST(td.asJson, url) val req = Method.POST(td.asJson, url)
logger.debug(s"Running request $req") client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r"))
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
} }
def many(tds: List[TextData]): F[Unit] = { def many(tds: List[TextData]): F[Unit] = {
val req = Method.POST(tds.asJson, url) val req = Method.POST(tds.asJson, url)
logger.debug(s"Running request $req") client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r"))
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
} }
} }
} }

View File

@ -89,7 +89,14 @@ object Migration {
) )
.map(caa => .map(caa =>
TextData TextData
.attachment(caa.item, caa.id, caa.collective, caa.name, caa.content) .attachment(
caa.item,
caa.id,
caa.collective,
caa.lang,
caa.name,
caa.content
)
) )
) )
) )

View File

@ -1,6 +1,5 @@
package docspell.joex.process package docspell.joex.process
import fs2.Stream
import bitpeace.{Mimetype, RangeDef} import bitpeace.{Mimetype, RangeDef}
import cats.data.OptionT import cats.data.OptionT
import cats.implicits._ import cats.implicits._
@ -30,9 +29,11 @@ object TextExtraction {
item item
) )
) )
_ <- ctx.logger.debug("Storing extracted texts") _ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <- fts.indexData(ctx.logger, Stream.emits(txt.map(_._2))) idxItem =
TextData.item(item.item.id, ctx.args.meta.collective, item.item.name.some, None)
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
dur <- start dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1)) } yield item.copy(metas = txt.map(_._1))
@ -52,6 +53,7 @@ object TextExtraction {
item.item.id, item.item.id,
ra.id, ra.id,
collective, collective,
lang,
ra.name, ra.name,
rm.content rm.content
) )

View File

@ -6,7 +6,7 @@ import cats.effect.Sync
import cats.data.OptionT import cats.data.OptionT
import doobie._ import doobie._
import doobie.implicits._ import doobie.implicits._
import docspell.common.{Ident, MetaProposalList} import docspell.common._
import docspell.store.Store import docspell.store.Store
import docspell.store.impl.Implicits._ import docspell.store.impl.Implicits._
import docspell.store.records._ import docspell.store.records._
@ -143,6 +143,7 @@ object QAttachment {
id: Ident, id: Ident,
item: Ident, item: Ident,
collective: Ident, collective: Ident,
lang: Language,
name: Option[String], name: Option[String],
content: Option[String] content: Option[String]
) )
@ -154,11 +155,14 @@ object QAttachment {
val mContent = RAttachmentMeta.Columns.content.prefix("m") val mContent = RAttachmentMeta.Columns.content.prefix("m")
val iId = RItem.Columns.id.prefix("i") val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i") val iColl = RItem.Columns.cid.prefix("i")
val cId = RCollective.Columns.id.prefix("c")
val cLang = RCollective.Columns.language.prefix("c")
val cols = Seq(aId, aItem, iColl, aName, mContent) val cols = Seq(aId, aItem, iColl, cLang, aName, mContent)
val from = RAttachment.table ++ fr"a INNER JOIN" ++ val from = RAttachment.table ++ fr"a INNER JOIN" ++
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++
fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) ++
fr"INNER JOIN" ++ RCollective.table ++ fr"c ON" ++ cId.is(iColl)
selectSimple(cols, from, Fragment.empty) selectSimple(cols, from, Fragment.empty)
.query[ContentAndName] .query[ContentAndName]