Add language to schema, extend fts-client

This commit is contained in:
Eike Kettner 2020-06-20 22:27:26 +02:00
parent 3576c45d1a
commit 1f4ff0d4c4
13 changed files with 145 additions and 54 deletions

View File

@ -48,17 +48,14 @@ object OFulltext {
batch: Batch,
search: (Query, Batch) => F[Vector[A]]
): Stream[F, A] = {
val fq = FtsQuery(ftsQ, q.collective, batch.limit, batch.offset, Nil)
val fq = FtsQuery(ftsQ, q.collective, Nil, batch.limit, batch.offset)
val qres =
for {
items <-
fts
.searchBasic(fq)
.flatMap(r => Stream.emits(r.results))
.map(_.itemId)
.compile
.toVector
.search(fq)
.map(_.results.map(_.itemId))
.map(_.toSet)
sq = q.copy(itemIds = Some(items))
res <- search(sq, batch)

View File

@ -7,19 +7,41 @@ import docspell.common._
* engine.
*
* It defines all operations required for integration into docspell.
* It uses data structures and terms of docspell. Implementation
* modules need to translate it to the engine that provides the
* features.
* It uses data structures from docspell. Implementation modules need
* to translate it to the engine that provides the features.
*/
trait FtsClient[F[_]] {
/** Optional operation to do some initialization tasks. This is called
* exactly once and then never again. It may be used to setup the
* database.
/** Initialization tasks. This is called exactly once and then never
* again (except when re-indexing everything). It may be used to
* setup the database.
*/
def initialize: F[Unit]
def searchBasic(q: FtsQuery): Stream[F, FtsResult]
def search(q: FtsQuery): F[FtsResult]
def searchAll(q: FtsQuery): Stream[F, FtsResult] =
Stream.eval(search(q)).flatMap { result =>
if (result.results.size < q.limit) Stream.emit(result)
else Stream.emit(result) ++ searchAll(q.nextPage)
}
/** Push all data to the index. Data with same `id' is replaced.
* Values that are `None' are removed from the index (or set to an
* empty string).
*/
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
def indexData(logger: Logger[F], data: TextData*): F[Unit] =
indexData(logger, Stream.emits(data))
/** Push all data to the index, but only update existing entries. No
* new entries are created and values that are given as `None' are
* skipped.
*/
def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit]
def updateIndex(logger: Logger[F], data: TextData*): F[Unit] =
updateIndex(logger, Stream.emits(data))
}

View File

@ -6,5 +6,18 @@ import docspell.common._
*
* The query itself is a raw string. Each implementation may
* interpret it according to the system in use.
*
* Searches must only look for given collective and in the given list
* of item ids.
*/
final case class FtsQuery(q: String, collective: Ident, limit: Int, offset: Int, items: List[Ident])
final case class FtsQuery(
q: String,
collective: Ident,
items: List[Ident],
limit: Int,
offset: Int
) {
def nextPage: FtsQuery =
copy(offset = limit + offset)
}

View File

@ -23,6 +23,7 @@ object TextData {
item: Ident,
attachId: Ident,
collective: Ident,
lang: Language,
name: Option[String],
text: Option[String]
) extends TextData {
@ -35,10 +36,11 @@ object TextData {
item: Ident,
attachId: Ident,
collective: Ident,
lang: Language,
name: Option[String],
text: Option[String]
): TextData =
Attachment(item, attachId, collective, name, text)
Attachment(item, attachId, collective, lang, name, text)
final case class Item(
item: Ident,

View File

@ -13,13 +13,15 @@ object Field {
new Field(name)
val discriminator = Field("discriminator")
val id = Field("id")
val itemId = Field("itemId")
val collectiveId = Field("collectiveId")
val attachmentId = Field("attachmentId")
val discriminator = Field("discriminator")
val attachmentName = Field("attachmentName")
val content = Field("content")
val content_de = Field("content_de")
val content_en = Field("content_en")
val itemName = Field("itemName")
val itemNotes = Field("itemNotes")

View File

@ -1,6 +1,5 @@
package docspell.ftssolr
//import cats.implicits._
import io.circe._
import docspell.common._
import docspell.ftsclient._
@ -11,16 +10,30 @@ trait JsonCodec {
enc: Encoder[Ident]
): Encoder[TextData.Attachment] =
new Encoder[TextData.Attachment] {
final def apply(td: TextData.Attachment): Json =
Json.obj(
final def apply(td: TextData.Attachment): Json = {
val cnt =
(
td.lang match {
case Language.German =>
Field.content_de.name
case Language.English =>
Field.content_en.name
},
Json.fromString(td.text.getOrElse(""))
)
Json.fromFields(
cnt :: List(
(Field.id.name, enc(td.id)),
(Field.itemId.name, enc(td.item)),
(Field.collectiveId.name, enc(td.collective)),
(Field.attachmentId.name, enc(td.attachId)),
(Field.attachmentName.name, Json.fromString(td.name.getOrElse(""))),
(Field.content.name, Json.fromString(td.text.getOrElse(""))),
(Field.discriminator.name, Json.fromString("attachment"))
)
)
}
}
implicit def itemEncoder(implicit enc: Encoder[Ident]): Encoder[TextData.Item] =
@ -50,9 +63,9 @@ trait JsonCodec {
count <- c.downField("response").get[Int]("numFound")
maxScore <- c.downField("response").get[Double]("maxScore")
results <- c.downField("response").get[List[FtsResult.ItemMatch]]("docs")
highligh <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting")
highline = highligh.map(kv => kv._1 -> kv._2.values.flatten.toList)
} yield FtsResult(qtime, count, maxScore, highline, results)
highlightng <- c.get[Map[Ident, Map[String, List[String]]]]("highlighting")
highlight = highlightng.map(kv => kv._1 -> kv._2.values.flatten.toList)
} yield FtsResult(qtime, count, maxScore, highlight, results)
}
implicit def decodeItemMatch: Decoder[FtsResult.ItemMatch] =

View File

@ -17,8 +17,8 @@ final class SolrFtsClient[F[_]: Effect](
def initialize: F[Unit] =
solrSetup.setupSchema
def searchBasic(q: FtsQuery): Stream[F, FtsResult] =
Stream.eval(solrQuery.query(q))
def search(q: FtsQuery): F[FtsResult] =
solrQuery.query(q)
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
(for {
@ -32,6 +32,8 @@ final class SolrFtsClient[F[_]: Effect](
}
} yield ()).compile.drain
def updateIndex(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = ???
}
object SolrFtsClient {

View File

@ -18,7 +18,14 @@ trait SolrQuery[F[_]] {
def query(q: FtsQuery): F[FtsResult] = {
val fq = QueryData(
List(Field.content, Field.itemName, Field.itemNotes, Field.attachmentName),
List(
Field.content,
Field.content_de,
Field.content_en,
Field.itemName,
Field.itemNotes,
Field.attachmentName
),
List(
Field.id,
Field.itemId,

View File

@ -10,6 +10,7 @@ import org.log4s.getLogger
import _root_.io.circe.syntax._
import _root_.io.circe._
import _root_.io.circe.generic.semiauto._
import docspell.common._
trait SolrSetup[F[_]] {
@ -44,9 +45,16 @@ object SolrSetup {
Field.itemName,
Field.itemNotes
)
.traverse(addTextField)
.traverse(addTextField(None))
cmds0 *> cmds1 *> ().pure[F]
val cntLang = Language.all.traverse {
case l @ Language.German =>
addTextField(l.some)(Field.content_de)
case l @ Language.English =>
addTextField(l.some)(Field.content_en)
}
cmds0 *> cmds1 *> cntLang *> ().pure[F]
}
private def run(cmd: Json): F[Unit] = {
@ -59,10 +67,18 @@ object SolrSetup {
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.string(field)))
private def addTextField(field: Field): F[Unit] =
private def addTextField(lang: Option[Language])(field: Field): F[Unit] =
lang match {
case None =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.text(field)))
case Some(Language.German) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textDE(field)))
case Some(Language.English) =>
run(DeleteField.command(DeleteField(field))).attempt *>
run(AddField.command(AddField.textEN(field)))
}
}
}
@ -87,6 +103,12 @@ object SolrSetup {
def text(field: Field): AddField =
AddField(field, "text_general", true, true, false)
def textDE(field: Field): AddField =
AddField(field, "text_de", true, true, false)
def textEN(field: Field): AddField =
AddField(field, "text_en", true, true, false)
}
case class DeleteField(name: Field)

View File

@ -35,14 +35,12 @@ object SolrUpdate {
def single(td: TextData): F[Unit] = {
val req = Method.POST(td.asJson, url)
logger.debug(s"Running request $req")
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r"))
}
def many(tds: List[TextData]): F[Unit] = {
val req = Method.POST(tds.asJson, url)
logger.debug(s"Running request $req")
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
client.expect[String](req).map(r => logger.debug(s"Req: $req Response: $r"))
}
}
}

View File

@ -89,7 +89,14 @@ object Migration {
)
.map(caa =>
TextData
.attachment(caa.item, caa.id, caa.collective, caa.name, caa.content)
.attachment(
caa.item,
caa.id,
caa.collective,
caa.lang,
caa.name,
caa.content
)
)
)
)

View File

@ -1,6 +1,5 @@
package docspell.joex.process
import fs2.Stream
import bitpeace.{Mimetype, RangeDef}
import cats.data.OptionT
import cats.implicits._
@ -32,7 +31,9 @@ object TextExtraction {
)
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <- fts.indexData(ctx.logger, Stream.emits(txt.map(_._2)))
idxItem =
TextData.item(item.item.id, ctx.args.meta.collective, item.item.name.some, None)
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1))
@ -52,6 +53,7 @@ object TextExtraction {
item.item.id,
ra.id,
collective,
lang,
ra.name,
rm.content
)

View File

@ -6,7 +6,7 @@ import cats.effect.Sync
import cats.data.OptionT
import doobie._
import doobie.implicits._
import docspell.common.{Ident, MetaProposalList}
import docspell.common._
import docspell.store.Store
import docspell.store.impl.Implicits._
import docspell.store.records._
@ -143,6 +143,7 @@ object QAttachment {
id: Ident,
item: Ident,
collective: Ident,
lang: Language,
name: Option[String],
content: Option[String]
)
@ -154,11 +155,14 @@ object QAttachment {
val mContent = RAttachmentMeta.Columns.content.prefix("m")
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val cId = RCollective.Columns.id.prefix("c")
val cLang = RCollective.Columns.language.prefix("c")
val cols = Seq(aId, aItem, iColl, aName, mContent)
val cols = Seq(aId, aItem, iColl, cLang, aName, mContent)
val from = RAttachment.table ++ fr"a INNER JOIN" ++
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++
fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem)
fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ iId.is(aItem) ++
fr"INNER JOIN" ++ RCollective.table ++ fr"c ON" ++ cId.is(iColl)
selectSimple(cols, from, Fragment.empty)
.query[ContentAndName]