Index exsiting data in solr

This commit is contained in:
Eike Kettner 2020-06-19 00:43:35 +02:00
parent 60c079f664
commit 1f4220eccb
7 changed files with 202 additions and 80 deletions

View File

@ -277,6 +277,8 @@ val ftssolr = project.in(file("modules/fts-solr")).
name := "docspell-fts-solr",
libraryDependencies ++=
Dependencies.http4sClient ++
Dependencies.http4sCirce ++
Dependencies.http4sDsl ++
Dependencies.circe
).dependsOn(common, ftsclient)
@ -356,7 +358,9 @@ val joex = project.in(file("modules/joex")).
name := "docspell-joex",
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.http4s ++
Dependencies.http4sServer ++
Dependencies.http4sCirce ++
Dependencies.http4sDsl ++
Dependencies.circe ++
Dependencies.pureconfig ++
Dependencies.emilTnef ++
@ -384,7 +388,9 @@ val restserver = project.in(file("modules/restserver")).
settings(
name := "docspell-restserver",
libraryDependencies ++=
Dependencies.http4s ++
Dependencies.http4sServer ++
Dependencies.http4sCirce ++
Dependencies.http4sDsl ++
Dependencies.circe ++
Dependencies.pureconfig ++
Dependencies.yamusca ++

View File

@ -10,6 +10,11 @@ sealed trait TextData {
def collective: Ident
final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A =
this match {
case a: TextData.Attachment => f(a)
case a: TextData.Item => g(a)
}
}
object TextData {

View File

@ -0,0 +1,41 @@
package docspell.ftssolr
import docspell.common._
import docspell.ftsclient._
import io.circe._
trait JsonCodec {
implicit def attachmentEncoder: Encoder[TextData.Attachment] =
new Encoder[TextData.Attachment] {
final def apply(td: TextData.Attachment): Json = Json.obj(
("id", Ident.encodeIdent(td.id)),
("item", Ident.encodeIdent(td.item)),
("collective", Ident.encodeIdent(td.collective)),
("attachmentName", Json.fromString(td.name.getOrElse(""))),
("content", Json.fromString(td.text.getOrElse(""))),
("discriminator", Json.fromString("attachment"))
)
}
implicit def itemEncoder: Encoder[TextData.Item] =
new Encoder[TextData.Item] {
final def apply(td: TextData.Item): Json = Json.obj(
("id", Ident.encodeIdent(td.id)),
("item", Ident.encodeIdent(td.item)),
("collective", Ident.encodeIdent(td.collective)),
("itemName", Json.fromString(td.name.getOrElse(""))),
("itemNotes", Json.fromString(td.notes.getOrElse(""))),
("discriminator", Json.fromString("item"))
)
}
implicit def textDataEncoder(implicit
ae: Encoder[TextData.Attachment],
ie: Encoder[TextData.Item]
): Encoder[TextData] =
Encoder(_.fold(ae.apply, ie.apply))
}
object JsonCodec extends JsonCodec

View File

@ -11,10 +11,8 @@ import docspell.ftsclient._
import docspell.ftsclient.FtsBasicResult._
final class SolrFtsClient[F[_]: Effect](
cfg: SolrConfig,
client: Client[F]
solrUpdate: SolrUpdate[F]
) extends FtsClient[F] {
println(s"$client $cfg")
def initialize: F[Unit] =
().pure[F]
@ -34,7 +32,16 @@ final class SolrFtsClient[F[_]: Effect](
)
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
logger.info("Inserting lots of data into index")
(for {
_ <- Stream.eval(logger.debug("Inserting data into index"))
chunks <- data.chunks
res <- Stream.eval(solrUpdate.many(chunks.toList).attempt)
_ <- res match {
case Right(()) => Stream.emit(())
case Left(ex) =>
Stream.eval(logger.error(ex)("Error inserting chunk of data into index"))
}
} yield ()).compile.drain
}
@ -44,6 +51,8 @@ object SolrFtsClient {
cfg: SolrConfig,
httpClient: Client[F]
): Resource[F, FtsClient[F]] =
Resource.pure[F, FtsClient[F]](new SolrFtsClient(cfg, httpClient))
Resource.pure[F, FtsClient[F]](
new SolrFtsClient(SolrUpdate(cfg, httpClient))
)
}

View File

@ -0,0 +1,49 @@
package docspell.ftssolr
import cats.effect._
import org.http4s._
import cats.implicits._
import org.http4s.client.Client
import org.http4s.circe._
import org.http4s.client.dsl.Http4sClientDsl
import _root_.io.circe.syntax._
import org.log4s.getLogger
import docspell.ftsclient._
import JsonCodec._
trait SolrUpdate[F[_]] {
def single(td: TextData): F[Unit]
def many(tds: List[TextData]): F[Unit]
}
object SolrUpdate {
private[this] val logger = getLogger
def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrUpdate[F] = {
val dsl = new Http4sClientDsl[F] {}
import dsl._
new SolrUpdate[F] {
val url = (Uri.unsafeFromString(cfg.url.asString) / "update")
.withQueryParam("commitWithin", "1000")
.withQueryParam("overwrite", "true")
.withQueryParam("wt", "json")
def single(td: TextData): F[Unit] = {
val req = Method.POST(td.asJson, url)
logger.debug(s"Running request $req")
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
}
def many(tds: List[TextData]): F[Unit] = {
val req = Method.POST(tds.asJson, url)
logger.debug(s"Running request $req")
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
}
}
}
}

View File

@ -6,5 +6,5 @@ CREATE TABLE "fts_migration" (
"created" timestamp not null
);
CREATE UNIQE INDEX "fts_migration_version_engine_idx"
CREATE UNIQUE INDEX "fts_migration_version_engine_idx"
ON "fts_migration"("version", "fts_engine");

View File

@ -4,40 +4,39 @@ import sbt._
object Dependencies {
val BcryptVersion = "0.4"
val BcryptVersion = "0.4"
val BetterMonadicForVersion = "0.3.1"
val BitpeaceVersion = "0.5.0"
val CalevVersion = "0.3.1"
val CirceVersion = "0.13.0"
val DoobieVersion = "0.9.0"
val EmilVersion = "0.6.1"
val FastparseVersion = "2.1.3"
val FlexmarkVersion = "0.62.2"
val FlywayVersion = "6.4.4"
val Fs2Version = "2.4.2"
val H2Version = "1.4.200"
val Http4sVersion = "0.21.4"
val Icu4jVersion = "67.1"
val JsoupVersion = "1.13.1"
val KindProjectorVersion = "0.10.3"
val Log4sVersion = "1.8.2"
val LogbackVersion = "1.2.3"
val MariaDbVersion = "2.6.0"
val MiniTestVersion = "2.8.2"
val PdfboxVersion = "2.0.20"
val PoiVersion = "4.1.2"
val PostgresVersion = "42.2.14"
val PureConfigVersion = "0.12.3"
val Slf4jVersion = "1.7.30"
val StanfordNlpVersion = "3.9.2"
val TikaVersion = "1.24.1"
val YamuscaVersion = "0.6.2"
val SwaggerUIVersion = "3.26.1"
val SemanticUIVersion = "2.4.1"
val TwelveMonkeysVersion = "3.5"
val JQueryVersion = "3.5.1"
val ViewerJSVersion = "0.5.8"
val BitpeaceVersion = "0.5.0"
val CalevVersion = "0.3.1"
val CirceVersion = "0.13.0"
val DoobieVersion = "0.9.0"
val EmilVersion = "0.6.1"
val FastparseVersion = "2.1.3"
val FlexmarkVersion = "0.62.2"
val FlywayVersion = "6.4.4"
val Fs2Version = "2.4.2"
val H2Version = "1.4.200"
val Http4sVersion = "0.21.4"
val Icu4jVersion = "67.1"
val JsoupVersion = "1.13.1"
val KindProjectorVersion = "0.10.3"
val Log4sVersion = "1.8.2"
val LogbackVersion = "1.2.3"
val MariaDbVersion = "2.6.0"
val MiniTestVersion = "2.8.2"
val PdfboxVersion = "2.0.20"
val PoiVersion = "4.1.2"
val PostgresVersion = "42.2.14"
val PureConfigVersion = "0.12.3"
val Slf4jVersion = "1.7.30"
val StanfordNlpVersion = "3.9.2"
val TikaVersion = "1.24.1"
val YamuscaVersion = "0.6.2"
val SwaggerUIVersion = "3.26.1"
val SemanticUIVersion = "2.4.1"
val TwelveMonkeysVersion = "3.5"
val JQueryVersion = "3.5.1"
val ViewerJSVersion = "0.5.8"
val calevCore = Seq(
"com.github.eikek" %% "calev-core" % CalevVersion
@ -57,23 +56,27 @@ object Dependencies {
)
val poi = Seq(
"org.apache.poi" % "poi" % PoiVersion,
"org.apache.poi" % "poi-ooxml" % PoiVersion,
"org.apache.poi" % "poi-scratchpad" % PoiVersion,
).map(_.excludeAll(
ExclusionRule("commons-logging")
)) ++ jclOverSlf4j
"org.apache.poi" % "poi" % PoiVersion,
"org.apache.poi" % "poi-ooxml" % PoiVersion,
"org.apache.poi" % "poi-scratchpad" % PoiVersion
).map(
_.excludeAll(
ExclusionRule("commons-logging")
)
) ++ jclOverSlf4j
// https://github.com/vsch/flexmark-java
// BSD 2-Clause
val flexmark = Seq(
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
).map(_.excludeAll(
ExclusionRule("junit"),
ExclusionRule("hamcrest-core")
))
).map(
_.excludeAll(
ExclusionRule("junit"),
ExclusionRule("hamcrest-core")
)
)
val twelvemonkeys = Seq(
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
@ -81,30 +84,30 @@ object Dependencies {
)
val pdfbox = Seq(
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
("org.apache.pdfbox" % "pdfbox" % PdfboxVersion).excludeAll(
ExclusionRule("org.bouncycastle"),
ExclusionRule("commons-logging")
)
) ++ jclOverSlf4j
val emilCommon = Seq(
"com.github.eikek" %% "emil-common" % EmilVersion,
"com.github.eikek" %% "emil-common" % EmilVersion
)
val emil = Seq(
"com.github.eikek" %% "emil-common" % EmilVersion,
"com.github.eikek" %% "emil-javamail" % EmilVersion
"com.github.eikek" %% "emil-common" % EmilVersion,
"com.github.eikek" %% "emil-javamail" % EmilVersion
)
val emilDoobie = Seq(
"com.github.eikek" %% "emil-doobie" % EmilVersion,
"com.github.eikek" %% "emil-doobie" % EmilVersion
)
val emilTnef = Seq(
"com.github.eikek" %% "emil-tnef" % EmilVersion,
"com.github.eikek" %% "emil-tnef" % EmilVersion
)
val emilMarkdown = Seq(
"com.github.eikek" %% "emil-markdown" % EmilVersion,
"com.github.eikek" %% "emil-markdown" % EmilVersion
)
val emilJsoup = Seq(
"com.github.eikek" %% "emil-jsoup" % EmilVersion,
"com.github.eikek" %% "emil-jsoup" % EmilVersion
)
val jsoup = Seq(
@ -112,7 +115,7 @@ object Dependencies {
)
val stanfordNlpCore = Seq(
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).excludeAll(
ExclusionRule("com.io7m.xom", "xom"),
ExclusionRule("javax.servlet", "javax.servlet-api"),
ExclusionRule("org.apache.lucene", "lucene-queryparser"),
@ -130,8 +133,11 @@ object Dependencies {
)
val stanfordNlpModels = Seq(
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-german",
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-english"
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
.classifier("models-german"),
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
"models-english"
)
)
val tika = Seq(
@ -150,22 +156,28 @@ object Dependencies {
val fs2 = Seq(
"co.fs2" %% "fs2-core" % Fs2Version,
"co.fs2" %% "fs2-io" % Fs2Version
"co.fs2" %% "fs2-io" % Fs2Version
)
val http4s = Seq(
"org.http4s" %% "http4s-blaze-server" % Http4sVersion,
"org.http4s" %% "http4s-circe" % Http4sVersion,
"org.http4s" %% "http4s-dsl" % Http4sVersion,
)
val http4sClient = Seq(
"org.http4s" %% "http4s-blaze-client" % Http4sVersion
)
val http4sCirce = Seq(
"org.http4s" %% "http4s-circe" % Http4sVersion
)
val http4sDsl = Seq(
"org.http4s" %% "http4s-dsl" % Http4sVersion
)
val http4sServer = Seq(
"org.http4s" %% "http4s-blaze-server" % Http4sVersion
)
val circe = Seq(
"io.circe" %% "circe-generic" % CirceVersion,
"io.circe" %% "circe-parser" % CirceVersion
"io.circe" %% "circe-parser" % CirceVersion
)
// https://github.com/Log4s/log4s;ASL 2.0
@ -203,7 +215,7 @@ object Dependencies {
// https://github.com/tpolecat/doobie
// MIT
val doobie = Seq(
"org.tpolecat" %% "doobie-core" % DoobieVersion,
"org.tpolecat" %% "doobie-core" % DoobieVersion,
"org.tpolecat" %% "doobie-hikari" % DoobieVersion
)
@ -224,18 +236,18 @@ object Dependencies {
val miniTest = Seq(
// https://github.com/monix/minitest
// Apache 2.0
"io.monix" %% "minitest" % MiniTestVersion,
"io.monix" %% "minitest" % MiniTestVersion,
"io.monix" %% "minitest-laws" % MiniTestVersion
).map(_ % Test)
val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion
val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion
val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion
val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion
val webjars = Seq(
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
"org.webjars" % "Semantic-UI"% SemanticUIVersion,
"org.webjars" % "jquery" % JQueryVersion,
"org.webjars" % "viewerjs" % ViewerJSVersion
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
"org.webjars" % "Semantic-UI" % SemanticUIVersion,
"org.webjars" % "jquery" % JQueryVersion,
"org.webjars" % "viewerjs" % ViewerJSVersion
)
val icu4j = Seq(