mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 13:32:51 +00:00
Index exsiting data in solr
This commit is contained in:
parent
60c079f664
commit
1f4220eccb
10
build.sbt
10
build.sbt
@ -277,6 +277,8 @@ val ftssolr = project.in(file("modules/fts-solr")).
|
||||
name := "docspell-fts-solr",
|
||||
libraryDependencies ++=
|
||||
Dependencies.http4sClient ++
|
||||
Dependencies.http4sCirce ++
|
||||
Dependencies.http4sDsl ++
|
||||
Dependencies.circe
|
||||
).dependsOn(common, ftsclient)
|
||||
|
||||
@ -356,7 +358,9 @@ val joex = project.in(file("modules/joex")).
|
||||
name := "docspell-joex",
|
||||
libraryDependencies ++=
|
||||
Dependencies.fs2 ++
|
||||
Dependencies.http4s ++
|
||||
Dependencies.http4sServer ++
|
||||
Dependencies.http4sCirce ++
|
||||
Dependencies.http4sDsl ++
|
||||
Dependencies.circe ++
|
||||
Dependencies.pureconfig ++
|
||||
Dependencies.emilTnef ++
|
||||
@ -384,7 +388,9 @@ val restserver = project.in(file("modules/restserver")).
|
||||
settings(
|
||||
name := "docspell-restserver",
|
||||
libraryDependencies ++=
|
||||
Dependencies.http4s ++
|
||||
Dependencies.http4sServer ++
|
||||
Dependencies.http4sCirce ++
|
||||
Dependencies.http4sDsl ++
|
||||
Dependencies.circe ++
|
||||
Dependencies.pureconfig ++
|
||||
Dependencies.yamusca ++
|
||||
|
@ -10,6 +10,11 @@ sealed trait TextData {
|
||||
|
||||
def collective: Ident
|
||||
|
||||
final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A =
|
||||
this match {
|
||||
case a: TextData.Attachment => f(a)
|
||||
case a: TextData.Item => g(a)
|
||||
}
|
||||
}
|
||||
|
||||
object TextData {
|
||||
|
@ -0,0 +1,41 @@
|
||||
package docspell.ftssolr
|
||||
|
||||
import docspell.common._
|
||||
import docspell.ftsclient._
|
||||
import io.circe._
|
||||
|
||||
trait JsonCodec {
|
||||
|
||||
implicit def attachmentEncoder: Encoder[TextData.Attachment] =
|
||||
new Encoder[TextData.Attachment] {
|
||||
final def apply(td: TextData.Attachment): Json = Json.obj(
|
||||
("id", Ident.encodeIdent(td.id)),
|
||||
("item", Ident.encodeIdent(td.item)),
|
||||
("collective", Ident.encodeIdent(td.collective)),
|
||||
("attachmentName", Json.fromString(td.name.getOrElse(""))),
|
||||
("content", Json.fromString(td.text.getOrElse(""))),
|
||||
("discriminator", Json.fromString("attachment"))
|
||||
)
|
||||
}
|
||||
|
||||
implicit def itemEncoder: Encoder[TextData.Item] =
|
||||
new Encoder[TextData.Item] {
|
||||
final def apply(td: TextData.Item): Json = Json.obj(
|
||||
("id", Ident.encodeIdent(td.id)),
|
||||
("item", Ident.encodeIdent(td.item)),
|
||||
("collective", Ident.encodeIdent(td.collective)),
|
||||
("itemName", Json.fromString(td.name.getOrElse(""))),
|
||||
("itemNotes", Json.fromString(td.notes.getOrElse(""))),
|
||||
("discriminator", Json.fromString("item"))
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
implicit def textDataEncoder(implicit
|
||||
ae: Encoder[TextData.Attachment],
|
||||
ie: Encoder[TextData.Item]
|
||||
): Encoder[TextData] =
|
||||
Encoder(_.fold(ae.apply, ie.apply))
|
||||
}
|
||||
|
||||
object JsonCodec extends JsonCodec
|
@ -11,10 +11,8 @@ import docspell.ftsclient._
|
||||
import docspell.ftsclient.FtsBasicResult._
|
||||
|
||||
final class SolrFtsClient[F[_]: Effect](
|
||||
cfg: SolrConfig,
|
||||
client: Client[F]
|
||||
solrUpdate: SolrUpdate[F]
|
||||
) extends FtsClient[F] {
|
||||
println(s"$client $cfg")
|
||||
|
||||
def initialize: F[Unit] =
|
||||
().pure[F]
|
||||
@ -34,7 +32,16 @@ final class SolrFtsClient[F[_]: Effect](
|
||||
)
|
||||
|
||||
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
|
||||
logger.info("Inserting lots of data into index")
|
||||
(for {
|
||||
_ <- Stream.eval(logger.debug("Inserting data into index"))
|
||||
chunks <- data.chunks
|
||||
res <- Stream.eval(solrUpdate.many(chunks.toList).attempt)
|
||||
_ <- res match {
|
||||
case Right(()) => Stream.emit(())
|
||||
case Left(ex) =>
|
||||
Stream.eval(logger.error(ex)("Error inserting chunk of data into index"))
|
||||
}
|
||||
} yield ()).compile.drain
|
||||
|
||||
}
|
||||
|
||||
@ -44,6 +51,8 @@ object SolrFtsClient {
|
||||
cfg: SolrConfig,
|
||||
httpClient: Client[F]
|
||||
): Resource[F, FtsClient[F]] =
|
||||
Resource.pure[F, FtsClient[F]](new SolrFtsClient(cfg, httpClient))
|
||||
Resource.pure[F, FtsClient[F]](
|
||||
new SolrFtsClient(SolrUpdate(cfg, httpClient))
|
||||
)
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,49 @@
|
||||
package docspell.ftssolr
|
||||
|
||||
import cats.effect._
|
||||
import org.http4s._
|
||||
import cats.implicits._
|
||||
import org.http4s.client.Client
|
||||
import org.http4s.circe._
|
||||
import org.http4s.client.dsl.Http4sClientDsl
|
||||
import _root_.io.circe.syntax._
|
||||
import org.log4s.getLogger
|
||||
|
||||
import docspell.ftsclient._
|
||||
import JsonCodec._
|
||||
|
||||
trait SolrUpdate[F[_]] {
|
||||
|
||||
def single(td: TextData): F[Unit]
|
||||
|
||||
def many(tds: List[TextData]): F[Unit]
|
||||
|
||||
}
|
||||
|
||||
object SolrUpdate {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrUpdate[F] = {
|
||||
val dsl = new Http4sClientDsl[F] {}
|
||||
import dsl._
|
||||
|
||||
new SolrUpdate[F] {
|
||||
val url = (Uri.unsafeFromString(cfg.url.asString) / "update")
|
||||
.withQueryParam("commitWithin", "1000")
|
||||
.withQueryParam("overwrite", "true")
|
||||
.withQueryParam("wt", "json")
|
||||
|
||||
def single(td: TextData): F[Unit] = {
|
||||
val req = Method.POST(td.asJson, url)
|
||||
logger.debug(s"Running request $req")
|
||||
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
|
||||
}
|
||||
|
||||
def many(tds: List[TextData]): F[Unit] = {
|
||||
val req = Method.POST(tds.asJson, url)
|
||||
logger.debug(s"Running request $req")
|
||||
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -6,5 +6,5 @@ CREATE TABLE "fts_migration" (
|
||||
"created" timestamp not null
|
||||
);
|
||||
|
||||
CREATE UNIQE INDEX "fts_migration_version_engine_idx"
|
||||
CREATE UNIQUE INDEX "fts_migration_version_engine_idx"
|
||||
ON "fts_migration"("version", "fts_engine");
|
||||
|
@ -4,40 +4,39 @@ import sbt._
|
||||
|
||||
object Dependencies {
|
||||
|
||||
val BcryptVersion = "0.4"
|
||||
val BcryptVersion = "0.4"
|
||||
val BetterMonadicForVersion = "0.3.1"
|
||||
val BitpeaceVersion = "0.5.0"
|
||||
val CalevVersion = "0.3.1"
|
||||
val CirceVersion = "0.13.0"
|
||||
val DoobieVersion = "0.9.0"
|
||||
val EmilVersion = "0.6.1"
|
||||
val FastparseVersion = "2.1.3"
|
||||
val FlexmarkVersion = "0.62.2"
|
||||
val FlywayVersion = "6.4.4"
|
||||
val Fs2Version = "2.4.2"
|
||||
val H2Version = "1.4.200"
|
||||
val Http4sVersion = "0.21.4"
|
||||
val Icu4jVersion = "67.1"
|
||||
val JsoupVersion = "1.13.1"
|
||||
val KindProjectorVersion = "0.10.3"
|
||||
val Log4sVersion = "1.8.2"
|
||||
val LogbackVersion = "1.2.3"
|
||||
val MariaDbVersion = "2.6.0"
|
||||
val MiniTestVersion = "2.8.2"
|
||||
val PdfboxVersion = "2.0.20"
|
||||
val PoiVersion = "4.1.2"
|
||||
val PostgresVersion = "42.2.14"
|
||||
val PureConfigVersion = "0.12.3"
|
||||
val Slf4jVersion = "1.7.30"
|
||||
val StanfordNlpVersion = "3.9.2"
|
||||
val TikaVersion = "1.24.1"
|
||||
val YamuscaVersion = "0.6.2"
|
||||
val SwaggerUIVersion = "3.26.1"
|
||||
val SemanticUIVersion = "2.4.1"
|
||||
val TwelveMonkeysVersion = "3.5"
|
||||
val JQueryVersion = "3.5.1"
|
||||
val ViewerJSVersion = "0.5.8"
|
||||
|
||||
val BitpeaceVersion = "0.5.0"
|
||||
val CalevVersion = "0.3.1"
|
||||
val CirceVersion = "0.13.0"
|
||||
val DoobieVersion = "0.9.0"
|
||||
val EmilVersion = "0.6.1"
|
||||
val FastparseVersion = "2.1.3"
|
||||
val FlexmarkVersion = "0.62.2"
|
||||
val FlywayVersion = "6.4.4"
|
||||
val Fs2Version = "2.4.2"
|
||||
val H2Version = "1.4.200"
|
||||
val Http4sVersion = "0.21.4"
|
||||
val Icu4jVersion = "67.1"
|
||||
val JsoupVersion = "1.13.1"
|
||||
val KindProjectorVersion = "0.10.3"
|
||||
val Log4sVersion = "1.8.2"
|
||||
val LogbackVersion = "1.2.3"
|
||||
val MariaDbVersion = "2.6.0"
|
||||
val MiniTestVersion = "2.8.2"
|
||||
val PdfboxVersion = "2.0.20"
|
||||
val PoiVersion = "4.1.2"
|
||||
val PostgresVersion = "42.2.14"
|
||||
val PureConfigVersion = "0.12.3"
|
||||
val Slf4jVersion = "1.7.30"
|
||||
val StanfordNlpVersion = "3.9.2"
|
||||
val TikaVersion = "1.24.1"
|
||||
val YamuscaVersion = "0.6.2"
|
||||
val SwaggerUIVersion = "3.26.1"
|
||||
val SemanticUIVersion = "2.4.1"
|
||||
val TwelveMonkeysVersion = "3.5"
|
||||
val JQueryVersion = "3.5.1"
|
||||
val ViewerJSVersion = "0.5.8"
|
||||
|
||||
val calevCore = Seq(
|
||||
"com.github.eikek" %% "calev-core" % CalevVersion
|
||||
@ -57,23 +56,27 @@ object Dependencies {
|
||||
)
|
||||
|
||||
val poi = Seq(
|
||||
"org.apache.poi" % "poi" % PoiVersion,
|
||||
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
||||
"org.apache.poi" % "poi-scratchpad" % PoiVersion,
|
||||
).map(_.excludeAll(
|
||||
ExclusionRule("commons-logging")
|
||||
)) ++ jclOverSlf4j
|
||||
"org.apache.poi" % "poi" % PoiVersion,
|
||||
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
||||
"org.apache.poi" % "poi-scratchpad" % PoiVersion
|
||||
).map(
|
||||
_.excludeAll(
|
||||
ExclusionRule("commons-logging")
|
||||
)
|
||||
) ++ jclOverSlf4j
|
||||
|
||||
// https://github.com/vsch/flexmark-java
|
||||
// BSD 2-Clause
|
||||
val flexmark = Seq(
|
||||
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
|
||||
).map(_.excludeAll(
|
||||
ExclusionRule("junit"),
|
||||
ExclusionRule("hamcrest-core")
|
||||
))
|
||||
).map(
|
||||
_.excludeAll(
|
||||
ExclusionRule("junit"),
|
||||
ExclusionRule("hamcrest-core")
|
||||
)
|
||||
)
|
||||
|
||||
val twelvemonkeys = Seq(
|
||||
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
|
||||
@ -81,30 +84,30 @@ object Dependencies {
|
||||
)
|
||||
|
||||
val pdfbox = Seq(
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
||||
("org.apache.pdfbox" % "pdfbox" % PdfboxVersion).excludeAll(
|
||||
ExclusionRule("org.bouncycastle"),
|
||||
ExclusionRule("commons-logging")
|
||||
)
|
||||
) ++ jclOverSlf4j
|
||||
|
||||
val emilCommon = Seq(
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion
|
||||
)
|
||||
val emil = Seq(
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
||||
)
|
||||
val emilDoobie = Seq(
|
||||
"com.github.eikek" %% "emil-doobie" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-doobie" % EmilVersion
|
||||
)
|
||||
val emilTnef = Seq(
|
||||
"com.github.eikek" %% "emil-tnef" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-tnef" % EmilVersion
|
||||
)
|
||||
val emilMarkdown = Seq(
|
||||
"com.github.eikek" %% "emil-markdown" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-markdown" % EmilVersion
|
||||
)
|
||||
val emilJsoup = Seq(
|
||||
"com.github.eikek" %% "emil-jsoup" % EmilVersion,
|
||||
"com.github.eikek" %% "emil-jsoup" % EmilVersion
|
||||
)
|
||||
|
||||
val jsoup = Seq(
|
||||
@ -112,7 +115,7 @@ object Dependencies {
|
||||
)
|
||||
|
||||
val stanfordNlpCore = Seq(
|
||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).excludeAll(
|
||||
ExclusionRule("com.io7m.xom", "xom"),
|
||||
ExclusionRule("javax.servlet", "javax.servlet-api"),
|
||||
ExclusionRule("org.apache.lucene", "lucene-queryparser"),
|
||||
@ -130,8 +133,11 @@ object Dependencies {
|
||||
)
|
||||
|
||||
val stanfordNlpModels = Seq(
|
||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-german",
|
||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-english"
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-german"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
|
||||
"models-english"
|
||||
)
|
||||
)
|
||||
|
||||
val tika = Seq(
|
||||
@ -150,22 +156,28 @@ object Dependencies {
|
||||
|
||||
val fs2 = Seq(
|
||||
"co.fs2" %% "fs2-core" % Fs2Version,
|
||||
"co.fs2" %% "fs2-io" % Fs2Version
|
||||
"co.fs2" %% "fs2-io" % Fs2Version
|
||||
)
|
||||
|
||||
val http4s = Seq(
|
||||
"org.http4s" %% "http4s-blaze-server" % Http4sVersion,
|
||||
"org.http4s" %% "http4s-circe" % Http4sVersion,
|
||||
"org.http4s" %% "http4s-dsl" % Http4sVersion,
|
||||
)
|
||||
|
||||
val http4sClient = Seq(
|
||||
"org.http4s" %% "http4s-blaze-client" % Http4sVersion
|
||||
)
|
||||
|
||||
val http4sCirce = Seq(
|
||||
"org.http4s" %% "http4s-circe" % Http4sVersion
|
||||
)
|
||||
|
||||
val http4sDsl = Seq(
|
||||
"org.http4s" %% "http4s-dsl" % Http4sVersion
|
||||
)
|
||||
|
||||
val http4sServer = Seq(
|
||||
"org.http4s" %% "http4s-blaze-server" % Http4sVersion
|
||||
)
|
||||
|
||||
val circe = Seq(
|
||||
"io.circe" %% "circe-generic" % CirceVersion,
|
||||
"io.circe" %% "circe-parser" % CirceVersion
|
||||
"io.circe" %% "circe-parser" % CirceVersion
|
||||
)
|
||||
|
||||
// https://github.com/Log4s/log4s;ASL 2.0
|
||||
@ -203,7 +215,7 @@ object Dependencies {
|
||||
// https://github.com/tpolecat/doobie
|
||||
// MIT
|
||||
val doobie = Seq(
|
||||
"org.tpolecat" %% "doobie-core" % DoobieVersion,
|
||||
"org.tpolecat" %% "doobie-core" % DoobieVersion,
|
||||
"org.tpolecat" %% "doobie-hikari" % DoobieVersion
|
||||
)
|
||||
|
||||
@ -224,18 +236,18 @@ object Dependencies {
|
||||
val miniTest = Seq(
|
||||
// https://github.com/monix/minitest
|
||||
// Apache 2.0
|
||||
"io.monix" %% "minitest" % MiniTestVersion,
|
||||
"io.monix" %% "minitest" % MiniTestVersion,
|
||||
"io.monix" %% "minitest-laws" % MiniTestVersion
|
||||
).map(_ % Test)
|
||||
|
||||
val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion
|
||||
val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion
|
||||
val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion
|
||||
val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion
|
||||
|
||||
val webjars = Seq(
|
||||
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
|
||||
"org.webjars" % "Semantic-UI"% SemanticUIVersion,
|
||||
"org.webjars" % "jquery" % JQueryVersion,
|
||||
"org.webjars" % "viewerjs" % ViewerJSVersion
|
||||
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
|
||||
"org.webjars" % "Semantic-UI" % SemanticUIVersion,
|
||||
"org.webjars" % "jquery" % JQueryVersion,
|
||||
"org.webjars" % "viewerjs" % ViewerJSVersion
|
||||
)
|
||||
|
||||
val icu4j = Seq(
|
||||
|
Loading…
x
Reference in New Issue
Block a user