mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Index exsiting data in solr
This commit is contained in:
parent
60c079f664
commit
1f4220eccb
10
build.sbt
10
build.sbt
@ -277,6 +277,8 @@ val ftssolr = project.in(file("modules/fts-solr")).
|
|||||||
name := "docspell-fts-solr",
|
name := "docspell-fts-solr",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.http4sClient ++
|
Dependencies.http4sClient ++
|
||||||
|
Dependencies.http4sCirce ++
|
||||||
|
Dependencies.http4sDsl ++
|
||||||
Dependencies.circe
|
Dependencies.circe
|
||||||
).dependsOn(common, ftsclient)
|
).dependsOn(common, ftsclient)
|
||||||
|
|
||||||
@ -356,7 +358,9 @@ val joex = project.in(file("modules/joex")).
|
|||||||
name := "docspell-joex",
|
name := "docspell-joex",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.fs2 ++
|
Dependencies.fs2 ++
|
||||||
Dependencies.http4s ++
|
Dependencies.http4sServer ++
|
||||||
|
Dependencies.http4sCirce ++
|
||||||
|
Dependencies.http4sDsl ++
|
||||||
Dependencies.circe ++
|
Dependencies.circe ++
|
||||||
Dependencies.pureconfig ++
|
Dependencies.pureconfig ++
|
||||||
Dependencies.emilTnef ++
|
Dependencies.emilTnef ++
|
||||||
@ -384,7 +388,9 @@ val restserver = project.in(file("modules/restserver")).
|
|||||||
settings(
|
settings(
|
||||||
name := "docspell-restserver",
|
name := "docspell-restserver",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.http4s ++
|
Dependencies.http4sServer ++
|
||||||
|
Dependencies.http4sCirce ++
|
||||||
|
Dependencies.http4sDsl ++
|
||||||
Dependencies.circe ++
|
Dependencies.circe ++
|
||||||
Dependencies.pureconfig ++
|
Dependencies.pureconfig ++
|
||||||
Dependencies.yamusca ++
|
Dependencies.yamusca ++
|
||||||
|
@ -10,6 +10,11 @@ sealed trait TextData {
|
|||||||
|
|
||||||
def collective: Ident
|
def collective: Ident
|
||||||
|
|
||||||
|
final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A =
|
||||||
|
this match {
|
||||||
|
case a: TextData.Attachment => f(a)
|
||||||
|
case a: TextData.Item => g(a)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object TextData {
|
object TextData {
|
||||||
|
@ -0,0 +1,41 @@
|
|||||||
|
package docspell.ftssolr
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.ftsclient._
|
||||||
|
import io.circe._
|
||||||
|
|
||||||
|
trait JsonCodec {
|
||||||
|
|
||||||
|
implicit def attachmentEncoder: Encoder[TextData.Attachment] =
|
||||||
|
new Encoder[TextData.Attachment] {
|
||||||
|
final def apply(td: TextData.Attachment): Json = Json.obj(
|
||||||
|
("id", Ident.encodeIdent(td.id)),
|
||||||
|
("item", Ident.encodeIdent(td.item)),
|
||||||
|
("collective", Ident.encodeIdent(td.collective)),
|
||||||
|
("attachmentName", Json.fromString(td.name.getOrElse(""))),
|
||||||
|
("content", Json.fromString(td.text.getOrElse(""))),
|
||||||
|
("discriminator", Json.fromString("attachment"))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
implicit def itemEncoder: Encoder[TextData.Item] =
|
||||||
|
new Encoder[TextData.Item] {
|
||||||
|
final def apply(td: TextData.Item): Json = Json.obj(
|
||||||
|
("id", Ident.encodeIdent(td.id)),
|
||||||
|
("item", Ident.encodeIdent(td.item)),
|
||||||
|
("collective", Ident.encodeIdent(td.collective)),
|
||||||
|
("itemName", Json.fromString(td.name.getOrElse(""))),
|
||||||
|
("itemNotes", Json.fromString(td.notes.getOrElse(""))),
|
||||||
|
("discriminator", Json.fromString("item"))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
implicit def textDataEncoder(implicit
|
||||||
|
ae: Encoder[TextData.Attachment],
|
||||||
|
ie: Encoder[TextData.Item]
|
||||||
|
): Encoder[TextData] =
|
||||||
|
Encoder(_.fold(ae.apply, ie.apply))
|
||||||
|
}
|
||||||
|
|
||||||
|
object JsonCodec extends JsonCodec
|
@ -11,10 +11,8 @@ import docspell.ftsclient._
|
|||||||
import docspell.ftsclient.FtsBasicResult._
|
import docspell.ftsclient.FtsBasicResult._
|
||||||
|
|
||||||
final class SolrFtsClient[F[_]: Effect](
|
final class SolrFtsClient[F[_]: Effect](
|
||||||
cfg: SolrConfig,
|
solrUpdate: SolrUpdate[F]
|
||||||
client: Client[F]
|
|
||||||
) extends FtsClient[F] {
|
) extends FtsClient[F] {
|
||||||
println(s"$client $cfg")
|
|
||||||
|
|
||||||
def initialize: F[Unit] =
|
def initialize: F[Unit] =
|
||||||
().pure[F]
|
().pure[F]
|
||||||
@ -34,7 +32,16 @@ final class SolrFtsClient[F[_]: Effect](
|
|||||||
)
|
)
|
||||||
|
|
||||||
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
|
def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] =
|
||||||
logger.info("Inserting lots of data into index")
|
(for {
|
||||||
|
_ <- Stream.eval(logger.debug("Inserting data into index"))
|
||||||
|
chunks <- data.chunks
|
||||||
|
res <- Stream.eval(solrUpdate.many(chunks.toList).attempt)
|
||||||
|
_ <- res match {
|
||||||
|
case Right(()) => Stream.emit(())
|
||||||
|
case Left(ex) =>
|
||||||
|
Stream.eval(logger.error(ex)("Error inserting chunk of data into index"))
|
||||||
|
}
|
||||||
|
} yield ()).compile.drain
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,6 +51,8 @@ object SolrFtsClient {
|
|||||||
cfg: SolrConfig,
|
cfg: SolrConfig,
|
||||||
httpClient: Client[F]
|
httpClient: Client[F]
|
||||||
): Resource[F, FtsClient[F]] =
|
): Resource[F, FtsClient[F]] =
|
||||||
Resource.pure[F, FtsClient[F]](new SolrFtsClient(cfg, httpClient))
|
Resource.pure[F, FtsClient[F]](
|
||||||
|
new SolrFtsClient(SolrUpdate(cfg, httpClient))
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,49 @@
|
|||||||
|
package docspell.ftssolr
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import org.http4s._
|
||||||
|
import cats.implicits._
|
||||||
|
import org.http4s.client.Client
|
||||||
|
import org.http4s.circe._
|
||||||
|
import org.http4s.client.dsl.Http4sClientDsl
|
||||||
|
import _root_.io.circe.syntax._
|
||||||
|
import org.log4s.getLogger
|
||||||
|
|
||||||
|
import docspell.ftsclient._
|
||||||
|
import JsonCodec._
|
||||||
|
|
||||||
|
trait SolrUpdate[F[_]] {
|
||||||
|
|
||||||
|
def single(td: TextData): F[Unit]
|
||||||
|
|
||||||
|
def many(tds: List[TextData]): F[Unit]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object SolrUpdate {
|
||||||
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
|
def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrUpdate[F] = {
|
||||||
|
val dsl = new Http4sClientDsl[F] {}
|
||||||
|
import dsl._
|
||||||
|
|
||||||
|
new SolrUpdate[F] {
|
||||||
|
val url = (Uri.unsafeFromString(cfg.url.asString) / "update")
|
||||||
|
.withQueryParam("commitWithin", "1000")
|
||||||
|
.withQueryParam("overwrite", "true")
|
||||||
|
.withQueryParam("wt", "json")
|
||||||
|
|
||||||
|
def single(td: TextData): F[Unit] = {
|
||||||
|
val req = Method.POST(td.asJson, url)
|
||||||
|
logger.debug(s"Running request $req")
|
||||||
|
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
|
||||||
|
}
|
||||||
|
|
||||||
|
def many(tds: List[TextData]): F[Unit] = {
|
||||||
|
val req = Method.POST(tds.asJson, url)
|
||||||
|
logger.debug(s"Running request $req")
|
||||||
|
client.expect[String](req).map(r => logger.debug(s"Response: $r"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -6,5 +6,5 @@ CREATE TABLE "fts_migration" (
|
|||||||
"created" timestamp not null
|
"created" timestamp not null
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE UNIQE INDEX "fts_migration_version_engine_idx"
|
CREATE UNIQUE INDEX "fts_migration_version_engine_idx"
|
||||||
ON "fts_migration"("version", "fts_engine");
|
ON "fts_migration"("version", "fts_engine");
|
||||||
|
@ -38,7 +38,6 @@ object Dependencies {
|
|||||||
val JQueryVersion = "3.5.1"
|
val JQueryVersion = "3.5.1"
|
||||||
val ViewerJSVersion = "0.5.8"
|
val ViewerJSVersion = "0.5.8"
|
||||||
|
|
||||||
|
|
||||||
val calevCore = Seq(
|
val calevCore = Seq(
|
||||||
"com.github.eikek" %% "calev-core" % CalevVersion
|
"com.github.eikek" %% "calev-core" % CalevVersion
|
||||||
)
|
)
|
||||||
@ -59,10 +58,12 @@ object Dependencies {
|
|||||||
val poi = Seq(
|
val poi = Seq(
|
||||||
"org.apache.poi" % "poi" % PoiVersion,
|
"org.apache.poi" % "poi" % PoiVersion,
|
||||||
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
||||||
"org.apache.poi" % "poi-scratchpad" % PoiVersion,
|
"org.apache.poi" % "poi-scratchpad" % PoiVersion
|
||||||
).map(_.excludeAll(
|
).map(
|
||||||
|
_.excludeAll(
|
||||||
ExclusionRule("commons-logging")
|
ExclusionRule("commons-logging")
|
||||||
)) ++ jclOverSlf4j
|
)
|
||||||
|
) ++ jclOverSlf4j
|
||||||
|
|
||||||
// https://github.com/vsch/flexmark-java
|
// https://github.com/vsch/flexmark-java
|
||||||
// BSD 2-Clause
|
// BSD 2-Clause
|
||||||
@ -70,10 +71,12 @@ object Dependencies {
|
|||||||
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
||||||
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
||||||
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
|
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
|
||||||
).map(_.excludeAll(
|
).map(
|
||||||
|
_.excludeAll(
|
||||||
ExclusionRule("junit"),
|
ExclusionRule("junit"),
|
||||||
ExclusionRule("hamcrest-core")
|
ExclusionRule("hamcrest-core")
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
val twelvemonkeys = Seq(
|
val twelvemonkeys = Seq(
|
||||||
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
|
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
|
||||||
@ -81,30 +84,30 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
|
|
||||||
val pdfbox = Seq(
|
val pdfbox = Seq(
|
||||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
("org.apache.pdfbox" % "pdfbox" % PdfboxVersion).excludeAll(
|
||||||
ExclusionRule("org.bouncycastle"),
|
ExclusionRule("org.bouncycastle"),
|
||||||
ExclusionRule("commons-logging")
|
ExclusionRule("commons-logging")
|
||||||
)
|
)
|
||||||
) ++ jclOverSlf4j
|
) ++ jclOverSlf4j
|
||||||
|
|
||||||
val emilCommon = Seq(
|
val emilCommon = Seq(
|
||||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
"com.github.eikek" %% "emil-common" % EmilVersion
|
||||||
)
|
)
|
||||||
val emil = Seq(
|
val emil = Seq(
|
||||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||||
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
||||||
)
|
)
|
||||||
val emilDoobie = Seq(
|
val emilDoobie = Seq(
|
||||||
"com.github.eikek" %% "emil-doobie" % EmilVersion,
|
"com.github.eikek" %% "emil-doobie" % EmilVersion
|
||||||
)
|
)
|
||||||
val emilTnef = Seq(
|
val emilTnef = Seq(
|
||||||
"com.github.eikek" %% "emil-tnef" % EmilVersion,
|
"com.github.eikek" %% "emil-tnef" % EmilVersion
|
||||||
)
|
)
|
||||||
val emilMarkdown = Seq(
|
val emilMarkdown = Seq(
|
||||||
"com.github.eikek" %% "emil-markdown" % EmilVersion,
|
"com.github.eikek" %% "emil-markdown" % EmilVersion
|
||||||
)
|
)
|
||||||
val emilJsoup = Seq(
|
val emilJsoup = Seq(
|
||||||
"com.github.eikek" %% "emil-jsoup" % EmilVersion,
|
"com.github.eikek" %% "emil-jsoup" % EmilVersion
|
||||||
)
|
)
|
||||||
|
|
||||||
val jsoup = Seq(
|
val jsoup = Seq(
|
||||||
@ -112,7 +115,7 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
|
|
||||||
val stanfordNlpCore = Seq(
|
val stanfordNlpCore = Seq(
|
||||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll(
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).excludeAll(
|
||||||
ExclusionRule("com.io7m.xom", "xom"),
|
ExclusionRule("com.io7m.xom", "xom"),
|
||||||
ExclusionRule("javax.servlet", "javax.servlet-api"),
|
ExclusionRule("javax.servlet", "javax.servlet-api"),
|
||||||
ExclusionRule("org.apache.lucene", "lucene-queryparser"),
|
ExclusionRule("org.apache.lucene", "lucene-queryparser"),
|
||||||
@ -130,8 +133,11 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
|
|
||||||
val stanfordNlpModels = Seq(
|
val stanfordNlpModels = Seq(
|
||||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-german",
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||||
"edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-english"
|
.classifier("models-german"),
|
||||||
|
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier(
|
||||||
|
"models-english"
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
val tika = Seq(
|
val tika = Seq(
|
||||||
@ -153,16 +159,22 @@ object Dependencies {
|
|||||||
"co.fs2" %% "fs2-io" % Fs2Version
|
"co.fs2" %% "fs2-io" % Fs2Version
|
||||||
)
|
)
|
||||||
|
|
||||||
val http4s = Seq(
|
|
||||||
"org.http4s" %% "http4s-blaze-server" % Http4sVersion,
|
|
||||||
"org.http4s" %% "http4s-circe" % Http4sVersion,
|
|
||||||
"org.http4s" %% "http4s-dsl" % Http4sVersion,
|
|
||||||
)
|
|
||||||
|
|
||||||
val http4sClient = Seq(
|
val http4sClient = Seq(
|
||||||
"org.http4s" %% "http4s-blaze-client" % Http4sVersion
|
"org.http4s" %% "http4s-blaze-client" % Http4sVersion
|
||||||
)
|
)
|
||||||
|
|
||||||
|
val http4sCirce = Seq(
|
||||||
|
"org.http4s" %% "http4s-circe" % Http4sVersion
|
||||||
|
)
|
||||||
|
|
||||||
|
val http4sDsl = Seq(
|
||||||
|
"org.http4s" %% "http4s-dsl" % Http4sVersion
|
||||||
|
)
|
||||||
|
|
||||||
|
val http4sServer = Seq(
|
||||||
|
"org.http4s" %% "http4s-blaze-server" % Http4sVersion
|
||||||
|
)
|
||||||
|
|
||||||
val circe = Seq(
|
val circe = Seq(
|
||||||
"io.circe" %% "circe-generic" % CirceVersion,
|
"io.circe" %% "circe-generic" % CirceVersion,
|
||||||
"io.circe" %% "circe-parser" % CirceVersion
|
"io.circe" %% "circe-parser" % CirceVersion
|
||||||
@ -233,7 +245,7 @@ object Dependencies {
|
|||||||
|
|
||||||
val webjars = Seq(
|
val webjars = Seq(
|
||||||
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
|
"org.webjars" % "swagger-ui" % SwaggerUIVersion,
|
||||||
"org.webjars" % "Semantic-UI"% SemanticUIVersion,
|
"org.webjars" % "Semantic-UI" % SemanticUIVersion,
|
||||||
"org.webjars" % "jquery" % JQueryVersion,
|
"org.webjars" % "jquery" % JQueryVersion,
|
||||||
"org.webjars" % "viewerjs" % ViewerJSVersion
|
"org.webjars" % "viewerjs" % ViewerJSVersion
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user