diff --git a/build.sbt b/build.sbt index 70f0c10b..56602502 100644 --- a/build.sbt +++ b/build.sbt @@ -277,6 +277,8 @@ val ftssolr = project.in(file("modules/fts-solr")). name := "docspell-fts-solr", libraryDependencies ++= Dependencies.http4sClient ++ + Dependencies.http4sCirce ++ + Dependencies.http4sDsl ++ Dependencies.circe ).dependsOn(common, ftsclient) @@ -356,7 +358,9 @@ val joex = project.in(file("modules/joex")). name := "docspell-joex", libraryDependencies ++= Dependencies.fs2 ++ - Dependencies.http4s ++ + Dependencies.http4sServer ++ + Dependencies.http4sCirce ++ + Dependencies.http4sDsl ++ Dependencies.circe ++ Dependencies.pureconfig ++ Dependencies.emilTnef ++ @@ -384,7 +388,9 @@ val restserver = project.in(file("modules/restserver")). settings( name := "docspell-restserver", libraryDependencies ++= - Dependencies.http4s ++ + Dependencies.http4sServer ++ + Dependencies.http4sCirce ++ + Dependencies.http4sDsl ++ Dependencies.circe ++ Dependencies.pureconfig ++ Dependencies.yamusca ++ diff --git a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala index 84a07920..a6fe4e21 100644 --- a/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala +++ b/modules/fts-client/src/main/scala/docspell/ftsclient/TextData.scala @@ -10,6 +10,11 @@ sealed trait TextData { def collective: Ident + final def fold[A](f: TextData.Attachment => A, g: TextData.Item => A): A = + this match { + case a: TextData.Attachment => f(a) + case a: TextData.Item => g(a) + } } object TextData { diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala new file mode 100644 index 00000000..d240e8e1 --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/JsonCodec.scala @@ -0,0 +1,41 @@ +package docspell.ftssolr + +import docspell.common._ +import docspell.ftsclient._ +import io.circe._ + +trait JsonCodec { + + implicit def attachmentEncoder: Encoder[TextData.Attachment] = + new Encoder[TextData.Attachment] { + final def apply(td: TextData.Attachment): Json = Json.obj( + ("id", Ident.encodeIdent(td.id)), + ("item", Ident.encodeIdent(td.item)), + ("collective", Ident.encodeIdent(td.collective)), + ("attachmentName", Json.fromString(td.name.getOrElse(""))), + ("content", Json.fromString(td.text.getOrElse(""))), + ("discriminator", Json.fromString("attachment")) + ) + } + + implicit def itemEncoder: Encoder[TextData.Item] = + new Encoder[TextData.Item] { + final def apply(td: TextData.Item): Json = Json.obj( + ("id", Ident.encodeIdent(td.id)), + ("item", Ident.encodeIdent(td.item)), + ("collective", Ident.encodeIdent(td.collective)), + ("itemName", Json.fromString(td.name.getOrElse(""))), + ("itemNotes", Json.fromString(td.notes.getOrElse(""))), + ("discriminator", Json.fromString("item")) + ) + } + + + implicit def textDataEncoder(implicit + ae: Encoder[TextData.Attachment], + ie: Encoder[TextData.Item] + ): Encoder[TextData] = + Encoder(_.fold(ae.apply, ie.apply)) +} + +object JsonCodec extends JsonCodec diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala index 6fc50c3e..f6281f1f 100644 --- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrFtsClient.scala @@ -11,10 +11,8 @@ import docspell.ftsclient._ import docspell.ftsclient.FtsBasicResult._ final class SolrFtsClient[F[_]: Effect]( - cfg: SolrConfig, - client: Client[F] + solrUpdate: SolrUpdate[F] ) extends FtsClient[F] { - println(s"$client $cfg") def initialize: F[Unit] = ().pure[F] @@ -34,7 +32,16 @@ final class SolrFtsClient[F[_]: Effect]( ) def indexData(logger: Logger[F], data: Stream[F, TextData]): F[Unit] = - logger.info("Inserting lots of data into index") + (for { + _ <- Stream.eval(logger.debug("Inserting data into index")) + chunks <- data.chunks + res <- Stream.eval(solrUpdate.many(chunks.toList).attempt) + _ <- res match { + case Right(()) => Stream.emit(()) + case Left(ex) => + Stream.eval(logger.error(ex)("Error inserting chunk of data into index")) + } + } yield ()).compile.drain } @@ -44,6 +51,8 @@ object SolrFtsClient { cfg: SolrConfig, httpClient: Client[F] ): Resource[F, FtsClient[F]] = - Resource.pure[F, FtsClient[F]](new SolrFtsClient(cfg, httpClient)) + Resource.pure[F, FtsClient[F]]( + new SolrFtsClient(SolrUpdate(cfg, httpClient)) + ) } diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala new file mode 100644 index 00000000..7a73d45c --- /dev/null +++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrUpdate.scala @@ -0,0 +1,49 @@ +package docspell.ftssolr + +import cats.effect._ +import org.http4s._ +import cats.implicits._ +import org.http4s.client.Client +import org.http4s.circe._ +import org.http4s.client.dsl.Http4sClientDsl +import _root_.io.circe.syntax._ +import org.log4s.getLogger + +import docspell.ftsclient._ +import JsonCodec._ + +trait SolrUpdate[F[_]] { + + def single(td: TextData): F[Unit] + + def many(tds: List[TextData]): F[Unit] + +} + +object SolrUpdate { + private[this] val logger = getLogger + + def apply[F[_]: ConcurrentEffect](cfg: SolrConfig, client: Client[F]): SolrUpdate[F] = { + val dsl = new Http4sClientDsl[F] {} + import dsl._ + + new SolrUpdate[F] { + val url = (Uri.unsafeFromString(cfg.url.asString) / "update") + .withQueryParam("commitWithin", "1000") + .withQueryParam("overwrite", "true") + .withQueryParam("wt", "json") + + def single(td: TextData): F[Unit] = { + val req = Method.POST(td.asJson, url) + logger.debug(s"Running request $req") + client.expect[String](req).map(r => logger.debug(s"Response: $r")) + } + + def many(tds: List[TextData]): F[Unit] = { + val req = Method.POST(tds.asJson, url) + logger.debug(s"Running request $req") + client.expect[String](req).map(r => logger.debug(s"Response: $r")) + } + } + } +} diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.7.0__fts-migration.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.7.0__fts-migration.sql index 993a14ca..f78fd7eb 100644 --- a/modules/store/src/main/resources/db/migration/postgresql/V1.7.0__fts-migration.sql +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.7.0__fts-migration.sql @@ -6,5 +6,5 @@ CREATE TABLE "fts_migration" ( "created" timestamp not null ); -CREATE UNIQE INDEX "fts_migration_version_engine_idx" +CREATE UNIQUE INDEX "fts_migration_version_engine_idx" ON "fts_migration"("version", "fts_engine"); diff --git a/project/Dependencies.scala b/project/Dependencies.scala index edab1ff3..cc704c7c 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -4,40 +4,39 @@ import sbt._ object Dependencies { - val BcryptVersion = "0.4" + val BcryptVersion = "0.4" val BetterMonadicForVersion = "0.3.1" - val BitpeaceVersion = "0.5.0" - val CalevVersion = "0.3.1" - val CirceVersion = "0.13.0" - val DoobieVersion = "0.9.0" - val EmilVersion = "0.6.1" - val FastparseVersion = "2.1.3" - val FlexmarkVersion = "0.62.2" - val FlywayVersion = "6.4.4" - val Fs2Version = "2.4.2" - val H2Version = "1.4.200" - val Http4sVersion = "0.21.4" - val Icu4jVersion = "67.1" - val JsoupVersion = "1.13.1" - val KindProjectorVersion = "0.10.3" - val Log4sVersion = "1.8.2" - val LogbackVersion = "1.2.3" - val MariaDbVersion = "2.6.0" - val MiniTestVersion = "2.8.2" - val PdfboxVersion = "2.0.20" - val PoiVersion = "4.1.2" - val PostgresVersion = "42.2.14" - val PureConfigVersion = "0.12.3" - val Slf4jVersion = "1.7.30" - val StanfordNlpVersion = "3.9.2" - val TikaVersion = "1.24.1" - val YamuscaVersion = "0.6.2" - val SwaggerUIVersion = "3.26.1" - val SemanticUIVersion = "2.4.1" - val TwelveMonkeysVersion = "3.5" - val JQueryVersion = "3.5.1" - val ViewerJSVersion = "0.5.8" - + val BitpeaceVersion = "0.5.0" + val CalevVersion = "0.3.1" + val CirceVersion = "0.13.0" + val DoobieVersion = "0.9.0" + val EmilVersion = "0.6.1" + val FastparseVersion = "2.1.3" + val FlexmarkVersion = "0.62.2" + val FlywayVersion = "6.4.4" + val Fs2Version = "2.4.2" + val H2Version = "1.4.200" + val Http4sVersion = "0.21.4" + val Icu4jVersion = "67.1" + val JsoupVersion = "1.13.1" + val KindProjectorVersion = "0.10.3" + val Log4sVersion = "1.8.2" + val LogbackVersion = "1.2.3" + val MariaDbVersion = "2.6.0" + val MiniTestVersion = "2.8.2" + val PdfboxVersion = "2.0.20" + val PoiVersion = "4.1.2" + val PostgresVersion = "42.2.14" + val PureConfigVersion = "0.12.3" + val Slf4jVersion = "1.7.30" + val StanfordNlpVersion = "3.9.2" + val TikaVersion = "1.24.1" + val YamuscaVersion = "0.6.2" + val SwaggerUIVersion = "3.26.1" + val SemanticUIVersion = "2.4.1" + val TwelveMonkeysVersion = "3.5" + val JQueryVersion = "3.5.1" + val ViewerJSVersion = "0.5.8" val calevCore = Seq( "com.github.eikek" %% "calev-core" % CalevVersion @@ -57,23 +56,27 @@ object Dependencies { ) val poi = Seq( - "org.apache.poi" % "poi" % PoiVersion, - "org.apache.poi" % "poi-ooxml" % PoiVersion, - "org.apache.poi" % "poi-scratchpad" % PoiVersion, - ).map(_.excludeAll( - ExclusionRule("commons-logging") - )) ++ jclOverSlf4j + "org.apache.poi" % "poi" % PoiVersion, + "org.apache.poi" % "poi-ooxml" % PoiVersion, + "org.apache.poi" % "poi-scratchpad" % PoiVersion + ).map( + _.excludeAll( + ExclusionRule("commons-logging") + ) + ) ++ jclOverSlf4j // https://github.com/vsch/flexmark-java // BSD 2-Clause val flexmark = Seq( - "com.vladsch.flexmark" % "flexmark" % FlexmarkVersion, - "com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion, "com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion - ).map(_.excludeAll( - ExclusionRule("junit"), - ExclusionRule("hamcrest-core") - )) + ).map( + _.excludeAll( + ExclusionRule("junit"), + ExclusionRule("hamcrest-core") + ) + ) val twelvemonkeys = Seq( "com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion, @@ -81,30 +84,30 @@ object Dependencies { ) val pdfbox = Seq( - "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll ( + ("org.apache.pdfbox" % "pdfbox" % PdfboxVersion).excludeAll( ExclusionRule("org.bouncycastle"), ExclusionRule("commons-logging") ) ) ++ jclOverSlf4j val emilCommon = Seq( - "com.github.eikek" %% "emil-common" % EmilVersion, + "com.github.eikek" %% "emil-common" % EmilVersion ) val emil = Seq( - "com.github.eikek" %% "emil-common" % EmilVersion, - "com.github.eikek" %% "emil-javamail" % EmilVersion + "com.github.eikek" %% "emil-common" % EmilVersion, + "com.github.eikek" %% "emil-javamail" % EmilVersion ) val emilDoobie = Seq( - "com.github.eikek" %% "emil-doobie" % EmilVersion, + "com.github.eikek" %% "emil-doobie" % EmilVersion ) val emilTnef = Seq( - "com.github.eikek" %% "emil-tnef" % EmilVersion, + "com.github.eikek" %% "emil-tnef" % EmilVersion ) val emilMarkdown = Seq( - "com.github.eikek" %% "emil-markdown" % EmilVersion, + "com.github.eikek" %% "emil-markdown" % EmilVersion ) val emilJsoup = Seq( - "com.github.eikek" %% "emil-jsoup" % EmilVersion, + "com.github.eikek" %% "emil-jsoup" % EmilVersion ) val jsoup = Seq( @@ -112,7 +115,7 @@ object Dependencies { ) val stanfordNlpCore = Seq( - "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion excludeAll( + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).excludeAll( ExclusionRule("com.io7m.xom", "xom"), ExclusionRule("javax.servlet", "javax.servlet-api"), ExclusionRule("org.apache.lucene", "lucene-queryparser"), @@ -130,8 +133,11 @@ object Dependencies { ) val stanfordNlpModels = Seq( - "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-german", - "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion classifier "models-english" + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion) + .classifier("models-german"), + ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion).classifier( + "models-english" + ) ) val tika = Seq( @@ -150,22 +156,28 @@ object Dependencies { val fs2 = Seq( "co.fs2" %% "fs2-core" % Fs2Version, - "co.fs2" %% "fs2-io" % Fs2Version + "co.fs2" %% "fs2-io" % Fs2Version ) - val http4s = Seq( - "org.http4s" %% "http4s-blaze-server" % Http4sVersion, - "org.http4s" %% "http4s-circe" % Http4sVersion, - "org.http4s" %% "http4s-dsl" % Http4sVersion, - ) - val http4sClient = Seq( "org.http4s" %% "http4s-blaze-client" % Http4sVersion ) + val http4sCirce = Seq( + "org.http4s" %% "http4s-circe" % Http4sVersion + ) + + val http4sDsl = Seq( + "org.http4s" %% "http4s-dsl" % Http4sVersion + ) + + val http4sServer = Seq( + "org.http4s" %% "http4s-blaze-server" % Http4sVersion + ) + val circe = Seq( "io.circe" %% "circe-generic" % CirceVersion, - "io.circe" %% "circe-parser" % CirceVersion + "io.circe" %% "circe-parser" % CirceVersion ) // https://github.com/Log4s/log4s;ASL 2.0 @@ -203,7 +215,7 @@ object Dependencies { // https://github.com/tpolecat/doobie // MIT val doobie = Seq( - "org.tpolecat" %% "doobie-core" % DoobieVersion, + "org.tpolecat" %% "doobie-core" % DoobieVersion, "org.tpolecat" %% "doobie-hikari" % DoobieVersion ) @@ -224,18 +236,18 @@ object Dependencies { val miniTest = Seq( // https://github.com/monix/minitest // Apache 2.0 - "io.monix" %% "minitest" % MiniTestVersion, + "io.monix" %% "minitest" % MiniTestVersion, "io.monix" %% "minitest-laws" % MiniTestVersion ).map(_ % Test) - val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion - val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion + val kindProjectorPlugin = "org.typelevel" %% "kind-projector" % KindProjectorVersion + val betterMonadicFor = "com.olegpy" %% "better-monadic-for" % BetterMonadicForVersion val webjars = Seq( - "org.webjars" % "swagger-ui" % SwaggerUIVersion, - "org.webjars" % "Semantic-UI"% SemanticUIVersion, - "org.webjars" % "jquery" % JQueryVersion, - "org.webjars" % "viewerjs" % ViewerJSVersion + "org.webjars" % "swagger-ui" % SwaggerUIVersion, + "org.webjars" % "Semantic-UI" % SemanticUIVersion, + "org.webjars" % "jquery" % JQueryVersion, + "org.webjars" % "viewerjs" % ViewerJSVersion ) val icu4j = Seq(