Poc for clearing stanford pipeline after some idle time

2025-07-04 16:48:26 +00:00 · 2021-01-05 23:56:20 +01:00
parent b08e88cd69
commit 73a9572835
2 changed files with 54 additions and 4 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@ -31,7 +31,7 @@ object TextAnalyser {
      labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
  }

-  def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
+  def create[F[_]: Concurrent: Timer](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
    Resource
      .liftF(PipelineCache[F]())
      .map(cache =>
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@ -9,6 +9,8 @@ import docspell.common._

 import edu.stanford.nlp.pipeline.StanfordCoreNLP
 import org.log4s.getLogger
+import scala.concurrent.duration._
+import cats.data.OptionT

 /** Creating the StanfordCoreNLP pipeline is quite expensive as it
  * involves IO and initializing large objects.
@ -32,18 +34,64 @@ object PipelineCache {
        makeClassifier(settings).pure[F]
    }

-  def apply[F[_]: Sync](): F[PipelineCache[F]] =
-    Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
+  def apply[F[_]: Concurrent: Timer](): F[PipelineCache[F]] =
+    for {
+      data     <- Ref.of(Map.empty[String, Entry])
+      counter  <- Ref.of(Long.MinValue)
+      cleaning <- Ref.of(false)
+    } yield new Impl[F](data, counter, cleaning): PipelineCache[F]

-  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
+  final private class Impl[F[_]](
+      data: Ref[F, Map[String, Entry]],
+      counter: Ref[F, Long],
+      cleaningProgress: Ref[F, Boolean]
+  )(implicit T: Timer[F], F: Concurrent[F])
      extends PipelineCache[F] {

+    private[this] val clearInterval = 1.minute
+    private[this] val log           = Logger.log4s(logger)
+
    def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
      for {
        id  <- makeSettingsId(settings)
        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
+        _   <- scheduleClearPipeline
      } yield nlp

+    private def scheduleClearPipeline: F[Unit] =
+      (for {
+        cnt <- OptionT(counter.tryModify(n => (n + 1, n + 1)))
+        free <- OptionT.liftF(cleaningProgress.access.flatMap { case (b, setter) =>
+          if (b) false.pure[F]
+          else setter(true)
+        })
+        _ <- OptionT.liftF(
+          if (free)
+            F.start(
+              T.sleep(clearInterval) *> cleaningProgress.set(false) *> clearStale(cnt)
+            )
+          else ().pure[F]
+        )
+      } yield ()).getOrElse(())
+
+    private def clearStale(n: Long): F[Unit] =
+      log.debug("Attempting to clear stanford nlp pipeline cache to free memory") *>
+        counter.get.flatMap(x =>
+          if (x == n) clearAll
+          else
+            log.debug(
+              "Don't clear yet, as it has been used in between"
+            ) *> scheduleClearPipeline
+        )
+
+    private def clearAll: F[Unit] =
+      log.info("Clearing stanford nlp pipeline cache now!") *>
+        data.set(Map.empty) *> Sync[F].delay {
+          // turns out that everything is cached in a static map
+          StanfordCoreNLP.clearAnnotatorPool()
+          System.gc();
+        }
+
    private def getOrCreate(
        key: String,
        id: String,
@ -81,6 +129,8 @@ object PipelineCache {
    }

  }
+
+
  private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
    new StanfordCoreNLP(Properties.forSettings(settings))