Poc for clearing stanford pipeline after some idle time

This commit is contained in:
Eike Kettner 2021-01-05 23:56:20 +01:00
parent b08e88cd69
commit 73a9572835
2 changed files with 54 additions and 4 deletions

View File

@ -31,7 +31,7 @@ object TextAnalyser {
labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
}
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
def create[F[_]: Concurrent: Timer](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
Resource
.liftF(PipelineCache[F]())
.map(cache =>

View File

@ -9,6 +9,8 @@ import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
import org.log4s.getLogger
import scala.concurrent.duration._
import cats.data.OptionT
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
* involves IO and initializing large objects.
@ -32,18 +34,64 @@ object PipelineCache {
makeClassifier(settings).pure[F]
}
def apply[F[_]: Sync](): F[PipelineCache[F]] =
Ref.of(Map.empty[String, Entry]).map(data => (new Impl[F](data): PipelineCache[F]))
def apply[F[_]: Concurrent: Timer](): F[PipelineCache[F]] =
for {
data <- Ref.of(Map.empty[String, Entry])
counter <- Ref.of(Long.MinValue)
cleaning <- Ref.of(false)
} yield new Impl[F](data, counter, cleaning): PipelineCache[F]
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
final private class Impl[F[_]](
data: Ref[F, Map[String, Entry]],
counter: Ref[F, Long],
cleaningProgress: Ref[F, Boolean]
)(implicit T: Timer[F], F: Concurrent[F])
extends PipelineCache[F] {
private[this] val clearInterval = 1.minute
private[this] val log = Logger.log4s(logger)
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
for {
id <- makeSettingsId(settings)
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
_ <- scheduleClearPipeline
} yield nlp
private def scheduleClearPipeline: F[Unit] =
(for {
cnt <- OptionT(counter.tryModify(n => (n + 1, n + 1)))
free <- OptionT.liftF(cleaningProgress.access.flatMap { case (b, setter) =>
if (b) false.pure[F]
else setter(true)
})
_ <- OptionT.liftF(
if (free)
F.start(
T.sleep(clearInterval) *> cleaningProgress.set(false) *> clearStale(cnt)
)
else ().pure[F]
)
} yield ()).getOrElse(())
private def clearStale(n: Long): F[Unit] =
log.debug("Attempting to clear stanford nlp pipeline cache to free memory") *>
counter.get.flatMap(x =>
if (x == n) clearAll
else
log.debug(
"Don't clear yet, as it has been used in between"
) *> scheduleClearPipeline
)
private def clearAll: F[Unit] =
log.info("Clearing stanford nlp pipeline cache now!") *>
data.set(Map.empty) *> Sync[F].delay {
// turns out that everything is cached in a static map
StanfordCoreNLP.clearAnnotatorPool()
System.gc();
}
private def getOrCreate(
key: String,
id: String,
@ -81,6 +129,8 @@ object PipelineCache {
}
}
private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(Properties.forSettings(settings))