mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Upgrade code base to CE3
This commit is contained in:
@ -32,10 +32,7 @@ object TextAnalyser {
|
||||
labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
|
||||
}
|
||||
|
||||
def create[F[_]: Concurrent: Timer: ContextShift](
|
||||
cfg: TextAnalysisConfig,
|
||||
blocker: Blocker
|
||||
): Resource[F, TextAnalyser[F]] =
|
||||
def create[F[_]: Async](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
|
||||
Resource
|
||||
.eval(Nlp(cfg.nlpConfig))
|
||||
.map(stanfordNer =>
|
||||
@ -56,7 +53,7 @@ object TextAnalyser {
|
||||
} yield Result(spans ++ list, dates)
|
||||
|
||||
def classifier: TextClassifier[F] =
|
||||
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||
new StanfordTextClassifier[F](cfg.classifier)
|
||||
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (cfg.maxLength <= 0)
|
||||
@ -82,7 +79,7 @@ object TextAnalyser {
|
||||
|
||||
/** Provides the nlp pipeline based on the configuration. */
|
||||
private object Nlp {
|
||||
def apply[F[_]: Concurrent: Timer](
|
||||
def apply[F[_]: Async](
|
||||
cfg: TextAnalysisConfig.NlpConfig
|
||||
): F[Input[F] => F[Vector[NerLabel]]] =
|
||||
cfg.mode match {
|
||||
@ -104,7 +101,7 @@ object TextAnalyser {
|
||||
text: String
|
||||
)
|
||||
|
||||
def annotate[F[_]: BracketThrow](
|
||||
def annotate[F[_]: Async](
|
||||
cache: PipelineCache[F]
|
||||
)(input: Input[F]): F[Vector[NerLabel]] =
|
||||
cache
|
||||
|
@ -2,10 +2,11 @@ package docspell.analysis.classifier
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect.Ref
|
||||
import cats.effect._
|
||||
import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
import fs2.io.file.Files
|
||||
|
||||
import docspell.analysis.classifier
|
||||
import docspell.analysis.classifier.TextClassifier._
|
||||
@ -15,10 +16,8 @@ import docspell.common.syntax.FileSyntax._
|
||||
|
||||
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||
|
||||
final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
cfg: TextClassifierConfig,
|
||||
blocker: Blocker
|
||||
) extends TextClassifier[F] {
|
||||
final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
|
||||
extends TextClassifier[F] {
|
||||
|
||||
def trainClassifier[A](
|
||||
logger: Logger[F],
|
||||
@ -28,7 +27,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
.withTempDir(cfg.workingDir, "trainclassifier")
|
||||
.use { dir =>
|
||||
for {
|
||||
rawData <- writeDataFile(blocker, dir, data)
|
||||
rawData <- writeDataFile(dir, data)
|
||||
_ <- logger.debug(s"Learning from ${rawData.count} items.")
|
||||
trainData <- splitData(logger, rawData)
|
||||
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||
@ -81,8 +80,8 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
|
||||
|
||||
val fileLines =
|
||||
fs2.io.file
|
||||
.readAll(in.file, blocker, 4096)
|
||||
File
|
||||
.readAll[F](in.file, 4096)
|
||||
.through(fs2.text.utf8Decode)
|
||||
.through(fs2.text.lines)
|
||||
|
||||
@ -95,7 +94,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
.take(nTest)
|
||||
.intersperse("\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(td.test, blocker))
|
||||
.through(Files[F].writeAll(td.test))
|
||||
.compile
|
||||
.drain
|
||||
_ <-
|
||||
@ -103,13 +102,13 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
.drop(nTest)
|
||||
.intersperse("\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(td.train, blocker))
|
||||
.through(Files[F].writeAll(td.train))
|
||||
.compile
|
||||
.drain
|
||||
} yield td
|
||||
}
|
||||
|
||||
def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
|
||||
def writeDataFile(dir: Path, data: Stream[F, Data]): F[RawData] = {
|
||||
val target = dir.resolve("rawdata")
|
||||
for {
|
||||
counter <- Ref.of[F, Long](0L)
|
||||
@ -120,7 +119,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
.evalTap(_ => counter.update(_ + 1))
|
||||
.intersperse("\r\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(target, blocker))
|
||||
.through(Files[F].writeAll(target))
|
||||
.compile
|
||||
.drain
|
||||
lines <- counter.get
|
||||
|
@ -19,7 +19,7 @@ object DateFind {
|
||||
.splitToken(text, " \t.,\n\r/".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
.sliding(3)
|
||||
.filter(_.length == 3)
|
||||
.filter(_.size == 3)
|
||||
.flatMap(q =>
|
||||
Stream.emits(
|
||||
SimpleDate
|
||||
@ -28,9 +28,9 @@ object DateFind {
|
||||
NerDateLabel(
|
||||
sd.toLocalDate,
|
||||
NerLabel(
|
||||
text.substring(q.head.begin, q(2).end),
|
||||
text.substring(q.head.get.begin, q(2).end),
|
||||
NerTag.Date,
|
||||
q.head.begin,
|
||||
q.head.get.begin,
|
||||
q(2).end
|
||||
)
|
||||
)
|
||||
|
@ -2,9 +2,8 @@ package docspell.analysis.nlp
|
||||
|
||||
import scala.concurrent.duration.{Duration => _, _}
|
||||
|
||||
import cats.Applicative
|
||||
import cats.effect.Ref
|
||||
import cats.effect._
|
||||
import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.NlpSettings
|
||||
@ -28,7 +27,7 @@ trait PipelineCache[F[_]] {
|
||||
object PipelineCache {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
def apply[F[_]: Concurrent: Timer](clearInterval: Duration)(
|
||||
def apply[F[_]: Async](clearInterval: Duration)(
|
||||
creator: NlpSettings => Annotator[F],
|
||||
release: F[Unit]
|
||||
): F[PipelineCache[F]] =
|
||||
@ -38,7 +37,7 @@ object PipelineCache {
|
||||
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
|
||||
} yield new Impl[F](data, creator, cacheClear)
|
||||
|
||||
final private class Impl[F[_]: Sync](
|
||||
final private class Impl[F[_]: Async](
|
||||
data: Ref[F, Map[String, Entry[Annotator[F]]]],
|
||||
creator: NlpSettings => Annotator[F],
|
||||
cacheClear: CacheClearing[F]
|
||||
@ -97,20 +96,20 @@ object PipelineCache {
|
||||
}
|
||||
|
||||
object CacheClearing {
|
||||
def none[F[_]: Applicative]: CacheClearing[F] =
|
||||
def none[F[_]]: CacheClearing[F] =
|
||||
new CacheClearing[F] {
|
||||
def withCache: Resource[F, Unit] =
|
||||
Resource.pure[F, Unit](())
|
||||
}
|
||||
|
||||
def create[F[_]: Concurrent: Timer, A](
|
||||
def create[F[_]: Async, A](
|
||||
data: Ref[F, Map[String, Entry[A]]],
|
||||
interval: Duration,
|
||||
release: F[Unit]
|
||||
): F[CacheClearing[F]] =
|
||||
for {
|
||||
counter <- Ref.of(0L)
|
||||
cleaning <- Ref.of(None: Option[Fiber[F, Unit]])
|
||||
cleaning <- Ref.of(None: Option[Fiber[F, Throwable, Unit]])
|
||||
log = Logger.log4s(logger)
|
||||
result <-
|
||||
if (interval.millis <= 0)
|
||||
@ -135,10 +134,10 @@ object PipelineCache {
|
||||
final private class CacheClearingImpl[F[_], A](
|
||||
data: Ref[F, Map[String, Entry[A]]],
|
||||
counter: Ref[F, Long],
|
||||
cleaningFiber: Ref[F, Option[Fiber[F, Unit]]],
|
||||
cleaningFiber: Ref[F, Option[Fiber[F, Throwable, Unit]]],
|
||||
clearInterval: FiniteDuration,
|
||||
release: F[Unit]
|
||||
)(implicit T: Timer[F], F: Concurrent[F])
|
||||
)(implicit F: Async[F])
|
||||
extends CacheClearing[F] {
|
||||
private[this] val log = Logger.log4s[F](logger)
|
||||
|
||||
@ -157,8 +156,8 @@ object PipelineCache {
|
||||
case None => ().pure[F]
|
||||
}
|
||||
|
||||
private def clearAllLater: F[Fiber[F, Unit]] =
|
||||
F.start(T.sleep(clearInterval) *> clearAll)
|
||||
private def clearAllLater: F[Fiber[F, Throwable, Unit]] =
|
||||
F.start(F.sleep(clearInterval) *> clearAll)
|
||||
|
||||
private def logDontClear: F[Unit] =
|
||||
log.info("Cancel stanford cache clearing, as it has been used in between.")
|
||||
|
@ -2,12 +2,12 @@ package docspell.analysis.classifier
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.data.NonEmptyList
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
import fs2.Stream
|
||||
import fs2.io.file.Files
|
||||
|
||||
import docspell.analysis.classifier.TextClassifier.Data
|
||||
import docspell.common._
|
||||
@ -17,8 +17,6 @@ import munit._
|
||||
class StanfordTextClassifierSuite extends FunSuite {
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
test("learn from data") {
|
||||
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||
|
||||
@ -38,34 +36,30 @@ class StanfordTextClassifierSuite extends FunSuite {
|
||||
})
|
||||
.covary[IO]
|
||||
|
||||
val modelExists =
|
||||
Blocker[IO].use { blocker =>
|
||||
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||
classifier.trainClassifier[Boolean](logger, data)(
|
||||
Kleisli(result => File.existsNonEmpty[IO](result.model))
|
||||
)
|
||||
}
|
||||
val modelExists = {
|
||||
val classifier = new StanfordTextClassifier[IO](cfg)
|
||||
classifier.trainClassifier[Boolean](logger, data)(
|
||||
Kleisli(result => File.existsNonEmpty[IO](result.model))
|
||||
)
|
||||
}
|
||||
assertEquals(modelExists.unsafeRunSync(), true)
|
||||
}
|
||||
|
||||
test("run classifier") {
|
||||
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||
val things = for {
|
||||
dir <- File.withTempDir[IO](Paths.get("target"), "testcls")
|
||||
blocker <- Blocker[IO]
|
||||
} yield (dir, blocker)
|
||||
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||
val things = File.withTempDir[IO](Paths.get("target"), "testcls")
|
||||
|
||||
things
|
||||
.use { case (dir, blocker) =>
|
||||
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||
.use { dir =>
|
||||
val classifier = new StanfordTextClassifier[IO](cfg)
|
||||
|
||||
val modelFile = dir.resolve("test.ser.gz")
|
||||
for {
|
||||
_ <-
|
||||
LenientUri
|
||||
.fromJava(getClass.getResource("/test.ser.gz"))
|
||||
.readURL[IO](4096, blocker)
|
||||
.through(fs2.io.file.writeAll(modelFile, blocker))
|
||||
.readURL[IO](4096)
|
||||
.through(Files[IO].writeAll(modelFile))
|
||||
.compile
|
||||
.drain
|
||||
model = ClassifierModel(modelFile)
|
||||
|
@ -3,6 +3,7 @@ package docspell.analysis.nlp
|
||||
import java.nio.file.Paths
|
||||
|
||||
import cats.effect.IO
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.analysis.Env
|
||||
import docspell.common._
|
||||
|
Reference in New Issue
Block a user