Upgrade code base to CE3

This commit is contained in:
eikek
2021-06-21 21:33:54 +02:00
parent 903ec26e54
commit bd791b4593
146 changed files with 638 additions and 758 deletions

View File

@ -32,10 +32,7 @@ object TextAnalyser {
labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
}
def create[F[_]: Concurrent: Timer: ContextShift](
cfg: TextAnalysisConfig,
blocker: Blocker
): Resource[F, TextAnalyser[F]] =
def create[F[_]: Async](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
Resource
.eval(Nlp(cfg.nlpConfig))
.map(stanfordNer =>
@ -56,7 +53,7 @@ object TextAnalyser {
} yield Result(spans ++ list, dates)
def classifier: TextClassifier[F] =
new StanfordTextClassifier[F](cfg.classifier, blocker)
new StanfordTextClassifier[F](cfg.classifier)
private def textLimit(logger: Logger[F], text: String): F[String] =
if (cfg.maxLength <= 0)
@ -82,7 +79,7 @@ object TextAnalyser {
/** Provides the nlp pipeline based on the configuration. */
private object Nlp {
def apply[F[_]: Concurrent: Timer](
def apply[F[_]: Async](
cfg: TextAnalysisConfig.NlpConfig
): F[Input[F] => F[Vector[NerLabel]]] =
cfg.mode match {
@ -104,7 +101,7 @@ object TextAnalyser {
text: String
)
def annotate[F[_]: BracketThrow](
def annotate[F[_]: Async](
cache: PipelineCache[F]
)(input: Input[F]): F[Vector[NerLabel]] =
cache

View File

@ -2,10 +2,11 @@ package docspell.analysis.classifier
import java.nio.file.Path
import cats.effect.Ref
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import fs2.Stream
import fs2.io.file.Files
import docspell.analysis.classifier
import docspell.analysis.classifier.TextClassifier._
@ -15,10 +16,8 @@ import docspell.common.syntax.FileSyntax._
import edu.stanford.nlp.classify.ColumnDataClassifier
final class StanfordTextClassifier[F[_]: Sync: ContextShift](
cfg: TextClassifierConfig,
blocker: Blocker
) extends TextClassifier[F] {
final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
extends TextClassifier[F] {
def trainClassifier[A](
logger: Logger[F],
@ -28,7 +27,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
.withTempDir(cfg.workingDir, "trainclassifier")
.use { dir =>
for {
rawData <- writeDataFile(blocker, dir, data)
rawData <- writeDataFile(dir, data)
_ <- logger.debug(s"Learning from ${rawData.count} items.")
trainData <- splitData(logger, rawData)
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
@ -81,8 +80,8 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
val fileLines =
fs2.io.file
.readAll(in.file, blocker, 4096)
File
.readAll[F](in.file, 4096)
.through(fs2.text.utf8Decode)
.through(fs2.text.lines)
@ -95,7 +94,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
.take(nTest)
.intersperse("\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(td.test, blocker))
.through(Files[F].writeAll(td.test))
.compile
.drain
_ <-
@ -103,13 +102,13 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
.drop(nTest)
.intersperse("\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(td.train, blocker))
.through(Files[F].writeAll(td.train))
.compile
.drain
} yield td
}
def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
def writeDataFile(dir: Path, data: Stream[F, Data]): F[RawData] = {
val target = dir.resolve("rawdata")
for {
counter <- Ref.of[F, Long](0L)
@ -120,7 +119,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
.evalTap(_ => counter.update(_ + 1))
.intersperse("\r\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(target, blocker))
.through(Files[F].writeAll(target))
.compile
.drain
lines <- counter.get

View File

@ -19,7 +19,7 @@ object DateFind {
.splitToken(text, " \t.,\n\r/".toSet)
.filter(w => lang != Language.Latvian || w.value != "gada")
.sliding(3)
.filter(_.length == 3)
.filter(_.size == 3)
.flatMap(q =>
Stream.emits(
SimpleDate
@ -28,9 +28,9 @@ object DateFind {
NerDateLabel(
sd.toLocalDate,
NerLabel(
text.substring(q.head.begin, q(2).end),
text.substring(q.head.get.begin, q(2).end),
NerTag.Date,
q.head.begin,
q.head.get.begin,
q(2).end
)
)

View File

@ -2,9 +2,8 @@ package docspell.analysis.nlp
import scala.concurrent.duration.{Duration => _, _}
import cats.Applicative
import cats.effect.Ref
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import docspell.analysis.NlpSettings
@ -28,7 +27,7 @@ trait PipelineCache[F[_]] {
object PipelineCache {
private[this] val logger = getLogger
def apply[F[_]: Concurrent: Timer](clearInterval: Duration)(
def apply[F[_]: Async](clearInterval: Duration)(
creator: NlpSettings => Annotator[F],
release: F[Unit]
): F[PipelineCache[F]] =
@ -38,7 +37,7 @@ object PipelineCache {
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
} yield new Impl[F](data, creator, cacheClear)
final private class Impl[F[_]: Sync](
final private class Impl[F[_]: Async](
data: Ref[F, Map[String, Entry[Annotator[F]]]],
creator: NlpSettings => Annotator[F],
cacheClear: CacheClearing[F]
@ -97,20 +96,20 @@ object PipelineCache {
}
object CacheClearing {
def none[F[_]: Applicative]: CacheClearing[F] =
def none[F[_]]: CacheClearing[F] =
new CacheClearing[F] {
def withCache: Resource[F, Unit] =
Resource.pure[F, Unit](())
}
def create[F[_]: Concurrent: Timer, A](
def create[F[_]: Async, A](
data: Ref[F, Map[String, Entry[A]]],
interval: Duration,
release: F[Unit]
): F[CacheClearing[F]] =
for {
counter <- Ref.of(0L)
cleaning <- Ref.of(None: Option[Fiber[F, Unit]])
cleaning <- Ref.of(None: Option[Fiber[F, Throwable, Unit]])
log = Logger.log4s(logger)
result <-
if (interval.millis <= 0)
@ -135,10 +134,10 @@ object PipelineCache {
final private class CacheClearingImpl[F[_], A](
data: Ref[F, Map[String, Entry[A]]],
counter: Ref[F, Long],
cleaningFiber: Ref[F, Option[Fiber[F, Unit]]],
cleaningFiber: Ref[F, Option[Fiber[F, Throwable, Unit]]],
clearInterval: FiniteDuration,
release: F[Unit]
)(implicit T: Timer[F], F: Concurrent[F])
)(implicit F: Async[F])
extends CacheClearing[F] {
private[this] val log = Logger.log4s[F](logger)
@ -157,8 +156,8 @@ object PipelineCache {
case None => ().pure[F]
}
private def clearAllLater: F[Fiber[F, Unit]] =
F.start(T.sleep(clearInterval) *> clearAll)
private def clearAllLater: F[Fiber[F, Throwable, Unit]] =
F.start(F.sleep(clearInterval) *> clearAll)
private def logDontClear: F[Unit] =
log.info("Cancel stanford cache clearing, as it has been used in between.")

View File

@ -2,12 +2,12 @@ package docspell.analysis.classifier
import java.nio.file.Paths
import scala.concurrent.ExecutionContext
import cats.data.Kleisli
import cats.data.NonEmptyList
import cats.effect._
import cats.effect.unsafe.implicits.global
import fs2.Stream
import fs2.io.file.Files
import docspell.analysis.classifier.TextClassifier.Data
import docspell.common._
@ -17,8 +17,6 @@ import munit._
class StanfordTextClassifierSuite extends FunSuite {
val logger = Logger.log4s[IO](org.log4s.getLogger)
implicit val CS = IO.contextShift(ExecutionContext.global)
test("learn from data") {
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
@ -38,34 +36,30 @@ class StanfordTextClassifierSuite extends FunSuite {
})
.covary[IO]
val modelExists =
Blocker[IO].use { blocker =>
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
classifier.trainClassifier[Boolean](logger, data)(
Kleisli(result => File.existsNonEmpty[IO](result.model))
)
}
val modelExists = {
val classifier = new StanfordTextClassifier[IO](cfg)
classifier.trainClassifier[Boolean](logger, data)(
Kleisli(result => File.existsNonEmpty[IO](result.model))
)
}
assertEquals(modelExists.unsafeRunSync(), true)
}
test("run classifier") {
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
val things = for {
dir <- File.withTempDir[IO](Paths.get("target"), "testcls")
blocker <- Blocker[IO]
} yield (dir, blocker)
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
val things = File.withTempDir[IO](Paths.get("target"), "testcls")
things
.use { case (dir, blocker) =>
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
.use { dir =>
val classifier = new StanfordTextClassifier[IO](cfg)
val modelFile = dir.resolve("test.ser.gz")
for {
_ <-
LenientUri
.fromJava(getClass.getResource("/test.ser.gz"))
.readURL[IO](4096, blocker)
.through(fs2.io.file.writeAll(modelFile, blocker))
.readURL[IO](4096)
.through(Files[IO].writeAll(modelFile))
.compile
.drain
model = ClassifierModel(modelFile)

View File

@ -3,6 +3,7 @@ package docspell.analysis.nlp
import java.nio.file.Paths
import cats.effect.IO
import cats.effect.unsafe.implicits.global
import docspell.analysis.Env
import docspell.common._