mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 09:58:26 +00:00
Initial impl of a text classifier based on stanford-nlp
This commit is contained in:
@ -298,7 +298,7 @@ docspell.joex {
|
||||
# These settings are used to configure the classifier. If
|
||||
# multiple are given, they are all tried and the "best" is
|
||||
# chosen at the end. See
|
||||
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
|
||||
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||
# for more info about these settings. The settings are almost
|
||||
# identical to them, as they yielded best results with *my*
|
||||
# dataset.
|
||||
|
@ -2,7 +2,10 @@ package docspell.joex
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import docspell.analysis.TextAnalysisConfig
|
||||
import docspell.analysis.nlp.TextClassifierConfig
|
||||
import docspell.backend.Config.Files
|
||||
import docspell.common._
|
||||
import docspell.convert.ConvertConfig
|
||||
@ -62,7 +65,15 @@ object Config {
|
||||
) {
|
||||
|
||||
def textAnalysisConfig: TextAnalysisConfig =
|
||||
TextAnalysisConfig(maxLength)
|
||||
TextAnalysisConfig(
|
||||
maxLength,
|
||||
TextClassifierConfig(
|
||||
workingDir,
|
||||
NonEmptyList
|
||||
.fromList(classification.classifiers)
|
||||
.getOrElse(NonEmptyList.of(Map.empty))
|
||||
)
|
||||
)
|
||||
|
||||
def regexNerFileConfig: RegexNerFile.Config =
|
||||
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
||||
|
@ -0,0 +1,64 @@
|
||||
package docspell.joex.learn
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.nlp.TextClassifier.Data
|
||||
import docspell.backend.ops.OCollective
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.scheduler._
|
||||
|
||||
object LearnClassifierTask {
|
||||
|
||||
type Args = LearnClassifierArgs
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: Config.TextAnalysis,
|
||||
blocker: Blocker,
|
||||
analyser: TextAnalyser[F]
|
||||
): Task[F, Args, Unit] =
|
||||
Task { ctx =>
|
||||
(for {
|
||||
sett <- findActiveSettings[F](ctx.args.collective, cfg)
|
||||
data = selectItems(
|
||||
ctx,
|
||||
math.min(cfg.classification.itemCount, sett.itemCount),
|
||||
sett.category.getOrElse("")
|
||||
)
|
||||
_ <- OptionT.liftF(
|
||||
analyser
|
||||
.classifier(blocker)
|
||||
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx)))
|
||||
)
|
||||
} yield ())
|
||||
.getOrElseF(logInactiveWarning(ctx.logger))
|
||||
}
|
||||
|
||||
private def handleModel[F[_]](
|
||||
ctx: Context[F, Args]
|
||||
)(trainedModel: ClassifierModel): F[Unit] =
|
||||
???
|
||||
|
||||
private def selectItems[F[_]](
|
||||
ctx: Context[F, Args],
|
||||
max: Int,
|
||||
category: String
|
||||
): Stream[F, Data] =
|
||||
???
|
||||
|
||||
private def findActiveSettings[F[_]: Sync](
|
||||
coll: Ident,
|
||||
cfg: Config.TextAnalysis
|
||||
): OptionT[F, OCollective.Classifier] =
|
||||
???
|
||||
|
||||
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
|
||||
logger.warn(
|
||||
"Classification is disabled. Check joex config and the collective settings."
|
||||
)
|
||||
}
|
@ -4,7 +4,7 @@ import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.common._
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
@ -42,7 +42,7 @@ object TextAnalysis {
|
||||
analyser: TextAnalyser[F],
|
||||
nerFile: RegexNerFile[F]
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
||||
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
|
||||
for {
|
||||
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||
sett = settings.copy(regexNer = customNer)
|
||||
|
Reference in New Issue
Block a user