Initial impl of a text classifier based on stanford-nlp

This commit is contained in:
Eike Kettner
2020-08-31 22:35:27 +02:00
parent 8c4f2e702b
commit 0c97b4ef76
16 changed files with 376 additions and 18 deletions

View File

@ -298,7 +298,7 @@ docspell.joex {
# These settings are used to configure the classifier. If
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
# for more info about these settings. The settings are almost
# identical to them, as they yielded best results with *my*
# dataset.

View File

@ -2,7 +2,10 @@ package docspell.joex
import java.nio.file.Path
import cats.data.NonEmptyList
import docspell.analysis.TextAnalysisConfig
import docspell.analysis.nlp.TextClassifierConfig
import docspell.backend.Config.Files
import docspell.common._
import docspell.convert.ConvertConfig
@ -62,7 +65,15 @@ object Config {
) {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(maxLength)
TextAnalysisConfig(
maxLength,
TextClassifierConfig(
workingDir,
NonEmptyList
.fromList(classification.classifiers)
.getOrElse(NonEmptyList.of(Map.empty))
)
)
def regexNerFileConfig: RegexNerFile.Config =
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)

View File

@ -0,0 +1,64 @@
package docspell.joex.learn
import cats.data.Kleisli
import cats.data.OptionT
import cats.effect._
import fs2.Stream
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.nlp.TextClassifier.Data
import docspell.backend.ops.OCollective
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler._
object LearnClassifierTask {
type Args = LearnClassifierArgs
def apply[F[_]: Sync: ContextShift](
cfg: Config.TextAnalysis,
blocker: Blocker,
analyser: TextAnalyser[F]
): Task[F, Args, Unit] =
Task { ctx =>
(for {
sett <- findActiveSettings[F](ctx.args.collective, cfg)
data = selectItems(
ctx,
math.min(cfg.classification.itemCount, sett.itemCount),
sett.category.getOrElse("")
)
_ <- OptionT.liftF(
analyser
.classifier(blocker)
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx)))
)
} yield ())
.getOrElseF(logInactiveWarning(ctx.logger))
}
private def handleModel[F[_]](
ctx: Context[F, Args]
)(trainedModel: ClassifierModel): F[Unit] =
???
private def selectItems[F[_]](
ctx: Context[F, Args],
max: Int,
category: String
): Stream[F, Data] =
???
private def findActiveSettings[F[_]: Sync](
coll: Ident,
cfg: Config.TextAnalysis
): OptionT[F, OCollective.Classifier] =
???
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
logger.warn(
"Classification is disabled. Check joex config and the collective settings."
)
}

View File

@ -4,7 +4,7 @@ import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.nlp.StanfordNerSettings
import docspell.common._
import docspell.joex.analysis.RegexNerFile
import docspell.joex.process.ItemData.AttachmentDates
@ -42,7 +42,7 @@ object TextAnalysis {
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(ctx.args.meta.language, false, None)
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
for {
customNer <- nerFile.makeFile(ctx.args.meta.collective)
sett = settings.copy(regexNer = customNer)