Separate ner from classification

This commit is contained in:
Eike Kettner 2021-01-13 21:41:51 +01:00
parent f02f15e5bd
commit a699e87304
12 changed files with 25 additions and 28 deletions

View File

@ -3,13 +3,10 @@ package docspell.analysis
import cats.effect._
import cats.implicits._
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind
import docspell.analysis.nlp.PipelineCache
import docspell.analysis.nlp.StanfordNerClassifier
import docspell.analysis.nlp.StanfordNerSettings
import docspell.analysis.nlp.StanfordTextClassifier
import docspell.analysis.nlp.TextClassifier
import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
import docspell.common._
trait TextAnalyser[F[_]] {
@ -67,7 +64,7 @@ object TextAnalyser {
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
: F[Vector[NerLabel]] =
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
private def contactNer(text: String): F[Vector[NerLabel]] =
Sync[F].delay {

View File

@ -1,6 +1,6 @@
package docspell.analysis
import docspell.analysis.nlp.TextClassifierConfig
import docspell.analysis.classifier.TextClassifierConfig
import docspell.common._
case class TextAnalysisConfig(

View File

@ -1,4 +1,4 @@
package docspell.analysis.nlp
package docspell.analysis.classifier
import java.nio.file.Path

View File

@ -1,4 +1,4 @@
package docspell.analysis.nlp
package docspell.analysis.classifier
import java.nio.file.Path
@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref
import cats.implicits._
import fs2.Stream
import docspell.analysis.nlp.TextClassifier._
import docspell.analysis.classifier
import docspell.analysis.classifier.TextClassifier._
import docspell.analysis.nlp.Properties
import docspell.common._
import edu.stanford.nlp.classify.ColumnDataClassifier
@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
case Some(text) =>
Sync[F].delay {
val cls = ColumnDataClassifier.getClassifier(
model.model.normalize().toAbsolutePath().toString()
model.model.normalize().toAbsolutePath.toString
)
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
Option(cat)
@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
cdc.trainClassifier(in.train.toString())
val score = cdc.testClassifier(in.test.toString())
TrainResult(score.first(), ClassifierModel(in.modelFile))
TrainResult(score.first(), classifier.ClassifierModel(in.modelFile))
}
_ <- logger.debug(s"Trained with result $res")
} yield res

View File

@ -1,9 +1,9 @@
package docspell.analysis.nlp
package docspell.analysis.classifier
import cats.data.Kleisli
import fs2.Stream
import docspell.analysis.nlp.TextClassifier.Data
import docspell.analysis.classifier.TextClassifier.Data
import docspell.common._
trait TextClassifier[F[_]] {

View File

@ -1,4 +1,4 @@
package docspell.analysis.nlp
package docspell.analysis.classifier
import java.nio.file.Path

View File

@ -9,7 +9,7 @@ import docspell.common._
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
object StanfordNerClassifier {
object StanfordNerAnnotator {
/** Runs named entity recognition on the given `text`.
*
@ -28,9 +28,9 @@ object StanfordNerClassifier {
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.use(crf => Applicative[F].pure(runClassifier(crf, text)))
.use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
val doc = new CoreDocument(text)
nerClassifier.annotate(doc)
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector

View File

@ -1,4 +1,4 @@
package docspell.analysis.nlp
package docspell.analysis.classifier
import minitest._
import cats.effect._

View File

@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find english ner labels") {
val labels =
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText)
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find german ner labels") {
val labels =
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText)
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),

View File

@ -4,8 +4,7 @@ import java.nio.file.Path
import cats.data.NonEmptyList
import docspell.analysis.TextAnalysisConfig
import docspell.analysis.nlp.TextClassifierConfig
import docspell.analysis.{TextAnalysisConfig, classifier}
import docspell.backend.Config.Files
import docspell.common._
import docspell.convert.ConvertConfig
@ -69,7 +68,7 @@ object Config {
TextAnalysisConfig(
maxLength,
clearStanfordNlpInterval,
TextClassifierConfig(
classifier.TextClassifierConfig(
workingDir,
NonEmptyList
.fromList(classification.classifiers)

View File

@ -7,8 +7,8 @@ import cats.implicits._
import fs2.{Pipe, Stream}
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.nlp.TextClassifier.Data
import docspell.analysis.classifier.ClassifierModel
import docspell.analysis.classifier.TextClassifier.Data
import docspell.backend.ops.OCollective
import docspell.common._
import docspell.joex.Config

View File

@ -5,9 +5,8 @@ import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
import docspell.analysis.nlp.StanfordNerSettings
import docspell.analysis.nlp.TextClassifier
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile