From a699e87304ac17be62a72d9a09e6d97493ec8cc9 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 13 Jan 2021 21:41:51 +0100 Subject: [PATCH] Separate ner from classification --- .../main/scala/docspell/analysis/TextAnalyser.scala | 9 +++------ .../scala/docspell/analysis/TextAnalysisConfig.scala | 2 +- .../analysis/{nlp => classifier}/ClassifierModel.scala | 2 +- .../{nlp => classifier}/StanfordTextClassifier.scala | 10 ++++++---- .../analysis/{nlp => classifier}/TextClassifier.scala | 4 ++-- .../{nlp => classifier}/TextClassifierConfig.scala | 2 +- ...dNerClassifier.scala => StanfordNerAnnotator.scala} | 6 +++--- .../StanfordTextClassifierSuite.scala | 2 +- .../docspell/analysis/nlp/TextAnalyserSuite.scala | 4 ++-- modules/joex/src/main/scala/docspell/joex/Config.scala | 5 ++--- .../docspell/joex/learn/LearnClassifierTask.scala | 4 ++-- .../scala/docspell/joex/process/TextAnalysis.scala | 3 +-- 12 files changed, 25 insertions(+), 28 deletions(-) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/ClassifierModel.scala (64%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/StanfordTextClassifier.scala (93%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/TextClassifier.scala (83%) rename modules/analysis/src/main/scala/docspell/analysis/{nlp => classifier}/TextClassifierConfig.scala (82%) rename modules/analysis/src/main/scala/docspell/analysis/nlp/{StanfordNerClassifier.scala => StanfordNerAnnotator.scala} (86%) rename modules/analysis/src/test/scala/docspell/analysis/{nlp => classifier}/StanfordTextClassifierSuite.scala (98%) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index c319b784..b67347ae 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -3,13 +3,10 @@ package docspell.analysis import cats.effect._ import cats.implicits._ +import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier} import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind -import docspell.analysis.nlp.PipelineCache -import docspell.analysis.nlp.StanfordNerClassifier -import docspell.analysis.nlp.StanfordNerSettings -import docspell.analysis.nlp.StanfordTextClassifier -import docspell.analysis.nlp.TextClassifier +import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings} import docspell.common._ trait TextAnalyser[F[_]] { @@ -67,7 +64,7 @@ object TextAnalyser { private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) : F[Vector[NerLabel]] = - StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text) + StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text) private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay { diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index cb6e1d39..2dbfbfc4 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,6 +1,6 @@ package docspell.analysis -import docspell.analysis.nlp.TextClassifierConfig +import docspell.analysis.classifier.TextClassifierConfig import docspell.common._ case class TextAnalysisConfig( diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala similarity index 64% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala index 82f9f9cc..071a8e29 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/ClassifierModel.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala similarity index 93% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala index 091d9e16..edd1c7da 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path @@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref import cats.implicits._ import fs2.Stream -import docspell.analysis.nlp.TextClassifier._ +import docspell.analysis.classifier +import docspell.analysis.classifier.TextClassifier._ +import docspell.analysis.nlp.Properties import docspell.common._ import edu.stanford.nlp.classify.ColumnDataClassifier @@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( case Some(text) => Sync[F].delay { val cls = ColumnDataClassifier.getClassifier( - model.model.normalize().toAbsolutePath().toString() + model.model.normalize().toAbsolutePath.toString ) val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text))) Option(cat) @@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props))) cdc.trainClassifier(in.train.toString()) val score = cdc.testClassifier(in.test.toString()) - TrainResult(score.first(), ClassifierModel(in.modelFile)) + TrainResult(score.first(), classifier.ClassifierModel(in.modelFile)) } _ <- logger.debug(s"Trained with result $res") } yield res diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala similarity index 83% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala index f2927d0c..3569f499 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifier.scala @@ -1,9 +1,9 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import cats.data.Kleisli import fs2.Stream -import docspell.analysis.nlp.TextClassifier.Data +import docspell.analysis.classifier.TextClassifier.Data import docspell.common._ trait TextClassifier[F[_]] { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala similarity index 82% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala rename to modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala index e3baac46..bb628ebf 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/TextClassifierConfig.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import java.nio.file.Path diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala similarity index 86% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala index 3f196b8e..df9fa431 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala @@ -9,7 +9,7 @@ import docspell.common._ import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP} -object StanfordNerClassifier { +object StanfordNerAnnotator { /** Runs named entity recognition on the given `text`. * @@ -28,9 +28,9 @@ object StanfordNerClassifier { )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = cache .obtain(cacheKey, settings) - .use(crf => Applicative[F].pure(runClassifier(crf, text))) + .use(crf => Applicative[F].pure(nerAnnotate(crf, text))) - def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { + def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = { val doc = new CoreDocument(text) nerClassifier.annotate(doc) doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala similarity index 98% rename from modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala index e38ba703..0229585c 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala @@ -1,4 +1,4 @@ -package docspell.analysis.nlp +package docspell.analysis.classifier import minitest._ import cats.effect._ diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index b22093f1..e0dfc4a0 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find english ner labels") { val labels = - StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText) + StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find german ner labels") { val labels = - StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText) + StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 601d0049..8fba3582 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -4,8 +4,7 @@ import java.nio.file.Path import cats.data.NonEmptyList -import docspell.analysis.TextAnalysisConfig -import docspell.analysis.nlp.TextClassifierConfig +import docspell.analysis.{TextAnalysisConfig, classifier} import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -69,7 +68,7 @@ object Config { TextAnalysisConfig( maxLength, clearStanfordNlpInterval, - TextClassifierConfig( + classifier.TextClassifierConfig( workingDir, NonEmptyList .fromList(classification.classifiers) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 535b7f0d..d5c632c3 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -7,8 +7,8 @@ import cats.implicits._ import fs2.{Pipe, Stream} import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.ClassifierModel -import docspell.analysis.nlp.TextClassifier.Data +import docspell.analysis.classifier.ClassifierModel +import docspell.analysis.classifier.TextClassifier.Data import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 4a868d47..1fd2401a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -5,9 +5,8 @@ import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.ClassifierModel +import docspell.analysis.classifier.{ClassifierModel, TextClassifier} import docspell.analysis.nlp.StanfordNerSettings -import docspell.analysis.nlp.TextClassifier import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile