mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 18:39:33 +00:00
Separate ner from classification
This commit is contained in:
parent
f02f15e5bd
commit
a699e87304
@ -3,13 +3,10 @@ package docspell.analysis
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
|
||||
import docspell.analysis.contact.Contact
|
||||
import docspell.analysis.date.DateFind
|
||||
import docspell.analysis.nlp.PipelineCache
|
||||
import docspell.analysis.nlp.StanfordNerClassifier
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.analysis.nlp.StanfordTextClassifier
|
||||
import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
|
||||
import docspell.common._
|
||||
|
||||
trait TextAnalyser[F[_]] {
|
||||
@ -67,7 +64,7 @@ object TextAnalyser {
|
||||
|
||||
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
||||
: F[Vector[NerLabel]] =
|
||||
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
||||
StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
|
||||
|
||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||
Sync[F].delay {
|
||||
|
@ -1,6 +1,6 @@
|
||||
package docspell.analysis
|
||||
|
||||
import docspell.analysis.nlp.TextClassifierConfig
|
||||
import docspell.analysis.classifier.TextClassifierConfig
|
||||
import docspell.common._
|
||||
|
||||
case class TextAnalysisConfig(
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.analysis.nlp
|
||||
package docspell.analysis.classifier
|
||||
|
||||
import java.nio.file.Path
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.analysis.nlp
|
||||
package docspell.analysis.classifier
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.nlp.TextClassifier._
|
||||
import docspell.analysis.classifier
|
||||
import docspell.analysis.classifier.TextClassifier._
|
||||
import docspell.analysis.nlp.Properties
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||
@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
case Some(text) =>
|
||||
Sync[F].delay {
|
||||
val cls = ColumnDataClassifier.getClassifier(
|
||||
model.model.normalize().toAbsolutePath().toString()
|
||||
model.model.normalize().toAbsolutePath.toString
|
||||
)
|
||||
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
|
||||
Option(cat)
|
||||
@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
|
||||
cdc.trainClassifier(in.train.toString())
|
||||
val score = cdc.testClassifier(in.test.toString())
|
||||
TrainResult(score.first(), ClassifierModel(in.modelFile))
|
||||
TrainResult(score.first(), classifier.ClassifierModel(in.modelFile))
|
||||
}
|
||||
_ <- logger.debug(s"Trained with result $res")
|
||||
} yield res
|
@ -1,9 +1,9 @@
|
||||
package docspell.analysis.nlp
|
||||
package docspell.analysis.classifier
|
||||
|
||||
import cats.data.Kleisli
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.nlp.TextClassifier.Data
|
||||
import docspell.analysis.classifier.TextClassifier.Data
|
||||
import docspell.common._
|
||||
|
||||
trait TextClassifier[F[_]] {
|
@ -1,4 +1,4 @@
|
||||
package docspell.analysis.nlp
|
||||
package docspell.analysis.classifier
|
||||
|
||||
import java.nio.file.Path
|
||||
|
@ -9,7 +9,7 @@ import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
||||
|
||||
object StanfordNerClassifier {
|
||||
object StanfordNerAnnotator {
|
||||
|
||||
/** Runs named entity recognition on the given `text`.
|
||||
*
|
||||
@ -28,9 +28,9 @@ object StanfordNerClassifier {
|
||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(cacheKey, settings)
|
||||
.use(crf => Applicative[F].pure(runClassifier(crf, text)))
|
||||
.use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
|
||||
|
||||
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||
val doc = new CoreDocument(text)
|
||||
nerClassifier.annotate(doc)
|
||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
@ -1,4 +1,4 @@
|
||||
package docspell.analysis.nlp
|
||||
package docspell.analysis.classifier
|
||||
|
||||
import minitest._
|
||||
import cats.effect._
|
@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find english ner labels") {
|
||||
val labels =
|
||||
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
|
||||
StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText)
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels =
|
||||
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
|
||||
StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText)
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
|
@ -4,8 +4,7 @@ import java.nio.file.Path
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import docspell.analysis.TextAnalysisConfig
|
||||
import docspell.analysis.nlp.TextClassifierConfig
|
||||
import docspell.analysis.{TextAnalysisConfig, classifier}
|
||||
import docspell.backend.Config.Files
|
||||
import docspell.common._
|
||||
import docspell.convert.ConvertConfig
|
||||
@ -69,7 +68,7 @@ object Config {
|
||||
TextAnalysisConfig(
|
||||
maxLength,
|
||||
clearStanfordNlpInterval,
|
||||
TextClassifierConfig(
|
||||
classifier.TextClassifierConfig(
|
||||
workingDir,
|
||||
NonEmptyList
|
||||
.fromList(classification.classifiers)
|
||||
|
@ -7,8 +7,8 @@ import cats.implicits._
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.nlp.TextClassifier.Data
|
||||
import docspell.analysis.classifier.ClassifierModel
|
||||
import docspell.analysis.classifier.TextClassifier.Data
|
||||
import docspell.backend.ops.OCollective
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
|
@ -5,9 +5,8 @@ import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
|
Loading…
x
Reference in New Issue
Block a user