mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-07 07:35:59 +00:00
Separate ner from classification
This commit is contained in:
parent
f02f15e5bd
commit
a699e87304
@ -3,13 +3,10 @@ package docspell.analysis
|
|||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.classifier.{StanfordTextClassifier, TextClassifier}
|
||||||
import docspell.analysis.contact.Contact
|
import docspell.analysis.contact.Contact
|
||||||
import docspell.analysis.date.DateFind
|
import docspell.analysis.date.DateFind
|
||||||
import docspell.analysis.nlp.PipelineCache
|
import docspell.analysis.nlp.{PipelineCache, StanfordNerAnnotator, StanfordNerSettings}
|
||||||
import docspell.analysis.nlp.StanfordNerClassifier
|
|
||||||
import docspell.analysis.nlp.StanfordNerSettings
|
|
||||||
import docspell.analysis.nlp.StanfordTextClassifier
|
|
||||||
import docspell.analysis.nlp.TextClassifier
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
trait TextAnalyser[F[_]] {
|
trait TextAnalyser[F[_]] {
|
||||||
@ -67,7 +64,7 @@ object TextAnalyser {
|
|||||||
|
|
||||||
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
||||||
: F[Vector[NerLabel]] =
|
: F[Vector[NerLabel]] =
|
||||||
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
StanfordNerAnnotator.nerAnnotate[F](key.id, cache)(settings, text)
|
||||||
|
|
||||||
private def contactNer(text: String): F[Vector[NerLabel]] =
|
private def contactNer(text: String): F[Vector[NerLabel]] =
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package docspell.analysis
|
package docspell.analysis
|
||||||
|
|
||||||
import docspell.analysis.nlp.TextClassifierConfig
|
import docspell.analysis.classifier.TextClassifierConfig
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
case class TextAnalysisConfig(
|
case class TextAnalysisConfig(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.classifier
|
||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.classifier
|
||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
@ -7,7 +7,9 @@ import cats.effect.concurrent.Ref
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.analysis.nlp.TextClassifier._
|
import docspell.analysis.classifier
|
||||||
|
import docspell.analysis.classifier.TextClassifier._
|
||||||
|
import docspell.analysis.nlp.Properties
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
import edu.stanford.nlp.classify.ColumnDataClassifier
|
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||||
@ -43,7 +45,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
|||||||
case Some(text) =>
|
case Some(text) =>
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
val cls = ColumnDataClassifier.getClassifier(
|
val cls = ColumnDataClassifier.getClassifier(
|
||||||
model.model.normalize().toAbsolutePath().toString()
|
model.model.normalize().toAbsolutePath.toString
|
||||||
)
|
)
|
||||||
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
|
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
|
||||||
Option(cat)
|
Option(cat)
|
||||||
@ -65,7 +67,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
|||||||
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
|
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
|
||||||
cdc.trainClassifier(in.train.toString())
|
cdc.trainClassifier(in.train.toString())
|
||||||
val score = cdc.testClassifier(in.test.toString())
|
val score = cdc.testClassifier(in.test.toString())
|
||||||
TrainResult(score.first(), ClassifierModel(in.modelFile))
|
TrainResult(score.first(), classifier.ClassifierModel(in.modelFile))
|
||||||
}
|
}
|
||||||
_ <- logger.debug(s"Trained with result $res")
|
_ <- logger.debug(s"Trained with result $res")
|
||||||
} yield res
|
} yield res
|
@ -1,9 +1,9 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.classifier
|
||||||
|
|
||||||
import cats.data.Kleisli
|
import cats.data.Kleisli
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
import docspell.analysis.nlp.TextClassifier.Data
|
import docspell.analysis.classifier.TextClassifier.Data
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
trait TextClassifier[F[_]] {
|
trait TextClassifier[F[_]] {
|
@ -1,4 +1,4 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.classifier
|
||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
@ -9,7 +9,7 @@ import docspell.common._
|
|||||||
|
|
||||||
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
|
||||||
|
|
||||||
object StanfordNerClassifier {
|
object StanfordNerAnnotator {
|
||||||
|
|
||||||
/** Runs named entity recognition on the given `text`.
|
/** Runs named entity recognition on the given `text`.
|
||||||
*
|
*
|
||||||
@ -28,9 +28,9 @@ object StanfordNerClassifier {
|
|||||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||||
cache
|
cache
|
||||||
.obtain(cacheKey, settings)
|
.obtain(cacheKey, settings)
|
||||||
.use(crf => Applicative[F].pure(runClassifier(crf, text)))
|
.use(crf => Applicative[F].pure(nerAnnotate(crf, text)))
|
||||||
|
|
||||||
def runClassifier(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||||
val doc = new CoreDocument(text)
|
val doc = new CoreDocument(text)
|
||||||
nerClassifier.annotate(doc)
|
nerClassifier.annotate(doc)
|
||||||
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
doc.tokens().asScala.collect(Function.unlift(LabelConverter.toNerLabel)).toVector
|
@ -1,4 +1,4 @@
|
|||||||
package docspell.analysis.nlp
|
package docspell.analysis.classifier
|
||||||
|
|
||||||
import minitest._
|
import minitest._
|
||||||
import cats.effect._
|
import cats.effect._
|
@ -13,7 +13,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
|||||||
|
|
||||||
test("find english ner labels") {
|
test("find english ner labels") {
|
||||||
val labels =
|
val labels =
|
||||||
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
|
StanfordNerAnnotator.nerAnnotate(englishClassifier, TestFiles.letterENText)
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||||
@ -49,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
|||||||
|
|
||||||
test("find german ner labels") {
|
test("find german ner labels") {
|
||||||
val labels =
|
val labels =
|
||||||
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
|
StanfordNerAnnotator.nerAnnotate(germanClassifier, TestFiles.letterDEText)
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
NerLabel("Max", NerTag.Person, 0, 3),
|
NerLabel("Max", NerTag.Person, 0, 3),
|
||||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||||
|
@ -4,8 +4,7 @@ import java.nio.file.Path
|
|||||||
|
|
||||||
import cats.data.NonEmptyList
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
import docspell.analysis.TextAnalysisConfig
|
import docspell.analysis.{TextAnalysisConfig, classifier}
|
||||||
import docspell.analysis.nlp.TextClassifierConfig
|
|
||||||
import docspell.backend.Config.Files
|
import docspell.backend.Config.Files
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
@ -69,7 +68,7 @@ object Config {
|
|||||||
TextAnalysisConfig(
|
TextAnalysisConfig(
|
||||||
maxLength,
|
maxLength,
|
||||||
clearStanfordNlpInterval,
|
clearStanfordNlpInterval,
|
||||||
TextClassifierConfig(
|
classifier.TextClassifierConfig(
|
||||||
workingDir,
|
workingDir,
|
||||||
NonEmptyList
|
NonEmptyList
|
||||||
.fromList(classification.classifiers)
|
.fromList(classification.classifiers)
|
||||||
|
@ -7,8 +7,8 @@ import cats.implicits._
|
|||||||
import fs2.{Pipe, Stream}
|
import fs2.{Pipe, Stream}
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.analysis.nlp.ClassifierModel
|
import docspell.analysis.classifier.ClassifierModel
|
||||||
import docspell.analysis.nlp.TextClassifier.Data
|
import docspell.analysis.classifier.TextClassifier.Data
|
||||||
import docspell.backend.ops.OCollective
|
import docspell.backend.ops.OCollective
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
@ -5,9 +5,8 @@ import cats.effect._
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.analysis.nlp.ClassifierModel
|
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||||
import docspell.analysis.nlp.StanfordNerSettings
|
import docspell.analysis.nlp.StanfordNerSettings
|
||||||
import docspell.analysis.nlp.TextClassifier
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
|
Loading…
x
Reference in New Issue
Block a user