mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Reformat with scalafmt 3.0.0
This commit is contained in:
@ -155,10 +155,8 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
|
||||
case class TrainResult(score: Double, model: ClassifierModel)
|
||||
|
||||
def prepend(pre: String, data: Map[String, String]): Map[String, String] =
|
||||
data.toList
|
||||
.map({ case (k, v) =>
|
||||
if (k.startsWith(pre)) (k, v)
|
||||
else (pre + k, v)
|
||||
})
|
||||
.toMap
|
||||
data.toList.map { case (k, v) =>
|
||||
if (k.startsWith(pre)) (k, v)
|
||||
else (pre + k, v)
|
||||
}.toMap
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ object Domain {
|
||||
Tld
|
||||
.findTld(str)
|
||||
.map(tld => (str.dropRight(tld.length), tld))
|
||||
.map({ case (names, tld) =>
|
||||
.map { case (names, tld) =>
|
||||
names.split('.').toList match {
|
||||
case Nil => Left(s"Not a domain: $str")
|
||||
case segs
|
||||
@ -43,7 +43,7 @@ object Domain {
|
||||
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
|
||||
case _ => Left(s"Not a domain: $str")
|
||||
}
|
||||
})
|
||||
}
|
||||
.getOrElse(Left(s"Not a domain $str"))
|
||||
|
||||
def isDomain(str: String): Boolean =
|
||||
|
@ -160,11 +160,11 @@ object DateFind {
|
||||
Reader(words => Nel.of(reader, more: _*).map(_.read(words)).reduce)
|
||||
|
||||
def readFirst[A](f: Word => Option[A]): Reader[A] =
|
||||
Reader({
|
||||
Reader {
|
||||
case Nil => Result.Failure
|
||||
case a :: as =>
|
||||
f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
sealed trait Result[+A] {
|
||||
|
@ -15,7 +15,7 @@ object MonthName {
|
||||
|
||||
private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
|
||||
ns.foldLeft(n0) { (res, el) =>
|
||||
res.zip(el).map({ case (a, b) => a ++ b })
|
||||
res.zip(el).map { case (a, b) => a ++ b }
|
||||
}
|
||||
|
||||
private def forLang(lang: Language): List[List[String]] =
|
||||
|
@ -35,12 +35,12 @@ object Annotator {
|
||||
*
|
||||
* There are the following ways:
|
||||
*
|
||||
* - disabled: it returns a no-op annotator that always gives an empty list
|
||||
* - full: the complete stanford pipeline is used
|
||||
* - basic: only the ner classifier is used
|
||||
* - disabled: it returns a no-op annotator that always gives an empty list
|
||||
* - full: the complete stanford pipeline is used
|
||||
* - basic: only the ner classifier is used
|
||||
*
|
||||
* Additionally, if there is a regexNer-file specified, the regexner annotator is
|
||||
* also run. In case the full pipeline is used, this is already included.
|
||||
* Additionally, if there is a regexNer-file specified, the regexner annotator is also
|
||||
* run. In case the full pipeline is used, this is already included.
|
||||
*/
|
||||
def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
|
||||
mode match {
|
||||
|
@ -21,10 +21,9 @@ import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||
import org.log4s.getLogger
|
||||
|
||||
/** This is only using the CRFClassifier without building an analysis
|
||||
* pipeline. The ner-classifier cannot use results from POS-tagging
|
||||
* etc. and is therefore not as good as the [[StanfordNerAnnotator]].
|
||||
* But it uses less memory, while still being not bad.
|
||||
/** This is only using the CRFClassifier without building an analysis pipeline. The
|
||||
* ner-classifier cannot use results from POS-tagging etc. and is therefore not as good
|
||||
* as the [[StanfordNerAnnotator]]. But it uses less memory, while still being not bad.
|
||||
*/
|
||||
object BasicCRFAnnotator {
|
||||
private[this] val logger = getLogger
|
||||
|
@ -17,8 +17,8 @@ import docspell.common._
|
||||
|
||||
import org.log4s.getLogger
|
||||
|
||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
|
||||
* involves IO and initializing large objects.
|
||||
/** Creating the StanfordCoreNLP pipeline is quite expensive as it involves IO and
|
||||
* initializing large objects.
|
||||
*
|
||||
* Therefore, the instances are cached, because they are thread-safe.
|
||||
*
|
||||
|
@ -44,48 +44,48 @@ object Properties {
|
||||
|
||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "de",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "de",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
|
||||
"ner.useSUTime" -> "false", //only english, unused in docspell
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
"ner.useSUTime" -> "false", //only english, unused in docspell
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||
"tokenize.language" -> "en",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
|
||||
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||
"tokenize.language" -> "en",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "en",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile)
|
||||
|
||||
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "fr",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "fr",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||
|
@ -22,13 +22,11 @@ object StanfordNerAnnotator {
|
||||
/** Runs named entity recognition on the given `text`.
|
||||
*
|
||||
* This uses the classifier pipeline from stanford-nlp, see
|
||||
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
|
||||
* classifiers is quite expensive, it involves loading large model
|
||||
* files. The classifiers are thread-safe and so they are cached.
|
||||
* The `cacheKey` defines the "slot" where classifiers are stored
|
||||
* and retrieved. If for a given `cacheKey` the `settings` change,
|
||||
* a new classifier must be created. It will then replace the
|
||||
* previous one.
|
||||
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these classifiers is quite
|
||||
* expensive, it involves loading large model files. The classifiers are thread-safe
|
||||
* and so they are cached. The `cacheKey` defines the "slot" where classifiers are
|
||||
* stored and retrieved. If for a given `cacheKey` the `settings` change, a new
|
||||
* classifier must be created. It will then replace the previous one.
|
||||
*/
|
||||
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
|
||||
val doc = new CoreDocument(text)
|
||||
|
@ -17,18 +17,16 @@ object StanfordNerSettings {
|
||||
|
||||
/** Settings for configuring the stanford NER pipeline.
|
||||
*
|
||||
* The language is mandatory, only the provided ones are supported.
|
||||
* The `highRecall` only applies for non-English languages. For
|
||||
* non-English languages the english classifier is run as second
|
||||
* classifier and if `highRecall` is true, then it will be used to
|
||||
* tag untagged tokens. This may lead to a lot of false positives,
|
||||
* but since English is omnipresent in other languages, too it
|
||||
* depends on the use case for whether this is useful or not.
|
||||
* The language is mandatory, only the provided ones are supported. The `highRecall`
|
||||
* only applies for non-English languages. For non-English languages the english
|
||||
* classifier is run as second classifier and if `highRecall` is true, then it will be
|
||||
* used to tag untagged tokens. This may lead to a lot of false positives, but since
|
||||
* English is omnipresent in other languages, too it depends on the use case for
|
||||
* whether this is useful or not.
|
||||
*
|
||||
* The `regexNer` allows to specify a text file as described here:
|
||||
* https://nlp.stanford.edu/software/regexner.html. This will be used
|
||||
* as a last step to tag untagged tokens using the provided list of
|
||||
* regexps.
|
||||
* https://nlp.stanford.edu/software/regexner.html. This will be used as a last step to
|
||||
* tag untagged tokens using the provided list of regexps.
|
||||
*/
|
||||
case class Full(
|
||||
lang: NLPLanguage,
|
||||
@ -36,7 +34,8 @@ object StanfordNerSettings {
|
||||
regexNer: Option[Path]
|
||||
) extends StanfordNerSettings
|
||||
|
||||
/** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
|
||||
/** Not all languages are supported with predefined statistical models. This allows to
|
||||
* provide regexps only.
|
||||
*/
|
||||
case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings
|
||||
|
||||
|
@ -37,9 +37,9 @@ class StanfordTextClassifierSuite extends FunSuite {
|
||||
.repeat
|
||||
.take(10)
|
||||
)
|
||||
.flatMap({ case (a, b) =>
|
||||
.flatMap { case (a, b) =>
|
||||
Stream.emits(Seq(a, b))
|
||||
})
|
||||
}
|
||||
.covary[IO]
|
||||
|
||||
val modelExists = {
|
||||
@ -52,7 +52,7 @@ class StanfordTextClassifierSuite extends FunSuite {
|
||||
}
|
||||
|
||||
test("run classifier") {
|
||||
val cfg = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
|
||||
val cfg = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
|
||||
val things = File.withTempDir[IO](File.path(Paths.get("target")), "testcls")
|
||||
|
||||
things
|
||||
|
Reference in New Issue
Block a user