Reformat with scalafmt 3.0.0

This commit is contained in:
Scala Steward
2021-08-19 08:50:30 +02:00
parent 5a2a0295ef
commit e4fecefaea
127 changed files with 558 additions and 658 deletions

View File

@ -155,10 +155,8 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
case class TrainResult(score: Double, model: ClassifierModel)
def prepend(pre: String, data: Map[String, String]): Map[String, String] =
data.toList
.map({ case (k, v) =>
if (k.startsWith(pre)) (k, v)
else (pre + k, v)
})
.toMap
data.toList.map { case (k, v) =>
if (k.startsWith(pre)) (k, v)
else (pre + k, v)
}.toMap
}

View File

@ -32,7 +32,7 @@ object Domain {
Tld
.findTld(str)
.map(tld => (str.dropRight(tld.length), tld))
.map({ case (names, tld) =>
.map { case (names, tld) =>
names.split('.').toList match {
case Nil => Left(s"Not a domain: $str")
case segs
@ -43,7 +43,7 @@ object Domain {
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
case _ => Left(s"Not a domain: $str")
}
})
}
.getOrElse(Left(s"Not a domain $str"))
def isDomain(str: String): Boolean =

View File

@ -160,11 +160,11 @@ object DateFind {
Reader(words => Nel.of(reader, more: _*).map(_.read(words)).reduce)
def readFirst[A](f: Word => Option[A]): Reader[A] =
Reader({
Reader {
case Nil => Result.Failure
case a :: as =>
f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
})
}
}
sealed trait Result[+A] {

View File

@ -15,7 +15,7 @@ object MonthName {
private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
ns.foldLeft(n0) { (res, el) =>
res.zip(el).map({ case (a, b) => a ++ b })
res.zip(el).map { case (a, b) => a ++ b }
}
private def forLang(lang: Language): List[List[String]] =

View File

@ -35,12 +35,12 @@ object Annotator {
*
* There are the following ways:
*
* - disabled: it returns a no-op annotator that always gives an empty list
* - full: the complete stanford pipeline is used
* - basic: only the ner classifier is used
* - disabled: it returns a no-op annotator that always gives an empty list
* - full: the complete stanford pipeline is used
* - basic: only the ner classifier is used
*
* Additionally, if there is a regexNer-file specified, the regexner annotator is
* also run. In case the full pipeline is used, this is already included.
* Additionally, if there is a regexNer-file specified, the regexner annotator is also
* run. In case the full pipeline is used, this is already included.
*/
def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
mode match {

View File

@ -21,10 +21,9 @@ import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
/** This is only using the CRFClassifier without building an analysis
* pipeline. The ner-classifier cannot use results from POS-tagging
* etc. and is therefore not as good as the [[StanfordNerAnnotator]].
* But it uses less memory, while still being not bad.
/** This is only using the CRFClassifier without building an analysis pipeline. The
* ner-classifier cannot use results from POS-tagging etc. and is therefore not as good
* as the [[StanfordNerAnnotator]]. But it uses less memory, while still being not bad.
*/
object BasicCRFAnnotator {
private[this] val logger = getLogger

View File

@ -17,8 +17,8 @@ import docspell.common._
import org.log4s.getLogger
/** Creating the StanfordCoreNLP pipeline is quite expensive as it
* involves IO and initializing large objects.
/** Creating the StanfordCoreNLP pipeline is quite expensive as it involves IO and
* initializing large objects.
*
* Therefore, the instances are cached, because they are thread-safe.
*

View File

@ -44,48 +44,48 @@ object Properties {
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
Properties(
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile)
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def regexNerOnly(regexNerMappingFile: Path): JProps =

View File

@ -22,13 +22,11 @@ object StanfordNerAnnotator {
/** Runs named entity recognition on the given `text`.
*
* This uses the classifier pipeline from stanford-nlp, see
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these
* classifiers is quite expensive, it involves loading large model
* files. The classifiers are thread-safe and so they are cached.
* The `cacheKey` defines the "slot" where classifiers are stored
* and retrieved. If for a given `cacheKey` the `settings` change,
* a new classifier must be created. It will then replace the
* previous one.
* https://nlp.stanford.edu/software/CRF-NER.html. Creating these classifiers is quite
* expensive, it involves loading large model files. The classifiers are thread-safe
* and so they are cached. The `cacheKey` defines the "slot" where classifiers are
* stored and retrieved. If for a given `cacheKey` the `settings` change, a new
* classifier must be created. It will then replace the previous one.
*/
def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
val doc = new CoreDocument(text)

View File

@ -17,18 +17,16 @@ object StanfordNerSettings {
/** Settings for configuring the stanford NER pipeline.
*
* The language is mandatory, only the provided ones are supported.
* The `highRecall` only applies for non-English languages. For
* non-English languages the english classifier is run as second
* classifier and if `highRecall` is true, then it will be used to
* tag untagged tokens. This may lead to a lot of false positives,
* but since English is omnipresent in other languages, too it
* depends on the use case for whether this is useful or not.
* The language is mandatory, only the provided ones are supported. The `highRecall`
* only applies for non-English languages. For non-English languages the english
* classifier is run as second classifier and if `highRecall` is true, then it will be
* used to tag untagged tokens. This may lead to a lot of false positives, but since
* English is omnipresent in other languages, too it depends on the use case for
* whether this is useful or not.
*
* The `regexNer` allows to specify a text file as described here:
* https://nlp.stanford.edu/software/regexner.html. This will be used
* as a last step to tag untagged tokens using the provided list of
* regexps.
* https://nlp.stanford.edu/software/regexner.html. This will be used as a last step to
* tag untagged tokens using the provided list of regexps.
*/
case class Full(
lang: NLPLanguage,
@ -36,7 +34,8 @@ object StanfordNerSettings {
regexNer: Option[Path]
) extends StanfordNerSettings
/** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
/** Not all languages are supported with predefined statistical models. This allows to
* provide regexps only.
*/
case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings

View File

@ -37,9 +37,9 @@ class StanfordTextClassifierSuite extends FunSuite {
.repeat
.take(10)
)
.flatMap({ case (a, b) =>
.flatMap { case (a, b) =>
Stream.emits(Seq(a, b))
})
}
.covary[IO]
val modelExists = {
@ -52,7 +52,7 @@ class StanfordTextClassifierSuite extends FunSuite {
}
test("run classifier") {
val cfg = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
val cfg = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
val things = File.withTempDir[IO](File.path(Paths.get("target")), "testcls")
things