Update scalafmt settings

This commit is contained in:
eikek
2021-09-22 17:23:24 +02:00
parent c37f1d7c31
commit 9013f2de5b
277 changed files with 1579 additions and 1615 deletions

View File

@ -54,7 +54,7 @@ object TextAnalyser {
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input))
tags1 <- contactNer(input)
dates <- dateNer(settings.lang, input)
list = tags0 ++ tags1
list = tags0 ++ tags1
spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates)

View File

@ -31,10 +31,10 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
.withTempDir(cfg.workingDir, "trainclassifier")
.use { dir =>
for {
rawData <- writeDataFile(dir, data)
_ <- logger.debug(s"Learning from ${rawData.count} items.")
rawData <- writeDataFile(dir, data)
_ <- logger.debug(s"Learning from ${rawData.count} items.")
trainData <- splitData(logger, rawData)
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
sorted = scores.sortBy(-_.score)
res <- handler(sorted.head.model)
} yield res
@ -77,7 +77,7 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
} yield res
def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
val f = if (cfg.classifierConfigs.size > 1) 0.15 else 0.0
val f = if (cfg.classifierConfigs.size > 1) 0.15 else 0.0
val nTest = (in.count * f).toLong
val td =
@ -142,8 +142,8 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
props: Map[String, String]
): Map[String, String] =
prepend("2.", props) ++ Map(
"trainFile" -> trainData.train.absolutePathAsString,
"testFile" -> trainData.test.absolutePathAsString,
"trainFile" -> trainData.train.absolutePathAsString,
"testFile" -> trainData.test.absolutePathAsString,
"serializeTo" -> trainData.modelFile.absolutePathAsString
).toList

View File

@ -33,7 +33,7 @@ object Contact {
if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
else {
val name = str.substring(0, atIdx)
val dom = str.substring(atIdx + 1)
val dom = str.substring(atIdx + 1)
Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
}
}

View File

@ -14,8 +14,7 @@ private[analysis] object Tld {
def endsWithTld(str: String): Boolean =
findTld(str).isDefined
/** Some selected TLDs.
*/
/** Some selected TLDs. */
private[this] val known = List(
".com",
".org",

View File

@ -177,17 +177,17 @@ object DateFind {
object Result {
final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
val toOption = Some(value)
val toOption = Some(value)
def flatMap[B](f: A => Result[B]): Result[B] = f(value)
def map[B](f: A => B): Result[B] = Success(f(value), rest)
def map[B](f: A => B): Result[B] = Success(f(value), rest)
def next[B](r: Reader[B]): Result[(A, B)] =
r.read(rest).map(b => (value, b))
}
final case object Failure extends Result[Nothing] {
val toOption = None
val toOption = None
def flatMap[B](f: Nothing => Result[B]): Result[B] = this
def map[B](f: Nothing => B): Result[B] = this
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
def map[B](f: Nothing => B): Result[B] = this
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
}
implicit def resultSemigroup[A: Semigroup]: Semigroup[Result[A]] =

View File

@ -74,9 +74,9 @@ object BasicCRFAnnotator {
}
final class Cache {
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
def forLang(language: NLPLanguage): Annotator =
language match {

View File

@ -38,9 +38,9 @@ object PipelineCache {
release: F[Unit]
): F[PipelineCache[F]] =
for {
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
cacheClear <- CacheClearing.create(data, clearInterval, release)
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
} yield new Impl[F](data, creator, cacheClear)
final private class Impl[F[_]: Async](
@ -51,7 +51,7 @@ object PipelineCache {
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] =
for {
_ <- cacheClear.withCache
_ <- cacheClear.withCache
id <- Resource.eval(makeSettingsId(settings))
nlp <- Resource.eval(
data.modify(cache => getOrCreate(key, id, cache, settings, creator))
@ -73,13 +73,13 @@ object PipelineCache {
s"StanfordNLP settings changed for key $key. Creating new classifier"
)
val nlp = creator(settings)
val e = Entry(id, nlp)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
case None =>
val nlp = creator(settings)
val e = Entry(id, nlp)
val e = Entry(id, nlp)
(cache.updated(key, e), nlp)
}
@ -114,7 +114,7 @@ object PipelineCache {
release: F[Unit]
): F[CacheClearing[F]] =
for {
counter <- Ref.of(0L)
counter <- Ref.of(0L)
cleaning <- Ref.of(None: Option[Fiber[F, Throwable, Unit]])
log = Logger.log4s(logger)
result <-

View File

@ -44,47 +44,47 @@ object Properties {
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "de",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
"ner.useSUTime" -> "false", //only english, unused in docspell
"ner.language" -> "de",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
Properties(
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
"tokenize.language" -> "en",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.useSUTime" -> "false",
"ner.language" -> "en",
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile)
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
Properties(
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
"tokenize.language" -> "fr",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
"ner.statisticalOnly" -> "true",
"ner.rulesOnly" -> "false",
"ner.applyFineGrained" -> "false",
"ner.applyNumericClassifiers" -> "false",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.useSUTime" -> "false",
"ner.language" -> "de",
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)

View File

@ -8,15 +8,14 @@ package docspell.analysis.split
import fs2.Stream
/** Splits text into words.
*/
/** Splits text into words. */
object TextSplitter {
private[this] val trimChars =
".,…_[]^!<>=&ſ/{}*?()-:#$|~`+%\\\"'; \t\r\n".toSet
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
val index = if (indexes.isEmpty) -1 else indexes.min
val index = if (indexes.isEmpty) -1 else indexes.min
if (index < 0) Stream.emit(Word(str, start, start + str.length))
else if (index == 0) split(str.substring(1), sep, start + 1)

View File

@ -7,9 +7,9 @@
package docspell.analysis.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty
def isEmpty: Boolean = value.isEmpty
def nonEmpty: Boolean = !isEmpty
def length: Int = value.length
def length: Int = value.length
def trimLeft(chars: Set[Char]): Word = {
val v = value.dropWhile(chars.contains)

View File

@ -91,19 +91,19 @@ class StanfordNerAnnotatorSuite extends FunSuite {
val regexNerContent =
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|""".stripMargin
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|""".stripMargin
File
.withTempDir[IO](File.path(Paths.get("target")), "test-regex-ner")
.use { dir =>
for {
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
_ <- IO(
assertEquals(