mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Update scalafmt settings
This commit is contained in:
@ -54,7 +54,7 @@ object TextAnalyser {
|
||||
tags0 <- stanfordNer(Nlp.Input(cacheKey, settings, logger, input))
|
||||
tags1 <- contactNer(input)
|
||||
dates <- dateNer(settings.lang, input)
|
||||
list = tags0 ++ tags1
|
||||
list = tags0 ++ tags1
|
||||
spans = NerLabelSpan.build(list)
|
||||
} yield Result(spans ++ list, dates)
|
||||
|
||||
|
@ -31,10 +31,10 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
|
||||
.withTempDir(cfg.workingDir, "trainclassifier")
|
||||
.use { dir =>
|
||||
for {
|
||||
rawData <- writeDataFile(dir, data)
|
||||
_ <- logger.debug(s"Learning from ${rawData.count} items.")
|
||||
rawData <- writeDataFile(dir, data)
|
||||
_ <- logger.debug(s"Learning from ${rawData.count} items.")
|
||||
trainData <- splitData(logger, rawData)
|
||||
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||
sorted = scores.sortBy(-_.score)
|
||||
res <- handler(sorted.head.model)
|
||||
} yield res
|
||||
@ -77,7 +77,7 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
|
||||
} yield res
|
||||
|
||||
def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
|
||||
val f = if (cfg.classifierConfigs.size > 1) 0.15 else 0.0
|
||||
val f = if (cfg.classifierConfigs.size > 1) 0.15 else 0.0
|
||||
val nTest = (in.count * f).toLong
|
||||
|
||||
val td =
|
||||
@ -142,8 +142,8 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
|
||||
props: Map[String, String]
|
||||
): Map[String, String] =
|
||||
prepend("2.", props) ++ Map(
|
||||
"trainFile" -> trainData.train.absolutePathAsString,
|
||||
"testFile" -> trainData.test.absolutePathAsString,
|
||||
"trainFile" -> trainData.train.absolutePathAsString,
|
||||
"testFile" -> trainData.test.absolutePathAsString,
|
||||
"serializeTo" -> trainData.modelFile.absolutePathAsString
|
||||
).toList
|
||||
|
||||
|
@ -33,7 +33,7 @@ object Contact {
|
||||
if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
|
||||
else {
|
||||
val name = str.substring(0, atIdx)
|
||||
val dom = str.substring(atIdx + 1)
|
||||
val dom = str.substring(atIdx + 1)
|
||||
Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
|
||||
}
|
||||
}
|
||||
|
@ -14,8 +14,7 @@ private[analysis] object Tld {
|
||||
def endsWithTld(str: String): Boolean =
|
||||
findTld(str).isDefined
|
||||
|
||||
/** Some selected TLDs.
|
||||
*/
|
||||
/** Some selected TLDs. */
|
||||
private[this] val known = List(
|
||||
".com",
|
||||
".org",
|
||||
|
@ -177,17 +177,17 @@ object DateFind {
|
||||
|
||||
object Result {
|
||||
final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
|
||||
val toOption = Some(value)
|
||||
val toOption = Some(value)
|
||||
def flatMap[B](f: A => Result[B]): Result[B] = f(value)
|
||||
def map[B](f: A => B): Result[B] = Success(f(value), rest)
|
||||
def map[B](f: A => B): Result[B] = Success(f(value), rest)
|
||||
def next[B](r: Reader[B]): Result[(A, B)] =
|
||||
r.read(rest).map(b => (value, b))
|
||||
}
|
||||
final case object Failure extends Result[Nothing] {
|
||||
val toOption = None
|
||||
val toOption = None
|
||||
def flatMap[B](f: Nothing => Result[B]): Result[B] = this
|
||||
def map[B](f: Nothing => B): Result[B] = this
|
||||
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
|
||||
def map[B](f: Nothing => B): Result[B] = this
|
||||
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
|
||||
}
|
||||
|
||||
implicit def resultSemigroup[A: Semigroup]: Semigroup[Result[A]] =
|
||||
|
@ -74,9 +74,9 @@ object BasicCRFAnnotator {
|
||||
}
|
||||
|
||||
final class Cache {
|
||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||
|
||||
def forLang(language: NLPLanguage): Annotator =
|
||||
language match {
|
||||
|
@ -38,9 +38,9 @@ object PipelineCache {
|
||||
release: F[Unit]
|
||||
): F[PipelineCache[F]] =
|
||||
for {
|
||||
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
|
||||
data <- Ref.of(Map.empty[String, Entry[Annotator[F]]])
|
||||
cacheClear <- CacheClearing.create(data, clearInterval, release)
|
||||
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
|
||||
_ <- Logger.log4s(logger).info("Creating nlp pipeline cache")
|
||||
} yield new Impl[F](data, creator, cacheClear)
|
||||
|
||||
final private class Impl[F[_]: Async](
|
||||
@ -51,7 +51,7 @@ object PipelineCache {
|
||||
|
||||
def obtain(key: String, settings: NlpSettings): Resource[F, Annotator[F]] =
|
||||
for {
|
||||
_ <- cacheClear.withCache
|
||||
_ <- cacheClear.withCache
|
||||
id <- Resource.eval(makeSettingsId(settings))
|
||||
nlp <- Resource.eval(
|
||||
data.modify(cache => getOrCreate(key, id, cache, settings, creator))
|
||||
@ -73,13 +73,13 @@ object PipelineCache {
|
||||
s"StanfordNLP settings changed for key $key. Creating new classifier"
|
||||
)
|
||||
val nlp = creator(settings)
|
||||
val e = Entry(id, nlp)
|
||||
val e = Entry(id, nlp)
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
case None =>
|
||||
val nlp = creator(settings)
|
||||
val e = Entry(id, nlp)
|
||||
val e = Entry(id, nlp)
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
@ -114,7 +114,7 @@ object PipelineCache {
|
||||
release: F[Unit]
|
||||
): F[CacheClearing[F]] =
|
||||
for {
|
||||
counter <- Ref.of(0L)
|
||||
counter <- Ref.of(0L)
|
||||
cleaning <- Ref.of(None: Option[Fiber[F, Throwable, Unit]])
|
||||
log = Logger.log4s(logger)
|
||||
result <-
|
||||
|
@ -44,47 +44,47 @@ object Properties {
|
||||
|
||||
def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "de",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "de",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
|
||||
"ner.useSUTime" -> "false", //only english, unused in docspell
|
||||
"ner.language" -> "de",
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def nerEnglish(regexNerMappingFile: Option[String]): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||
"annotators" -> "tokenize,ssplit,pos,lemma,ner",
|
||||
"tokenize.language" -> "en",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "en",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "en",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile)
|
||||
|
||||
def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"annotators" -> "tokenize,ssplit,mwt,pos,lemma,ner",
|
||||
"tokenize.language" -> "fr",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
|
||||
"mwt.pos.model" -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
|
||||
"mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
|
||||
"ner.statisticalOnly" -> "true",
|
||||
"ner.rulesOnly" -> "false",
|
||||
"ner.applyFineGrained" -> "false",
|
||||
"ner.applyNumericClassifiers" -> "false",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "de",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "de",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
|
@ -8,15 +8,14 @@ package docspell.analysis.split
|
||||
|
||||
import fs2.Stream
|
||||
|
||||
/** Splits text into words.
|
||||
*/
|
||||
/** Splits text into words. */
|
||||
object TextSplitter {
|
||||
private[this] val trimChars =
|
||||
".,…_[]^!<>=&ſ/{}*?()-:#$|~`+%\\\"'; \t\r\n".toSet
|
||||
|
||||
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
|
||||
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
|
||||
val index = if (indexes.isEmpty) -1 else indexes.min
|
||||
val index = if (indexes.isEmpty) -1 else indexes.min
|
||||
|
||||
if (index < 0) Stream.emit(Word(str, start, start + str.length))
|
||||
else if (index == 0) split(str.substring(1), sep, start + 1)
|
||||
|
@ -7,9 +7,9 @@
|
||||
package docspell.analysis.split
|
||||
|
||||
case class Word(value: String, begin: Int, end: Int) {
|
||||
def isEmpty: Boolean = value.isEmpty
|
||||
def isEmpty: Boolean = value.isEmpty
|
||||
def nonEmpty: Boolean = !isEmpty
|
||||
def length: Int = value.length
|
||||
def length: Int = value.length
|
||||
|
||||
def trimLeft(chars: Set[Char]): Word = {
|
||||
val v = value.dropWhile(chars.contains)
|
||||
|
@ -91,19 +91,19 @@ class StanfordNerAnnotatorSuite extends FunSuite {
|
||||
|
||||
val regexNerContent =
|
||||
s"""(?i)volantino ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|""".stripMargin
|
||||
|(?i)volantino${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)ag${"\t"}ORGANIZATION${"\t"}LOCATION,PERSON,MISC${"\t"}3
|
||||
|(?i)andrea rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)andrea${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|(?i)rossi${"\t"}PERSON${"\t"}LOCATION,MISC${"\t"}2
|
||||
|""".stripMargin
|
||||
|
||||
File
|
||||
.withTempDir[IO](File.path(Paths.get("target")), "test-regex-ner")
|
||||
.use { dir =>
|
||||
for {
|
||||
out <- File.writeString[IO](dir / "regex.txt", regexNerContent)
|
||||
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
|
||||
ann = StanfordNerAnnotator.makePipeline(StanfordNerSettings.RegexOnly(out))
|
||||
labels = StanfordNerAnnotator.nerAnnotate(ann, "Hello Andrea Rossi, can you.")
|
||||
_ <- IO(
|
||||
assertEquals(
|
||||
|
Reference in New Issue
Block a user