Reformat with scalafmt 3.0.0

2025-08-05 02:24:52 +00:00 · 2021-08-19 08:50:30 +02:00
parent 5a2a0295ef
commit e4fecefaea
127 changed files with 558 additions and 658 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/classifier/StanfordTextClassifier.scala
@ -155,10 +155,8 @@ final class StanfordTextClassifier[F[_]: Async](cfg: TextClassifierConfig)
  case class TrainResult(score: Double, model: ClassifierModel)

  def prepend(pre: String, data: Map[String, String]): Map[String, String] =
-    data.toList
-      .map({ case (k, v) =>
-        if (k.startsWith(pre)) (k, v)
-        else (pre + k, v)
-      })
-      .toMap
+    data.toList.map { case (k, v) =>
+      if (k.startsWith(pre)) (k, v)
+      else (pre + k, v)
+    }.toMap
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
@ -32,7 +32,7 @@ object Domain {
    Tld
      .findTld(str)
      .map(tld => (str.dropRight(tld.length), tld))
-      .map({ case (names, tld) =>
+      .map { case (names, tld) =>
        names.split('.').toList match {
          case Nil => Left(s"Not a domain: $str")
          case segs
@ -43,7 +43,7 @@ object Domain {
            Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
          case _ => Left(s"Not a domain: $str")
        }
-      })
+      }
      .getOrElse(Left(s"Not a domain $str"))

  def isDomain(str: String): Boolean =
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@ -160,11 +160,11 @@ object DateFind {
        Reader(words => Nel.of(reader, more: _*).map(_.read(words)).reduce)

      def readFirst[A](f: Word => Option[A]): Reader[A] =
-        Reader({
+        Reader {
          case Nil => Result.Failure
          case a :: as =>
            f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
-        })
+        }
    }

    sealed trait Result[+A] {
--- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
@ -15,7 +15,7 @@ object MonthName {

  private def merge(n0: List[List[String]], ns: List[List[String]]*): List[List[String]] =
    ns.foldLeft(n0) { (res, el) =>
-      res.zip(el).map({ case (a, b) => a ++ b })
+      res.zip(el).map { case (a, b) => a ++ b }
    }

  private def forLang(lang: Language): List[List[String]] =
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Annotator.scala
@ -35,12 +35,12 @@ object Annotator {
    *
    * There are the following ways:
    *
-    * - disabled: it returns a no-op annotator that always gives an empty list
-    * - full: the complete stanford pipeline is used
-    * - basic: only the ner classifier is used
+    *   - disabled: it returns a no-op annotator that always gives an empty list
+    *   - full: the complete stanford pipeline is used
+    *   - basic: only the ner classifier is used
    *
-    * Additionally, if there is a regexNer-file specified, the regexner annotator is
-    * also run. In case the full pipeline is used, this is already included.
+    * Additionally, if there is a regexNer-file specified, the regexner annotator is also
+    * run. In case the full pipeline is used, this is already included.
    */
  def apply[F[_]: Sync](mode: NlpMode)(settings: NlpSettings): Annotator[F] =
    mode match {
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
@ -21,10 +21,9 @@ import edu.stanford.nlp.ie.crf.CRFClassifier
 import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
 import org.log4s.getLogger

-/** This is only using the CRFClassifier without building an analysis
-  * pipeline. The ner-classifier cannot use results from POS-tagging
-  * etc. and is therefore not as good as the [[StanfordNerAnnotator]].
-  * But it uses less memory, while still being not bad.
+/** This is only using the CRFClassifier without building an analysis pipeline. The
+  * ner-classifier cannot use results from POS-tagging etc. and is therefore not as good
+  * as the [[StanfordNerAnnotator]]. But it uses less memory, while still being not bad.
  */
 object BasicCRFAnnotator {
  private[this] val logger = getLogger
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@ -17,8 +17,8 @@ import docspell.common._

 import org.log4s.getLogger

-/** Creating the StanfordCoreNLP pipeline is quite expensive as it
-  * involves IO and initializing large objects.
+/** Creating the StanfordCoreNLP pipeline is quite expensive as it involves IO and
+  * initializing large objects.
  *
  * Therefore, the instances are cached, because they are thread-safe.
  *
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@ -44,48 +44,48 @@ object Properties {

  def nerGerman(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
-      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
-      "tokenize.language"           -> "de",
-      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
-      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
-      "ner.statisticalOnly"         -> "true",
-      "ner.rulesOnly"               -> "false",
-      "ner.applyFineGrained"        -> "false",
+      "annotators"           -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language"    -> "de",
+      "mwt.mappingFile"      -> "edu/stanford/nlp/models/mwt/german/german-mwt.tsv",
+      "pos.model"            -> "edu/stanford/nlp/models/pos-tagger/german-ud.tagger",
+      "ner.statisticalOnly"  -> "true",
+      "ner.rulesOnly"        -> "false",
+      "ner.applyFineGrained" -> "false",
      "ner.applyNumericClassifiers" -> "false", //only english supported, not needed currently
-      "ner.useSUTime"               -> "false", //only english, unused in docspell
-      "ner.language"                -> "de",
-      "ner.model"                   -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+      "ner.useSUTime" -> "false", //only english, unused in docspell
+      "ner.language"  -> "de",
+      "ner.model" -> "edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)

  def nerEnglish(regexNerMappingFile: Option[String]): JProps =
    Properties(
-      "annotators"                  -> "tokenize,ssplit,pos,lemma,ner",
-      "tokenize.language"           -> "en",
-      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
+      "annotators"        -> "tokenize,ssplit,pos,lemma,ner",
+      "tokenize.language" -> "en",
+      "pos.model" -> "edu/stanford/nlp/models/pos-tagger/english-left3words-distsim.tagger",
      "ner.statisticalOnly"         -> "true",
      "ner.rulesOnly"               -> "false",
      "ner.applyFineGrained"        -> "false",
      "ner.applyNumericClassifiers" -> "false",
      "ner.useSUTime"               -> "false",
      "ner.language"                -> "en",
-      "ner.model"                   -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+      "ner.model" -> "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile)

  def nerFrench(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
-      "annotators"                  -> "tokenize,ssplit,mwt,pos,lemma,ner",
-      "tokenize.language"           -> "fr",
-      "mwt.mappingFile"             -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
-      "mwt.pos.model"               -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
-      "mwt.statisticalMappingFile"  -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
-      "pos.model"                   -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
-      "ner.statisticalOnly"         -> "true",
-      "ner.rulesOnly"               -> "false",
-      "ner.applyFineGrained"        -> "false",
+      "annotators"        -> "tokenize,ssplit,mwt,pos,lemma,ner",
+      "tokenize.language" -> "fr",
+      "mwt.mappingFile"   -> "edu/stanford/nlp/models/mwt/french/french-mwt.tsv",
+      "mwt.pos.model"     -> "edu/stanford/nlp/models/mwt/french/french-mwt.tagger",
+      "mwt.statisticalMappingFile" -> "edu/stanford/nlp/models/mwt/french/french-mwt-statistical.tsv",
+      "pos.model"            -> "edu/stanford/nlp/models/pos-tagger/french-ud.tagger",
+      "ner.statisticalOnly"  -> "true",
+      "ner.rulesOnly"        -> "false",
+      "ner.applyFineGrained" -> "false",
      "ner.applyNumericClassifiers" -> "false",
      "ner.useSUTime"               -> "false",
      "ner.language"                -> "de",
-      "ner.model"                   -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+      "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)

  def regexNerOnly(regexNerMappingFile: Path): JProps =
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerAnnotator.scala
@ -22,13 +22,11 @@ object StanfordNerAnnotator {
  /** Runs named entity recognition on the given `text`.
    *
    * This uses the classifier pipeline from stanford-nlp, see
-    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these
-    * classifiers is quite expensive, it involves loading large model
-    * files. The classifiers are thread-safe and so they are cached.
-    * The `cacheKey` defines the "slot" where classifiers are stored
-    * and retrieved. If for a given `cacheKey` the `settings` change,
-    * a new classifier must be created. It will then replace the
-    * previous one.
+    * https://nlp.stanford.edu/software/CRF-NER.html. Creating these classifiers is quite
+    * expensive, it involves loading large model files. The classifiers are thread-safe
+    * and so they are cached. The `cacheKey` defines the "slot" where classifiers are
+    * stored and retrieved. If for a given `cacheKey` the `settings` change, a new
+    * classifier must be created. It will then replace the previous one.
    */
  def nerAnnotate(nerClassifier: StanfordCoreNLP, text: String): Vector[NerLabel] = {
    val doc = new CoreDocument(text)
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
@ -17,18 +17,16 @@ object StanfordNerSettings {

  /** Settings for configuring the stanford NER pipeline.
    *
-    * The language is mandatory, only the provided ones are supported.
-    * The `highRecall` only applies for non-English languages. For
-    * non-English languages the english classifier is run as second
-    * classifier and if `highRecall` is true, then it will be used to
-    * tag untagged tokens. This may lead to a lot of false positives,
-    * but since English is omnipresent in other languages, too it
-    * depends on the use case for whether this is useful or not.
+    * The language is mandatory, only the provided ones are supported. The `highRecall`
+    * only applies for non-English languages. For non-English languages the english
+    * classifier is run as second classifier and if `highRecall` is true, then it will be
+    * used to tag untagged tokens. This may lead to a lot of false positives, but since
+    * English is omnipresent in other languages, too it depends on the use case for
+    * whether this is useful or not.
    *
    * The `regexNer` allows to specify a text file as described here:
-    * https://nlp.stanford.edu/software/regexner.html. This will be used
-    * as a last step to tag untagged tokens using the provided list of
-    * regexps.
+    * https://nlp.stanford.edu/software/regexner.html. This will be used as a last step to
+    * tag untagged tokens using the provided list of regexps.
    */
  case class Full(
      lang: NLPLanguage,
@ -36,7 +34,8 @@ object StanfordNerSettings {
      regexNer: Option[Path]
  ) extends StanfordNerSettings

-  /** Not all languages are supported with predefined statistical models. This allows to provide regexps only.
+  /** Not all languages are supported with predefined statistical models. This allows to
+    * provide regexps only.
    */
  case class RegexOnly(regexNerFile: Path) extends StanfordNerSettings

--- a/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/classifier/StanfordTextClassifierSuite.scala
@ -37,9 +37,9 @@ class StanfordTextClassifierSuite extends FunSuite {
            .repeat
            .take(10)
        )
-        .flatMap({ case (a, b) =>
+        .flatMap { case (a, b) =>
          Stream.emits(Seq(a, b))
-        })
+        }
        .covary[IO]

    val modelExists = {
@ -52,7 +52,7 @@ class StanfordTextClassifierSuite extends FunSuite {
  }

  test("run classifier") {
-    val cfg    = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
+    val cfg = TextClassifierConfig(File.path(Paths.get("target")), NonEmptyList.of(Map()))
    val things = File.withTempDir[IO](File.path(Paths.get("target")), "testcls")

    things