Apply scalafmt to all files

This commit is contained in:
Eike Kettner
2019-12-30 21:44:13 +01:00
parent 57e274e2b0
commit fc3e22e399
133 changed files with 3003 additions and 2112 deletions

View File

@ -16,24 +16,28 @@ case class Domain(labels: NonEmptyList[String], tld: String) {
object Domain {
def domainFromUri(uri: String): Either[String, Domain] =
LenientUri.parse(if (uri.contains("://")) uri else s"http://$uri").
flatMap(uri => uri.authority.toRight("Uri has no authoriry part")).
flatMap(auth => parse(auth))
LenientUri
.parse(if (uri.contains("://")) uri else s"http://$uri")
.flatMap(uri => uri.authority.toRight("Uri has no authoriry part"))
.flatMap(auth => parse(auth))
def parse(str: String): Either[String, Domain] = {
Tld.findTld(str).
map(tld => (str.dropRight(tld.length), tld)).
map({ case (names, tld) =>
names.split('.').toList match {
case Nil => Left(s"Not a domain: $str")
case segs if segs.forall(label =>
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')) =>
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
case _ => Left(s"Not a domain: $str")
}
}).
getOrElse(Left(s"Not a domain $str"))
}
def parse(str: String): Either[String, Domain] =
Tld
.findTld(str)
.map(tld => (str.dropRight(tld.length), tld))
.map({
case (names, tld) =>
names.split('.').toList match {
case Nil => Left(s"Not a domain: $str")
case segs
if segs.forall(label =>
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')
) =>
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
case _ => Left(s"Not a domain: $str")
}
})
.getOrElse(Left(s"Not a domain $str"))
def isDomain(str: String): Boolean =
parse(str).isRight

View File

@ -20,20 +20,23 @@ object File {
def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay {
val count = new AtomicInteger(0)
Files.walkFileTree(dir, new SimpleFileVisitor[Path]() {
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
Files.deleteIfExists(file)
count.incrementAndGet()
FileVisitResult.CONTINUE
}
override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
Option(e) match {
case Some(ex) => throw ex
case None =>
Files.deleteIfExists(dir)
FileVisitResult.CONTINUE
Files.walkFileTree(
dir,
new SimpleFileVisitor[Path]() {
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
Files.deleteIfExists(file)
count.incrementAndGet()
FileVisitResult.CONTINUE
}
})
override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
Option(e) match {
case Some(ex) => throw ex
case None =>
Files.deleteIfExists(dir)
FileVisitResult.CONTINUE
}
}
)
count.get
}
@ -44,12 +47,14 @@ object File {
if (Files.isDirectory(path)) deleteDirectory(path)
else deleteFile(path).map(_ => 1)
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)
(f: Path => Stream[F, A]): Stream[F, A] =
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)(
f: Path => Stream[F, A]
): Stream[F, A] =
Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f)
def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
val javaList = Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
val javaList =
Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
javaList.asScala.toList.sortBy(_.getFileName.toString)
}

View File

@ -1,8 +1,6 @@
package docspell.text.ocr
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {
}
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
object MimeTypeHint {
val none = MimeTypeHint(None, None)

View File

@ -6,12 +6,23 @@ import fs2.Stream
object TextExtract {
def extract[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extract[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
extractOCR(in, blocker, lang, config)
def extractOCR[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
Stream.eval(TikaMimetype.detect(in)).
flatMap({
def extractOCR[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
Stream
.eval(TikaMimetype.detect(in))
.flatMap({
case mt if !config.isAllowed(mt) =>
raiseError(s"File `$mt` not allowed")

View File

@ -11,20 +11,16 @@ object TextSplitter {
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
val index = if (indexes.isEmpty) - 1 else indexes.min
val index = if (indexes.isEmpty) -1 else indexes.min
if (index < 0) Stream.emit(Word(str, start, start + str.length))
else if (index == 0) split(str.substring(1), sep, start + 1)
else Stream.emit(Word(str.substring(0, index), start, start + index)) ++
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
else
Stream.emit(Word(str.substring(0, index), start, start + index)) ++
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
}
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
split(str, sep, start).
map(w => w.trim(trimChars)).
filter(_.nonEmpty).
map(_.toLower)
}
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] =
split(str, sep, start).map(w => w.trim(trimChars)).filter(_.nonEmpty).map(_.toLower)
}

View File

@ -1,9 +1,9 @@
package docspell.text.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty
def isEmpty: Boolean = value.isEmpty
def nonEmpty: Boolean = !isEmpty
def length : Int = value.length
def length: Int = value.length
def trimLeft(chars: Set[Char]): Word = {
val v = value.dropWhile(chars.contains)

View File

@ -7,20 +7,21 @@ import fs2.Stream
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
val letterSourceDE: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
readURL[IO](16 * 1024, blocker)
LenientUri
.fromJava(getClass.getResource("/letter-de-source.pdf"))
.readURL[IO](16 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
readURL[IO](16 * 1024, blocker)
LenientUri
.fromJava(getClass.getResource("/letter-en-source.pdf"))
.readURL[IO](16 * 1024, blocker)
val letterDEText = """Max Mustermann
val letterDEText =
"""Max Mustermann
|
|Lilienweg 21
|
@ -52,7 +53,8 @@ object TestFiles {
|Max Mustermann
|""".stripMargin.trim
val letterENText = """Derek Jeter
val letterENText =
"""Derek Jeter
|
|123 Elm Ave.
|

View File

@ -22,11 +22,9 @@ object ContactAnnotateSpec extends SimpleTestSuite {
val labels = Contact.annotate(text)
assertEquals(labels.size, 2)
assertEquals(labels(0),
NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
assertEquals(labels(1),
NerLabel("example.com", NerTag.Website, 308, 319))
assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
}
}

View File

@ -8,45 +8,49 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find english ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
, NerLabel("Jeter",NerTag.Person,6,11)
, NerLabel("Treesville",NerTag.Person,27,37)
, NerLabel("Derek",NerTag.Person,69,74)
, NerLabel("Jeter",NerTag.Person,75,80)
, NerLabel("Treesville",NerTag.Location,96,106)
, NerLabel("M.",NerTag.Person,142,144)
, NerLabel("Leat",NerTag.Person,145,149)
, NerLabel("Syrup",NerTag.Organization,160,165)
, NerLabel("Production",NerTag.Organization,166,176)
, NerLabel("Old",NerTag.Organization,177,180)
, NerLabel("Sticky",NerTag.Organization,181,187)
, NerLabel("Pancake",NerTag.Organization,188,195)
, NerLabel("Company",NerTag.Organization,196,203)
, NerLabel("Maple",NerTag.Location,208,213)
, NerLabel("Lane",NerTag.Location,214,218)
, NerLabel("Forest",NerTag.Location,220,226)
, NerLabel("Hemptown",NerTag.Location,241,249)
, NerLabel("Little",NerTag.Organization,349,355)
, NerLabel("League",NerTag.Organization,356,362)
, NerLabel("Derek",NerTag.Person,1119,1124)
, NerLabel("Jeter",NerTag.Person,1125,1130))
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 69, 74),
NerLabel("Jeter", NerTag.Person, 75, 80),
NerLabel("Treesville", NerTag.Location, 96, 106),
NerLabel("M.", NerTag.Person, 142, 144),
NerLabel("Leat", NerTag.Person, 145, 149),
NerLabel("Syrup", NerTag.Organization, 160, 165),
NerLabel("Production", NerTag.Organization, 166, 176),
NerLabel("Old", NerTag.Organization, 177, 180),
NerLabel("Sticky", NerTag.Organization, 181, 187),
NerLabel("Pancake", NerTag.Organization, 188, 195),
NerLabel("Company", NerTag.Organization, 196, 203),
NerLabel("Maple", NerTag.Location, 208, 213),
NerLabel("Lane", NerTag.Location, 214, 218),
NerLabel("Forest", NerTag.Location, 220, 226),
NerLabel("Hemptown", NerTag.Location, 241, 249),
NerLabel("Little", NerTag.Organization, 349, 355),
NerLabel("League", NerTag.Organization, 356, 362),
NerLabel("Derek", NerTag.Person, 1119, 1124),
NerLabel("Jeter", NerTag.Person, 1125, 1130)
)
assertEquals(labels, expect)
}
test("find german ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
, NerLabel("Mustermann", NerTag.Person, 4, 14)
, NerLabel("Lilienweg", NerTag.Location, 16, 25)
, NerLabel("Max", NerTag.Person, 77, 80)
, NerLabel("Mustermann", NerTag.Person, 81, 91)
, NerLabel("Lilienweg", NerTag.Location, 93, 102)
, NerLabel("EasyCare", NerTag.Organization, 124, 132)
, NerLabel("AG", NerTag.Organization, 133, 135)
, NerLabel("Ackerweg", NerTag.Location, 158, 166)
, NerLabel("Nebendorf", NerTag.Location, 184, 193)
, NerLabel("Max", NerTag.Person, 505, 508)
, NerLabel("Mustermann", NerTag.Person, 509, 519))
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),
NerLabel("Lilienweg", NerTag.Location, 16, 25),
NerLabel("Max", NerTag.Person, 77, 80),
NerLabel("Mustermann", NerTag.Person, 81, 91),
NerLabel("Lilienweg", NerTag.Location, 93, 102),
NerLabel("EasyCare", NerTag.Organization, 124, 132),
NerLabel("AG", NerTag.Organization, 133, 135),
NerLabel("Ackerweg", NerTag.Location, 158, 166),
NerLabel("Nebendorf", NerTag.Location, 184, 193),
NerLabel("Max", NerTag.Person, 505, 508),
NerLabel("Mustermann", NerTag.Person, 509, 519)
)
assertEquals(labels, expect)
}
}

View File

@ -9,16 +9,22 @@ object TextExtractionSuite extends SimpleTestSuite {
test("extract english pdf") {
ignore()
val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
compile.lastOrError.unsafeRunSync()
val text = TextExtract
.extract[IO](letterSourceEN, blocker, "eng", Config.default)
.compile
.lastOrError
.unsafeRunSync()
println(text)
}
test("extract german pdf") {
ignore()
val expect = TestFiles.letterDEText
val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
compile.lastOrError.unsafeRunSync()
val extract = TextExtract
.extract[IO](letterSourceDE, blocker, "deu", Config.default)
.compile
.lastOrError
.unsafeRunSync()
assertEquals(extract.trim, expect.trim)
}

View File

@ -15,7 +15,6 @@ object TestSplitterSpec extends SimpleTestSuite {
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
assertEquals(words.size, 31)
assertEquals(words(13), Word("bitte", 109, 114))
assertEquals(text.substring(109, 114).toLowerCase, "bitte")