mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-07-04 16:48:26 +00:00
Apply scalafmt to all files
This commit is contained in:
@ -16,24 +16,28 @@ case class Domain(labels: NonEmptyList[String], tld: String) {
|
||||
object Domain {
|
||||
|
||||
def domainFromUri(uri: String): Either[String, Domain] =
|
||||
LenientUri.parse(if (uri.contains("://")) uri else s"http://$uri").
|
||||
flatMap(uri => uri.authority.toRight("Uri has no authoriry part")).
|
||||
flatMap(auth => parse(auth))
|
||||
LenientUri
|
||||
.parse(if (uri.contains("://")) uri else s"http://$uri")
|
||||
.flatMap(uri => uri.authority.toRight("Uri has no authoriry part"))
|
||||
.flatMap(auth => parse(auth))
|
||||
|
||||
def parse(str: String): Either[String, Domain] = {
|
||||
Tld.findTld(str).
|
||||
map(tld => (str.dropRight(tld.length), tld)).
|
||||
map({ case (names, tld) =>
|
||||
names.split('.').toList match {
|
||||
case Nil => Left(s"Not a domain: $str")
|
||||
case segs if segs.forall(label =>
|
||||
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')) =>
|
||||
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
|
||||
case _ => Left(s"Not a domain: $str")
|
||||
}
|
||||
}).
|
||||
getOrElse(Left(s"Not a domain $str"))
|
||||
}
|
||||
def parse(str: String): Either[String, Domain] =
|
||||
Tld
|
||||
.findTld(str)
|
||||
.map(tld => (str.dropRight(tld.length), tld))
|
||||
.map({
|
||||
case (names, tld) =>
|
||||
names.split('.').toList match {
|
||||
case Nil => Left(s"Not a domain: $str")
|
||||
case segs
|
||||
if segs.forall(label =>
|
||||
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')
|
||||
) =>
|
||||
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
|
||||
case _ => Left(s"Not a domain: $str")
|
||||
}
|
||||
})
|
||||
.getOrElse(Left(s"Not a domain $str"))
|
||||
|
||||
def isDomain(str: String): Boolean =
|
||||
parse(str).isRight
|
||||
|
@ -20,20 +20,23 @@ object File {
|
||||
|
||||
def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay {
|
||||
val count = new AtomicInteger(0)
|
||||
Files.walkFileTree(dir, new SimpleFileVisitor[Path]() {
|
||||
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
|
||||
Files.deleteIfExists(file)
|
||||
count.incrementAndGet()
|
||||
FileVisitResult.CONTINUE
|
||||
}
|
||||
override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
|
||||
Option(e) match {
|
||||
case Some(ex) => throw ex
|
||||
case None =>
|
||||
Files.deleteIfExists(dir)
|
||||
FileVisitResult.CONTINUE
|
||||
Files.walkFileTree(
|
||||
dir,
|
||||
new SimpleFileVisitor[Path]() {
|
||||
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
|
||||
Files.deleteIfExists(file)
|
||||
count.incrementAndGet()
|
||||
FileVisitResult.CONTINUE
|
||||
}
|
||||
})
|
||||
override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
|
||||
Option(e) match {
|
||||
case Some(ex) => throw ex
|
||||
case None =>
|
||||
Files.deleteIfExists(dir)
|
||||
FileVisitResult.CONTINUE
|
||||
}
|
||||
}
|
||||
)
|
||||
count.get
|
||||
}
|
||||
|
||||
@ -44,12 +47,14 @@ object File {
|
||||
if (Files.isDirectory(path)) deleteDirectory(path)
|
||||
else deleteFile(path).map(_ => 1)
|
||||
|
||||
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)
|
||||
(f: Path => Stream[F, A]): Stream[F, A] =
|
||||
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)(
|
||||
f: Path => Stream[F, A]
|
||||
): Stream[F, A] =
|
||||
Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f)
|
||||
|
||||
def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
|
||||
val javaList = Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
|
||||
val javaList =
|
||||
Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
|
||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||
}
|
||||
|
||||
|
@ -1,8 +1,6 @@
|
||||
package docspell.text.ocr
|
||||
|
||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {
|
||||
|
||||
}
|
||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
|
||||
|
||||
object MimeTypeHint {
|
||||
val none = MimeTypeHint(None, None)
|
||||
|
@ -6,12 +6,23 @@ import fs2.Stream
|
||||
|
||||
object TextExtract {
|
||||
|
||||
def extract[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
|
||||
def extract[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
extractOCR(in, blocker, lang, config)
|
||||
|
||||
def extractOCR[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
|
||||
Stream.eval(TikaMimetype.detect(in)).
|
||||
flatMap({
|
||||
def extractOCR[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
Stream
|
||||
.eval(TikaMimetype.detect(in))
|
||||
.flatMap({
|
||||
case mt if !config.isAllowed(mt) =>
|
||||
raiseError(s"File `$mt` not allowed")
|
||||
|
||||
|
@ -11,20 +11,16 @@ object TextSplitter {
|
||||
|
||||
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
|
||||
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
|
||||
val index = if (indexes.isEmpty) - 1 else indexes.min
|
||||
val index = if (indexes.isEmpty) -1 else indexes.min
|
||||
|
||||
if (index < 0) Stream.emit(Word(str, start, start + str.length))
|
||||
else if (index == 0) split(str.substring(1), sep, start + 1)
|
||||
else Stream.emit(Word(str.substring(0, index), start, start + index)) ++
|
||||
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
|
||||
else
|
||||
Stream.emit(Word(str.substring(0, index), start, start + index)) ++
|
||||
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
|
||||
}
|
||||
|
||||
|
||||
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
|
||||
split(str, sep, start).
|
||||
map(w => w.trim(trimChars)).
|
||||
filter(_.nonEmpty).
|
||||
map(_.toLower)
|
||||
}
|
||||
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] =
|
||||
split(str, sep, start).map(w => w.trim(trimChars)).filter(_.nonEmpty).map(_.toLower)
|
||||
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
package docspell.text.split
|
||||
|
||||
case class Word(value: String, begin: Int, end: Int) {
|
||||
def isEmpty: Boolean = value.isEmpty
|
||||
def isEmpty: Boolean = value.isEmpty
|
||||
def nonEmpty: Boolean = !isEmpty
|
||||
def length : Int = value.length
|
||||
def length: Int = value.length
|
||||
|
||||
def trimLeft(chars: Set[Char]): Word = {
|
||||
val v = value.dropWhile(chars.contains)
|
||||
|
@ -7,20 +7,21 @@ import fs2.Stream
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
|
||||
val letterSourceDE: Stream[IO, Byte] =
|
||||
LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
|
||||
readURL[IO](16 * 1024, blocker)
|
||||
LenientUri
|
||||
.fromJava(getClass.getResource("/letter-de-source.pdf"))
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
|
||||
val letterSourceEN: Stream[IO, Byte] =
|
||||
LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
|
||||
readURL[IO](16 * 1024, blocker)
|
||||
LenientUri
|
||||
.fromJava(getClass.getResource("/letter-en-source.pdf"))
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
|
||||
|
||||
val letterDEText = """Max Mustermann
|
||||
val letterDEText =
|
||||
"""Max Mustermann
|
||||
|
|
||||
|Lilienweg 21
|
||||
|
|
||||
@ -52,7 +53,8 @@ object TestFiles {
|
||||
|Max Mustermann
|
||||
|""".stripMargin.trim
|
||||
|
||||
val letterENText = """Derek Jeter
|
||||
val letterENText =
|
||||
"""Derek Jeter
|
||||
|
|
||||
|123 Elm Ave.
|
||||
|
|
||||
|
@ -22,11 +22,9 @@ object ContactAnnotateSpec extends SimpleTestSuite {
|
||||
|
||||
val labels = Contact.annotate(text)
|
||||
assertEquals(labels.size, 2)
|
||||
assertEquals(labels(0),
|
||||
NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
|
||||
assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
|
||||
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
|
||||
assertEquals(labels(1),
|
||||
NerLabel("example.com", NerTag.Website, 308, 319))
|
||||
assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
|
||||
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
|
||||
}
|
||||
}
|
||||
|
@ -8,45 +8,49 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find english ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
|
||||
val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
|
||||
, NerLabel("Jeter",NerTag.Person,6,11)
|
||||
, NerLabel("Treesville",NerTag.Person,27,37)
|
||||
, NerLabel("Derek",NerTag.Person,69,74)
|
||||
, NerLabel("Jeter",NerTag.Person,75,80)
|
||||
, NerLabel("Treesville",NerTag.Location,96,106)
|
||||
, NerLabel("M.",NerTag.Person,142,144)
|
||||
, NerLabel("Leat",NerTag.Person,145,149)
|
||||
, NerLabel("Syrup",NerTag.Organization,160,165)
|
||||
, NerLabel("Production",NerTag.Organization,166,176)
|
||||
, NerLabel("Old",NerTag.Organization,177,180)
|
||||
, NerLabel("Sticky",NerTag.Organization,181,187)
|
||||
, NerLabel("Pancake",NerTag.Organization,188,195)
|
||||
, NerLabel("Company",NerTag.Organization,196,203)
|
||||
, NerLabel("Maple",NerTag.Location,208,213)
|
||||
, NerLabel("Lane",NerTag.Location,214,218)
|
||||
, NerLabel("Forest",NerTag.Location,220,226)
|
||||
, NerLabel("Hemptown",NerTag.Location,241,249)
|
||||
, NerLabel("Little",NerTag.Organization,349,355)
|
||||
, NerLabel("League",NerTag.Organization,356,362)
|
||||
, NerLabel("Derek",NerTag.Person,1119,1124)
|
||||
, NerLabel("Jeter",NerTag.Person,1125,1130))
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 69, 74),
|
||||
NerLabel("Jeter", NerTag.Person, 75, 80),
|
||||
NerLabel("Treesville", NerTag.Location, 96, 106),
|
||||
NerLabel("M.", NerTag.Person, 142, 144),
|
||||
NerLabel("Leat", NerTag.Person, 145, 149),
|
||||
NerLabel("Syrup", NerTag.Organization, 160, 165),
|
||||
NerLabel("Production", NerTag.Organization, 166, 176),
|
||||
NerLabel("Old", NerTag.Organization, 177, 180),
|
||||
NerLabel("Sticky", NerTag.Organization, 181, 187),
|
||||
NerLabel("Pancake", NerTag.Organization, 188, 195),
|
||||
NerLabel("Company", NerTag.Organization, 196, 203),
|
||||
NerLabel("Maple", NerTag.Location, 208, 213),
|
||||
NerLabel("Lane", NerTag.Location, 214, 218),
|
||||
NerLabel("Forest", NerTag.Location, 220, 226),
|
||||
NerLabel("Hemptown", NerTag.Location, 241, 249),
|
||||
NerLabel("Little", NerTag.Organization, 349, 355),
|
||||
NerLabel("League", NerTag.Organization, 356, 362),
|
||||
NerLabel("Derek", NerTag.Person, 1119, 1124),
|
||||
NerLabel("Jeter", NerTag.Person, 1125, 1130)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
||||
val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
|
||||
, NerLabel("Mustermann", NerTag.Person, 4, 14)
|
||||
, NerLabel("Lilienweg", NerTag.Location, 16, 25)
|
||||
, NerLabel("Max", NerTag.Person, 77, 80)
|
||||
, NerLabel("Mustermann", NerTag.Person, 81, 91)
|
||||
, NerLabel("Lilienweg", NerTag.Location, 93, 102)
|
||||
, NerLabel("EasyCare", NerTag.Organization, 124, 132)
|
||||
, NerLabel("AG", NerTag.Organization, 133, 135)
|
||||
, NerLabel("Ackerweg", NerTag.Location, 158, 166)
|
||||
, NerLabel("Nebendorf", NerTag.Location, 184, 193)
|
||||
, NerLabel("Max", NerTag.Person, 505, 508)
|
||||
, NerLabel("Mustermann", NerTag.Person, 509, 519))
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
NerLabel("Lilienweg", NerTag.Location, 16, 25),
|
||||
NerLabel("Max", NerTag.Person, 77, 80),
|
||||
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
||||
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
||||
NerLabel("EasyCare", NerTag.Organization, 124, 132),
|
||||
NerLabel("AG", NerTag.Organization, 133, 135),
|
||||
NerLabel("Ackerweg", NerTag.Location, 158, 166),
|
||||
NerLabel("Nebendorf", NerTag.Location, 184, 193),
|
||||
NerLabel("Max", NerTag.Person, 505, 508),
|
||||
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
}
|
||||
|
@ -9,16 +9,22 @@ object TextExtractionSuite extends SimpleTestSuite {
|
||||
|
||||
test("extract english pdf") {
|
||||
ignore()
|
||||
val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
|
||||
compile.lastOrError.unsafeRunSync()
|
||||
val text = TextExtract
|
||||
.extract[IO](letterSourceEN, blocker, "eng", Config.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
println(text)
|
||||
}
|
||||
|
||||
test("extract german pdf") {
|
||||
ignore()
|
||||
val expect = TestFiles.letterDEText
|
||||
val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
|
||||
compile.lastOrError.unsafeRunSync()
|
||||
val extract = TextExtract
|
||||
.extract[IO](letterSourceDE, blocker, "deu", Config.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
|
||||
assertEquals(extract.trim, expect.trim)
|
||||
}
|
||||
|
@ -15,7 +15,6 @@ object TestSplitterSpec extends SimpleTestSuite {
|
||||
|
||||
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
|
||||
|
||||
|
||||
assertEquals(words.size, 31)
|
||||
assertEquals(words(13), Word("bitte", 109, 114))
|
||||
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
|
||||
|
Reference in New Issue
Block a user