mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-11-03 18:00:11 +00:00 
			
		
		
		
	Apply scalafmt to all files
This commit is contained in:
		@@ -16,24 +16,28 @@ case class Domain(labels: NonEmptyList[String], tld: String) {
 | 
			
		||||
object Domain {
 | 
			
		||||
 | 
			
		||||
  def domainFromUri(uri: String): Either[String, Domain] =
 | 
			
		||||
    LenientUri.parse(if (uri.contains("://")) uri else s"http://$uri").
 | 
			
		||||
      flatMap(uri => uri.authority.toRight("Uri has no authoriry part")).
 | 
			
		||||
      flatMap(auth => parse(auth))
 | 
			
		||||
    LenientUri
 | 
			
		||||
      .parse(if (uri.contains("://")) uri else s"http://$uri")
 | 
			
		||||
      .flatMap(uri => uri.authority.toRight("Uri has no authoriry part"))
 | 
			
		||||
      .flatMap(auth => parse(auth))
 | 
			
		||||
 | 
			
		||||
  def parse(str: String): Either[String, Domain] = {
 | 
			
		||||
    Tld.findTld(str).
 | 
			
		||||
      map(tld => (str.dropRight(tld.length), tld)).
 | 
			
		||||
      map({ case (names, tld) =>
 | 
			
		||||
        names.split('.').toList match {
 | 
			
		||||
          case Nil => Left(s"Not a domain: $str")
 | 
			
		||||
          case segs if segs.forall(label =>
 | 
			
		||||
            label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')) =>
 | 
			
		||||
            Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
 | 
			
		||||
          case _ => Left(s"Not a domain: $str")
 | 
			
		||||
        }
 | 
			
		||||
      }).
 | 
			
		||||
      getOrElse(Left(s"Not a domain $str"))
 | 
			
		||||
  }
 | 
			
		||||
  def parse(str: String): Either[String, Domain] =
 | 
			
		||||
    Tld
 | 
			
		||||
      .findTld(str)
 | 
			
		||||
      .map(tld => (str.dropRight(tld.length), tld))
 | 
			
		||||
      .map({
 | 
			
		||||
        case (names, tld) =>
 | 
			
		||||
          names.split('.').toList match {
 | 
			
		||||
            case Nil => Left(s"Not a domain: $str")
 | 
			
		||||
            case segs
 | 
			
		||||
                if segs.forall(label =>
 | 
			
		||||
                  label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')
 | 
			
		||||
                ) =>
 | 
			
		||||
              Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
 | 
			
		||||
            case _ => Left(s"Not a domain: $str")
 | 
			
		||||
          }
 | 
			
		||||
      })
 | 
			
		||||
      .getOrElse(Left(s"Not a domain $str"))
 | 
			
		||||
 | 
			
		||||
  def isDomain(str: String): Boolean =
 | 
			
		||||
    parse(str).isRight
 | 
			
		||||
 
 | 
			
		||||
@@ -20,20 +20,23 @@ object File {
 | 
			
		||||
 | 
			
		||||
  def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay {
 | 
			
		||||
    val count = new AtomicInteger(0)
 | 
			
		||||
    Files.walkFileTree(dir, new SimpleFileVisitor[Path]() {
 | 
			
		||||
      override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
 | 
			
		||||
        Files.deleteIfExists(file)
 | 
			
		||||
        count.incrementAndGet()
 | 
			
		||||
        FileVisitResult.CONTINUE
 | 
			
		||||
      }
 | 
			
		||||
      override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
 | 
			
		||||
        Option(e) match {
 | 
			
		||||
          case Some(ex) => throw ex
 | 
			
		||||
          case None =>
 | 
			
		||||
            Files.deleteIfExists(dir)
 | 
			
		||||
            FileVisitResult.CONTINUE
 | 
			
		||||
    Files.walkFileTree(
 | 
			
		||||
      dir,
 | 
			
		||||
      new SimpleFileVisitor[Path]() {
 | 
			
		||||
        override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
 | 
			
		||||
          Files.deleteIfExists(file)
 | 
			
		||||
          count.incrementAndGet()
 | 
			
		||||
          FileVisitResult.CONTINUE
 | 
			
		||||
        }
 | 
			
		||||
    })
 | 
			
		||||
        override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
 | 
			
		||||
          Option(e) match {
 | 
			
		||||
            case Some(ex) => throw ex
 | 
			
		||||
            case None =>
 | 
			
		||||
              Files.deleteIfExists(dir)
 | 
			
		||||
              FileVisitResult.CONTINUE
 | 
			
		||||
          }
 | 
			
		||||
      }
 | 
			
		||||
    )
 | 
			
		||||
    count.get
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -44,12 +47,14 @@ object File {
 | 
			
		||||
    if (Files.isDirectory(path)) deleteDirectory(path)
 | 
			
		||||
    else deleteFile(path).map(_ => 1)
 | 
			
		||||
 | 
			
		||||
  def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)
 | 
			
		||||
    (f: Path => Stream[F, A]): Stream[F, A] =
 | 
			
		||||
  def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)(
 | 
			
		||||
      f: Path => Stream[F, A]
 | 
			
		||||
  ): Stream[F, A] =
 | 
			
		||||
    Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f)
 | 
			
		||||
 | 
			
		||||
  def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
 | 
			
		||||
    val javaList = Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
 | 
			
		||||
    val javaList =
 | 
			
		||||
      Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
 | 
			
		||||
    javaList.asScala.toList.sortBy(_.getFileName.toString)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,6 @@
 | 
			
		||||
package docspell.text.ocr
 | 
			
		||||
 | 
			
		||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
 | 
			
		||||
 | 
			
		||||
object MimeTypeHint {
 | 
			
		||||
  val none = MimeTypeHint(None, None)
 | 
			
		||||
 
 | 
			
		||||
@@ -6,12 +6,23 @@ import fs2.Stream
 | 
			
		||||
 | 
			
		||||
object TextExtract {
 | 
			
		||||
 | 
			
		||||
  def extract[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
 | 
			
		||||
  def extract[F[_]: Sync: ContextShift](
 | 
			
		||||
      in: Stream[F, Byte],
 | 
			
		||||
      blocker: Blocker,
 | 
			
		||||
      lang: String,
 | 
			
		||||
      config: Config
 | 
			
		||||
  ): Stream[F, String] =
 | 
			
		||||
    extractOCR(in, blocker, lang, config)
 | 
			
		||||
 | 
			
		||||
  def extractOCR[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
 | 
			
		||||
    Stream.eval(TikaMimetype.detect(in)).
 | 
			
		||||
      flatMap({
 | 
			
		||||
  def extractOCR[F[_]: Sync: ContextShift](
 | 
			
		||||
      in: Stream[F, Byte],
 | 
			
		||||
      blocker: Blocker,
 | 
			
		||||
      lang: String,
 | 
			
		||||
      config: Config
 | 
			
		||||
  ): Stream[F, String] =
 | 
			
		||||
    Stream
 | 
			
		||||
      .eval(TikaMimetype.detect(in))
 | 
			
		||||
      .flatMap({
 | 
			
		||||
        case mt if !config.isAllowed(mt) =>
 | 
			
		||||
          raiseError(s"File `$mt` not allowed")
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -11,20 +11,16 @@ object TextSplitter {
 | 
			
		||||
 | 
			
		||||
  def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
 | 
			
		||||
    val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
 | 
			
		||||
    val index = if (indexes.isEmpty) - 1 else indexes.min
 | 
			
		||||
    val index   = if (indexes.isEmpty) -1 else indexes.min
 | 
			
		||||
 | 
			
		||||
    if (index < 0) Stream.emit(Word(str, start, start + str.length))
 | 
			
		||||
    else if (index == 0) split(str.substring(1), sep, start + 1)
 | 
			
		||||
    else Stream.emit(Word(str.substring(0, index), start, start + index)) ++
 | 
			
		||||
      Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
 | 
			
		||||
    else
 | 
			
		||||
      Stream.emit(Word(str.substring(0, index), start, start + index)) ++
 | 
			
		||||
        Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
 | 
			
		||||
    split(str, sep, start).
 | 
			
		||||
      map(w => w.trim(trimChars)).
 | 
			
		||||
      filter(_.nonEmpty).
 | 
			
		||||
      map(_.toLower)
 | 
			
		||||
  }
 | 
			
		||||
  def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] =
 | 
			
		||||
    split(str, sep, start).map(w => w.trim(trimChars)).filter(_.nonEmpty).map(_.toLower)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,9 +1,9 @@
 | 
			
		||||
package docspell.text.split
 | 
			
		||||
 | 
			
		||||
case class Word(value: String, begin: Int, end: Int) {
 | 
			
		||||
  def isEmpty: Boolean = value.isEmpty
 | 
			
		||||
  def isEmpty: Boolean  = value.isEmpty
 | 
			
		||||
  def nonEmpty: Boolean = !isEmpty
 | 
			
		||||
  def length : Int = value.length
 | 
			
		||||
  def length: Int       = value.length
 | 
			
		||||
 | 
			
		||||
  def trimLeft(chars: Set[Char]): Word = {
 | 
			
		||||
    val v = value.dropWhile(chars.contains)
 | 
			
		||||
 
 | 
			
		||||
@@ -7,20 +7,21 @@ import fs2.Stream
 | 
			
		||||
import scala.concurrent.ExecutionContext
 | 
			
		||||
 | 
			
		||||
object TestFiles {
 | 
			
		||||
  val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
 | 
			
		||||
  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
 | 
			
		||||
  implicit val CS = IO.contextShift(ExecutionContext.global)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  val letterSourceDE: Stream[IO, Byte] =
 | 
			
		||||
    LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
 | 
			
		||||
      readURL[IO](16 * 1024, blocker)
 | 
			
		||||
    LenientUri
 | 
			
		||||
      .fromJava(getClass.getResource("/letter-de-source.pdf"))
 | 
			
		||||
      .readURL[IO](16 * 1024, blocker)
 | 
			
		||||
 | 
			
		||||
  val letterSourceEN: Stream[IO, Byte] =
 | 
			
		||||
    LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
 | 
			
		||||
      readURL[IO](16 * 1024, blocker)
 | 
			
		||||
    LenientUri
 | 
			
		||||
      .fromJava(getClass.getResource("/letter-en-source.pdf"))
 | 
			
		||||
      .readURL[IO](16 * 1024, blocker)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  val letterDEText = """Max Mustermann
 | 
			
		||||
  val letterDEText =
 | 
			
		||||
    """Max Mustermann
 | 
			
		||||
               |
 | 
			
		||||
               |Lilienweg 21
 | 
			
		||||
               |
 | 
			
		||||
@@ -52,7 +53,8 @@ object TestFiles {
 | 
			
		||||
               |Max Mustermann
 | 
			
		||||
               |""".stripMargin.trim
 | 
			
		||||
 | 
			
		||||
  val letterENText = """Derek Jeter
 | 
			
		||||
  val letterENText =
 | 
			
		||||
    """Derek Jeter
 | 
			
		||||
                       |
 | 
			
		||||
                       |123 Elm Ave.
 | 
			
		||||
                       |
 | 
			
		||||
 
 | 
			
		||||
@@ -22,11 +22,9 @@ object ContactAnnotateSpec extends SimpleTestSuite {
 | 
			
		||||
 | 
			
		||||
    val labels = Contact.annotate(text)
 | 
			
		||||
    assertEquals(labels.size, 2)
 | 
			
		||||
    assertEquals(labels(0),
 | 
			
		||||
      NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
 | 
			
		||||
    assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
 | 
			
		||||
    assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
 | 
			
		||||
    assertEquals(labels(1),
 | 
			
		||||
      NerLabel("example.com", NerTag.Website, 308, 319))
 | 
			
		||||
    assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
 | 
			
		||||
    assertEquals(text.substring(308, 319).toLowerCase, "example.com")
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -8,45 +8,49 @@ object TextAnalyserSuite extends SimpleTestSuite {
 | 
			
		||||
 | 
			
		||||
  test("find english ner labels") {
 | 
			
		||||
    val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
 | 
			
		||||
    val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
 | 
			
		||||
      , NerLabel("Jeter",NerTag.Person,6,11)
 | 
			
		||||
      , NerLabel("Treesville",NerTag.Person,27,37)
 | 
			
		||||
      , NerLabel("Derek",NerTag.Person,69,74)
 | 
			
		||||
      , NerLabel("Jeter",NerTag.Person,75,80)
 | 
			
		||||
      , NerLabel("Treesville",NerTag.Location,96,106)
 | 
			
		||||
      , NerLabel("M.",NerTag.Person,142,144)
 | 
			
		||||
      , NerLabel("Leat",NerTag.Person,145,149)
 | 
			
		||||
      , NerLabel("Syrup",NerTag.Organization,160,165)
 | 
			
		||||
      , NerLabel("Production",NerTag.Organization,166,176)
 | 
			
		||||
      , NerLabel("Old",NerTag.Organization,177,180)
 | 
			
		||||
      , NerLabel("Sticky",NerTag.Organization,181,187)
 | 
			
		||||
      , NerLabel("Pancake",NerTag.Organization,188,195)
 | 
			
		||||
      , NerLabel("Company",NerTag.Organization,196,203)
 | 
			
		||||
      , NerLabel("Maple",NerTag.Location,208,213)
 | 
			
		||||
      , NerLabel("Lane",NerTag.Location,214,218)
 | 
			
		||||
      , NerLabel("Forest",NerTag.Location,220,226)
 | 
			
		||||
      , NerLabel("Hemptown",NerTag.Location,241,249)
 | 
			
		||||
      , NerLabel("Little",NerTag.Organization,349,355)
 | 
			
		||||
      , NerLabel("League",NerTag.Organization,356,362)
 | 
			
		||||
      , NerLabel("Derek",NerTag.Person,1119,1124)
 | 
			
		||||
      , NerLabel("Jeter",NerTag.Person,1125,1130))
 | 
			
		||||
    val expect = Vector(
 | 
			
		||||
      NerLabel("Derek", NerTag.Person, 0, 5),
 | 
			
		||||
      NerLabel("Jeter", NerTag.Person, 6, 11),
 | 
			
		||||
      NerLabel("Treesville", NerTag.Person, 27, 37),
 | 
			
		||||
      NerLabel("Derek", NerTag.Person, 69, 74),
 | 
			
		||||
      NerLabel("Jeter", NerTag.Person, 75, 80),
 | 
			
		||||
      NerLabel("Treesville", NerTag.Location, 96, 106),
 | 
			
		||||
      NerLabel("M.", NerTag.Person, 142, 144),
 | 
			
		||||
      NerLabel("Leat", NerTag.Person, 145, 149),
 | 
			
		||||
      NerLabel("Syrup", NerTag.Organization, 160, 165),
 | 
			
		||||
      NerLabel("Production", NerTag.Organization, 166, 176),
 | 
			
		||||
      NerLabel("Old", NerTag.Organization, 177, 180),
 | 
			
		||||
      NerLabel("Sticky", NerTag.Organization, 181, 187),
 | 
			
		||||
      NerLabel("Pancake", NerTag.Organization, 188, 195),
 | 
			
		||||
      NerLabel("Company", NerTag.Organization, 196, 203),
 | 
			
		||||
      NerLabel("Maple", NerTag.Location, 208, 213),
 | 
			
		||||
      NerLabel("Lane", NerTag.Location, 214, 218),
 | 
			
		||||
      NerLabel("Forest", NerTag.Location, 220, 226),
 | 
			
		||||
      NerLabel("Hemptown", NerTag.Location, 241, 249),
 | 
			
		||||
      NerLabel("Little", NerTag.Organization, 349, 355),
 | 
			
		||||
      NerLabel("League", NerTag.Organization, 356, 362),
 | 
			
		||||
      NerLabel("Derek", NerTag.Person, 1119, 1124),
 | 
			
		||||
      NerLabel("Jeter", NerTag.Person, 1125, 1130)
 | 
			
		||||
    )
 | 
			
		||||
    assertEquals(labels, expect)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  test("find german ner labels") {
 | 
			
		||||
    val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
 | 
			
		||||
    val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
 | 
			
		||||
      , NerLabel("Mustermann", NerTag.Person, 4, 14)
 | 
			
		||||
      , NerLabel("Lilienweg", NerTag.Location, 16, 25)
 | 
			
		||||
      , NerLabel("Max", NerTag.Person, 77, 80)
 | 
			
		||||
      , NerLabel("Mustermann", NerTag.Person, 81, 91)
 | 
			
		||||
      , NerLabel("Lilienweg", NerTag.Location, 93, 102)
 | 
			
		||||
      , NerLabel("EasyCare", NerTag.Organization, 124, 132)
 | 
			
		||||
      , NerLabel("AG", NerTag.Organization, 133, 135)
 | 
			
		||||
      , NerLabel("Ackerweg", NerTag.Location, 158, 166)
 | 
			
		||||
      , NerLabel("Nebendorf", NerTag.Location, 184, 193)
 | 
			
		||||
      , NerLabel("Max", NerTag.Person, 505, 508)
 | 
			
		||||
      , NerLabel("Mustermann", NerTag.Person, 509, 519))
 | 
			
		||||
    val expect = Vector(
 | 
			
		||||
      NerLabel("Max", NerTag.Person, 0, 3),
 | 
			
		||||
      NerLabel("Mustermann", NerTag.Person, 4, 14),
 | 
			
		||||
      NerLabel("Lilienweg", NerTag.Location, 16, 25),
 | 
			
		||||
      NerLabel("Max", NerTag.Person, 77, 80),
 | 
			
		||||
      NerLabel("Mustermann", NerTag.Person, 81, 91),
 | 
			
		||||
      NerLabel("Lilienweg", NerTag.Location, 93, 102),
 | 
			
		||||
      NerLabel("EasyCare", NerTag.Organization, 124, 132),
 | 
			
		||||
      NerLabel("AG", NerTag.Organization, 133, 135),
 | 
			
		||||
      NerLabel("Ackerweg", NerTag.Location, 158, 166),
 | 
			
		||||
      NerLabel("Nebendorf", NerTag.Location, 184, 193),
 | 
			
		||||
      NerLabel("Max", NerTag.Person, 505, 508),
 | 
			
		||||
      NerLabel("Mustermann", NerTag.Person, 509, 519)
 | 
			
		||||
    )
 | 
			
		||||
    assertEquals(labels, expect)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -9,16 +9,22 @@ object TextExtractionSuite extends SimpleTestSuite {
 | 
			
		||||
 | 
			
		||||
  test("extract english pdf") {
 | 
			
		||||
    ignore()
 | 
			
		||||
    val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
 | 
			
		||||
      compile.lastOrError.unsafeRunSync()
 | 
			
		||||
    val text = TextExtract
 | 
			
		||||
      .extract[IO](letterSourceEN, blocker, "eng", Config.default)
 | 
			
		||||
      .compile
 | 
			
		||||
      .lastOrError
 | 
			
		||||
      .unsafeRunSync()
 | 
			
		||||
    println(text)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  test("extract german pdf") {
 | 
			
		||||
    ignore()
 | 
			
		||||
    val expect = TestFiles.letterDEText
 | 
			
		||||
    val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
 | 
			
		||||
      compile.lastOrError.unsafeRunSync()
 | 
			
		||||
    val extract = TextExtract
 | 
			
		||||
      .extract[IO](letterSourceDE, blocker, "deu", Config.default)
 | 
			
		||||
      .compile
 | 
			
		||||
      .lastOrError
 | 
			
		||||
      .unsafeRunSync()
 | 
			
		||||
 | 
			
		||||
    assertEquals(extract.trim, expect.trim)
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -15,7 +15,6 @@ object TestSplitterSpec extends SimpleTestSuite {
 | 
			
		||||
 | 
			
		||||
    val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    assertEquals(words.size, 31)
 | 
			
		||||
    assertEquals(words(13), Word("bitte", 109, 114))
 | 
			
		||||
    assertEquals(text.substring(109, 114).toLowerCase, "bitte")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user