scalafmtAll

2025-08-05 02:24:52 +00:00 · 2020-03-26 18:26:00 +01:00
parent 09ea724c13
commit 9656ba62f4
91 changed files with 871 additions and 295 deletions
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@ -13,7 +13,11 @@ import docspell.files.ImageSize

 trait Extraction[F[_]] {

-  def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult]
+  def extractText(
+      data: Stream[F, Byte],
+      dataType: DataType,
+      lang: Language
+  ): F[ExtractResult]

 }

@ -71,13 +75,17 @@ object Extraction {
                  doExtract
                }
              case None =>
-                logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
+                logger.info(
+                  s"Cannot read image data from ${mt.asString}. Extracting anyways."
+                ) *>
                  doExtract
            }

          case OdfType.ContainerMatch(_) =>
            logger
-              .info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
+              .info(
+                s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
+              ) *>
              OdfExtract.get(data).map(ExtractResult.fromEither)

          case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@ -135,7 +135,9 @@ object Ocr {
      .map(_ => targetFile)
      .handleErrorWith { th =>
        logger
-          .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
+          .warn(
+            s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction."
+          )
        Stream.emit(img)
      }
  }
@ -152,10 +154,15 @@ object Ocr {
  ): Stream[F, String] =
    // tesseract cannot cope with absolute filenames
    // so use the parent as working dir
-    runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
-      val cmd = config.tesseract.command
-        .replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
-      SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
+    runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
+      uimg =>
+        val cmd = config.tesseract.command
+          .replace(
+            Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
+          )
+        SystemCommand
+          .execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
+          .map(_.stdout)
    }

  /** Run tesseract on the given image file and return the extracted
--- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala
@ -41,11 +41,16 @@ object OcrConfig {
      Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
    ),
    unpaper = Unpaper(
-      SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
+      SystemCommand
+        .Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
    ),
    tesseract = Tesseract(
      SystemCommand
-        .Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
+        .Config(
+          "tesseract",
+          Seq("{{file}}", "stdout", "-l", "{{lang}}"),
+          Duration.minutes(1)
+        )
    )
  )
 }
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@ -14,7 +14,9 @@ import fs2.Stream
 object PdfboxExtract {

  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
-    data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
+    data.compile
+      .to(Array)
+      .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)

  def get(is: InputStream): Either[Throwable, String] =
    Using(PDDocument.load(is))(readText).toEither.flatten
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
@ -20,10 +20,16 @@ import docspell.files.TikaMimetype

 object PoiExtract {

-  def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
+  def get[F[_]: Sync](
+      data: Stream[F, Byte],
+      hint: MimeTypeHint
+  ): F[Either[Throwable, String]] =
    TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))

-  def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] =
+  def get[F[_]: Sync](
+      data: Stream[F, Byte],
+      mime: MimeType
+  ): F[Either[Throwable, String]] =
    mime match {
      case PoiType.doc =>
        getDoc(data)
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala
@ -6,10 +6,11 @@ object PoiType {

  val msoffice = MimeType.application("x-tika-msoffice")
  val ooxml    = MimeType.application("x-tika-ooxml")
-  val docx     = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
-  val xlsx     = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
-  val xls      = MimeType.application("vnd.ms-excel")
-  val doc      = MimeType.application("msword")
+  val docx =
+    MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
+  val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+  val xls  = MimeType.application("vnd.ms-excel")
+  val doc  = MimeType.application("msword")

  val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)