Upgrade code base to CE3

2025-08-05 02:24:52 +00:00 · 2021-06-21 21:33:54 +02:00
parent 903ec26e54
commit bd791b4593
146 changed files with 638 additions and 758 deletions
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@ -25,8 +25,7 @@ trait Extraction[F[_]] {

 object Extraction {

-  def create[F[_]: Sync: ContextShift](
-      blocker: Blocker,
+  def create[F[_]: Async](
      logger: Logger[F],
      cfg: ExtractConfig
  ): Extraction[F] =
@ -39,7 +38,7 @@ object Extraction {
        TikaMimetype.resolve(dataType, data).flatMap {
          case MimeType.PdfMatch(_) =>
            PdfExtract
-              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
+              .get(data, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
              .map(ExtractResult.fromEitherResult)

          case PoiType(mt) =>
@ -59,7 +58,7 @@ object Extraction {

          case OcrType(mt) =>
            val doExtract = TextExtract
-              .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
+              .extractOCR(data, logger, lang.iso3, cfg.ocr)
              .compile
              .lastOrError
              .map(_.value)
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@ -17,9 +17,8 @@ object PdfExtract {
      Result(t._1, t._2)
  }

-  def get[F[_]: Sync: ContextShift](
+  def get[F[_]: Async](
      in: Stream[F, Byte],
-      blocker: Blocker,
      lang: Language,
      stripMinLen: Int,
      ocrCfg: OcrConfig,
@ -27,7 +26,7 @@ object PdfExtract {
  ): F[Either[Throwable, Result]] = {

    val runOcr =
-      TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
+      TextExtract.extractOCR(in, logger, lang.iso3, ocrCfg).compile.lastOrError

    def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
      if (ocrStr.length > strippedRes._1.length)
--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@ -2,7 +2,7 @@ package docspell.extract.ocr

 import java.nio.file.Path

-import cats.effect.{Blocker, ContextShift, Sync}
+import cats.effect._
 import fs2.Stream

 import docspell.common._
@ -11,16 +11,15 @@ object Ocr {

  /** Extract the text of all pages in the given pdf file.
    */
-  def extractPdf[F[_]: Sync: ContextShift](
+  def extractPdf[F[_]: Async](
      pdf: Stream[F, Byte],
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): F[Option[String]] =
    File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
-      runGhostscript(pdf, config, wd, blocker, logger)
-        .flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config))
+      runGhostscript(pdf, config, wd, logger)
+        .flatMap(tmpImg => runTesseractFile(tmpImg, logger, lang, config))
        .fold1(_ + "\n\n\n" + _)
        .compile
        .last
@ -28,47 +27,43 @@ object Ocr {

  /** Extract the text from the given image file
    */
-  def extractImage[F[_]: Sync: ContextShift](
+  def extractImage[F[_]: Async](
      img: Stream[F, Byte],
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): Stream[F, String] =
-    runTesseractStdin(img, blocker, logger, lang, config)
+    runTesseractStdin(img, logger, lang, config)

-  def extractPdFFile[F[_]: Sync: ContextShift](
+  def extractPdFFile[F[_]: Async](
      pdf: Path,
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): F[Option[String]] =
    File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
-      runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
-        .flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config))
+      runGhostscriptFile(pdf, config.ghostscript.command, wd, logger)
+        .flatMap(tif => runTesseractFile(tif, logger, lang, config))
        .fold1(_ + "\n\n\n" + _)
        .compile
        .last
    }

-  def extractImageFile[F[_]: Sync: ContextShift](
+  def extractImageFile[F[_]: Async](
      img: Path,
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): Stream[F, String] =
-    runTesseractFile(img, blocker, logger, lang, config)
+    runTesseractFile(img, logger, lang, config)

  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[extract] def runGhostscript[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscript[F[_]: Async](
      pdf: Stream[F, Byte],
      cfg: OcrConfig,
      wd: Path,
-      blocker: Blocker,
      logger: Logger[F]
  ): Stream[F, Path] = {
    val xargs =
@ -84,19 +79,18 @@ object Ocr {
        )
      )
    SystemCommand
-      .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
-      .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
+      .execSuccess(cmd, logger, wd = Some(wd), stdin = pdf)
+      .evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
      .flatMap(fs => Stream.emits(fs))
  }

  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscriptFile[F[_]: Async](
      pdf: Path,
      ghostscript: SystemCommand.Config,
      wd: Path,
-      blocker: Blocker,
      logger: Logger[F]
  ): Stream[F, Path] = {
    val cmd = ghostscript.replace(
@ -106,8 +100,8 @@ object Ocr {
      )
    )
    SystemCommand
-      .execSuccess[F](cmd, blocker, logger, wd = Some(wd))
-      .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
+      .execSuccess[F](cmd, logger, wd = Some(wd))
+      .evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
      .flatMap(fs => Stream.emits(fs))
  }

@ -117,11 +111,10 @@ object Ocr {
  /** Run unpaper to optimize the image for ocr. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
+  private[extract] def runUnpaperFile[F[_]: Async](
      img: Path,
      unpaper: SystemCommand.Config,
      wd: Path,
-      blocker: Blocker,
      logger: Logger[F]
  ): Stream[F, Path] = {
    val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
@ -132,7 +125,7 @@ object Ocr {
      )
    )
    SystemCommand
-      .execSuccess[F](cmd, blocker, logger, wd = Some(wd))
+      .execSuccess[F](cmd, logger, wd = Some(wd))
      .map(_ => targetFile)
      .handleErrorWith { th =>
        logger
@ -146,39 +139,36 @@ object Ocr {
  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractFile[F[_]: Async](
      img: Path,
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): Stream[F, String] =
    // tesseract cannot cope with absolute filenames
    // so use the parent as working dir
-    runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
-      uimg =>
-        val cmd = config.tesseract.command
-          .replace(
-            Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
-          )
-        SystemCommand
-          .execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
-          .map(_.stdout)
+    runUnpaperFile(img, config.unpaper.command, img.getParent, logger).flatMap { uimg =>
+      val cmd = config.tesseract.command
+        .replace(
+          Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
+        )
+      SystemCommand
+        .execSuccess[F](cmd, logger, wd = Some(uimg.getParent))
+        .map(_.stdout)
    }

  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractStdin[F[_]: Async](
      img: Stream[F, Byte],
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): Stream[F, String] = {
    val cmd = config.tesseract.command
      .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
-    SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
+    SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout)
  }

  private def fixLanguage(lang: String): String =
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@ -1,6 +1,6 @@
 package docspell.extract.ocr

-import cats.effect.{Blocker, ContextShift, Sync}
+import cats.effect._
 import fs2.Stream

 import docspell.common._
@ -9,18 +9,16 @@ import docspell.files._

 object TextExtract {

-  def extract[F[_]: Sync: ContextShift](
+  def extract[F[_]: Async](
      in: Stream[F, Byte],
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
  ): Stream[F, Text] =
-    extractOCR(in, blocker, logger, lang, config)
+    extractOCR(in, logger, lang, config)

-  def extractOCR[F[_]: Sync: ContextShift](
+  def extractOCR[F[_]: Async](
      in: Stream[F, Byte],
-      blocker: Blocker,
      logger: Logger[F],
      lang: String,
      config: OcrConfig
@ -29,10 +27,10 @@ object TextExtract {
      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
      .flatMap({
        case MimeType.pdf =>
-          Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
+          Stream.eval(Ocr.extractPdf(in, logger, lang, config)).unNoneTerminate

        case mt if mt.primary == "image" =>
-          Ocr.extractImage(in, blocker, logger, lang, config)
+          Ocr.extractImage(in, logger, lang, config)

        case mt =>
          raiseError(s"File `$mt` not supported")
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxPreview.scala
@ -12,6 +12,7 @@ import fs2.Stream
 import org.apache.commons.io.output.ByteArrayOutputStream
 import org.apache.pdfbox.pdmodel.PDDocument
 import org.apache.pdfbox.rendering.PDFRenderer
+import scodec.bits.ByteVector

 trait PdfboxPreview[F[_]] {

@ -50,7 +51,7 @@ object PdfboxPreview {
  private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
    val out = new ByteArrayOutputStream()
    ImageIO.write(img, "PNG", out)
-    Stream.chunk(Chunk.bytes(out.toByteArray()))
+    Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
  }

 }
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@ -1,6 +1,7 @@
 package docspell.extract.ocr

 import cats.effect.IO
+import cats.effect.unsafe.implicits.global

 import docspell.common.Logger
 import docspell.files.TestFiles
@ -14,7 +15,7 @@ class TextExtractionSuite extends FunSuite {

  test("extract english pdf".ignore) {
    val text = TextExtract
-      .extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default)
+      .extract[IO](letterSourceEN, logger, "eng", OcrConfig.default)
      .compile
      .lastOrError
      .unsafeRunSync()
@ -24,7 +25,7 @@ class TextExtractionSuite extends FunSuite {
  test("extract german pdf".ignore) {
    val expect = TestFiles.letterDEText
    val extract = TextExtract
-      .extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default)
+      .extract[IO](letterSourceDE, logger, "deu", OcrConfig.default)
      .compile
      .lastOrError
      .unsafeRunSync()
--- a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
@ -1,14 +1,13 @@
 package docspell.extract.odf

 import cats.effect._
+import cats.effect.unsafe.implicits.global

-import docspell.files.{ExampleFiles, TestFiles}
+import docspell.files.ExampleFiles

 import munit._

 class OdfExtractTest extends FunSuite {
-  val blocker     = TestFiles.blocker
-  implicit val CS = TestFiles.CS

  val files = List(
    ExampleFiles.examples_sample_odt -> 6372,
@ -21,7 +20,7 @@ class OdfExtractTest extends FunSuite {
      val str1 = OdfExtract.get(is).fold(throw _, identity)
      assertEquals(str1.length, len)

-      val data = file.readURL[IO](8192, blocker)
+      val data = file.readURL[IO](8192)
      val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
      assertEquals(str2, str1)
    }
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@ -1,14 +1,13 @@
 package docspell.extract.pdfbox

 import cats.effect._
+import cats.effect.unsafe.implicits.global

 import docspell.files.{ExampleFiles, TestFiles}

 import munit._

 class PdfboxExtractTest extends FunSuite {
-  val blocker     = TestFiles.blocker
-  implicit val CS = TestFiles.CS

  val textPDFs = List(
    ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
@ -27,7 +26,7 @@ class PdfboxExtractTest extends FunSuite {

  test("extract text from text PDFs via Stream") {
    textPDFs.foreach { case (file, txt) =>
-      val data     = file.readURL[IO](8192, blocker)
+      val data     = file.readURL[IO](8192)
      val str      = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
      val received = removeFormatting(str.value)
      val expect   = removeFormatting(txt)
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxPreviewTest.scala
@ -3,15 +3,15 @@ package docspell.extract.pdfbox
 import java.nio.file.Path

 import cats.effect._
+import cats.effect.unsafe.implicits.global
 import fs2.Stream
+import fs2.io.file.Files

-import docspell.files.{ExampleFiles, TestFiles}
+import docspell.files.ExampleFiles

 import munit._

 class PdfboxPreviewTest extends FunSuite {
-  val blocker     = TestFiles.blocker
-  implicit val CS = TestFiles.CS

  val testPDFs = List(
    ExampleFiles.letter_de_pdf     -> "7d98be75b239816d6c751b3f3c56118ebf1a4632c43baf35a68a662f9d595ab8",
@ -21,7 +21,7 @@ class PdfboxPreviewTest extends FunSuite {

  test("extract first page image from PDFs".flaky) {
    testPDFs.foreach { case (file, checksum) =>
-      val data = file.readURL[IO](8192, blocker)
+      val data = file.readURL[IO](8192)
      val sha256out =
        Stream
          .eval(PdfboxPreview[IO](PreviewConfig(48)))
@ -42,7 +42,7 @@ class PdfboxPreviewTest extends FunSuite {
  def writeToFile(data: Stream[IO, Byte], file: Path): IO[Unit] =
    data
      .through(
-        fs2.io.file.writeAll(file, blocker)
+        Files[IO].writeAll(file)
      )
      .compile
      .drain
--- a/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
@ -1,15 +1,14 @@
 package docspell.extract.poi

 import cats.effect._
+import cats.effect.unsafe.implicits.global

 import docspell.common.MimeTypeHint
-import docspell.files.{ExampleFiles, TestFiles}
+import docspell.files.ExampleFiles

 import munit._

 class PoiExtractTest extends FunSuite {
-  val blocker     = TestFiles.blocker
-  implicit val CS = TestFiles.CS

  val officeFiles = List(
    ExampleFiles.examples_sample_doc  -> 6241,
@ -21,13 +20,13 @@ class PoiExtractTest extends FunSuite {
  test("extract text from ms office files") {
    officeFiles.foreach { case (file, len) =>
      val str1 = PoiExtract
-        .get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
+        .get[IO](file.readURL[IO](8192), MimeTypeHint.none)
        .unsafeRunSync()
        .fold(throw _, identity)

      val str2 = PoiExtract
        .get[IO](
-          file.readURL[IO](8192, blocker),
+          file.readURL[IO](8192),
          MimeTypeHint(Some(file.path.segments.last), None)
        )
        .unsafeRunSync()