mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Upgrade code base to CE3
This commit is contained in:
@ -25,8 +25,7 @@ trait Extraction[F[_]] {
|
||||
|
||||
object Extraction {
|
||||
|
||||
def create[F[_]: Sync: ContextShift](
|
||||
blocker: Blocker,
|
||||
def create[F[_]: Async](
|
||||
logger: Logger[F],
|
||||
cfg: ExtractConfig
|
||||
): Extraction[F] =
|
||||
@ -39,7 +38,7 @@ object Extraction {
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.get(data, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(ExtractResult.fromEitherResult)
|
||||
|
||||
case PoiType(mt) =>
|
||||
@ -59,7 +58,7 @@ object Extraction {
|
||||
|
||||
case OcrType(mt) =>
|
||||
val doExtract = TextExtract
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.extractOCR(data, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(_.value)
|
||||
|
@ -17,9 +17,8 @@ object PdfExtract {
|
||||
Result(t._1, t._2)
|
||||
}
|
||||
|
||||
def get[F[_]: Sync: ContextShift](
|
||||
def get[F[_]: Async](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: Language,
|
||||
stripMinLen: Int,
|
||||
ocrCfg: OcrConfig,
|
||||
@ -27,7 +26,7 @@ object PdfExtract {
|
||||
): F[Either[Throwable, Result]] = {
|
||||
|
||||
val runOcr =
|
||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||
TextExtract.extractOCR(in, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||
|
||||
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
|
||||
if (ocrStr.length > strippedRes._1.length)
|
||||
|
@ -2,7 +2,7 @@ package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
@ -11,16 +11,15 @@ object Ocr {
|
||||
|
||||
/** Extract the text of all pages in the given pdf file.
|
||||
*/
|
||||
def extractPdf[F[_]: Sync: ContextShift](
|
||||
def extractPdf[F[_]: Async](
|
||||
pdf: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscript(pdf, config, wd, blocker, logger)
|
||||
.flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config))
|
||||
runGhostscript(pdf, config, wd, logger)
|
||||
.flatMap(tmpImg => runTesseractFile(tmpImg, logger, lang, config))
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
.compile
|
||||
.last
|
||||
@ -28,47 +27,43 @@ object Ocr {
|
||||
|
||||
/** Extract the text from the given image file
|
||||
*/
|
||||
def extractImage[F[_]: Sync: ContextShift](
|
||||
def extractImage[F[_]: Async](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
runTesseractStdin(img, blocker, logger, lang, config)
|
||||
runTesseractStdin(img, logger, lang, config)
|
||||
|
||||
def extractPdFFile[F[_]: Sync: ContextShift](
|
||||
def extractPdFFile[F[_]: Async](
|
||||
pdf: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
|
||||
.flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config))
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, logger)
|
||||
.flatMap(tif => runTesseractFile(tif, logger, lang, config))
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
.compile
|
||||
.last
|
||||
}
|
||||
|
||||
def extractImageFile[F[_]: Sync: ContextShift](
|
||||
def extractImageFile[F[_]: Async](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
runTesseractFile(img, blocker, logger, lang, config)
|
||||
runTesseractFile(img, logger, lang, config)
|
||||
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscript[F[_]: Async](
|
||||
pdf: Stream[F, Byte],
|
||||
cfg: OcrConfig,
|
||||
wd: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val xargs =
|
||||
@ -84,19 +79,18 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
|
||||
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||
.execSuccess(cmd, logger, wd = Some(wd), stdin = pdf)
|
||||
.evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
|
||||
.flatMap(fs => Stream.emits(fs))
|
||||
}
|
||||
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscriptFile[F[_]: Async](
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val cmd = ghostscript.replace(
|
||||
@ -106,8 +100,8 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
|
||||
.execSuccess[F](cmd, logger, wd = Some(wd))
|
||||
.evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
|
||||
.flatMap(fs => Stream.emits(fs))
|
||||
}
|
||||
|
||||
@ -117,11 +111,10 @@ object Ocr {
|
||||
/** Run unpaper to optimize the image for ocr. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runUnpaperFile[F[_]: Async](
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
): Stream[F, Path] = {
|
||||
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
||||
@ -132,7 +125,7 @@ object Ocr {
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
|
||||
.execSuccess[F](cmd, logger, wd = Some(wd))
|
||||
.map(_ => targetFile)
|
||||
.handleErrorWith { th =>
|
||||
logger
|
||||
@ -146,39 +139,36 @@ object Ocr {
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractFile[F[_]: Async](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
// tesseract cannot cope with absolute filenames
|
||||
// so use the parent as working dir
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
|
||||
uimg =>
|
||||
val cmd = config.tesseract.command
|
||||
.replace(
|
||||
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
|
||||
.map(_.stdout)
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, logger).flatMap { uimg =>
|
||||
val cmd = config.tesseract.command
|
||||
.replace(
|
||||
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, logger, wd = Some(uimg.getParent))
|
||||
.map(_.stdout)
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractStdin[F[_]: Async](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] = {
|
||||
val cmd = config.tesseract.command
|
||||
.replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
|
||||
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
|
||||
SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout)
|
||||
}
|
||||
|
||||
private def fixLanguage(lang: String): String =
|
||||
|
@ -1,6 +1,6 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
@ -9,18 +9,16 @@ import docspell.files._
|
||||
|
||||
object TextExtract {
|
||||
|
||||
def extract[F[_]: Sync: ContextShift](
|
||||
def extract[F[_]: Async](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, Text] =
|
||||
extractOCR(in, blocker, logger, lang, config)
|
||||
extractOCR(in, logger, lang, config)
|
||||
|
||||
def extractOCR[F[_]: Sync: ContextShift](
|
||||
def extractOCR[F[_]: Async](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
@ -29,10 +27,10 @@ object TextExtract {
|
||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||
.flatMap({
|
||||
case MimeType.pdf =>
|
||||
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
||||
Stream.eval(Ocr.extractPdf(in, logger, lang, config)).unNoneTerminate
|
||||
|
||||
case mt if mt.primary == "image" =>
|
||||
Ocr.extractImage(in, blocker, logger, lang, config)
|
||||
Ocr.extractImage(in, logger, lang, config)
|
||||
|
||||
case mt =>
|
||||
raiseError(s"File `$mt` not supported")
|
||||
|
@ -12,6 +12,7 @@ import fs2.Stream
|
||||
import org.apache.commons.io.output.ByteArrayOutputStream
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.rendering.PDFRenderer
|
||||
import scodec.bits.ByteVector
|
||||
|
||||
trait PdfboxPreview[F[_]] {
|
||||
|
||||
@ -50,7 +51,7 @@ object PdfboxPreview {
|
||||
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
|
||||
val out = new ByteArrayOutputStream()
|
||||
ImageIO.write(img, "PNG", out)
|
||||
Stream.chunk(Chunk.bytes(out.toByteArray()))
|
||||
Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.common.Logger
|
||||
import docspell.files.TestFiles
|
||||
@ -14,7 +15,7 @@ class TextExtractionSuite extends FunSuite {
|
||||
|
||||
test("extract english pdf".ignore) {
|
||||
val text = TextExtract
|
||||
.extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default)
|
||||
.extract[IO](letterSourceEN, logger, "eng", OcrConfig.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
@ -24,7 +25,7 @@ class TextExtractionSuite extends FunSuite {
|
||||
test("extract german pdf".ignore) {
|
||||
val expect = TestFiles.letterDEText
|
||||
val extract = TextExtract
|
||||
.extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default)
|
||||
.extract[IO](letterSourceDE, logger, "deu", OcrConfig.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
|
@ -1,14 +1,13 @@
|
||||
package docspell.extract.odf
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import docspell.files.ExampleFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class OdfExtractTest extends FunSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val files = List(
|
||||
ExampleFiles.examples_sample_odt -> 6372,
|
||||
@ -21,7 +20,7 @@ class OdfExtractTest extends FunSuite {
|
||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str1.length, len)
|
||||
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val data = file.readURL[IO](8192)
|
||||
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
|
||||
assertEquals(str2, str1)
|
||||
}
|
||||
|
@ -1,14 +1,13 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
|
||||
import munit._
|
||||
|
||||
class PdfboxExtractTest extends FunSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val textPDFs = List(
|
||||
ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
|
||||
@ -27,7 +26,7 @@ class PdfboxExtractTest extends FunSuite {
|
||||
|
||||
test("extract text from text PDFs via Stream") {
|
||||
textPDFs.foreach { case (file, txt) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val data = file.readURL[IO](8192)
|
||||
val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
|
||||
val received = removeFormatting(str.value)
|
||||
val expect = removeFormatting(txt)
|
||||
|
@ -3,15 +3,15 @@ package docspell.extract.pdfbox
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
import fs2.Stream
|
||||
import fs2.io.file.Files
|
||||
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import docspell.files.ExampleFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class PdfboxPreviewTest extends FunSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val testPDFs = List(
|
||||
ExampleFiles.letter_de_pdf -> "7d98be75b239816d6c751b3f3c56118ebf1a4632c43baf35a68a662f9d595ab8",
|
||||
@ -21,7 +21,7 @@ class PdfboxPreviewTest extends FunSuite {
|
||||
|
||||
test("extract first page image from PDFs".flaky) {
|
||||
testPDFs.foreach { case (file, checksum) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val data = file.readURL[IO](8192)
|
||||
val sha256out =
|
||||
Stream
|
||||
.eval(PdfboxPreview[IO](PreviewConfig(48)))
|
||||
@ -42,7 +42,7 @@ class PdfboxPreviewTest extends FunSuite {
|
||||
def writeToFile(data: Stream[IO, Byte], file: Path): IO[Unit] =
|
||||
data
|
||||
.through(
|
||||
fs2.io.file.writeAll(file, blocker)
|
||||
Files[IO].writeAll(file)
|
||||
)
|
||||
.compile
|
||||
.drain
|
||||
|
@ -1,15 +1,14 @@
|
||||
package docspell.extract.poi
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.unsafe.implicits.global
|
||||
|
||||
import docspell.common.MimeTypeHint
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import docspell.files.ExampleFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class PoiExtractTest extends FunSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val officeFiles = List(
|
||||
ExampleFiles.examples_sample_doc -> 6241,
|
||||
@ -21,13 +20,13 @@ class PoiExtractTest extends FunSuite {
|
||||
test("extract text from ms office files") {
|
||||
officeFiles.foreach { case (file, len) =>
|
||||
val str1 = PoiExtract
|
||||
.get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
|
||||
.get[IO](file.readURL[IO](8192), MimeTypeHint.none)
|
||||
.unsafeRunSync()
|
||||
.fold(throw _, identity)
|
||||
|
||||
val str2 = PoiExtract
|
||||
.get[IO](
|
||||
file.readURL[IO](8192, blocker),
|
||||
file.readURL[IO](8192),
|
||||
MimeTypeHint(Some(file.path.segments.last), None)
|
||||
)
|
||||
.unsafeRunSync()
|
||||
|
Reference in New Issue
Block a user