Upgrade code base to CE3

This commit is contained in:
eikek
2021-06-21 21:33:54 +02:00
parent 903ec26e54
commit bd791b4593
146 changed files with 638 additions and 758 deletions

View File

@ -25,8 +25,7 @@ trait Extraction[F[_]] {
object Extraction {
def create[F[_]: Sync: ContextShift](
blocker: Blocker,
def create[F[_]: Async](
logger: Logger[F],
cfg: ExtractConfig
): Extraction[F] =
@ -39,7 +38,7 @@ object Extraction {
TikaMimetype.resolve(dataType, data).flatMap {
case MimeType.PdfMatch(_) =>
PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.get(data, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(ExtractResult.fromEitherResult)
case PoiType(mt) =>
@ -59,7 +58,7 @@ object Extraction {
case OcrType(mt) =>
val doExtract = TextExtract
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.extractOCR(data, logger, lang.iso3, cfg.ocr)
.compile
.lastOrError
.map(_.value)

View File

@ -17,9 +17,8 @@ object PdfExtract {
Result(t._1, t._2)
}
def get[F[_]: Sync: ContextShift](
def get[F[_]: Async](
in: Stream[F, Byte],
blocker: Blocker,
lang: Language,
stripMinLen: Int,
ocrCfg: OcrConfig,
@ -27,7 +26,7 @@ object PdfExtract {
): F[Either[Throwable, Result]] = {
val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
TextExtract.extractOCR(in, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
if (ocrStr.length > strippedRes._1.length)

View File

@ -2,7 +2,7 @@ package docspell.extract.ocr
import java.nio.file.Path
import cats.effect.{Blocker, ContextShift, Sync}
import cats.effect._
import fs2.Stream
import docspell.common._
@ -11,16 +11,15 @@ object Ocr {
/** Extract the text of all pages in the given pdf file.
*/
def extractPdf[F[_]: Sync: ContextShift](
def extractPdf[F[_]: Async](
pdf: Stream[F, Byte],
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): F[Option[String]] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
runGhostscript(pdf, config, wd, blocker, logger)
.flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config))
runGhostscript(pdf, config, wd, logger)
.flatMap(tmpImg => runTesseractFile(tmpImg, logger, lang, config))
.fold1(_ + "\n\n\n" + _)
.compile
.last
@ -28,47 +27,43 @@ object Ocr {
/** Extract the text from the given image file
*/
def extractImage[F[_]: Sync: ContextShift](
def extractImage[F[_]: Async](
img: Stream[F, Byte],
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] =
runTesseractStdin(img, blocker, logger, lang, config)
runTesseractStdin(img, logger, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](
def extractPdFFile[F[_]: Async](
pdf: Path,
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): F[Option[String]] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger)
.flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config))
runGhostscriptFile(pdf, config.ghostscript.command, wd, logger)
.flatMap(tif => runTesseractFile(tif, logger, lang, config))
.fold1(_ + "\n\n\n" + _)
.compile
.last
}
def extractImageFile[F[_]: Sync: ContextShift](
def extractImageFile[F[_]: Async](
img: Path,
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] =
runTesseractFile(img, blocker, logger, lang, config)
runTesseractFile(img, logger, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
private[extract] def runGhostscript[F[_]: Async](
pdf: Stream[F, Byte],
cfg: OcrConfig,
wd: Path,
blocker: Blocker,
logger: Logger[F]
): Stream[F, Path] = {
val xargs =
@ -84,19 +79,18 @@ object Ocr {
)
)
SystemCommand
.execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf)
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
.execSuccess(cmd, logger, wd = Some(wd), stdin = pdf)
.evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
.flatMap(fs => Stream.emits(fs))
}
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
private[extract] def runGhostscriptFile[F[_]: Async](
pdf: Path,
ghostscript: SystemCommand.Config,
wd: Path,
blocker: Blocker,
logger: Logger[F]
): Stream[F, Path] = {
val cmd = ghostscript.replace(
@ -106,8 +100,8 @@ object Ocr {
)
)
SystemCommand
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
.evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd))
.execSuccess[F](cmd, logger, wd = Some(wd))
.evalMap(_ => File.listJFiles(pathEndsWith(".tif"), wd))
.flatMap(fs => Stream.emits(fs))
}
@ -117,11 +111,10 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
private[extract] def runUnpaperFile[F[_]: Async](
img: Path,
unpaper: SystemCommand.Config,
wd: Path,
blocker: Blocker,
logger: Logger[F]
): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
@ -132,7 +125,7 @@ object Ocr {
)
)
SystemCommand
.execSuccess[F](cmd, blocker, logger, wd = Some(wd))
.execSuccess[F](cmd, logger, wd = Some(wd))
.map(_ => targetFile)
.handleErrorWith { th =>
logger
@ -146,39 +139,36 @@ object Ocr {
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
private[extract] def runTesseractFile[F[_]: Async](
img: Path,
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] =
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
uimg =>
val cmd = config.tesseract.command
.replace(
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
)
SystemCommand
.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
.map(_.stdout)
runUnpaperFile(img, config.unpaper.command, img.getParent, logger).flatMap { uimg =>
val cmd = config.tesseract.command
.replace(
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
)
SystemCommand
.execSuccess[F](cmd, logger, wd = Some(uimg.getParent))
.map(_.stdout)
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
private[extract] def runTesseractStdin[F[_]: Async](
img: Stream[F, Byte],
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] = {
val cmd = config.tesseract.command
.replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))
SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout)
SystemCommand.execSuccess(cmd, logger, stdin = img).map(_.stdout)
}
private def fixLanguage(lang: String): String =

View File

@ -1,6 +1,6 @@
package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import cats.effect._
import fs2.Stream
import docspell.common._
@ -9,18 +9,16 @@ import docspell.files._
object TextExtract {
def extract[F[_]: Sync: ContextShift](
def extract[F[_]: Async](
in: Stream[F, Byte],
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, Text] =
extractOCR(in, blocker, logger, lang, config)
extractOCR(in, logger, lang, config)
def extractOCR[F[_]: Sync: ContextShift](
def extractOCR[F[_]: Async](
in: Stream[F, Byte],
blocker: Blocker,
logger: Logger[F],
lang: String,
config: OcrConfig
@ -29,10 +27,10 @@ object TextExtract {
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({
case MimeType.pdf =>
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
Stream.eval(Ocr.extractPdf(in, logger, lang, config)).unNoneTerminate
case mt if mt.primary == "image" =>
Ocr.extractImage(in, blocker, logger, lang, config)
Ocr.extractImage(in, logger, lang, config)
case mt =>
raiseError(s"File `$mt` not supported")

View File

@ -12,6 +12,7 @@ import fs2.Stream
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.PDFRenderer
import scodec.bits.ByteVector
trait PdfboxPreview[F[_]] {
@ -50,7 +51,7 @@ object PdfboxPreview {
private def pngStream[F[_]](img: RenderedImage): Stream[F, Byte] = {
val out = new ByteArrayOutputStream()
ImageIO.write(img, "PNG", out)
Stream.chunk(Chunk.bytes(out.toByteArray()))
Stream.chunk(Chunk.byteVector(ByteVector.view(out.toByteArray())))
}
}