mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Configure pdf extraction; move Logger and DataType to common
This commit is contained in:
@ -1,21 +0,0 @@
|
||||
package docspell.extract
|
||||
|
||||
import docspell.common.{MimeType, MimeTypeHint}
|
||||
|
||||
sealed trait DataType {
|
||||
|
||||
}
|
||||
|
||||
object DataType {
|
||||
|
||||
case class Exact(mime: MimeType) extends DataType
|
||||
|
||||
case class Hint(hint: MimeTypeHint) extends DataType
|
||||
|
||||
|
||||
def apply(mt: MimeType): DataType =
|
||||
Exact(mt)
|
||||
|
||||
def filename(name: String): DataType =
|
||||
Hint(MimeTypeHint.filename(name))
|
||||
}
|
@ -2,4 +2,4 @@ package docspell.extract
|
||||
|
||||
import docspell.extract.ocr.OcrConfig
|
||||
|
||||
case class ExtractConfig(ocr: OcrConfig)
|
||||
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
||||
|
@ -29,14 +29,10 @@ object Extraction {
|
||||
dataType: DataType,
|
||||
lang: Language
|
||||
): F[ExtractResult] = {
|
||||
val mime = dataType match {
|
||||
case DataType.Exact(mt) => mt.pure[F]
|
||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||
}
|
||||
mime.flatMap {
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.pdf =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, 5, cfg.ocr, logger)
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
case PoiType(mt) =>
|
||||
|
@ -0,0 +1,3 @@
|
||||
package docspell.extract
|
||||
|
||||
case class PdfConfig (minTextLen: Int)
|
@ -43,7 +43,7 @@ object PdfExtract {
|
||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||
else
|
||||
logger
|
||||
.info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *>
|
||||
.info(s"Stripped text from PDF is small (${str.length}). Trying with OCR.") *>
|
||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
||||
)
|
||||
} yield res
|
||||
|
@ -17,13 +17,15 @@ object Ocr {
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscript(pdf, config, wd, blocker)
|
||||
.flatMap({ tmpImg =>
|
||||
runTesseractFile(tmpImg, blocker, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
.fold1(_ + "\n\n\n" + _).
|
||||
compile.
|
||||
last
|
||||
}
|
||||
|
||||
/** Extract the text from the given image file
|
||||
@ -41,13 +43,15 @@ object Ocr {
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||
): F[Option[String]] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
||||
.flatMap({ tif =>
|
||||
runTesseractFile(tif, blocker, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
.fold1(_ + "\n\n\n" + _).
|
||||
compile.
|
||||
last
|
||||
}
|
||||
|
||||
def extractImageFile[F[_]: Sync: ContextShift](
|
||||
|
@ -28,7 +28,7 @@ object TextExtract {
|
||||
raiseError(s"File `$mt` not allowed")
|
||||
|
||||
case MimeType.pdf =>
|
||||
Ocr.extractPdf(in, blocker, lang, config)
|
||||
Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate
|
||||
|
||||
case mt if mt.primary == "image" =>
|
||||
Ocr.extractImage(in, blocker, lang, config)
|
||||
|
Reference in New Issue
Block a user