mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Reorganize processing code
Use separate modules for - text extraction - conversion to pdf - text analysis
This commit is contained in:
@ -0,0 +1,54 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.{Path, Paths}
|
||||
|
||||
import docspell.common._
|
||||
|
||||
case class Config(
|
||||
allowedContentTypes: Set[MimeType],
|
||||
ghostscript: Config.Ghostscript,
|
||||
pageRange: Config.PageRange,
|
||||
unpaper: Config.Unpaper,
|
||||
tesseract: Config.Tesseract
|
||||
) {
|
||||
|
||||
def isAllowed(mt: MimeType): Boolean =
|
||||
allowedContentTypes contains mt
|
||||
}
|
||||
|
||||
object Config {
|
||||
case class PageRange(begin: Int)
|
||||
|
||||
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
||||
case class Tesseract(command: SystemCommand.Config)
|
||||
case class Unpaper(command: SystemCommand.Config)
|
||||
|
||||
val default = Config(
|
||||
allowedContentTypes = Set(
|
||||
MimeType.pdf,
|
||||
MimeType.png,
|
||||
MimeType.jpeg,
|
||||
MimeType.tiff
|
||||
),
|
||||
pageRange = PageRange(10),
|
||||
ghostscript = Ghostscript(
|
||||
SystemCommand.Config(
|
||||
"gs",
|
||||
Seq(
|
||||
"-dNOPAUSE",
|
||||
"-dBATCH",
|
||||
"-dSAFER",
|
||||
"-sDEVICE=tiffscaled8",
|
||||
"-sOutputFile={{outfile}}",
|
||||
"{{infile}}"
|
||||
),
|
||||
Duration.seconds(30)
|
||||
),
|
||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||
),
|
||||
unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
|
||||
tesseract = Tesseract(
|
||||
SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
||||
)
|
||||
)
|
||||
}
|
191
modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
Normal file
191
modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
Normal file
@ -0,0 +1,191 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import fs2.Stream
|
||||
import org.log4s._
|
||||
import docspell.common._
|
||||
|
||||
object Ocr {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
/** Extract the text of all pages in the given pdf file.
|
||||
*/
|
||||
def extractPdf[F[_]: Sync: ContextShift](
|
||||
pdf: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||
runGhostscript(pdf, config, wd, blocker)
|
||||
.flatMap({ tmpImg =>
|
||||
runTesseractFile(tmpImg, blocker, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
}
|
||||
|
||||
/** Extract the text from the given image file
|
||||
*/
|
||||
def extractImage[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
runTesseractStdin(img, blocker, lang, config)
|
||||
|
||||
def extractPdFFile[F[_]: Sync: ContextShift](
|
||||
pdf: Path,
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
||||
.flatMap({ tif =>
|
||||
runTesseractFile(tif, blocker, lang, config)
|
||||
})
|
||||
.fold1(_ + "\n\n\n" + _)
|
||||
}
|
||||
|
||||
def extractImageFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
runTesseractFile(img, blocker, lang, config)
|
||||
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
pdf: Stream[F, Byte],
|
||||
cfg: Config,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
): Stream[F, Path] = {
|
||||
val xargs =
|
||||
if (cfg.pageRange.begin > 0)
|
||||
s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
|
||||
else cfg.ghostscript.command.args
|
||||
val cmd = cfg.ghostscript.command
|
||||
.copy(args = xargs)
|
||||
.mapArgs(
|
||||
replace(
|
||||
Map(
|
||||
"{{infile}}" -> "-",
|
||||
"{{outfile}}" -> "%d.tif"
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
|
||||
.evalMap({ _ =>
|
||||
File.listFiles(pathEndsWith(".tif"), wd)
|
||||
})
|
||||
.flatMap(fs => Stream.emits(fs))
|
||||
}
|
||||
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
): Stream[F, Path] = {
|
||||
val cmd = ghostscript.mapArgs(
|
||||
replace(
|
||||
Map(
|
||||
"{{infile}}" -> pdf.toAbsolutePath.toString,
|
||||
"{{outfile}}" -> "%d.tif"
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, wd = Some(wd))
|
||||
.evalMap({ _ =>
|
||||
File.listFiles(pathEndsWith(".tif"), wd)
|
||||
})
|
||||
.flatMap(fs => Stream.emits(fs))
|
||||
}
|
||||
|
||||
private def pathEndsWith(ext: String): Path => Boolean =
|
||||
p => p.getFileName.toString.endsWith(ext)
|
||||
|
||||
/** Run unpaper to optimize the image for ocr. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
wd: Path,
|
||||
blocker: Blocker
|
||||
): Stream[F, Path] = {
|
||||
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
|
||||
val cmd = unpaper.mapArgs(
|
||||
replace(
|
||||
Map(
|
||||
"{{infile}}" -> img.toAbsolutePath.toString,
|
||||
"{{outfile}}" -> targetFile.toString
|
||||
)
|
||||
)
|
||||
)
|
||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
|
||||
th =>
|
||||
logger
|
||||
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
||||
Stream.emit(img)
|
||||
}
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
// tesseract cannot cope with absolute filenames
|
||||
// so use the parent as working dir
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
|
||||
val cmd = config.tesseract.command.mapArgs(
|
||||
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
||||
)
|
||||
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] = {
|
||||
val cmd = config.tesseract.command
|
||||
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
||||
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
|
||||
}
|
||||
|
||||
private def replace(repl: Map[String, String]): String => String =
|
||||
s =>
|
||||
repl.foldLeft(s) {
|
||||
case (res, (k, v)) =>
|
||||
res.replace(k, v)
|
||||
}
|
||||
|
||||
private def fixLanguage(lang: String): String =
|
||||
lang match {
|
||||
case "de" => "deu"
|
||||
case "en" => "eng"
|
||||
case l => l
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import fs2.Stream
|
||||
|
||||
object TextExtract {
|
||||
|
||||
def extract[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
extractOCR(in, blocker, lang, config)
|
||||
|
||||
def extractOCR[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
config: Config
|
||||
): Stream[F, String] =
|
||||
Stream
|
||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||
.flatMap({
|
||||
case mt if !config.isAllowed(mt) =>
|
||||
raiseError(s"File `$mt` not allowed")
|
||||
|
||||
case MimeType.pdf =>
|
||||
Ocr.extractPdf(in, blocker, lang, config)
|
||||
|
||||
case mt if mt.primary == "image" =>
|
||||
Ocr.extractImage(in, blocker, lang, config)
|
||||
|
||||
case mt =>
|
||||
raiseError(s"File `$mt` not supported")
|
||||
})
|
||||
|
||||
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
|
||||
Stream.raiseError[F](new Exception(msg))
|
||||
}
|
14
modules/extract/src/test/resources/logback.xml
Normal file
14
modules/extract/src/test/resources/logback.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<configuration>
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<withJansi>true</withJansi>
|
||||
|
||||
<encoder>
|
||||
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<logger name="docspell" level="debug" />
|
||||
<root level="INFO">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
@ -0,0 +1,30 @@
|
||||
package docspell.extract
|
||||
|
||||
import fs2.Stream
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
val letterSourceDE: Stream[IO, Byte] =
|
||||
ExampleFiles.letter_de_pdf
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
|
||||
val letterSourceEN: Stream[IO, Byte] =
|
||||
ExampleFiles.letter_en_pdf
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import docspell.extract.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
||||
import TestFiles._
|
||||
|
||||
test("extract english pdf") {
|
||||
ignore()
|
||||
val text = TextExtract
|
||||
.extract[IO](letterSourceEN, blocker, "eng", Config.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
println(text)
|
||||
}
|
||||
|
||||
test("extract german pdf") {
|
||||
ignore()
|
||||
val expect = TestFiles.letterDEText
|
||||
val extract = TextExtract
|
||||
.extract[IO](letterSourceDE, blocker, "deu", Config.default)
|
||||
.compile
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
|
||||
assertEquals(extract.trim, expect.trim)
|
||||
}
|
||||
|
||||
test("find mimetypes") {
|
||||
ExampleFiles.
|
||||
all.foreach { url =>
|
||||
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
|
||||
map(mt => println(url.asString + ": " + mt.asString)).
|
||||
unsafeRunSync
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user