Reorganize processing code

Use separate modules for

- text extraction
- conversion to pdf
- text analysis
This commit is contained in:
Eike Kettner
2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions

View File

@ -0,0 +1,54 @@
package docspell.extract.ocr
import java.nio.file.{Path, Paths}
import docspell.common._
case class Config(
allowedContentTypes: Set[MimeType],
ghostscript: Config.Ghostscript,
pageRange: Config.PageRange,
unpaper: Config.Unpaper,
tesseract: Config.Tesseract
) {
def isAllowed(mt: MimeType): Boolean =
allowedContentTypes contains mt
}
object Config {
case class PageRange(begin: Int)
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
case class Tesseract(command: SystemCommand.Config)
case class Unpaper(command: SystemCommand.Config)
val default = Config(
allowedContentTypes = Set(
MimeType.pdf,
MimeType.png,
MimeType.jpeg,
MimeType.tiff
),
pageRange = PageRange(10),
ghostscript = Ghostscript(
SystemCommand.Config(
"gs",
Seq(
"-dNOPAUSE",
"-dBATCH",
"-dSAFER",
"-sDEVICE=tiffscaled8",
"-sOutputFile={{outfile}}",
"{{infile}}"
),
Duration.seconds(30)
),
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
),
unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
tesseract = Tesseract(
SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
)
)
}

View File

@ -0,0 +1,191 @@
package docspell.extract.ocr
import java.nio.file.Path
import cats.effect.{Blocker, ContextShift, Sync}
import fs2.Stream
import org.log4s._
import docspell.common._
object Ocr {
private[this] val logger = getLogger
/** Extract the text of all pages in the given pdf file.
*/
def extractPdf[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscript(pdf, config, wd, blocker)
.flatMap({ tmpImg =>
runTesseractFile(tmpImg, blocker, lang, config)
})
.fold1(_ + "\n\n\n" + _)
}
/** Extract the text from the given image file
*/
def extractImage[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractStdin(img, blocker, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](
pdf: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
.flatMap({ tif =>
runTesseractFile(tif, blocker, lang, config)
})
.fold1(_ + "\n\n\n" + _)
}
def extractImageFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractFile(img, blocker, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
cfg: Config,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val xargs =
if (cfg.pageRange.begin > 0)
s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command
.copy(args = xargs)
.mapArgs(
replace(
Map(
"{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif"
)
)
)
SystemCommand
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
})
.flatMap(fs => Stream.emits(fs))
}
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path,
ghostscript: SystemCommand.Config,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(
replace(
Map(
"{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif"
)
)
)
SystemCommand
.execSuccess[F](cmd, blocker, wd = Some(wd))
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
})
.flatMap(fs => Stream.emits(fs))
}
private def pathEndsWith(ext: String): Path => Boolean =
p => p.getFileName.toString.endsWith(ext)
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path,
unpaper: SystemCommand.Config,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(
replace(
Map(
"{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString
)
)
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
th =>
logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img)
}
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
val cmd = config.tesseract.command.mapArgs(
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] = {
val cmd = config.tesseract.command
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
}
private def replace(repl: Map[String, String]): String => String =
s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v)
}
private def fixLanguage(lang: String): String =
lang match {
case "de" => "deu"
case "en" => "eng"
case l => l
}
}

View File

@ -0,0 +1,42 @@
package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common._
import docspell.files._
import fs2.Stream
object TextExtract {
def extract[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
extractOCR(in, blocker, lang, config)
def extractOCR[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
Stream
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({
case mt if !config.isAllowed(mt) =>
raiseError(s"File `$mt` not allowed")
case MimeType.pdf =>
Ocr.extractPdf(in, blocker, lang, config)
case mt if mt.primary == "image" =>
Ocr.extractImage(in, blocker, lang, config)
case mt =>
raiseError(s"File `$mt` not supported")
})
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
Stream.raiseError[F](new Exception(msg))
}

View File

@ -0,0 +1,14 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

View File

@ -0,0 +1,30 @@
package docspell.extract
import fs2.Stream
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
val letterSourceDE: Stream[IO, Byte] =
ExampleFiles.letter_de_pdf
.readURL[IO](16 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
ExampleFiles.letter_en_pdf
.readURL[IO](16 * 1024, blocker)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -0,0 +1,42 @@
package docspell.extract.ocr
import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.extract.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
import TestFiles._
test("extract english pdf") {
ignore()
val text = TextExtract
.extract[IO](letterSourceEN, blocker, "eng", Config.default)
.compile
.lastOrError
.unsafeRunSync()
println(text)
}
test("extract german pdf") {
ignore()
val expect = TestFiles.letterDEText
val extract = TextExtract
.extract[IO](letterSourceDE, blocker, "deu", Config.default)
.compile
.lastOrError
.unsafeRunSync()
assertEquals(extract.trim, expect.trim)
}
test("find mimetypes") {
ExampleFiles.
all.foreach { url =>
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
map(mt => println(url.asString + ": " + mt.asString)).
unsafeRunSync
}
}
}