From e0682464b5c73d8170893d68fea49db02fd4427e Mon Sep 17 00:00:00 2001 From: Eike Kettner <eike.kettner@posteo.de> Date: Mon, 17 Feb 2020 14:01:36 +0100 Subject: [PATCH] Configure pdf extraction; move Logger and DataType to common --- .../scala/docspell/common}/DataType.scala | 4 +-- .../src/main/scala/docspell/common/File.scala | 16 +++++----- .../main/scala/docspell/common/Logger.scala | 29 +++++++++++++++++++ .../docspell/extract/ExtractConfig.scala | 2 +- .../scala/docspell/extract/Extraction.scala | 8 ++--- .../scala/docspell/extract/PdfConfig.scala | 3 ++ .../scala/docspell/extract/PdfExtract.scala | 2 +- .../main/scala/docspell/extract/ocr/Ocr.scala | 16 ++++++---- .../docspell/extract/ocr/TextExtract.scala | 2 +- .../scala/docspell/files/TikaMimetype.scala | 5 ++++ .../docspell/joex/scheduler/Context.scala | 2 +- .../{Logger.scala => QueueLogger.scala} | 2 +- 12 files changed, 64 insertions(+), 27 deletions(-) rename modules/{extract/src/main/scala/docspell/extract => common/src/main/scala/docspell/common}/DataType.scala (79%) create mode 100644 modules/extract/src/main/scala/docspell/extract/PdfConfig.scala rename modules/joex/src/main/scala/docspell/joex/scheduler/{Logger.scala => QueueLogger.scala} (98%) diff --git a/modules/extract/src/main/scala/docspell/extract/DataType.scala b/modules/common/src/main/scala/docspell/common/DataType.scala similarity index 79% rename from modules/extract/src/main/scala/docspell/extract/DataType.scala rename to modules/common/src/main/scala/docspell/common/DataType.scala index 7d4c28d6..69d2a883 100644 --- a/modules/extract/src/main/scala/docspell/extract/DataType.scala +++ b/modules/common/src/main/scala/docspell/common/DataType.scala @@ -1,6 +1,4 @@ -package docspell.extract - -import docspell.common.{MimeType, MimeTypeHint} +package docspell.common sealed trait DataType { diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 22c39188..41884949 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -4,11 +4,10 @@ import java.io.IOException import java.nio.file.attribute.BasicFileAttributes import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor} import java.util.concurrent.atomic.AtomicInteger -import scala.jdk.CollectionConverters._ +import scala.jdk.CollectionConverters._ import cats.implicits._ -import cats.effect.Sync -import fs2.Stream +import cats.effect.{Blocker, ContextShift, Resource, Sync} object File { @@ -40,6 +39,9 @@ object File { count.get } + def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = + Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) + def deleteFile[F[_]: Sync](file: Path): F[Unit] = Sync[F].delay(Files.deleteIfExists(file)).map(_ => ()) @@ -47,10 +49,8 @@ object File { if (Files.isDirectory(path)) deleteDirectory(path) else deleteFile(path).map(_ => 1) - def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)( - f: Path => Stream[F, A] - ): Stream[F, A] = - Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f) + def withTempDir[F[_]: Sync](parent: Path, prefix: String): Resource[F, Path] = + Resource.make(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())) def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay { val javaList = @@ -58,4 +58,6 @@ object File { javaList.asScala.toList.sortBy(_.getFileName.toString) } + def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) = + fs2.io.file.readAll(file, blocker, chunkSize) } diff --git a/modules/common/src/main/scala/docspell/common/Logger.scala b/modules/common/src/main/scala/docspell/common/Logger.scala index cb155ca1..e95264b0 100644 --- a/modules/common/src/main/scala/docspell/common/Logger.scala +++ b/modules/common/src/main/scala/docspell/common/Logger.scala @@ -1,5 +1,9 @@ package docspell.common +import cats.effect.Sync +import docspell.common.syntax.all._ +import org.log4s.{Logger => Log4sLogger} + trait Logger[F[_]] { def trace(msg: => String): F[Unit] @@ -10,3 +14,28 @@ trait Logger[F[_]] { def error(msg: => String): F[Unit] } + +object Logger { + + + def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] { + def trace(msg: => String): F[Unit] = + log.ftrace(msg) + + def debug(msg: => String): F[Unit] = + log.fdebug(msg) + + def info(msg: => String): F[Unit] = + log.finfo(msg) + + def warn(msg: => String): F[Unit] = + log.fwarn(msg) + + def error(ex: Throwable)(msg: => String): F[Unit] = + log.ferror(ex)(msg) + + def error(msg: => String): F[Unit] = + log.ferror(msg) + } + +} \ No newline at end of file diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala index 76b65537..b4951686 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -2,4 +2,4 @@ package docspell.extract import docspell.extract.ocr.OcrConfig -case class ExtractConfig(ocr: OcrConfig) +case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index ebc44591..8ef052c8 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -29,14 +29,10 @@ object Extraction { dataType: DataType, lang: Language ): F[ExtractResult] = { - val mime = dataType match { - case DataType.Exact(mt) => mt.pure[F] - case DataType.Hint(hint) => TikaMimetype.detect(data, hint) - } - mime.flatMap { + TikaMimetype.resolve(dataType, data).flatMap { case MimeType.pdf => PdfExtract - .get(data, blocker, lang, 5, cfg.ocr, logger) + .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) .map(ExtractResult.fromEither) case PoiType(mt) => diff --git a/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala b/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala new file mode 100644 index 00000000..7d4476f8 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala @@ -0,0 +1,3 @@ +package docspell.extract + +case class PdfConfig (minTextLen: Int) diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 2489b391..20d800bb 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -43,7 +43,7 @@ object PdfExtract { if (str.length >= stripMinLen) str.pure[F].attempt else logger - .info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *> + .info(s"Stripped text from PDF is small (${str.length}). Trying with OCR.") *> runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt ) } yield res diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index fbdece9c..4e97e928 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -17,13 +17,15 @@ object Ocr { blocker: Blocker, lang: String, config: OcrConfig - ): Stream[F, String] = - File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => + ): F[Option[String]] = + File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => runGhostscript(pdf, config, wd, blocker) .flatMap({ tmpImg => runTesseractFile(tmpImg, blocker, lang, config) }) - .fold1(_ + "\n\n\n" + _) + .fold1(_ + "\n\n\n" + _). + compile. + last } /** Extract the text from the given image file @@ -41,13 +43,15 @@ object Ocr { blocker: Blocker, lang: String, config: OcrConfig - ): Stream[F, String] = - File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => + ): F[Option[String]] = + File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) .flatMap({ tif => runTesseractFile(tif, blocker, lang, config) }) - .fold1(_ + "\n\n\n" + _) + .fold1(_ + "\n\n\n" + _). + compile. + last } def extractImageFile[F[_]: Sync: ContextShift]( diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index 35031207..07238482 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -28,7 +28,7 @@ object TextExtract { raiseError(s"File `$mt` not allowed") case MimeType.pdf => - Ocr.extractPdf(in, blocker, lang, config) + Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate case mt if mt.primary == "image" => Ocr.extractImage(in, blocker, lang, config) diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala index 3511859c..b828b8fe 100644 --- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -38,4 +38,9 @@ object TikaMimetype { def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] = data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint)) + def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] = + dt match { + case DataType.Exact(mt) => mt.pure[F] + case DataType.Hint(hint) => TikaMimetype.detect(data, hint) + } } diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala index a769103a..ba1784a0 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala @@ -52,7 +52,7 @@ object Context { ): F[Context[F, A]] = for { _ <- log.ftrace("Creating logger for task run") - logger <- Logger(job.id, job.info, config.logBufferSize, logSink) + logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink) _ <- log.ftrace("Logger created, instantiating context") ctx = create[F, A](job, arg, config, logger, store, blocker) } yield ctx diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala similarity index 98% rename from modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala rename to modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala index 3ac4d441..86f2e36e 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala @@ -5,7 +5,7 @@ import cats.effect.{Concurrent, Sync} import docspell.common._ import fs2.concurrent.Queue -object Logger { +object QueueLogger { def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] = new Logger[F] {