mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Configure pdf extraction; move Logger and DataType to common
This commit is contained in:
parent
3d615181e0
commit
e0682464b5
@ -1,6 +1,4 @@
|
|||||||
package docspell.extract
|
package docspell.common
|
||||||
|
|
||||||
import docspell.common.{MimeType, MimeTypeHint}
|
|
||||||
|
|
||||||
sealed trait DataType {
|
sealed trait DataType {
|
||||||
|
|
@ -4,11 +4,10 @@ import java.io.IOException
|
|||||||
import java.nio.file.attribute.BasicFileAttributes
|
import java.nio.file.attribute.BasicFileAttributes
|
||||||
import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
|
import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
|
||||||
import java.util.concurrent.atomic.AtomicInteger
|
import java.util.concurrent.atomic.AtomicInteger
|
||||||
import scala.jdk.CollectionConverters._
|
|
||||||
|
|
||||||
|
import scala.jdk.CollectionConverters._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.{Blocker, ContextShift, Resource, Sync}
|
||||||
import fs2.Stream
|
|
||||||
|
|
||||||
object File {
|
object File {
|
||||||
|
|
||||||
@ -40,6 +39,9 @@ object File {
|
|||||||
count.get
|
count.get
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
||||||
|
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
||||||
|
|
||||||
def deleteFile[F[_]: Sync](file: Path): F[Unit] =
|
def deleteFile[F[_]: Sync](file: Path): F[Unit] =
|
||||||
Sync[F].delay(Files.deleteIfExists(file)).map(_ => ())
|
Sync[F].delay(Files.deleteIfExists(file)).map(_ => ())
|
||||||
|
|
||||||
@ -47,10 +49,8 @@ object File {
|
|||||||
if (Files.isDirectory(path)) deleteDirectory(path)
|
if (Files.isDirectory(path)) deleteDirectory(path)
|
||||||
else deleteFile(path).map(_ => 1)
|
else deleteFile(path).map(_ => 1)
|
||||||
|
|
||||||
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)(
|
def withTempDir[F[_]: Sync](parent: Path, prefix: String): Resource[F, Path] =
|
||||||
f: Path => Stream[F, A]
|
Resource.make(mkTempDir(parent, prefix))(p => delete(p).map(_ => ()))
|
||||||
): Stream[F, A] =
|
|
||||||
Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f)
|
|
||||||
|
|
||||||
def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
|
def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
|
||||||
val javaList =
|
val javaList =
|
||||||
@ -58,4 +58,6 @@ object File {
|
|||||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) =
|
||||||
|
fs2.io.file.readAll(file, blocker, chunkSize)
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
|
import cats.effect.Sync
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
import org.log4s.{Logger => Log4sLogger}
|
||||||
|
|
||||||
trait Logger[F[_]] {
|
trait Logger[F[_]] {
|
||||||
|
|
||||||
def trace(msg: => String): F[Unit]
|
def trace(msg: => String): F[Unit]
|
||||||
@ -10,3 +14,28 @@ trait Logger[F[_]] {
|
|||||||
def error(msg: => String): F[Unit]
|
def error(msg: => String): F[Unit]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
object Logger {
|
||||||
|
|
||||||
|
|
||||||
|
def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] {
|
||||||
|
def trace(msg: => String): F[Unit] =
|
||||||
|
log.ftrace(msg)
|
||||||
|
|
||||||
|
def debug(msg: => String): F[Unit] =
|
||||||
|
log.fdebug(msg)
|
||||||
|
|
||||||
|
def info(msg: => String): F[Unit] =
|
||||||
|
log.finfo(msg)
|
||||||
|
|
||||||
|
def warn(msg: => String): F[Unit] =
|
||||||
|
log.fwarn(msg)
|
||||||
|
|
||||||
|
def error(ex: Throwable)(msg: => String): F[Unit] =
|
||||||
|
log.ferror(ex)(msg)
|
||||||
|
|
||||||
|
def error(msg: => String): F[Unit] =
|
||||||
|
log.ferror(msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,4 +2,4 @@ package docspell.extract
|
|||||||
|
|
||||||
import docspell.extract.ocr.OcrConfig
|
import docspell.extract.ocr.OcrConfig
|
||||||
|
|
||||||
case class ExtractConfig(ocr: OcrConfig)
|
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
||||||
|
@ -29,14 +29,10 @@ object Extraction {
|
|||||||
dataType: DataType,
|
dataType: DataType,
|
||||||
lang: Language
|
lang: Language
|
||||||
): F[ExtractResult] = {
|
): F[ExtractResult] = {
|
||||||
val mime = dataType match {
|
TikaMimetype.resolve(dataType, data).flatMap {
|
||||||
case DataType.Exact(mt) => mt.pure[F]
|
|
||||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
|
||||||
}
|
|
||||||
mime.flatMap {
|
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
PdfExtract
|
PdfExtract
|
||||||
.get(data, blocker, lang, 5, cfg.ocr, logger)
|
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||||
.map(ExtractResult.fromEither)
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case PoiType(mt) =>
|
case PoiType(mt) =>
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
package docspell.extract
|
||||||
|
|
||||||
|
case class PdfConfig (minTextLen: Int)
|
@ -43,7 +43,7 @@ object PdfExtract {
|
|||||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||||
else
|
else
|
||||||
logger
|
logger
|
||||||
.info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *>
|
.info(s"Stripped text from PDF is small (${str.length}). Trying with OCR.") *>
|
||||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
||||||
)
|
)
|
||||||
} yield res
|
} yield res
|
||||||
|
@ -17,13 +17,15 @@ object Ocr {
|
|||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscript(pdf, config, wd, blocker)
|
runGhostscript(pdf, config, wd, blocker)
|
||||||
.flatMap({ tmpImg =>
|
.flatMap({ tmpImg =>
|
||||||
runTesseractFile(tmpImg, blocker, lang, config)
|
runTesseractFile(tmpImg, blocker, lang, config)
|
||||||
})
|
})
|
||||||
.fold1(_ + "\n\n\n" + _)
|
.fold1(_ + "\n\n\n" + _).
|
||||||
|
compile.
|
||||||
|
last
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Extract the text from the given image file
|
/** Extract the text from the given image file
|
||||||
@ -41,13 +43,15 @@ object Ocr {
|
|||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): F[Option[String]] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd =>
|
||||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
||||||
.flatMap({ tif =>
|
.flatMap({ tif =>
|
||||||
runTesseractFile(tif, blocker, lang, config)
|
runTesseractFile(tif, blocker, lang, config)
|
||||||
})
|
})
|
||||||
.fold1(_ + "\n\n\n" + _)
|
.fold1(_ + "\n\n\n" + _).
|
||||||
|
compile.
|
||||||
|
last
|
||||||
}
|
}
|
||||||
|
|
||||||
def extractImageFile[F[_]: Sync: ContextShift](
|
def extractImageFile[F[_]: Sync: ContextShift](
|
||||||
|
@ -28,7 +28,7 @@ object TextExtract {
|
|||||||
raiseError(s"File `$mt` not allowed")
|
raiseError(s"File `$mt` not allowed")
|
||||||
|
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
Ocr.extractPdf(in, blocker, lang, config)
|
Stream.eval(Ocr.extractPdf(in, blocker, lang, config)).unNoneTerminate
|
||||||
|
|
||||||
case mt if mt.primary == "image" =>
|
case mt if mt.primary == "image" =>
|
||||||
Ocr.extractImage(in, blocker, lang, config)
|
Ocr.extractImage(in, blocker, lang, config)
|
||||||
|
@ -38,4 +38,9 @@ object TikaMimetype {
|
|||||||
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
|
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
|
||||||
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
|
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
|
||||||
|
|
||||||
|
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
|
||||||
|
dt match {
|
||||||
|
case DataType.Exact(mt) => mt.pure[F]
|
||||||
|
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,7 @@ object Context {
|
|||||||
): F[Context[F, A]] =
|
): F[Context[F, A]] =
|
||||||
for {
|
for {
|
||||||
_ <- log.ftrace("Creating logger for task run")
|
_ <- log.ftrace("Creating logger for task run")
|
||||||
logger <- Logger(job.id, job.info, config.logBufferSize, logSink)
|
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
|
||||||
_ <- log.ftrace("Logger created, instantiating context")
|
_ <- log.ftrace("Logger created, instantiating context")
|
||||||
ctx = create[F, A](job, arg, config, logger, store, blocker)
|
ctx = create[F, A](job, arg, config, logger, store, blocker)
|
||||||
} yield ctx
|
} yield ctx
|
||||||
|
@ -5,7 +5,7 @@ import cats.effect.{Concurrent, Sync}
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import fs2.concurrent.Queue
|
import fs2.concurrent.Queue
|
||||||
|
|
||||||
object Logger {
|
object QueueLogger {
|
||||||
|
|
||||||
def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] =
|
def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] =
|
||||||
new Logger[F] {
|
new Logger[F] {
|
Loading…
x
Reference in New Issue
Block a user