mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Add new convert module and sketch its integration
This commit is contained in:
@ -0,0 +1,24 @@
|
||||
package docspell.convert
|
||||
|
||||
import fs2._
|
||||
import cats.effect._
|
||||
import docspell.common._
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
|
||||
|
||||
}
|
||||
|
||||
object Conversion {
|
||||
|
||||
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
|
||||
Resource.pure(new Conversion[F] {
|
||||
|
||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
|
||||
println(cfg)
|
||||
???
|
||||
}
|
||||
|
||||
})
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
package docspell.convert
|
||||
|
||||
case class ConvertConfig()
|
@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri}
|
||||
import docspell.joex.scheduler.SchedulerConfig
|
||||
import docspell.store.JdbcConfig
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.convert.ConvertConfig
|
||||
|
||||
case class Config(
|
||||
appId: Ident,
|
||||
@ -11,17 +12,10 @@ case class Config(
|
||||
bind: Config.Bind,
|
||||
jdbc: JdbcConfig,
|
||||
scheduler: SchedulerConfig,
|
||||
extraction: OcrConfig
|
||||
extraction: OcrConfig,
|
||||
convert: ConvertConfig
|
||||
)
|
||||
|
||||
object Config {
|
||||
val postgres =
|
||||
JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev")
|
||||
val h2 = JdbcConfig(
|
||||
LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"),
|
||||
"sa",
|
||||
""
|
||||
)
|
||||
|
||||
case class Bind(address: String, port: Int)
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ object JoexAppImpl {
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
ProcessItemArgs.taskName,
|
||||
ItemHandler[F](cfg.extraction),
|
||||
ItemHandler[F](cfg),
|
||||
ItemHandler.onCancel[F]
|
||||
)
|
||||
)
|
||||
|
@ -7,6 +7,7 @@ import cats.effect._
|
||||
import cats.data.OptionT
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert._
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records._
|
||||
|
||||
@ -27,17 +28,17 @@ import docspell.store.records._
|
||||
object ConvertPdf {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: ConvertConfig,
|
||||
item: ItemData
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
|
||||
// get mimetype
|
||||
// try to convert
|
||||
// save to db
|
||||
// update file_id of RAttachment
|
||||
|
||||
def convert(ra: RAttachment) =
|
||||
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
|
||||
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
|
||||
|
||||
for {
|
||||
ras <- item.attachments.traverse(convert)
|
||||
@ -51,10 +52,12 @@ object ConvertPdf {
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
|
||||
def convertSafe[F[_]: Sync](
|
||||
cfg: ConvertConfig,
|
||||
ctx: Context[F, ProcessItemArgs]
|
||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
|
||||
|
||||
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
|
||||
map(_ => ra)
|
||||
}
|
||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
|
||||
Conversion.create[F](cfg).use { conv =>
|
||||
ctx.logger
|
||||
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
|
||||
.map(_ => ra)
|
||||
}
|
||||
}
|
||||
|
@ -3,16 +3,16 @@ package docspell.joex.process
|
||||
import cats.implicits._
|
||||
import cats.effect.{ContextShift, Sync}
|
||||
import docspell.common.{ItemState, ProcessItemArgs}
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.{RItem, RJob}
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
|
||||
object ItemHandler {
|
||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
|
||||
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] =
|
||||
def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] =
|
||||
CreateItem[F]
|
||||
.flatMap(itemStateTask(ItemState.Processing))
|
||||
.flatMap(safeProcess[F](cfg))
|
||||
@ -30,7 +30,7 @@ object ItemHandler {
|
||||
} yield last
|
||||
|
||||
def safeProcess[F[_]: Sync: ContextShift](
|
||||
cfg: OcrConfig
|
||||
cfg: Config
|
||||
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
|
||||
case true =>
|
||||
|
@ -3,15 +3,15 @@ package docspell.joex.process
|
||||
import cats.effect.{ContextShift, Sync}
|
||||
import docspell.common.ProcessItemArgs
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.joex.Config
|
||||
|
||||
object ProcessItem {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: OcrConfig
|
||||
cfg: Config
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
ConvertPdf(item)
|
||||
.flatMap(TextExtraction(cfg, _))
|
||||
ConvertPdf(cfg.convert, item)
|
||||
.flatMap(TextExtraction(cfg.extraction, _))
|
||||
.flatMap(Task.setProgress(25))
|
||||
.flatMap(TextAnalysis[F])
|
||||
.flatMap(Task.setProgress(50))
|
||||
|
Reference in New Issue
Block a user