Add new convert module and sketch its integration

This commit is contained in:
Eike Kettner
2020-02-11 00:33:52 +01:00
parent 3be90d64d5
commit ce22b727b1
9 changed files with 81 additions and 25 deletions

View File

@ -0,0 +1,24 @@
package docspell.convert
import fs2._
import cats.effect._
import docspell.common._
trait Conversion[F[_]] {
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
}
object Conversion {
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
Resource.pure(new Conversion[F] {
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
println(cfg)
???
}
})
}

View File

@ -0,0 +1,3 @@
package docspell.convert
case class ConvertConfig()

View File

@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri}
import docspell.joex.scheduler.SchedulerConfig
import docspell.store.JdbcConfig
import docspell.text.ocr.{Config => OcrConfig}
import docspell.convert.ConvertConfig
case class Config(
appId: Ident,
@ -11,17 +12,10 @@ case class Config(
bind: Config.Bind,
jdbc: JdbcConfig,
scheduler: SchedulerConfig,
extraction: OcrConfig
extraction: OcrConfig,
convert: ConvertConfig
)
object Config {
val postgres =
JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev")
val h2 = JdbcConfig(
LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"),
"sa",
""
)
case class Bind(address: String, port: Int)
}

View File

@ -55,7 +55,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
ProcessItemArgs.taskName,
ItemHandler[F](cfg.extraction),
ItemHandler[F](cfg),
ItemHandler.onCancel[F]
)
)

View File

@ -7,6 +7,7 @@ import cats.effect._
import cats.data.OptionT
import docspell.common._
import docspell.convert._
import docspell.joex.scheduler._
import docspell.store.records._
@ -27,17 +28,17 @@ import docspell.store.records._
object ConvertPdf {
def apply[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
// get mimetype
// try to convert
// save to db
// update file_id of RAttachment
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
for {
ras <- item.attachments.traverse(convert)
@ -51,10 +52,12 @@ object ConvertPdf {
.getOrElse(Mimetype.`application/octet-stream`)
def convertSafe[F[_]: Sync](
cfg: ConvertConfig,
ctx: Context[F, ProcessItemArgs]
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
map(_ => ra)
}
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
Conversion.create[F](cfg).use { conv =>
ctx.logger
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
.map(_ => ra)
}
}

View File

@ -3,16 +3,16 @@ package docspell.joex.process
import cats.implicits._
import cats.effect.{ContextShift, Sync}
import docspell.common.{ItemState, ProcessItemArgs}
import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task}
import docspell.store.queries.QItem
import docspell.store.records.{RItem, RJob}
import docspell.text.ocr.{Config => OcrConfig}
object ItemHandler {
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] =
def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] =
CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg))
@ -30,7 +30,7 @@ object ItemHandler {
} yield last
def safeProcess[F[_]: Sync: ContextShift](
cfg: OcrConfig
cfg: Config
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
case true =>

View File

@ -3,15 +3,15 @@ package docspell.joex.process
import cats.effect.{ContextShift, Sync}
import docspell.common.ProcessItemArgs
import docspell.joex.scheduler.Task
import docspell.text.ocr.{Config => OcrConfig}
import docspell.joex.Config
object ProcessItem {
def apply[F[_]: Sync: ContextShift](
cfg: OcrConfig
cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(item)
.flatMap(TextExtraction(cfg, _))
ConvertPdf(cfg.convert, item)
.flatMap(TextExtraction(cfg.extraction, _))
.flatMap(Task.setProgress(25))
.flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50))