mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Add new convert module and sketch its integration
This commit is contained in:
parent
3be90d64d5
commit
ce22b727b1
14
build.sbt
14
build.sbt
@ -173,6 +173,17 @@ val text = project.in(file("modules/text")).
|
||||
Dependencies.tika ++
|
||||
Dependencies.stanfordNlpCore
|
||||
).dependsOn(common)
|
||||
|
||||
val convert = project.in(file("modules/convert")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
settings(sharedSettings).
|
||||
settings(testSettings).
|
||||
settings(
|
||||
name := "docspell-convert",
|
||||
libraryDependencies ++=
|
||||
Dependencies.pdfbox ++
|
||||
Dependencies.flexmark
|
||||
).dependsOn(common)
|
||||
|
||||
val restapi = project.in(file("modules/restapi")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
@ -226,7 +237,7 @@ val joex = project.in(file("modules/joex")).
|
||||
addCompilerPlugin(Dependencies.betterMonadicFor),
|
||||
buildInfoPackage := "docspell.joex",
|
||||
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
|
||||
).dependsOn(store, text, joexapi, restapi)
|
||||
).dependsOn(store, text, convert, joexapi, restapi)
|
||||
|
||||
val backend = project.in(file("modules/backend")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
@ -357,6 +368,7 @@ val root = project.in(file(".")).
|
||||
).
|
||||
aggregate(common
|
||||
, text
|
||||
, convert
|
||||
, store
|
||||
, joexapi
|
||||
, joex
|
||||
|
@ -0,0 +1,24 @@
|
||||
package docspell.convert
|
||||
|
||||
import fs2._
|
||||
import cats.effect._
|
||||
import docspell.common._
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
|
||||
|
||||
}
|
||||
|
||||
object Conversion {
|
||||
|
||||
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
|
||||
Resource.pure(new Conversion[F] {
|
||||
|
||||
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
|
||||
println(cfg)
|
||||
???
|
||||
}
|
||||
|
||||
})
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
package docspell.convert
|
||||
|
||||
case class ConvertConfig()
|
@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri}
|
||||
import docspell.joex.scheduler.SchedulerConfig
|
||||
import docspell.store.JdbcConfig
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.convert.ConvertConfig
|
||||
|
||||
case class Config(
|
||||
appId: Ident,
|
||||
@ -11,17 +12,10 @@ case class Config(
|
||||
bind: Config.Bind,
|
||||
jdbc: JdbcConfig,
|
||||
scheduler: SchedulerConfig,
|
||||
extraction: OcrConfig
|
||||
extraction: OcrConfig,
|
||||
convert: ConvertConfig
|
||||
)
|
||||
|
||||
object Config {
|
||||
val postgres =
|
||||
JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev")
|
||||
val h2 = JdbcConfig(
|
||||
LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"),
|
||||
"sa",
|
||||
""
|
||||
)
|
||||
|
||||
case class Bind(address: String, port: Int)
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ object JoexAppImpl {
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
ProcessItemArgs.taskName,
|
||||
ItemHandler[F](cfg.extraction),
|
||||
ItemHandler[F](cfg),
|
||||
ItemHandler.onCancel[F]
|
||||
)
|
||||
)
|
||||
|
@ -7,6 +7,7 @@ import cats.effect._
|
||||
import cats.data.OptionT
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert._
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records._
|
||||
|
||||
@ -27,17 +28,17 @@ import docspell.store.records._
|
||||
object ConvertPdf {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: ConvertConfig,
|
||||
item: ItemData
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
|
||||
// get mimetype
|
||||
// try to convert
|
||||
// save to db
|
||||
// update file_id of RAttachment
|
||||
|
||||
def convert(ra: RAttachment) =
|
||||
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
|
||||
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
|
||||
|
||||
for {
|
||||
ras <- item.attachments.traverse(convert)
|
||||
@ -51,10 +52,12 @@ object ConvertPdf {
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
|
||||
def convertSafe[F[_]: Sync](
|
||||
cfg: ConvertConfig,
|
||||
ctx: Context[F, ProcessItemArgs]
|
||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
|
||||
|
||||
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
|
||||
map(_ => ra)
|
||||
}
|
||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
|
||||
Conversion.create[F](cfg).use { conv =>
|
||||
ctx.logger
|
||||
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
|
||||
.map(_ => ra)
|
||||
}
|
||||
}
|
||||
|
@ -3,16 +3,16 @@ package docspell.joex.process
|
||||
import cats.implicits._
|
||||
import cats.effect.{ContextShift, Sync}
|
||||
import docspell.common.{ItemState, ProcessItemArgs}
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.{RItem, RJob}
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
|
||||
object ItemHandler {
|
||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
|
||||
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] =
|
||||
def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] =
|
||||
CreateItem[F]
|
||||
.flatMap(itemStateTask(ItemState.Processing))
|
||||
.flatMap(safeProcess[F](cfg))
|
||||
@ -30,7 +30,7 @@ object ItemHandler {
|
||||
} yield last
|
||||
|
||||
def safeProcess[F[_]: Sync: ContextShift](
|
||||
cfg: OcrConfig
|
||||
cfg: Config
|
||||
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
|
||||
case true =>
|
||||
|
@ -3,15 +3,15 @@ package docspell.joex.process
|
||||
import cats.effect.{ContextShift, Sync}
|
||||
import docspell.common.ProcessItemArgs
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.joex.Config
|
||||
|
||||
object ProcessItem {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: OcrConfig
|
||||
cfg: Config
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
ConvertPdf(item)
|
||||
.flatMap(TextExtraction(cfg, _))
|
||||
ConvertPdf(cfg.convert, item)
|
||||
.flatMap(TextExtraction(cfg.extraction, _))
|
||||
.flatMap(Task.setProgress(25))
|
||||
.flatMap(TextAnalysis[F])
|
||||
.flatMap(Task.setProgress(50))
|
||||
|
@ -11,6 +11,7 @@ object Dependencies {
|
||||
val DoobieVersion = "0.8.8"
|
||||
val EmilVersion = "0.2.0"
|
||||
val FastparseVersion = "2.1.3"
|
||||
val FlexmarkVersion = "0.60.2"
|
||||
val FlywayVersion = "6.2.2"
|
||||
val Fs2Version = "2.2.2"
|
||||
val H2Version = "1.4.200"
|
||||
@ -20,6 +21,7 @@ object Dependencies {
|
||||
val LogbackVersion = "1.2.3"
|
||||
val MariaDbVersion = "2.5.4"
|
||||
val MiniTestVersion = "2.7.0"
|
||||
val PdfboxVersion = "2.0.18"
|
||||
val PostgresVersion = "42.2.10"
|
||||
val PureConfigVersion = "0.12.2"
|
||||
val SqliteVersion = "3.30.1"
|
||||
@ -30,6 +32,24 @@ object Dependencies {
|
||||
val SemanticUIVersion = "2.4.1"
|
||||
val JQueryVersion = "3.4.1"
|
||||
val ViewerJSVersion = "0.5.8"
|
||||
|
||||
// https://github.com/vsch/flexmark-java
|
||||
// BSD 2-Clause
|
||||
val flexmark = Seq(
|
||||
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
||||
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
|
||||
).map(_.excludeAll(
|
||||
ExclusionRule("junit"),
|
||||
ExclusionRule("hamcrest-core")
|
||||
))
|
||||
|
||||
val pdfbox = Seq(
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
|
||||
ExclusionRule("commons-logging"),
|
||||
ExclusionRule("org.bouncycastle")
|
||||
)
|
||||
)
|
||||
|
||||
val emil = Seq(
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||
|
Loading…
x
Reference in New Issue
Block a user