mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-13 01:39:33 +00:00
Add new convert module and sketch its integration
This commit is contained in:
parent
3be90d64d5
commit
ce22b727b1
14
build.sbt
14
build.sbt
@ -174,6 +174,17 @@ val text = project.in(file("modules/text")).
|
|||||||
Dependencies.stanfordNlpCore
|
Dependencies.stanfordNlpCore
|
||||||
).dependsOn(common)
|
).dependsOn(common)
|
||||||
|
|
||||||
|
val convert = project.in(file("modules/convert")).
|
||||||
|
disablePlugins(RevolverPlugin).
|
||||||
|
settings(sharedSettings).
|
||||||
|
settings(testSettings).
|
||||||
|
settings(
|
||||||
|
name := "docspell-convert",
|
||||||
|
libraryDependencies ++=
|
||||||
|
Dependencies.pdfbox ++
|
||||||
|
Dependencies.flexmark
|
||||||
|
).dependsOn(common)
|
||||||
|
|
||||||
val restapi = project.in(file("modules/restapi")).
|
val restapi = project.in(file("modules/restapi")).
|
||||||
disablePlugins(RevolverPlugin).
|
disablePlugins(RevolverPlugin).
|
||||||
enablePlugins(OpenApiSchema).
|
enablePlugins(OpenApiSchema).
|
||||||
@ -226,7 +237,7 @@ val joex = project.in(file("modules/joex")).
|
|||||||
addCompilerPlugin(Dependencies.betterMonadicFor),
|
addCompilerPlugin(Dependencies.betterMonadicFor),
|
||||||
buildInfoPackage := "docspell.joex",
|
buildInfoPackage := "docspell.joex",
|
||||||
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
|
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
|
||||||
).dependsOn(store, text, joexapi, restapi)
|
).dependsOn(store, text, convert, joexapi, restapi)
|
||||||
|
|
||||||
val backend = project.in(file("modules/backend")).
|
val backend = project.in(file("modules/backend")).
|
||||||
disablePlugins(RevolverPlugin).
|
disablePlugins(RevolverPlugin).
|
||||||
@ -357,6 +368,7 @@ val root = project.in(file(".")).
|
|||||||
).
|
).
|
||||||
aggregate(common
|
aggregate(common
|
||||||
, text
|
, text
|
||||||
|
, convert
|
||||||
, store
|
, store
|
||||||
, joexapi
|
, joexapi
|
||||||
, joex
|
, joex
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import fs2._
|
||||||
|
import cats.effect._
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
trait Conversion[F[_]] {
|
||||||
|
|
||||||
|
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object Conversion {
|
||||||
|
|
||||||
|
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
|
||||||
|
Resource.pure(new Conversion[F] {
|
||||||
|
|
||||||
|
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
|
||||||
|
println(cfg)
|
||||||
|
???
|
||||||
|
}
|
||||||
|
|
||||||
|
})
|
||||||
|
}
|
@ -0,0 +1,3 @@
|
|||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
case class ConvertConfig()
|
@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri}
|
|||||||
import docspell.joex.scheduler.SchedulerConfig
|
import docspell.joex.scheduler.SchedulerConfig
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
import docspell.text.ocr.{Config => OcrConfig}
|
import docspell.text.ocr.{Config => OcrConfig}
|
||||||
|
import docspell.convert.ConvertConfig
|
||||||
|
|
||||||
case class Config(
|
case class Config(
|
||||||
appId: Ident,
|
appId: Ident,
|
||||||
@ -11,17 +12,10 @@ case class Config(
|
|||||||
bind: Config.Bind,
|
bind: Config.Bind,
|
||||||
jdbc: JdbcConfig,
|
jdbc: JdbcConfig,
|
||||||
scheduler: SchedulerConfig,
|
scheduler: SchedulerConfig,
|
||||||
extraction: OcrConfig
|
extraction: OcrConfig,
|
||||||
|
convert: ConvertConfig
|
||||||
)
|
)
|
||||||
|
|
||||||
object Config {
|
object Config {
|
||||||
val postgres =
|
|
||||||
JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev")
|
|
||||||
val h2 = JdbcConfig(
|
|
||||||
LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"),
|
|
||||||
"sa",
|
|
||||||
""
|
|
||||||
)
|
|
||||||
|
|
||||||
case class Bind(address: String, port: Int)
|
case class Bind(address: String, port: Int)
|
||||||
}
|
}
|
||||||
|
@ -55,7 +55,7 @@ object JoexAppImpl {
|
|||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
ProcessItemArgs.taskName,
|
ProcessItemArgs.taskName,
|
||||||
ItemHandler[F](cfg.extraction),
|
ItemHandler[F](cfg),
|
||||||
ItemHandler.onCancel[F]
|
ItemHandler.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -7,6 +7,7 @@ import cats.effect._
|
|||||||
import cats.data.OptionT
|
import cats.data.OptionT
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert._
|
||||||
import docspell.joex.scheduler._
|
import docspell.joex.scheduler._
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
|
|
||||||
@ -27,17 +28,17 @@ import docspell.store.records._
|
|||||||
object ConvertPdf {
|
object ConvertPdf {
|
||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
cfg: ConvertConfig,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
): Task[F, ProcessItemArgs, ItemData] =
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
|
|
||||||
// get mimetype
|
// get mimetype
|
||||||
// try to convert
|
// try to convert
|
||||||
// save to db
|
// save to db
|
||||||
// update file_id of RAttachment
|
// update file_id of RAttachment
|
||||||
|
|
||||||
def convert(ra: RAttachment) =
|
def convert(ra: RAttachment) =
|
||||||
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
|
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
|
||||||
|
|
||||||
for {
|
for {
|
||||||
ras <- item.attachments.traverse(convert)
|
ras <- item.attachments.traverse(convert)
|
||||||
@ -51,10 +52,12 @@ object ConvertPdf {
|
|||||||
.getOrElse(Mimetype.`application/octet-stream`)
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
|
||||||
def convertSafe[F[_]: Sync](
|
def convertSafe[F[_]: Sync](
|
||||||
|
cfg: ConvertConfig,
|
||||||
ctx: Context[F, ProcessItemArgs]
|
ctx: Context[F, ProcessItemArgs]
|
||||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
|
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
|
||||||
|
Conversion.create[F](cfg).use { conv =>
|
||||||
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
|
ctx.logger
|
||||||
map(_ => ra)
|
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
|
||||||
|
.map(_ => ra)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,16 +3,16 @@ package docspell.joex.process
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.{ContextShift, Sync}
|
import cats.effect.{ContextShift, Sync}
|
||||||
import docspell.common.{ItemState, ProcessItemArgs}
|
import docspell.common.{ItemState, ProcessItemArgs}
|
||||||
|
import docspell.joex.Config
|
||||||
import docspell.joex.scheduler.{Context, Task}
|
import docspell.joex.scheduler.{Context, Task}
|
||||||
import docspell.store.queries.QItem
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.{RItem, RJob}
|
import docspell.store.records.{RItem, RJob}
|
||||||
import docspell.text.ocr.{Config => OcrConfig}
|
|
||||||
|
|
||||||
object ItemHandler {
|
object ItemHandler {
|
||||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
|
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
|
||||||
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
|
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
|
||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] =
|
def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] =
|
||||||
CreateItem[F]
|
CreateItem[F]
|
||||||
.flatMap(itemStateTask(ItemState.Processing))
|
.flatMap(itemStateTask(ItemState.Processing))
|
||||||
.flatMap(safeProcess[F](cfg))
|
.flatMap(safeProcess[F](cfg))
|
||||||
@ -30,7 +30,7 @@ object ItemHandler {
|
|||||||
} yield last
|
} yield last
|
||||||
|
|
||||||
def safeProcess[F[_]: Sync: ContextShift](
|
def safeProcess[F[_]: Sync: ContextShift](
|
||||||
cfg: OcrConfig
|
cfg: Config
|
||||||
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
|
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
|
@ -3,15 +3,15 @@ package docspell.joex.process
|
|||||||
import cats.effect.{ContextShift, Sync}
|
import cats.effect.{ContextShift, Sync}
|
||||||
import docspell.common.ProcessItemArgs
|
import docspell.common.ProcessItemArgs
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.text.ocr.{Config => OcrConfig}
|
import docspell.joex.Config
|
||||||
|
|
||||||
object ProcessItem {
|
object ProcessItem {
|
||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
cfg: OcrConfig
|
cfg: Config
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
ConvertPdf(item)
|
ConvertPdf(cfg.convert, item)
|
||||||
.flatMap(TextExtraction(cfg, _))
|
.flatMap(TextExtraction(cfg.extraction, _))
|
||||||
.flatMap(Task.setProgress(25))
|
.flatMap(Task.setProgress(25))
|
||||||
.flatMap(TextAnalysis[F])
|
.flatMap(TextAnalysis[F])
|
||||||
.flatMap(Task.setProgress(50))
|
.flatMap(Task.setProgress(50))
|
||||||
|
@ -11,6 +11,7 @@ object Dependencies {
|
|||||||
val DoobieVersion = "0.8.8"
|
val DoobieVersion = "0.8.8"
|
||||||
val EmilVersion = "0.2.0"
|
val EmilVersion = "0.2.0"
|
||||||
val FastparseVersion = "2.1.3"
|
val FastparseVersion = "2.1.3"
|
||||||
|
val FlexmarkVersion = "0.60.2"
|
||||||
val FlywayVersion = "6.2.2"
|
val FlywayVersion = "6.2.2"
|
||||||
val Fs2Version = "2.2.2"
|
val Fs2Version = "2.2.2"
|
||||||
val H2Version = "1.4.200"
|
val H2Version = "1.4.200"
|
||||||
@ -20,6 +21,7 @@ object Dependencies {
|
|||||||
val LogbackVersion = "1.2.3"
|
val LogbackVersion = "1.2.3"
|
||||||
val MariaDbVersion = "2.5.4"
|
val MariaDbVersion = "2.5.4"
|
||||||
val MiniTestVersion = "2.7.0"
|
val MiniTestVersion = "2.7.0"
|
||||||
|
val PdfboxVersion = "2.0.18"
|
||||||
val PostgresVersion = "42.2.10"
|
val PostgresVersion = "42.2.10"
|
||||||
val PureConfigVersion = "0.12.2"
|
val PureConfigVersion = "0.12.2"
|
||||||
val SqliteVersion = "3.30.1"
|
val SqliteVersion = "3.30.1"
|
||||||
@ -31,6 +33,24 @@ object Dependencies {
|
|||||||
val JQueryVersion = "3.4.1"
|
val JQueryVersion = "3.4.1"
|
||||||
val ViewerJSVersion = "0.5.8"
|
val ViewerJSVersion = "0.5.8"
|
||||||
|
|
||||||
|
// https://github.com/vsch/flexmark-java
|
||||||
|
// BSD 2-Clause
|
||||||
|
val flexmark = Seq(
|
||||||
|
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
|
||||||
|
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
|
||||||
|
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
|
||||||
|
).map(_.excludeAll(
|
||||||
|
ExclusionRule("junit"),
|
||||||
|
ExclusionRule("hamcrest-core")
|
||||||
|
))
|
||||||
|
|
||||||
|
val pdfbox = Seq(
|
||||||
|
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
|
||||||
|
ExclusionRule("commons-logging"),
|
||||||
|
ExclusionRule("org.bouncycastle")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
val emil = Seq(
|
val emil = Seq(
|
||||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||||
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
"com.github.eikek" %% "emil-javamail" % EmilVersion
|
||||||
|
Loading…
x
Reference in New Issue
Block a user