Add new convert module and sketch its integration

This commit is contained in:
Eike Kettner 2020-02-11 00:33:52 +01:00
parent 3be90d64d5
commit ce22b727b1
9 changed files with 81 additions and 25 deletions

View File

@ -173,6 +173,17 @@ val text = project.in(file("modules/text")).
Dependencies.tika ++
Dependencies.stanfordNlpCore
).dependsOn(common)
val convert = project.in(file("modules/convert")).
disablePlugins(RevolverPlugin).
settings(sharedSettings).
settings(testSettings).
settings(
name := "docspell-convert",
libraryDependencies ++=
Dependencies.pdfbox ++
Dependencies.flexmark
).dependsOn(common)
val restapi = project.in(file("modules/restapi")).
disablePlugins(RevolverPlugin).
@ -226,7 +237,7 @@ val joex = project.in(file("modules/joex")).
addCompilerPlugin(Dependencies.betterMonadicFor),
buildInfoPackage := "docspell.joex",
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
).dependsOn(store, text, joexapi, restapi)
).dependsOn(store, text, convert, joexapi, restapi)
val backend = project.in(file("modules/backend")).
disablePlugins(RevolverPlugin).
@ -357,6 +368,7 @@ val root = project.in(file(".")).
).
aggregate(common
, text
, convert
, store
, joexapi
, joex

View File

@ -0,0 +1,24 @@
package docspell.convert
import fs2._
import cats.effect._
import docspell.common._
trait Conversion[F[_]] {
def toPDF(inType: MimeType): Pipe[F, Byte, Byte]
}
object Conversion {
def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] =
Resource.pure(new Conversion[F] {
def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = {
println(cfg)
???
}
})
}

View File

@ -0,0 +1,3 @@
package docspell.convert
case class ConvertConfig()

View File

@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri}
import docspell.joex.scheduler.SchedulerConfig
import docspell.store.JdbcConfig
import docspell.text.ocr.{Config => OcrConfig}
import docspell.convert.ConvertConfig
case class Config(
appId: Ident,
@ -11,17 +12,10 @@ case class Config(
bind: Config.Bind,
jdbc: JdbcConfig,
scheduler: SchedulerConfig,
extraction: OcrConfig
extraction: OcrConfig,
convert: ConvertConfig
)
object Config {
val postgres =
JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev")
val h2 = JdbcConfig(
LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"),
"sa",
""
)
case class Bind(address: String, port: Int)
}

View File

@ -55,7 +55,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
ProcessItemArgs.taskName,
ItemHandler[F](cfg.extraction),
ItemHandler[F](cfg),
ItemHandler.onCancel[F]
)
)

View File

@ -7,6 +7,7 @@ import cats.effect._
import cats.data.OptionT
import docspell.common._
import docspell.convert._
import docspell.joex.scheduler._
import docspell.store.records._
@ -27,17 +28,17 @@ import docspell.store.records._
object ConvertPdf {
def apply[F[_]: Sync: ContextShift](
cfg: ConvertConfig,
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
// get mimetype
// try to convert
// save to db
// update file_id of RAttachment
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
for {
ras <- item.attachments.traverse(convert)
@ -51,10 +52,12 @@ object ConvertPdf {
.getOrElse(Mimetype.`application/octet-stream`)
def convertSafe[F[_]: Sync](
cfg: ConvertConfig,
ctx: Context[F, ProcessItemArgs]
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
map(_ => ra)
}
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
Conversion.create[F](cfg).use { conv =>
ctx.logger
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
.map(_ => ra)
}
}

View File

@ -3,16 +3,16 @@ package docspell.joex.process
import cats.implicits._
import cats.effect.{ContextShift, Sync}
import docspell.common.{ItemState, ProcessItemArgs}
import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task}
import docspell.store.queries.QItem
import docspell.store.records.{RItem, RJob}
import docspell.text.ocr.{Config => OcrConfig}
object ItemHandler {
def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] =
logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds)
def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] =
def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] =
CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg))
@ -30,7 +30,7 @@ object ItemHandler {
} yield last
def safeProcess[F[_]: Sync: ContextShift](
cfg: OcrConfig
cfg: Config
)(data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task(isLastRetry[F, ProcessItemArgs] _).flatMap {
case true =>

View File

@ -3,15 +3,15 @@ package docspell.joex.process
import cats.effect.{ContextShift, Sync}
import docspell.common.ProcessItemArgs
import docspell.joex.scheduler.Task
import docspell.text.ocr.{Config => OcrConfig}
import docspell.joex.Config
object ProcessItem {
def apply[F[_]: Sync: ContextShift](
cfg: OcrConfig
cfg: Config
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(item)
.flatMap(TextExtraction(cfg, _))
ConvertPdf(cfg.convert, item)
.flatMap(TextExtraction(cfg.extraction, _))
.flatMap(Task.setProgress(25))
.flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50))

View File

@ -11,6 +11,7 @@ object Dependencies {
val DoobieVersion = "0.8.8"
val EmilVersion = "0.2.0"
val FastparseVersion = "2.1.3"
val FlexmarkVersion = "0.60.2"
val FlywayVersion = "6.2.2"
val Fs2Version = "2.2.2"
val H2Version = "1.4.200"
@ -20,6 +21,7 @@ object Dependencies {
val LogbackVersion = "1.2.3"
val MariaDbVersion = "2.5.4"
val MiniTestVersion = "2.7.0"
val PdfboxVersion = "2.0.18"
val PostgresVersion = "42.2.10"
val PureConfigVersion = "0.12.2"
val SqliteVersion = "3.30.1"
@ -30,6 +32,24 @@ object Dependencies {
val SemanticUIVersion = "2.4.1"
val JQueryVersion = "3.4.1"
val ViewerJSVersion = "0.5.8"
// https://github.com/vsch/flexmark-java
// BSD 2-Clause
val flexmark = Seq(
"com.vladsch.flexmark" % "flexmark" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion,
"com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion
).map(_.excludeAll(
ExclusionRule("junit"),
ExclusionRule("hamcrest-core")
))
val pdfbox = Seq(
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
ExclusionRule("commons-logging"),
ExclusionRule("org.bouncycastle")
)
)
val emil = Seq(
"com.github.eikek" %% "emil-common" % EmilVersion,