From ce22b727b171948952c61dd6a2cdcc0f9511b0b3 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 11 Feb 2020 00:33:52 +0100 Subject: [PATCH] Add new convert module and sketch its integration --- build.sbt | 14 ++++++++++- .../scala/docspell/convert/Conversion.scala | 24 +++++++++++++++++++ .../docspell/convert/ConvertConfig.scala | 3 +++ .../src/main/scala/docspell/joex/Config.scala | 12 +++------- .../scala/docspell/joex/JoexAppImpl.scala | 2 +- .../docspell/joex/process/ConvertPdf.scala | 17 +++++++------ .../docspell/joex/process/ItemHandler.scala | 6 ++--- .../docspell/joex/process/ProcessItem.scala | 8 +++---- project/Dependencies.scala | 20 ++++++++++++++++ 9 files changed, 81 insertions(+), 25 deletions(-) create mode 100644 modules/convert/src/main/scala/docspell/convert/Conversion.scala create mode 100644 modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala diff --git a/build.sbt b/build.sbt index 84c52fe6..945a5098 100644 --- a/build.sbt +++ b/build.sbt @@ -173,6 +173,17 @@ val text = project.in(file("modules/text")). Dependencies.tika ++ Dependencies.stanfordNlpCore ).dependsOn(common) + +val convert = project.in(file("modules/convert")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-convert", + libraryDependencies ++= + Dependencies.pdfbox ++ + Dependencies.flexmark + ).dependsOn(common) val restapi = project.in(file("modules/restapi")). disablePlugins(RevolverPlugin). @@ -226,7 +237,7 @@ val joex = project.in(file("modules/joex")). addCompilerPlugin(Dependencies.betterMonadicFor), buildInfoPackage := "docspell.joex", reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}") - ).dependsOn(store, text, joexapi, restapi) + ).dependsOn(store, text, convert, joexapi, restapi) val backend = project.in(file("modules/backend")). disablePlugins(RevolverPlugin). @@ -357,6 +368,7 @@ val root = project.in(file(".")). ). aggregate(common , text + , convert , store , joexapi , joex diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala new file mode 100644 index 00000000..96a0776d --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -0,0 +1,24 @@ +package docspell.convert + +import fs2._ +import cats.effect._ +import docspell.common._ + +trait Conversion[F[_]] { + + def toPDF(inType: MimeType): Pipe[F, Byte, Byte] + +} + +object Conversion { + + def create[F[_]: Sync](cfg: ConvertConfig): Resource[F, Conversion[F]] = + Resource.pure(new Conversion[F] { + + def toPDF(inType: MimeType): Pipe[F, Byte, Byte] = { + println(cfg) + ??? + } + + }) +} diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala new file mode 100644 index 00000000..40f2924e --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -0,0 +1,3 @@ +package docspell.convert + +case class ConvertConfig() diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index b8f6b7ff..4cdd9391 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -4,6 +4,7 @@ import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig import docspell.text.ocr.{Config => OcrConfig} +import docspell.convert.ConvertConfig case class Config( appId: Ident, @@ -11,17 +12,10 @@ case class Config( bind: Config.Bind, jdbc: JdbcConfig, scheduler: SchedulerConfig, - extraction: OcrConfig + extraction: OcrConfig, + convert: ConvertConfig ) object Config { - val postgres = - JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev") - val h2 = JdbcConfig( - LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"), - "sa", - "" - ) - case class Bind(address: String, port: Int) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 374cf396..34270987 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -55,7 +55,7 @@ object JoexAppImpl { .withTask( JobTask.json( ProcessItemArgs.taskName, - ItemHandler[F](cfg.extraction), + ItemHandler[F](cfg), ItemHandler.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 03bdba4a..829e36fc 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -7,6 +7,7 @@ import cats.effect._ import cats.data.OptionT import docspell.common._ +import docspell.convert._ import docspell.joex.scheduler._ import docspell.store.records._ @@ -27,17 +28,17 @@ import docspell.store.records._ object ConvertPdf { def apply[F[_]: Sync: ContextShift]( + cfg: ConvertConfig, item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - // get mimetype // try to convert // save to db // update file_id of RAttachment def convert(ra: RAttachment) = - findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m)) + findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m)) for { ras <- item.attachments.traverse(convert) @@ -51,10 +52,12 @@ object ConvertPdf { .getOrElse(Mimetype.`application/octet-stream`) def convertSafe[F[_]: Sync]( + cfg: ConvertConfig, ctx: Context[F, ProcessItemArgs] - )(ra: RAttachment, mime: Mimetype): F[RAttachment] = { - - ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}"). - map(_ => ra) - } + )(ra: RAttachment, mime: Mimetype): F[RAttachment] = + Conversion.create[F](cfg).use { conv => + ctx.logger + .info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv") + .map(_ => ra) + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index 0d7dda6d..66104e96 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -3,16 +3,16 @@ package docspell.joex.process import cats.implicits._ import cats.effect.{ContextShift, Sync} import docspell.common.{ItemState, ProcessItemArgs} +import docspell.joex.Config import docspell.joex.scheduler.{Context, Task} import docspell.store.queries.QItem import docspell.store.records.{RItem, RJob} -import docspell.text.ocr.{Config => OcrConfig} object ItemHandler { def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] = logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds) - def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] = + def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] = CreateItem[F] .flatMap(itemStateTask(ItemState.Processing)) .flatMap(safeProcess[F](cfg)) @@ -30,7 +30,7 @@ object ItemHandler { } yield last def safeProcess[F[_]: Sync: ContextShift]( - cfg: OcrConfig + cfg: Config )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task(isLastRetry[F, ProcessItemArgs] _).flatMap { case true => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 679625a2..bb67fe03 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -3,15 +3,15 @@ package docspell.joex.process import cats.effect.{ContextShift, Sync} import docspell.common.ProcessItemArgs import docspell.joex.scheduler.Task -import docspell.text.ocr.{Config => OcrConfig} +import docspell.joex.Config object ProcessItem { def apply[F[_]: Sync: ContextShift]( - cfg: OcrConfig + cfg: Config )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - ConvertPdf(item) - .flatMap(TextExtraction(cfg, _)) + ConvertPdf(cfg.convert, item) + .flatMap(TextExtraction(cfg.extraction, _)) .flatMap(Task.setProgress(25)) .flatMap(TextAnalysis[F]) .flatMap(Task.setProgress(50)) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index cc9ae861..13dbb3a4 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -11,6 +11,7 @@ object Dependencies { val DoobieVersion = "0.8.8" val EmilVersion = "0.2.0" val FastparseVersion = "2.1.3" + val FlexmarkVersion = "0.60.2" val FlywayVersion = "6.2.2" val Fs2Version = "2.2.2" val H2Version = "1.4.200" @@ -20,6 +21,7 @@ object Dependencies { val LogbackVersion = "1.2.3" val MariaDbVersion = "2.5.4" val MiniTestVersion = "2.7.0" + val PdfboxVersion = "2.0.18" val PostgresVersion = "42.2.10" val PureConfigVersion = "0.12.2" val SqliteVersion = "3.30.1" @@ -30,6 +32,24 @@ object Dependencies { val SemanticUIVersion = "2.4.1" val JQueryVersion = "3.4.1" val ViewerJSVersion = "0.5.8" + + // https://github.com/vsch/flexmark-java + // BSD 2-Clause + val flexmark = Seq( + "com.vladsch.flexmark" % "flexmark" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion + ).map(_.excludeAll( + ExclusionRule("junit"), + ExclusionRule("hamcrest-core") + )) + + val pdfbox = Seq( + "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll( + ExclusionRule("commons-logging"), + ExclusionRule("org.bouncycastle") + ) + ) val emil = Seq( "com.github.eikek" %% "emil-common" % EmilVersion,