mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Convert some files to pdf
This commit is contained in:
parent
5869e2ee6e
commit
9b1349734e
@ -152,7 +152,7 @@ val files = project.in(file("modules/files")).
|
|||||||
settings(
|
settings(
|
||||||
name := "docspell-files",
|
name := "docspell-files",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.tika ,
|
Dependencies.tika,
|
||||||
Test / sourceGenerators += Def.task {
|
Test / sourceGenerators += Def.task {
|
||||||
val base = (Test/resourceDirectory).value
|
val base = (Test/resourceDirectory).value
|
||||||
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
|
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
|
||||||
@ -204,6 +204,7 @@ val extract = project.in(file("modules/extract")).
|
|||||||
name := "docspell-extract",
|
name := "docspell-extract",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.fs2 ++
|
Dependencies.fs2 ++
|
||||||
|
Dependencies.twelvemonkeys ++
|
||||||
Dependencies.pdfbox ++
|
Dependencies.pdfbox ++
|
||||||
Dependencies.poi ++
|
Dependencies.poi ++
|
||||||
Dependencies.commonsIO ++
|
Dependencies.commonsIO ++
|
||||||
@ -217,7 +218,8 @@ val convert = project.in(file("modules/convert")).
|
|||||||
settings(
|
settings(
|
||||||
name := "docspell-convert",
|
name := "docspell-convert",
|
||||||
libraryDependencies ++=
|
libraryDependencies ++=
|
||||||
Dependencies.flexmark
|
Dependencies.flexmark ++
|
||||||
|
Dependencies.twelvemonkeys
|
||||||
).dependsOn(common, files % "compile->compile;test->test")
|
).dependsOn(common, files % "compile->compile;test->test")
|
||||||
|
|
||||||
val analysis = project.in(file("modules/analysis")).
|
val analysis = project.in(file("modules/analysis")).
|
||||||
|
@ -6,8 +6,9 @@ import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
|
|||||||
import java.util.concurrent.atomic.AtomicInteger
|
import java.util.concurrent.atomic.AtomicInteger
|
||||||
|
|
||||||
import scala.jdk.CollectionConverters._
|
import scala.jdk.CollectionConverters._
|
||||||
|
import fs2.Stream
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.{Blocker, ContextShift, Resource, Sync}
|
import cats.effect._
|
||||||
|
|
||||||
object File {
|
object File {
|
||||||
|
|
||||||
@ -42,6 +43,9 @@ object File {
|
|||||||
count.get
|
count.get
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def exists[F[_]: Sync](file: Path): F[Boolean] =
|
||||||
|
Sync[F].delay(Files.exists(file))
|
||||||
|
|
||||||
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] =
|
||||||
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
Sync[F].delay(Files.exists(file) && Files.size(file) > minSize)
|
||||||
|
|
||||||
@ -61,6 +65,11 @@ object File {
|
|||||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||||
}
|
}
|
||||||
|
|
||||||
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int) =
|
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
|
||||||
fs2.io.file.readAll(file, blocker, chunkSize)
|
fs2.io.file.readAll(file, blocker, chunkSize)
|
||||||
|
|
||||||
|
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||||
|
readAll[F](file, blocker, 8192).
|
||||||
|
through(fs2.text.utf8Decode).
|
||||||
|
compile.foldMonoid
|
||||||
}
|
}
|
||||||
|
@ -1,24 +1,112 @@
|
|||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets
|
||||||
|
|
||||||
import fs2._
|
import fs2._
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf}
|
||||||
|
import docspell.convert.flexmark.Markdown
|
||||||
|
import docspell.files.{ImageSize, TikaMimetype}
|
||||||
|
|
||||||
trait Conversion[F[_]] {
|
trait Conversion[F[_]] {
|
||||||
|
|
||||||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]]
|
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object Conversion {
|
object Conversion {
|
||||||
|
|
||||||
def create[F[_]: Sync: ContextShift](cfg: ConvertConfig, blocker: Blocker, logger: Logger[F]): Resource[F, Conversion[F]] =
|
def create[F[_]: Sync: ContextShift](
|
||||||
|
cfg: ConvertConfig,
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
|
): Resource[F, Conversion[F]] =
|
||||||
Resource.pure(new Conversion[F] {
|
Resource.pure(new Conversion[F] {
|
||||||
|
|
||||||
def toPDF[A](in: Stream[F, Byte], dataType: DataType, handler: Pipe[F, Byte, A]): F[ConversionResult[F]] = {
|
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||||
println(s"$cfg $blocker $logger")
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
???
|
case MimeType.pdf =>
|
||||||
}
|
handler.run(ConversionResult.successPdf(in))
|
||||||
|
|
||||||
|
case MimeType.html =>
|
||||||
|
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
|
|
||||||
|
case Texts(_) =>
|
||||||
|
Markdown.toHtml(in, cfg.markdown).flatMap { html =>
|
||||||
|
val bytes = Stream
|
||||||
|
.chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8)))
|
||||||
|
.covary[F]
|
||||||
|
WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler)
|
||||||
|
}
|
||||||
|
|
||||||
|
case Images(mt) =>
|
||||||
|
ImageSize.get(in).flatMap {
|
||||||
|
case Some(dim) =>
|
||||||
|
if (dim.product > cfg.maxImageSize) {
|
||||||
|
logger
|
||||||
|
.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||||
|
handler.run(
|
||||||
|
ConversionResult.inputMalformed(
|
||||||
|
mt,
|
||||||
|
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
|
}
|
||||||
|
|
||||||
|
case None =>
|
||||||
|
logger.info(
|
||||||
|
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||||
|
) *>
|
||||||
|
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
|
}
|
||||||
|
|
||||||
|
case Office(_) =>
|
||||||
|
Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
|
|
||||||
|
case mt =>
|
||||||
|
handler.run(ConversionResult.unsupportedFormat(mt))
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
object Images {
|
||||||
|
|
||||||
|
val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff)
|
||||||
|
|
||||||
|
def unapply(m: MimeType): Option[MimeType] =
|
||||||
|
Some(m).filter(all.contains)
|
||||||
|
}
|
||||||
|
|
||||||
|
object Texts {
|
||||||
|
def unapply(m: MimeType): Option[MimeType] =
|
||||||
|
Some(m).filter(_.primary == "text")
|
||||||
|
}
|
||||||
|
|
||||||
|
object Office {
|
||||||
|
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||||
|
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||||
|
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
|
||||||
|
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||||
|
val msoffice = MimeType.application("x-tika-msoffice")
|
||||||
|
val ooxml = MimeType.application("x-tika-ooxml")
|
||||||
|
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||||
|
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||||
|
val xls = MimeType.application("vnd.ms-excel")
|
||||||
|
val doc = MimeType.application("msword")
|
||||||
|
val rtf = MimeType.application("rtf")
|
||||||
|
|
||||||
|
// without a filename, tika returns application/zip for odt/ods files, since
|
||||||
|
// they are just zip files
|
||||||
|
val odfContainer = MimeType.zip
|
||||||
|
|
||||||
|
val all =
|
||||||
|
Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer)
|
||||||
|
|
||||||
|
def unapply(m: MimeType): Option[MimeType] =
|
||||||
|
Some(m).filter(all.contains)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.common.MimeType
|
||||||
|
|
||||||
|
sealed trait ConversionResult[F[_]] {
|
||||||
|
|
||||||
|
def pdfData: Stream[F, Byte]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object ConversionResult {
|
||||||
|
|
||||||
|
/** The conversion is done by external tools that write files to the
|
||||||
|
* file system. These are temporary files and they will be deleted
|
||||||
|
* once the process finishes. This handler is used to do something
|
||||||
|
* relevant with the resulting files.
|
||||||
|
*/
|
||||||
|
type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A]
|
||||||
|
|
||||||
|
def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] =
|
||||||
|
UnsupportedFormat[F](mime)
|
||||||
|
|
||||||
|
def failure[F[_]](ex: Throwable): ConversionResult[F] =
|
||||||
|
Failure[F](ex)
|
||||||
|
|
||||||
|
def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] =
|
||||||
|
SuccessPdf[F](pdf)
|
||||||
|
|
||||||
|
def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] =
|
||||||
|
SuccessPdfTxt[F](pdf, txt)
|
||||||
|
|
||||||
|
def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] =
|
||||||
|
InputMalformed(mimeType, reason)
|
||||||
|
|
||||||
|
case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] {
|
||||||
|
val pdfData = Stream.empty
|
||||||
|
}
|
||||||
|
case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] {
|
||||||
|
val pdfData = Stream.empty
|
||||||
|
}
|
||||||
|
case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] {
|
||||||
|
val pdfData = pdf
|
||||||
|
}
|
||||||
|
case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] {
|
||||||
|
val pdfData = pdf
|
||||||
|
}
|
||||||
|
|
||||||
|
case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] {
|
||||||
|
val pdfData = Stream.empty
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,11 @@
|
|||||||
package docspell.convert
|
package docspell.convert
|
||||||
|
|
||||||
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
case class ConvertConfig(markdown: MarkdownConfig)
|
case class ConvertConfig(chunkSize: Int,
|
||||||
|
maxImageSize: Int,
|
||||||
|
markdown: MarkdownConfig,
|
||||||
|
wkhtmlpdf: WkHtmlPdfConfig,
|
||||||
|
tesseract: TesseractConfig,
|
||||||
|
unoconv: UnoconvConfig)
|
||||||
|
@ -2,30 +2,34 @@ package docspell.convert.extern
|
|||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.implicits._
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import fs2.{Pipe, Stream}
|
import fs2.{Pipe, Stream}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult
|
||||||
|
import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt}
|
||||||
|
|
||||||
object ExternConv {
|
private[extern] object ExternConv {
|
||||||
|
|
||||||
def toPDF[F[_]: Sync: ContextShift](
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
name: String,
|
name: String,
|
||||||
cmdCfg: SystemCommand.Config,
|
cmdCfg: SystemCommand.Config,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
chunkSize: Int,
|
|
||||||
useStdin: Boolean,
|
useStdin: Boolean,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F]
|
logger: Logger[F],
|
||||||
): Pipe[F, Byte, Byte] =
|
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||||
in =>
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
||||||
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
||||||
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||||
val sysCfg =
|
val sysCfg =
|
||||||
cmdCfg.replace(
|
cmdCfg.replace(
|
||||||
Map("{{outfile}}" -> out.toString) ++
|
Map(
|
||||||
|
"{{outfile}}" -> out.toString
|
||||||
|
) ++
|
||||||
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
(if (!useStdin) Map("{{infile}}" -> inFile.toString)
|
||||||
else Map.empty)
|
else Map.empty)
|
||||||
)
|
)
|
||||||
|
|
||||||
val createInput: Pipe[F, Byte, Unit] =
|
val createInput: Pipe[F, Byte, Unit] =
|
||||||
@ -35,41 +39,66 @@ object ExternConv {
|
|||||||
in.through(createInput).flatMap { _ =>
|
in.through(createInput).flatMap { _ =>
|
||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
||||||
.flatMap(result =>
|
.evalMap(result =>
|
||||||
logResult(name, result, logger) ++ readResult[F](
|
logResult(name, result, logger).
|
||||||
out,
|
flatMap(_ => reader(out, result)).
|
||||||
result,
|
flatMap(handler.run)
|
||||||
blocker,
|
|
||||||
chunkSize,
|
|
||||||
logger
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}.compile.lastOrError
|
||||||
|
|
||||||
def readResult[F[_]: Sync: ContextShift](
|
def readResult[F[_]: Sync: ContextShift](
|
||||||
out: Path,
|
|
||||||
result: SystemCommand.Result,
|
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Byte] =
|
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] =
|
||||||
Stream.eval(File.existsNonEmpty[F](out)).flatMap {
|
File.existsNonEmpty[F](out).flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
if (result.rc == 0) File.readAll(out, blocker, chunkSize)
|
if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||||
else
|
else
|
||||||
Stream
|
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||||
.eval(logger.warn(s"Command not successful (rc=${result.rc}), but file exists."))
|
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||||
.drain ++
|
|
||||||
File.readAll(out, blocker, chunkSize)
|
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
Stream.raiseError[F](
|
ConversionResult.failure[F](
|
||||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||||
)
|
).pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
private def storeDataToFile[F[_]: Sync: ContextShift](name: String, blocker: Blocker, logger: Logger[F], inFile: Path): Pipe[F, Byte, Unit] =
|
def readResultTesseract[F[_]: Sync: ContextShift](
|
||||||
|
outPrefix: String,
|
||||||
|
blocker: Blocker,
|
||||||
|
chunkSize: Int,
|
||||||
|
logger: Logger[F]
|
||||||
|
)(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = {
|
||||||
|
val outPdf = out.resolveSibling(s"$outPrefix.pdf")
|
||||||
|
File.existsNonEmpty[F](outPdf).flatMap {
|
||||||
|
case true =>
|
||||||
|
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||||
|
File.exists(outTxt).flatMap(txtExists => {
|
||||||
|
val pdfData = File.readAll(out, blocker, chunkSize)
|
||||||
|
if (result.rc == 0) {
|
||||||
|
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
||||||
|
else successPdf(pdfData).pure[F]
|
||||||
|
} else {
|
||||||
|
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||||
|
successPdf(pdfData).pure[F]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
case false =>
|
||||||
|
ConversionResult.failure[F](
|
||||||
|
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||||
|
).pure[F]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def storeDataToFile[F[_]: Sync: ContextShift](
|
||||||
|
name: String,
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
|
inFile: Path
|
||||||
|
): Pipe[F, Byte, Unit] =
|
||||||
in =>
|
in =>
|
||||||
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++
|
||||||
Stream.eval(storeFile(in, inFile, blocker))
|
Stream.eval(storeFile(in, inFile, blocker))
|
||||||
@ -78,12 +107,12 @@ object ExternConv {
|
|||||||
name: String,
|
name: String,
|
||||||
result: SystemCommand.Result,
|
result: SystemCommand.Result,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): Stream[F, Nothing] =
|
): F[Unit] =
|
||||||
Stream.eval(logger.debug(s"$name stdout: ${result.stdout}")).drain ++
|
logger.debug(s"$name stdout: ${result.stdout}") *>
|
||||||
Stream.eval(logger.debug(s"$name stderr: ${result.stderr}")).drain
|
logger.debug(s"$name stderr: ${result.stderr}")
|
||||||
|
|
||||||
private def storeFile[F[_]: Sync: ContextShift](
|
private def storeFile[F[_]: Sync: ContextShift](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
target: Path,
|
target: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): F[Unit] =
|
): F[Unit] =
|
||||||
|
@ -1,5 +1,26 @@
|
|||||||
package docspell.convert.extern
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
|
||||||
object Tesseract {
|
object Tesseract {
|
||||||
|
|
||||||
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
|
cfg: TesseractConfig,
|
||||||
|
chunkSize: Int,
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F]
|
||||||
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
|
val outBase = cfg.cmd.args.tail.headOption.getOrElse("out")
|
||||||
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
|
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||||
|
|
||||||
|
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
7
modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
vendored
Normal file
7
modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import docspell.common.SystemCommand
|
||||||
|
|
||||||
|
case class TesseractConfig (cmd: SystemCommand.Config, workingDir: Path)
|
@ -1,18 +1,25 @@
|
|||||||
package docspell.convert.extern
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import fs2.Pipe
|
import fs2.Stream
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
|
||||||
object Unoconv {
|
object Unoconv {
|
||||||
|
|
||||||
def toPDF[F[_]: Sync: ContextShift](
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
cfg: UnoconvConfig,
|
cfg: UnoconvConfig,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F],
|
logger: Logger[F]
|
||||||
): Pipe[F, Byte, Byte] =
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
ExternConv.toPDF[F]("unoconv", cfg.cmd, cfg.workingDir, chunkSize, false, blocker, logger)
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
|
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||||
|
|
||||||
|
ExternConv.toPDF[F, A]("unoconv", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,25 @@
|
|||||||
package docspell.convert.extern
|
package docspell.convert.extern
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import fs2.Pipe
|
import fs2.Stream
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
|
||||||
object WkHtmlPdf {
|
object WkHtmlPdf {
|
||||||
|
|
||||||
def toPDF[F[_]: Sync: ContextShift](
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
cfg: WkHtmlPdfConfig,
|
cfg: WkHtmlPdfConfig,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
): Pipe[F, Byte, Byte] =
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
ExternConv.toPDF[F]("wkhtmltopdf", cfg.cmd, cfg.workingDir, chunkSize, true, blocker, logger)
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
|
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||||
|
|
||||||
|
ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.cmd, cfg.workingDir, true, blocker, logger, reader)(in, handler)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,160 @@
|
|||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import java.nio.file.Paths
|
||||||
|
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import cats.implicits._
|
||||||
|
import cats.effect.IO
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
import docspell.files.{ExampleFiles, TestFiles}
|
||||||
|
import minitest.SimpleTestSuite
|
||||||
|
|
||||||
|
object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||||
|
val blocker = TestFiles.blocker
|
||||||
|
implicit val CS = TestFiles.CS
|
||||||
|
|
||||||
|
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||||
|
val target = Paths.get("target")
|
||||||
|
|
||||||
|
val convertConfig = ConvertConfig(
|
||||||
|
8192,
|
||||||
|
3000 * 3000,
|
||||||
|
MarkdownConfig("body { padding: 2em 5em; }"),
|
||||||
|
WkHtmlPdfConfig(
|
||||||
|
SystemCommand.Config(
|
||||||
|
"wkhtmltopdf",
|
||||||
|
Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"),
|
||||||
|
Duration.seconds(20)
|
||||||
|
),
|
||||||
|
target
|
||||||
|
),
|
||||||
|
TesseractConfig(
|
||||||
|
SystemCommand.Config(
|
||||||
|
"tesseract",
|
||||||
|
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||||
|
Duration.seconds(20)
|
||||||
|
),
|
||||||
|
target
|
||||||
|
),
|
||||||
|
UnoconvConfig(
|
||||||
|
SystemCommand.Config(
|
||||||
|
"unoconv",
|
||||||
|
Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"),
|
||||||
|
Duration.seconds(20)
|
||||||
|
),
|
||||||
|
target
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val conversion = Conversion.create[IO](convertConfig, blocker, logger)
|
||||||
|
|
||||||
|
val bombs = List(
|
||||||
|
ExampleFiles.bombs_20K_gray_jpeg,
|
||||||
|
ExampleFiles.bombs_20K_gray_png,
|
||||||
|
ExampleFiles.bombs_20K_rgb_jpeg,
|
||||||
|
ExampleFiles.bombs_20K_rgb_png
|
||||||
|
)
|
||||||
|
val pdfOnly = List(
|
||||||
|
ExampleFiles.examples_sample_ods,
|
||||||
|
ExampleFiles.examples_sample_doc,
|
||||||
|
ExampleFiles.examples_sample_docx,
|
||||||
|
ExampleFiles.examples_sample_ods,
|
||||||
|
ExampleFiles.examples_sample_odt,
|
||||||
|
ExampleFiles.examples_sample_rtf,
|
||||||
|
ExampleFiles.examples_sample_xls,
|
||||||
|
ExampleFiles.examples_sample_xlsx,
|
||||||
|
ExampleFiles.letter_de_md,
|
||||||
|
ExampleFiles.letter_de_txt,
|
||||||
|
ExampleFiles.letter_en_txt,
|
||||||
|
ExampleFiles.letter_de_html
|
||||||
|
)
|
||||||
|
val pdfAndTxt = List(
|
||||||
|
ExampleFiles.camera_letter_en_jpg,
|
||||||
|
ExampleFiles.camera_letter_en_png,
|
||||||
|
ExampleFiles.camera_letter_en_tiff,
|
||||||
|
ExampleFiles.scanner_jfif_jpg
|
||||||
|
)
|
||||||
|
|
||||||
|
test("convert to pdf") {
|
||||||
|
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||||
|
else
|
||||||
|
File
|
||||||
|
.withTempDir[IO](target, "convpdf")
|
||||||
|
.use { dir =>
|
||||||
|
conversion.use { conv =>
|
||||||
|
def check(n: Long): Handler[IO, Unit] =
|
||||||
|
storePdfHandler(dir.resolve(s"test-$n.pdf")).map { p =>
|
||||||
|
assert(p.isNonEmpty && p.isPDF)
|
||||||
|
}
|
||||||
|
|
||||||
|
runConversion(pdfOnly, check, conv).compile.drain
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.unsafeRunSync()
|
||||||
|
}
|
||||||
|
|
||||||
|
test("convert image to pdf and txt") {
|
||||||
|
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||||
|
else
|
||||||
|
File
|
||||||
|
.withTempDir[IO](target, "convimgpdf")
|
||||||
|
.use { dir =>
|
||||||
|
conversion.use { conv =>
|
||||||
|
def check(n: Long): Handler[IO, Unit] =
|
||||||
|
storePdfTxtHandler(dir.resolve(s"test-$n.pdf"), dir.resolve(s"test-$n.txt"))
|
||||||
|
.map {
|
||||||
|
case (p, t) =>
|
||||||
|
assert(p.isNonEmpty && p.isPDF)
|
||||||
|
assert(t.isNonEmpty && t.isPlainText)
|
||||||
|
}
|
||||||
|
|
||||||
|
runConversion(pdfAndTxt, check, conv).compile.drain
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.unsafeRunSync()
|
||||||
|
}
|
||||||
|
|
||||||
|
test("do not convert image bombs") {
|
||||||
|
if (!commandsExist) ignore("At least one of the conversion programs not found")
|
||||||
|
else
|
||||||
|
conversion
|
||||||
|
.use { conv =>
|
||||||
|
def check: Handler[IO, Unit] =
|
||||||
|
Kleisli({
|
||||||
|
case ConversionResult.InputMalformed(_, _) =>
|
||||||
|
().pure[IO]
|
||||||
|
case cr =>
|
||||||
|
IO.raiseError(new Exception(s"Unexpected result: $cr"))
|
||||||
|
})
|
||||||
|
|
||||||
|
runConversion(bombs, _ => check, conv).compile.drain
|
||||||
|
}
|
||||||
|
.unsafeRunSync()
|
||||||
|
}
|
||||||
|
|
||||||
|
def runConversion[A](
|
||||||
|
uris: List[LenientUri],
|
||||||
|
handler: Long => Handler[IO, A],
|
||||||
|
conv: Conversion[IO]
|
||||||
|
) =
|
||||||
|
Stream
|
||||||
|
.emits(uris)
|
||||||
|
.covary[IO]
|
||||||
|
.zipWithIndex
|
||||||
|
.evalMap({
|
||||||
|
case (uri, index) =>
|
||||||
|
val load = uri.readURL[IO](8192, blocker)
|
||||||
|
val dataType = DataType.filename(uri.path.segments.last)
|
||||||
|
logger.info(s"Processing file ${uri.path.asString}") *>
|
||||||
|
conv.toPDF(dataType, handler(index))(load)
|
||||||
|
})
|
||||||
|
|
||||||
|
def commandsExist: Boolean =
|
||||||
|
commandExists(convertConfig.unoconv.cmd.program) &&
|
||||||
|
commandExists(convertConfig.wkhtmlpdf.cmd.program) &&
|
||||||
|
commandExists(convertConfig.tesseract.cmd.program)
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
package docspell.convert
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets
|
||||||
|
import java.nio.file.{Files, Path}
|
||||||
|
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import cats.effect.IO
|
||||||
|
import fs2.{Pipe, Stream}
|
||||||
|
import docspell.common.MimeType
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
import docspell.files.TikaMimetype
|
||||||
|
|
||||||
|
trait FileChecks {
|
||||||
|
|
||||||
|
implicit class FileCheckOps(p: Path) {
|
||||||
|
|
||||||
|
def isNonEmpty: Boolean =
|
||||||
|
Files.exists(p) && Files.size(p) > 0
|
||||||
|
|
||||||
|
def isType(mime: MimeType): Boolean =
|
||||||
|
TikaMimetype.detect[IO](p).map(_ == mime).unsafeRunSync
|
||||||
|
|
||||||
|
def isPDF: Boolean =
|
||||||
|
isType(MimeType.pdf)
|
||||||
|
|
||||||
|
def isPlainText: Boolean =
|
||||||
|
isType(MimeType.text("plain"))
|
||||||
|
}
|
||||||
|
|
||||||
|
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
||||||
|
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
|
||||||
|
|
||||||
|
def storePdfHandler(file: Path): Handler[IO, Path] =
|
||||||
|
storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1)
|
||||||
|
|
||||||
|
def storePdfTxtHandler(filePdf: Path, fileTxt: Path): Handler[IO, (Path, Path)] =
|
||||||
|
Kleisli({
|
||||||
|
case ConversionResult.SuccessPdfTxt(pdf, txt) =>
|
||||||
|
for {
|
||||||
|
pout <- pdf.through(storeFile(filePdf)).compile.lastOrError
|
||||||
|
str <- txt
|
||||||
|
tout <- IO(Files.write(fileTxt, str.getBytes(StandardCharsets.UTF_8)))
|
||||||
|
} yield (pout, tout)
|
||||||
|
|
||||||
|
case ConversionResult.SuccessPdf(pdf) =>
|
||||||
|
pdf.through(storeFile(filePdf)).compile.lastOrError.map(p => (p, fileTxt))
|
||||||
|
|
||||||
|
case ConversionResult.Failure(ex) =>
|
||||||
|
throw new Exception(s"Unexpected result (failure: ${ex.getMessage})", ex)
|
||||||
|
|
||||||
|
case cr =>
|
||||||
|
throw new Exception(s"Unexpected result: $cr")
|
||||||
|
})
|
||||||
|
|
||||||
|
def commandExists(cmd: String): Boolean =
|
||||||
|
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1,22 +1,20 @@
|
|||||||
package docspell.convert.extern
|
package docspell.convert.extern
|
||||||
|
|
||||||
import java.nio.file.{Files, Path, Paths}
|
import java.nio.file.{Path, Paths}
|
||||||
|
|
||||||
import fs2.Stream
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.convert.FileChecks
|
||||||
import docspell.files.{ExampleFiles, TestFiles}
|
import docspell.files.{ExampleFiles, TestFiles}
|
||||||
import fs2.Pipe
|
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
|
|
||||||
object ExternConvTest extends SimpleTestSuite {
|
object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||||
val blocker = TestFiles.blocker
|
val blocker = TestFiles.blocker
|
||||||
implicit val CS = TestFiles.CS
|
implicit val CS = TestFiles.CS
|
||||||
|
|
||||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||||
val target = Paths.get("target")
|
val target = Paths.get("target")
|
||||||
|
|
||||||
|
|
||||||
test("convert html to pdf") {
|
test("convert html to pdf") {
|
||||||
val cfg = SystemCommand.Config(
|
val cfg = SystemCommand.Config(
|
||||||
"wkhtmltopdf",
|
"wkhtmltopdf",
|
||||||
@ -28,18 +26,20 @@ object ExternConvTest extends SimpleTestSuite {
|
|||||||
else {
|
else {
|
||||||
File
|
File
|
||||||
.withTempDir[IO](target, "wkhtmltopdf")
|
.withTempDir[IO](target, "wkhtmltopdf")
|
||||||
.use(dir => IO {
|
.use(dir =>
|
||||||
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
IO {
|
||||||
val p = ExampleFiles.letter_de_html
|
val wkCfg = WkHtmlPdfConfig(cfg, target)
|
||||||
.readURL[IO](8192, blocker)
|
val p =
|
||||||
.through(WkHtmlPdf.toPDF[IO](wkCfg, 8192, blocker, logger))
|
WkHtmlPdf
|
||||||
.through(storeFile(dir.resolve("test.pdf")))
|
.toPDF[IO, Path](wkCfg, 8192, blocker, logger)(
|
||||||
.compile
|
ExampleFiles.letter_de_html.readURL[IO](8192, blocker),
|
||||||
.lastOrError
|
storePdfHandler(dir.resolve("test.pdf"))
|
||||||
.unsafeRunSync()
|
)
|
||||||
|
.unsafeRunSync()
|
||||||
|
|
||||||
assert(Files.exists(p) && Files.size(p) > 0)
|
assert(p.isNonEmpty && p.isPDF)
|
||||||
})
|
}
|
||||||
|
)
|
||||||
.unsafeRunSync
|
.unsafeRunSync
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -55,26 +55,53 @@ object ExternConvTest extends SimpleTestSuite {
|
|||||||
else {
|
else {
|
||||||
File
|
File
|
||||||
.withTempDir[IO](target, "unoconv")
|
.withTempDir[IO](target, "unoconv")
|
||||||
.use(dir => IO {
|
.use(dir =>
|
||||||
val ucCfg = UnoconvConfig(cfg, target)
|
IO {
|
||||||
val p = ExampleFiles.examples_sample_docx
|
val ucCfg = UnoconvConfig(cfg, target)
|
||||||
.readURL[IO](8192, blocker)
|
val p =
|
||||||
.through(Unoconv.toPDF[IO](ucCfg, 8192, blocker, logger))
|
Unoconv
|
||||||
.through(storeFile(dir.resolve("test.pdf")))
|
.toPDF[IO, Path](ucCfg, 8192, blocker, logger)(
|
||||||
.compile
|
ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker),
|
||||||
.lastOrError
|
storePdfHandler(dir.resolve("test.pdf"))
|
||||||
.unsafeRunSync()
|
)
|
||||||
|
.unsafeRunSync()
|
||||||
|
|
||||||
assert(Files.exists(p) && Files.size(p) > 0)
|
assert(p.isNonEmpty && p.isPDF)
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
.unsafeRunSync
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
test("convert image to pdf") {
|
||||||
|
val cfg = SystemCommand.Config(
|
||||||
|
"tesseract",
|
||||||
|
Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"),
|
||||||
|
Duration.seconds(20)
|
||||||
|
)
|
||||||
|
|
||||||
|
if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found")
|
||||||
|
else {
|
||||||
|
File
|
||||||
|
.withTempDir[IO](target, "tesseract")
|
||||||
|
.use(dir =>
|
||||||
|
IO {
|
||||||
|
val tessCfg = TesseractConfig(cfg, target)
|
||||||
|
val (pdf, txt) =
|
||||||
|
Tesseract
|
||||||
|
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
|
||||||
|
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
||||||
|
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
||||||
|
)
|
||||||
|
.unsafeRunSync()
|
||||||
|
|
||||||
|
assert(pdf.isNonEmpty && pdf.isPDF)
|
||||||
|
assert(txt.isNonEmpty && txt.isPlainText)
|
||||||
|
}
|
||||||
|
)
|
||||||
.unsafeRunSync
|
.unsafeRunSync
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def storeFile(file: Path): Pipe[IO, Byte, Path] =
|
|
||||||
in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes))))
|
|
||||||
|
|
||||||
def commandExists(cmd: String): Boolean =
|
|
||||||
Runtime.getRuntime().exec(Array("which", cmd)).waitFor() == 0
|
|
||||||
}
|
}
|
||||||
|
@ -2,4 +2,4 @@ package docspell.extract
|
|||||||
|
|
||||||
import docspell.extract.ocr.OcrConfig
|
import docspell.extract.ocr.OcrConfig
|
||||||
|
|
||||||
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
|
||||||
|
@ -9,6 +9,7 @@ import docspell.extract.poi.{PoiExtract, PoiType}
|
|||||||
import docspell.extract.rtf.RtfExtract
|
import docspell.extract.rtf.RtfExtract
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import docspell.files.TikaMimetype
|
import docspell.files.TikaMimetype
|
||||||
|
import docspell.files.ImageSize
|
||||||
|
|
||||||
trait Extraction[F[_]] {
|
trait Extraction[F[_]] {
|
||||||
|
|
||||||
@ -44,14 +45,29 @@ object Extraction {
|
|||||||
case OdfType(_) =>
|
case OdfType(_) =>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case OcrType(_) =>
|
case OcrType(mt) =>
|
||||||
TextExtract
|
val doExtract = TextExtract
|
||||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.attempt
|
.attempt
|
||||||
.map(ExtractResult.fromEither)
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
ImageSize.get(data).flatMap {
|
||||||
|
case Some(dim) =>
|
||||||
|
if (dim.product > cfg.maxImageSize) {
|
||||||
|
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||||
|
ExtractResult.failure(new Exception(
|
||||||
|
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
|
||||||
|
).pure[F]
|
||||||
|
} else {
|
||||||
|
doExtract
|
||||||
|
}
|
||||||
|
case None =>
|
||||||
|
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
|
||||||
|
doExtract
|
||||||
|
}
|
||||||
|
|
||||||
case OdfType.container =>
|
case OdfType.container =>
|
||||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
@ -4,10 +4,10 @@ import docspell.common.MimeType
|
|||||||
|
|
||||||
object OdfType {
|
object OdfType {
|
||||||
|
|
||||||
val odt = MimeType.application("application/vnd.oasis.opendocument.text")
|
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||||
val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet")
|
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||||
val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text")
|
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
|
||||||
val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet")
|
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||||
|
|
||||||
val container = MimeType.zip
|
val container = MimeType.zip
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@ package docspell.files
|
|||||||
|
|
||||||
case class Dimension(width: Int, height: Int) {
|
case class Dimension(width: Int, height: Int) {
|
||||||
|
|
||||||
|
def product = width * height
|
||||||
|
|
||||||
def toAwtDimension: java.awt.Dimension =
|
def toAwtDimension: java.awt.Dimension =
|
||||||
new java.awt.Dimension(width, height)
|
new java.awt.Dimension(width, height)
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package docspell.files
|
package docspell.files
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream
|
||||||
|
import java.nio.file.{Files, Path}
|
||||||
|
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -8,6 +11,8 @@ import org.apache.tika.config.TikaConfig
|
|||||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
||||||
import org.apache.tika.mime.MediaType
|
import org.apache.tika.mime.MediaType
|
||||||
|
|
||||||
|
import scala.util.Using
|
||||||
|
|
||||||
object TikaMimetype {
|
object TikaMimetype {
|
||||||
private val tika = new TikaConfig().getDetector
|
private val tika = new TikaConfig().getDetector
|
||||||
|
|
||||||
@ -43,4 +48,12 @@ object TikaMimetype {
|
|||||||
case DataType.Exact(mt) => mt.pure[F]
|
case DataType.Exact(mt) => mt.pure[F]
|
||||||
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||||
|
Sync[F].delay {
|
||||||
|
val hint = MimeTypeHint.filename(file.getFileName.toString)
|
||||||
|
Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in =>
|
||||||
|
convert(tika.detect(in, makeMetadata(hint)))
|
||||||
|
}).toEither
|
||||||
|
}.rethrow
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@ object Dependencies {
|
|||||||
val YamuscaVersion = "0.6.1"
|
val YamuscaVersion = "0.6.1"
|
||||||
val SwaggerUIVersion = "3.25.0"
|
val SwaggerUIVersion = "3.25.0"
|
||||||
val SemanticUIVersion = "2.4.1"
|
val SemanticUIVersion = "2.4.1"
|
||||||
|
val TwelveMonkeysVersion = "3.5"
|
||||||
val JQueryVersion = "3.4.1"
|
val JQueryVersion = "3.4.1"
|
||||||
val ViewerJSVersion = "0.5.8"
|
val ViewerJSVersion = "0.5.8"
|
||||||
|
|
||||||
@ -62,10 +63,10 @@ object Dependencies {
|
|||||||
ExclusionRule("hamcrest-core")
|
ExclusionRule("hamcrest-core")
|
||||||
))
|
))
|
||||||
|
|
||||||
// val twelvemonkeys = Seq(
|
val twelvemonkeys = Seq(
|
||||||
// "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
|
"com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion,
|
||||||
// "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
|
"com.twelvemonkeys.imageio" % "imageio-tiff" % TwelveMonkeysVersion
|
||||||
// )
|
)
|
||||||
|
|
||||||
val pdfbox = Seq(
|
val pdfbox = Seq(
|
||||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user