mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Integrate support for more files into processing and upload
The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration.
This commit is contained in:
parent
9b1349734e
commit
97305d27ff
@ -1,6 +1,10 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
|
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {
|
||||||
|
|
||||||
|
def withName(name: String): MimeTypeHint =
|
||||||
|
copy(filename = Some(name))
|
||||||
|
}
|
||||||
|
|
||||||
object MimeTypeHint {
|
object MimeTypeHint {
|
||||||
val none = MimeTypeHint(None, None)
|
val none = MimeTypeHint(None, None)
|
||||||
|
@ -13,7 +13,7 @@ import docspell.files.{ImageSize, TikaMimetype}
|
|||||||
|
|
||||||
trait Conversion[F[_]] {
|
trait Conversion[F[_]] {
|
||||||
|
|
||||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -26,7 +26,7 @@ object Conversion {
|
|||||||
): Resource[F, Conversion[F]] =
|
): Resource[F, Conversion[F]] =
|
||||||
Resource.pure(new Conversion[F] {
|
Resource.pure(new Conversion[F] {
|
||||||
|
|
||||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||||
TikaMimetype.resolve(dataType, in).flatMap {
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
handler.run(ConversionResult.successPdf(in))
|
handler.run(ConversionResult.successPdf(in))
|
||||||
@ -55,14 +55,14 @@ object Conversion {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
case None =>
|
case None =>
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||||
) *>
|
) *>
|
||||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
case Office(_) =>
|
case Office(_) =>
|
||||||
@ -109,4 +109,13 @@ object Conversion {
|
|||||||
def unapply(m: MimeType): Option[MimeType] =
|
def unapply(m: MimeType): Option[MimeType] =
|
||||||
Some(m).filter(all.contains)
|
Some(m).filter(all.contains)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def unapply(mt: MimeType): Option[MimeType] =
|
||||||
|
mt match {
|
||||||
|
case Office(_) => Some(mt)
|
||||||
|
case Texts(_) => Some(mt)
|
||||||
|
case Images(_) => Some(mt)
|
||||||
|
case MimeType.html => Some(mt)
|
||||||
|
case _ => None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@ object Tesseract {
|
|||||||
|
|
||||||
def toPDF[F[_]: Sync: ContextShift, A](
|
def toPDF[F[_]: Sync: ContextShift, A](
|
||||||
cfg: TesseractConfig,
|
cfg: TesseractConfig,
|
||||||
|
lang: Language,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
@ -20,7 +21,7 @@ object Tesseract {
|
|||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||||
|
|
||||||
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -150,7 +150,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
|
|||||||
val load = uri.readURL[IO](8192, blocker)
|
val load = uri.readURL[IO](8192, blocker)
|
||||||
val dataType = DataType.filename(uri.path.segments.last)
|
val dataType = DataType.filename(uri.path.segments.last)
|
||||||
logger.info(s"Processing file ${uri.path.asString}") *>
|
logger.info(s"Processing file ${uri.path.asString}") *>
|
||||||
conv.toPDF(dataType, handler(index))(load)
|
conv.toPDF(dataType, Language.German, handler(index))(load)
|
||||||
})
|
})
|
||||||
|
|
||||||
def commandsExist: Boolean =
|
def commandsExist: Boolean =
|
||||||
|
@ -89,7 +89,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
|||||||
val tessCfg = TesseractConfig(cfg, target)
|
val tessCfg = TesseractConfig(cfg, target)
|
||||||
val (pdf, txt) =
|
val (pdf, txt) =
|
||||||
Tesseract
|
Tesseract
|
||||||
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
|
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)(
|
||||||
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
||||||
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
||||||
)
|
)
|
||||||
|
@ -2,4 +2,4 @@ package docspell.extract
|
|||||||
|
|
||||||
import docspell.extract.ocr.OcrConfig
|
import docspell.extract.ocr.OcrConfig
|
||||||
|
|
||||||
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
|
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
||||||
|
@ -55,10 +55,10 @@ object Extraction {
|
|||||||
|
|
||||||
ImageSize.get(data).flatMap {
|
ImageSize.get(data).flatMap {
|
||||||
case Some(dim) =>
|
case Some(dim) =>
|
||||||
if (dim.product > cfg.maxImageSize) {
|
if (dim.product > cfg.ocr.maxImageSize) {
|
||||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
|
||||||
ExtractResult.failure(new Exception(
|
ExtractResult.failure(new Exception(
|
||||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
|
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
|
||||||
).pure[F]
|
).pure[F]
|
||||||
} else {
|
} else {
|
||||||
doExtract
|
doExtract
|
||||||
@ -72,6 +72,12 @@ object Extraction {
|
|||||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case mt@MimeType("text", sub) if !sub.contains("html") =>
|
||||||
|
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||||
|
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||||
|
ExtractResult.success(txt.getOrElse("").trim)
|
||||||
|
}
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
ExtractResult.unsupportedFormat(mt).pure[F]
|
ExtractResult.unsupportedFormat(mt).pure[F]
|
||||||
|
|
||||||
|
@ -33,12 +33,12 @@ object PdfExtract {
|
|||||||
|
|
||||||
//maybe better: inspect the pdf and decide whether ocr or not
|
//maybe better: inspect the pdf and decide whether ocr or not
|
||||||
for {
|
for {
|
||||||
pdfboxRes <- PdfboxExtract.get[F](in)
|
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
|
||||||
res <- pdfboxRes.fold(
|
res <- pdfboxRes.fold(
|
||||||
ex =>
|
ex =>
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||||
) *> runOcr.attempt,
|
) >> runOcr.attempt,
|
||||||
str =>
|
str =>
|
||||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||||
else
|
else
|
||||||
|
@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths}
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
case class OcrConfig(
|
case class OcrConfig(
|
||||||
allowedContentTypes: Set[MimeType],
|
maxImageSize: Int,
|
||||||
ghostscript: OcrConfig.Ghostscript,
|
ghostscript: OcrConfig.Ghostscript,
|
||||||
pageRange: OcrConfig.PageRange,
|
pageRange: OcrConfig.PageRange,
|
||||||
unpaper: OcrConfig.Unpaper,
|
unpaper: OcrConfig.Unpaper,
|
||||||
tesseract: OcrConfig.Tesseract
|
tesseract: OcrConfig.Tesseract
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def isAllowed(mt: MimeType): Boolean =
|
|
||||||
allowedContentTypes contains mt
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object OcrConfig {
|
object OcrConfig {
|
||||||
@ -27,12 +24,7 @@ object OcrConfig {
|
|||||||
case class Unpaper(command: SystemCommand.Config)
|
case class Unpaper(command: SystemCommand.Config)
|
||||||
|
|
||||||
val default = OcrConfig(
|
val default = OcrConfig(
|
||||||
allowedContentTypes = Set(
|
maxImageSize = 3000 * 3000,
|
||||||
MimeType.pdf,
|
|
||||||
MimeType.png,
|
|
||||||
MimeType.jpeg,
|
|
||||||
MimeType.tiff
|
|
||||||
),
|
|
||||||
pageRange = PageRange(10),
|
pageRange = PageRange(10),
|
||||||
ghostscript = Ghostscript(
|
ghostscript = Ghostscript(
|
||||||
SystemCommand.Config(
|
SystemCommand.Config(
|
||||||
|
@ -26,9 +26,6 @@ object TextExtract {
|
|||||||
Stream
|
Stream
|
||||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case mt if !config.isAllowed(mt) =>
|
|
||||||
raiseError(s"File `$mt` not allowed")
|
|
||||||
|
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
||||||
|
|
||||||
|
@ -65,14 +65,26 @@ docspell.joex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Configuration of text extraction
|
# Configuration of text extraction
|
||||||
#
|
|
||||||
# Extracting text currently only work for image and pdf files. It
|
|
||||||
# will first run ghostscript to create a gray image from a pdf. Then
|
|
||||||
# unpaper is run to optimize the image for the upcoming ocr, which
|
|
||||||
# will be done by tesseract. All these programs must be available in
|
|
||||||
# your PATH or the absolute path can be specified below.
|
|
||||||
extraction {
|
extraction {
|
||||||
allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ]
|
# For PDF files it is first tried to read the text parts of the
|
||||||
|
# PDF. But PDFs can be complex documents and they may contain text
|
||||||
|
# and images. If the returned text is shorter than the value
|
||||||
|
# below, OCR is run afterwards. Then both extracted texts are
|
||||||
|
# compared and the longer will be used.
|
||||||
|
pdf {
|
||||||
|
min-text-len = 10
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extracting text using OCR works for image and pdf files. It will
|
||||||
|
# first run ghostscript to create a gray image from a pdf. Then
|
||||||
|
# unpaper is run to optimize the image for the upcoming ocr, which
|
||||||
|
# will be done by tesseract. All these programs must be available
|
||||||
|
# in your PATH or the absolute path can be specified below.
|
||||||
|
ocr {
|
||||||
|
|
||||||
|
# Images greater than this size are skipped. Note that every
|
||||||
|
# image is loaded completely into memory for doing OCR.
|
||||||
|
max-image-size = 14000000
|
||||||
|
|
||||||
# Defines what pages to process. If a PDF with 600 pages is
|
# Defines what pages to process. If a PDF with 600 pages is
|
||||||
# submitted, it is probably not necessary to scan through all of
|
# submitted, it is probably not necessary to scan through all of
|
||||||
@ -127,4 +139,69 @@ docspell.joex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Configuration for converting files into PDFs.
|
||||||
|
#
|
||||||
|
# Most of it is delegated to external tools, which can be configured
|
||||||
|
# below. They must be in the PATH environment or specify the full
|
||||||
|
# path below via the `program` key.
|
||||||
|
convert {
|
||||||
|
chunk-size = 524288
|
||||||
|
|
||||||
|
max-image-size = 12000000
|
||||||
|
|
||||||
|
markdown {
|
||||||
|
internal-css = """
|
||||||
|
body { padding: 2em 5em; }
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
wkhtmlpdf {
|
||||||
|
cmd = {
|
||||||
|
program = "wkhtmltopdf"
|
||||||
|
args = [
|
||||||
|
"-s",
|
||||||
|
"A4",
|
||||||
|
"--encoding",
|
||||||
|
"UTF-8",
|
||||||
|
"-",
|
||||||
|
"{{outfile}}"
|
||||||
|
]
|
||||||
|
timeout = "20 seconds"
|
||||||
|
}
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
|
}
|
||||||
|
|
||||||
|
tesseract = {
|
||||||
|
cmd = {
|
||||||
|
program = "tesseract"
|
||||||
|
args = [
|
||||||
|
"{{infile}}",
|
||||||
|
"out",
|
||||||
|
"-l",
|
||||||
|
"{{lang}}",
|
||||||
|
"pdf",
|
||||||
|
"txt"
|
||||||
|
]
|
||||||
|
timeout = "120 seconds"
|
||||||
|
}
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
|
}
|
||||||
|
|
||||||
|
unoconv = {
|
||||||
|
cmd = {
|
||||||
|
program = "unoconv"
|
||||||
|
args = [
|
||||||
|
"-f",
|
||||||
|
"pdf",
|
||||||
|
"-o",
|
||||||
|
"{{outfile}}",
|
||||||
|
"{{infile}}"
|
||||||
|
]
|
||||||
|
timeout = "20 seconds"
|
||||||
|
}
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-convert"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -3,8 +3,8 @@ package docspell.joex
|
|||||||
import docspell.common.{Ident, LenientUri}
|
import docspell.common.{Ident, LenientUri}
|
||||||
import docspell.joex.scheduler.SchedulerConfig
|
import docspell.joex.scheduler.SchedulerConfig
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
import docspell.extract.ocr.{OcrConfig => OcrConfig}
|
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
|
import docspell.extract.ExtractConfig
|
||||||
|
|
||||||
case class Config(
|
case class Config(
|
||||||
appId: Ident,
|
appId: Ident,
|
||||||
@ -12,7 +12,7 @@ case class Config(
|
|||||||
bind: Config.Bind,
|
bind: Config.Bind,
|
||||||
jdbc: JdbcConfig,
|
jdbc: JdbcConfig,
|
||||||
scheduler: SchedulerConfig,
|
scheduler: SchedulerConfig,
|
||||||
extraction: OcrConfig,
|
extraction: ExtractConfig,
|
||||||
convert: ConvertConfig
|
convert: ConvertConfig
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,15 +1,17 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import bitpeace.Mimetype
|
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||||
|
import cats.implicits._
|
||||||
import cats.Functor
|
import cats.Functor
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.data.OptionT
|
import cats.data.{Kleisli, OptionT}
|
||||||
|
import fs2.Stream
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert._
|
import docspell.convert._
|
||||||
import docspell.joex.scheduler._
|
import docspell.joex.scheduler._
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
|
import docspell.convert.ConversionResult.Handler
|
||||||
|
|
||||||
/** Goes through all attachments and creates a PDF version of it where
|
/** Goes through all attachments and creates a PDF version of it where
|
||||||
* supported.
|
* supported.
|
||||||
@ -32,32 +34,92 @@ object ConvertPdf {
|
|||||||
item: ItemData
|
item: ItemData
|
||||||
): Task[F, ProcessItemArgs, ItemData] =
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
// get mimetype
|
|
||||||
// try to convert
|
|
||||||
// save to db
|
|
||||||
// update file_id of RAttachment
|
|
||||||
|
|
||||||
def convert(ra: RAttachment) =
|
def convert(ra: RAttachment) =
|
||||||
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m))
|
findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m))
|
||||||
|
|
||||||
for {
|
for {
|
||||||
ras <- item.attachments.traverse(convert)
|
ras <- item.attachments.traverse(convert)
|
||||||
} yield item.copy(attachments = ras)
|
nra = ras.map(_._1)
|
||||||
|
nma = ras.flatMap(_._2)
|
||||||
|
} yield item.copy(attachments = nra, metas = nma)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] =
|
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
|
||||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||||
.map(_.mimetype)
|
.map(_.mimetype)
|
||||||
.getOrElse(Mimetype.`application/octet-stream`)
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
|
||||||
def convertSafe[F[_]: Sync: ContextShift](
|
def convertSafe[F[_]: Sync: ContextShift](
|
||||||
cfg: ConvertConfig,
|
cfg: ConvertConfig,
|
||||||
ctx: Context[F, ProcessItemArgs]
|
ctx: Context[F, ProcessItemArgs],
|
||||||
)(ra: RAttachment, mime: Mimetype): F[RAttachment] =
|
item: ItemData
|
||||||
Conversion.create[F](cfg, ctx.blocker,ctx.logger).use { conv =>
|
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||||
ctx.logger
|
Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv =>
|
||||||
.info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv")
|
mime match {
|
||||||
.map(_ => ra)
|
case Mimetype.`application/pdf` =>
|
||||||
|
ctx.logger.info("Not going to convert a PDF file into a PDF.") *>
|
||||||
|
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
val data = ctx.store.bitpeace
|
||||||
|
.get(ra.fileId.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||||
|
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||||
|
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def conversionHandler[F[_]: Sync](
|
||||||
|
ctx: Context[F, ProcessItemArgs],
|
||||||
|
cfg: ConvertConfig,
|
||||||
|
ra: RAttachment,
|
||||||
|
item: ItemData
|
||||||
|
): Handler[F, (RAttachment, Option[RAttachmentMeta])] =
|
||||||
|
Kleisli({
|
||||||
|
case ConversionResult.SuccessPdf(pdf) =>
|
||||||
|
ctx.logger.info(s"Conversion to pdf successful. Saving file.") *>
|
||||||
|
storePDF(ctx, cfg, ra, pdf)
|
||||||
|
.map(r => (r, None))
|
||||||
|
|
||||||
|
case ConversionResult.SuccessPdfTxt(pdf, txt) =>
|
||||||
|
ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *>
|
||||||
|
storePDF(ctx, cfg, ra, pdf)
|
||||||
|
.flatMap(r =>
|
||||||
|
txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some))
|
||||||
|
)
|
||||||
|
|
||||||
|
case ConversionResult.UnsupportedFormat(mt) =>
|
||||||
|
ctx.logger.info(s"PDF conversion for type ${mt.asString} not supported!") *>
|
||||||
|
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||||
|
|
||||||
|
case ConversionResult.InputMalformed(mt, reason) =>
|
||||||
|
ctx.logger.info(
|
||||||
|
s"PDF conversion from type ${mt.asString} reported malformed input: $reason."
|
||||||
|
) *>
|
||||||
|
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||||
|
|
||||||
|
case ConversionResult.Failure(ex) =>
|
||||||
|
ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *>
|
||||||
|
(ra, None: Option[RAttachmentMeta]).pure[F]
|
||||||
|
})
|
||||||
|
|
||||||
|
private def storePDF[F[_]: Sync](
|
||||||
|
ctx: Context[F, ProcessItemArgs],
|
||||||
|
cfg: ConvertConfig,
|
||||||
|
ra: RAttachment,
|
||||||
|
pdf: Stream[F, Byte]
|
||||||
|
) = {
|
||||||
|
val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf"))
|
||||||
|
val newName = ra.name.map(n => s"$n.pdf")
|
||||||
|
ctx.store.bitpeace
|
||||||
|
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
|
||||||
|
.compile
|
||||||
|
.lastOrError
|
||||||
|
.map(fm => Ident.unsafe(fm.id))
|
||||||
|
.flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId))
|
||||||
|
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import bitpeace.FileMeta
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import cats.data.OptionT
|
import cats.data.OptionT
|
||||||
@ -22,13 +23,15 @@ object CreateItem {
|
|||||||
|
|
||||||
def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] =
|
def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
val validFiles = ctx.args.meta.validFileTypes.map(_.asString).toSet
|
def isValidFile(fm: FileMeta) =
|
||||||
|
ctx.args.meta.validFileTypes.isEmpty ||
|
||||||
|
ctx.args.meta.validFileTypes.map(_.asString).toSet.contains(fm.mimetype.baseType)
|
||||||
|
|
||||||
def fileMetas(itemId: Ident, now: Timestamp) =
|
def fileMetas(itemId: Ident, now: Timestamp) =
|
||||||
Stream
|
Stream
|
||||||
.emits(ctx.args.files)
|
.emits(ctx.args.files)
|
||||||
.flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm)))
|
.flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm)))
|
||||||
.collect({ case (f, Some(fm)) if validFiles.contains(fm.mimetype.baseType) => f })
|
.collect({ case (f, Some(fm)) if isValidFile(fm) => f })
|
||||||
.zipWithIndex
|
.zipWithIndex
|
||||||
.evalMap({
|
.evalMap({
|
||||||
case (f, index) =>
|
case (f, index) =>
|
||||||
|
@ -9,7 +9,7 @@ case class ItemData(
|
|||||||
attachments: Vector[RAttachment],
|
attachments: Vector[RAttachment],
|
||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
dateLabels: Vector[AttachmentDates],
|
dateLabels: Vector[AttachmentDates],
|
||||||
originFile: Map[Ident, Ident]
|
originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||||
@ -17,6 +17,21 @@ case class ItemData(
|
|||||||
|
|
||||||
def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] =
|
def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] =
|
||||||
dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty)
|
dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty)
|
||||||
|
|
||||||
|
def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = {
|
||||||
|
val item = changeMeta(attachId, f)
|
||||||
|
val next = metas.map(a => if (a.id == attachId) item else a)
|
||||||
|
copy(metas = next)
|
||||||
|
}
|
||||||
|
|
||||||
|
def changeMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): RAttachmentMeta =
|
||||||
|
f(findOrCreate(attachId))
|
||||||
|
|
||||||
|
def findOrCreate(attachId: Ident): RAttachmentMeta =
|
||||||
|
metas.find(_.id == attachId).getOrElse {
|
||||||
|
RAttachmentMeta.empty(attachId)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
object ItemData {
|
object ItemData {
|
||||||
|
@ -1,25 +1,25 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import bitpeace.RangeDef
|
import bitpeace.{Mimetype, RangeDef}
|
||||||
|
import cats.data.OptionT
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{ContextShift, Sync}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
|
||||||
import docspell.joex.scheduler.{Context, Task}
|
import docspell.joex.scheduler.{Context, Task}
|
||||||
import docspell.store.Store
|
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
|
||||||
import docspell.store.records.{RAttachment, RAttachmentMeta}
|
|
||||||
import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig}
|
|
||||||
|
|
||||||
object TextExtraction {
|
object TextExtraction {
|
||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
cfg: OcrConfig,
|
cfg: ExtractConfig,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
): Task[F, ProcessItemArgs, ItemData] =
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.info("Starting text extraction")
|
_ <- ctx.logger.info("Starting text extraction")
|
||||||
start <- Duration.stopTime[F]
|
start <- Duration.stopTime[F]
|
||||||
txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item))
|
txt <- item.attachments.traverse(extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item))
|
||||||
_ <- ctx.logger.debug("Storing extracted texts")
|
_ <- ctx.logger.debug("Storing extracted texts")
|
||||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
|
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
|
||||||
dur <- start
|
dur <- start
|
||||||
@ -27,41 +27,63 @@ object TextExtraction {
|
|||||||
} yield item.copy(metas = txt)
|
} yield item.copy(metas = txt)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, _],
|
||||||
|
cfg: ExtractConfig,
|
||||||
|
lang: Language,
|
||||||
|
item: ItemData
|
||||||
|
)(ra: RAttachment): F[RAttachmentMeta] = {
|
||||||
|
val rm = item.findOrCreate(ra.id)
|
||||||
|
rm.content match {
|
||||||
|
case Some(_) =>
|
||||||
|
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||||
|
rm.pure[F]
|
||||||
|
case None =>
|
||||||
|
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def extractTextToMeta[F[_]: Sync: ContextShift](
|
def extractTextToMeta[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
cfg: OcrConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
)(ra: RAttachment): F[RAttachmentMeta] =
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}")
|
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||||
dst <- Duration.stopTime[F]
|
dst <- Duration.stopTime[F]
|
||||||
txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra))
|
txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra))
|
||||||
meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty))
|
meta = item.changeMeta(ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)))
|
||||||
est <- dst
|
est <- dst
|
||||||
_ <- ctx.logger.debug(
|
_ <- ctx.logger.debug(
|
||||||
s"Extracting text for attachment ${ra.name} finished in ${est.formatExact}"
|
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
||||||
)
|
)
|
||||||
} yield meta
|
} yield meta
|
||||||
|
|
||||||
def extractText[F[_]: Sync: ContextShift](
|
def extractText[F[_]: Sync: ContextShift](
|
||||||
ocrConfig: OcrConfig,
|
ctx: Context[F, _],
|
||||||
lang: Language,
|
extr: Extraction[F],
|
||||||
store: Store[F],
|
lang: Language
|
||||||
blocker: Blocker,
|
)(fileId: Ident): F[ExtractResult] = {
|
||||||
logger: Logger[F]
|
val data = ctx.store.bitpeace
|
||||||
)(fileId: Ident): F[Option[String]] = {
|
|
||||||
val data = store.bitpeace
|
|
||||||
.get(fileId.id)
|
.get(fileId.id)
|
||||||
.unNoneTerminate
|
.unNoneTerminate
|
||||||
.through(store.bitpeace.fetchData2(RangeDef.all))
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
|
||||||
TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last
|
def findMime: F[Mimetype] =
|
||||||
|
OptionT(ctx.store.transact(RFileMeta.findById(fileId)))
|
||||||
|
.map(_.mimetype)
|
||||||
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
|
||||||
|
findMime
|
||||||
|
.flatMap(mt =>
|
||||||
|
extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||||
}
|
}
|
||||||
|
|
||||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
ocrConfig: OcrConfig,
|
cfg: ExtractConfig,
|
||||||
|
ra: RAttachment,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
)(fileIds: List[Ident]): F[Option[String]] = {
|
)(fileIds: List[Ident]): F[Option[String]] = {
|
||||||
fileIds match {
|
fileIds match {
|
||||||
@ -69,11 +91,20 @@ object TextExtraction {
|
|||||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||||
|
|
||||||
case id :: rest =>
|
case id :: rest =>
|
||||||
extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id).
|
val extr = Extraction.create[F](ctx.blocker, ctx.logger, cfg)
|
||||||
recoverWith({
|
|
||||||
case ex =>
|
extractText[F](ctx, extr, lang)(id)
|
||||||
|
.flatMap({
|
||||||
|
case ExtractResult.Success(txt) =>
|
||||||
|
txt.some.pure[F]
|
||||||
|
|
||||||
|
case ExtractResult.UnsupportedFormat(mt) =>
|
||||||
|
ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file.").
|
||||||
|
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||||
|
|
||||||
|
case ExtractResult.Failure(ex) =>
|
||||||
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
||||||
flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest))
|
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -86,4 +117,9 @@ object TextExtraction {
|
|||||||
case Some(sid) => List(sid, ra.fileId).distinct
|
case Some(sid) => List(sid, ra.fileId).distinct
|
||||||
case None => List(ra.fileId)
|
case None => List(ra.fileId)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def stripAttachmentName(ra: RAttachment): String =
|
||||||
|
ra.name
|
||||||
|
.map(s => if (s.endsWith(".pdf") && s.count(_ == '.') > 1) s.dropRight(4) else s)
|
||||||
|
.getOrElse("<no-name>")
|
||||||
}
|
}
|
||||||
|
@ -80,9 +80,10 @@ docspell.server {
|
|||||||
# The file content types that are considered valid. Docspell
|
# The file content types that are considered valid. Docspell
|
||||||
# will only pass these files to processing. The processing code
|
# will only pass these files to processing. The processing code
|
||||||
# itself has also checks for which files are supported and which
|
# itself has also checks for which files are supported and which
|
||||||
# not. This affects the uploading part and is a first check to
|
# not. This affects the uploading part and can be used to
|
||||||
# avoid that 'bad' files get into the system.
|
# restrict file types that should be handed over to processing.
|
||||||
valid-mime-types = [ "application/pdf" ]
|
# By default all files are allowed.
|
||||||
|
valid-mime-types = [ ]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -38,6 +38,9 @@ object RAttachment {
|
|||||||
fr"${v.id},${v.itemId},${v.fileId.id},${v.position},${v.created},${v.name}"
|
fr"${v.id},${v.itemId},${v.fileId.id},${v.position},${v.created},${v.name}"
|
||||||
).update.run
|
).update.run
|
||||||
|
|
||||||
|
def updateFileIdAndName(attachId: Ident, fId: Ident, fname: Option[String]): ConnectionIO[Int] =
|
||||||
|
updateRow(table, id.is(attachId), commas(fileId.setTo(fId), name.setTo(fname))).update.run
|
||||||
|
|
||||||
def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] =
|
def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] =
|
||||||
selectSimple(all, table, id.is(attachId)).query[RAttachment].option
|
selectSimple(all, table, id.is(attachId)).query[RAttachment].option
|
||||||
|
|
||||||
@ -108,7 +111,8 @@ object RAttachment {
|
|||||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||||
for {
|
for {
|
||||||
n0 <- RAttachmentMeta.delete(attachId)
|
n0 <- RAttachmentMeta.delete(attachId)
|
||||||
n1 <- deleteFrom(table, id.is(attachId)).update.run
|
n1 <- RAttachmentSource.delete(attachId)
|
||||||
} yield n0 + n1
|
n2 <- deleteFrom(table, id.is(attachId)).update.run
|
||||||
|
} yield n0 + n1 + n2
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,11 +8,16 @@ import docspell.store.impl._
|
|||||||
import docspell.store.impl.Implicits._
|
import docspell.store.impl.Implicits._
|
||||||
|
|
||||||
case class RAttachmentMeta(
|
case class RAttachmentMeta(
|
||||||
id: Ident,
|
id: Ident, //same as RAttachment.id
|
||||||
content: Option[String],
|
content: Option[String],
|
||||||
nerlabels: List[NerLabel],
|
nerlabels: List[NerLabel],
|
||||||
proposals: MetaProposalList
|
proposals: MetaProposalList
|
||||||
) {}
|
) {
|
||||||
|
|
||||||
|
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
||||||
|
if (content.forall(_.trim.isEmpty)) copy(content = txt)
|
||||||
|
else this
|
||||||
|
}
|
||||||
|
|
||||||
object RAttachmentMeta {
|
object RAttachmentMeta {
|
||||||
def empty(attachId: Ident) = RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty)
|
def empty(attachId: Ident) = RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty)
|
||||||
|
@ -41,4 +41,6 @@ object RAttachmentSource {
|
|||||||
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
|
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
|
||||||
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
|
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
|
||||||
|
|
||||||
|
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||||
|
deleteFrom(table, id.is(attachId)).update.run
|
||||||
}
|
}
|
||||||
|
@ -35,7 +35,7 @@ type alias Settings =
|
|||||||
defaultSettings : Settings
|
defaultSettings : Settings
|
||||||
defaultSettings =
|
defaultSettings =
|
||||||
{ classList = \_ -> [ ( "ui placeholder segment", True ) ]
|
{ classList = \_ -> [ ( "ui placeholder segment", True ) ]
|
||||||
, contentTypes = [ "application/pdf" ]
|
, contentTypes = []
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -148,6 +148,10 @@ filterMime settings files =
|
|||||||
pred f =
|
pred f =
|
||||||
List.member (File.mime f) settings.contentTypes
|
List.member (File.mime f) settings.contentTypes
|
||||||
in
|
in
|
||||||
|
if settings.contentTypes == [] then
|
||||||
|
files
|
||||||
|
|
||||||
|
else
|
||||||
List.filter pred files
|
List.filter pred files
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user