From 97305d27ff5743484d765a84abdd05095633930e Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 19 Feb 2020 23:27:00 +0100 Subject: [PATCH] Integrate support for more files into processing and upload The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration. --- .../scala/docspell/common/MimeTypeHint.scala | 6 +- .../scala/docspell/convert/Conversion.scala | 17 +- .../docspell/convert/extern/Tesseract.scala | 3 +- .../docspell/convert/ConversionTest.scala | 2 +- .../convert/extern/ExternConvTest.scala | 2 +- .../docspell/extract/ExtractConfig.scala | 2 +- .../scala/docspell/extract/Extraction.scala | 12 +- .../scala/docspell/extract/PdfExtract.scala | 4 +- .../docspell/extract/ocr/OcrConfig.scala | 14 +- .../docspell/extract/ocr/TextExtract.scala | 3 - .../joex/src/main/resources/reference.conf | 181 +++++++++++++----- .../src/main/scala/docspell/joex/Config.scala | 16 +- .../docspell/joex/process/ConvertPdf.scala | 96 ++++++++-- .../docspell/joex/process/CreateItem.scala | 7 +- .../docspell/joex/process/ItemData.scala | 17 +- .../joex/process/TextExtraction.scala | 98 +++++++--- .../src/main/resources/reference.conf | 7 +- .../docspell/store/records/RAttachment.scala | 8 +- .../store/records/RAttachmentMeta.scala | 9 +- .../store/records/RAttachmentSource.scala | 2 + modules/webapp/src/main/elm/Comp/Dropzone.elm | 8 +- 21 files changed, 366 insertions(+), 148 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala index 7b98a92a..4199a29f 100644 --- a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala +++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala @@ -1,6 +1,10 @@ package docspell.common -case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} +case class MimeTypeHint(filename: Option[String], advertised: Option[String]) { + + def withName(name: String): MimeTypeHint = + copy(filename = Some(name)) +} object MimeTypeHint { val none = MimeTypeHint(None, None) diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index e106b844..18d62517 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -13,7 +13,7 @@ import docspell.files.{ImageSize, TikaMimetype} trait Conversion[F[_]] { - def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] } @@ -26,7 +26,7 @@ object Conversion { ): Resource[F, Conversion[F]] = Resource.pure(new Conversion[F] { - def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = TikaMimetype.resolve(dataType, in).flatMap { case MimeType.pdf => handler.run(ConversionResult.successPdf(in)) @@ -55,14 +55,14 @@ object Conversion { ) ) } else { - Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) } case None => logger.info( s"Cannot read image when determining size for ${mt.asString}. Converting anyways." ) *> - Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) } case Office(_) => @@ -109,4 +109,13 @@ object Conversion { def unapply(m: MimeType): Option[MimeType] = Some(m).filter(all.contains) } + + def unapply(mt: MimeType): Option[MimeType] = + mt match { + case Office(_) => Some(mt) + case Texts(_) => Some(mt) + case Images(_) => Some(mt) + case MimeType.html => Some(mt) + case _ => None + } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala index 7c04608e..233cfa96 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -12,6 +12,7 @@ object Tesseract { def toPDF[F[_]: Sync: ContextShift, A]( cfg: TesseractConfig, + lang: Language, chunkSize: Int, blocker: Blocker, logger: Logger[F] @@ -20,7 +21,7 @@ object Tesseract { val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger) - ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler) + ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler) } } diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index dc158a31..294ce4db 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -150,7 +150,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks { val load = uri.readURL[IO](8192, blocker) val dataType = DataType.filename(uri.path.segments.last) logger.info(s"Processing file ${uri.path.asString}") *> - conv.toPDF(dataType, handler(index))(load) + conv.toPDF(dataType, Language.German, handler(index))(load) }) def commandsExist: Boolean = diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala index 9db0588c..a2f496ec 100644 --- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -89,7 +89,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks { val tessCfg = TesseractConfig(cfg, target) val (pdf, txt) = Tesseract - .toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)( + .toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)( ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker), storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt")) ) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala index ae35cb5f..b4951686 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -2,4 +2,4 @@ package docspell.extract import docspell.extract.ocr.OcrConfig -case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig) +case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index 81a61e7a..02ca0502 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -55,10 +55,10 @@ object Extraction { ImageSize.get(data).flatMap { case Some(dim) => - if (dim.product > cfg.maxImageSize) { - logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + if (dim.product > cfg.ocr.maxImageSize) { + logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *> ExtractResult.failure(new Exception( - s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).") + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).") ).pure[F] } else { doExtract @@ -72,6 +72,12 @@ object Extraction { logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> OdfExtract.get(data).map(ExtractResult.fromEither) + case mt@MimeType("text", sub) if !sub.contains("html") => + logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> + data.through(fs2.text.utf8Decode).compile.last.map { txt => + ExtractResult.success(txt.getOrElse("").trim) + } + case mt => ExtractResult.unsupportedFormat(mt).pure[F] diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 2058e072..51c1fbcb 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -33,12 +33,12 @@ object PdfExtract { //maybe better: inspect the pdf and decide whether ocr or not for { - pdfboxRes <- PdfboxExtract.get[F](in) + pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in) res <- pdfboxRes.fold( ex => logger.info( s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " - ) *> runOcr.attempt, + ) >> runOcr.attempt, str => if (str.length >= stripMinLen) str.pure[F].attempt else diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala index b08f46ba..739b0149 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala @@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths} import docspell.common._ case class OcrConfig( - allowedContentTypes: Set[MimeType], - ghostscript: OcrConfig.Ghostscript, + maxImageSize: Int, + ghostscript: OcrConfig.Ghostscript, pageRange: OcrConfig.PageRange, unpaper: OcrConfig.Unpaper, tesseract: OcrConfig.Tesseract ) { - - def isAllowed(mt: MimeType): Boolean = - allowedContentTypes contains mt } object OcrConfig { @@ -27,12 +24,7 @@ object OcrConfig { case class Unpaper(command: SystemCommand.Config) val default = OcrConfig( - allowedContentTypes = Set( - MimeType.pdf, - MimeType.png, - MimeType.jpeg, - MimeType.tiff - ), + maxImageSize = 3000 * 3000, pageRange = PageRange(10), ghostscript = Ghostscript( SystemCommand.Config( diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index c2fd1678..7246bb7c 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -26,9 +26,6 @@ object TextExtract { Stream .eval(TikaMimetype.detect(in, MimeTypeHint.none)) .flatMap({ - case mt if !config.isAllowed(mt) => - raiseError(s"File `$mt` not allowed") - case MimeType.pdf => Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index a6a4ee60..35e48dc5 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -65,66 +65,143 @@ docspell.joex { } # Configuration of text extraction - # - # Extracting text currently only work for image and pdf files. It - # will first run ghostscript to create a gray image from a pdf. Then - # unpaper is run to optimize the image for the upcoming ocr, which - # will be done by tesseract. All these programs must be available in - # your PATH or the absolute path can be specified below. extraction { - allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ] - - # Defines what pages to process. If a PDF with 600 pages is - # submitted, it is probably not necessary to scan through all of - # them. This would take a long time and occupy resources for no - # value. The first few pages should suffice. The default is first - # 10 pages. - # - # If you want all pages being processed, set this number to -1. - # - # Note: if you change the ghostscript command below, be aware that - # this setting (if not -1) will add another parameter to the - # beginning of the command. - page-range { - begin = 10 + # For PDF files it is first tried to read the text parts of the + # PDF. But PDFs can be complex documents and they may contain text + # and images. If the returned text is shorter than the value + # below, OCR is run afterwards. Then both extracted texts are + # compared and the longer will be used. + pdf { + min-text-len = 10 } - # The ghostscript command. - ghostscript { - command { - program = "gs" - args = [ "-dNOPAUSE" - , "-dBATCH" - , "-dSAFER" - , "-sDEVICE=tiffscaled8" - , "-sOutputFile={{outfile}}" - , "{{infile}}" - ] - timeout = "5 minutes" + # Extracting text using OCR works for image and pdf files. It will + # first run ghostscript to create a gray image from a pdf. Then + # unpaper is run to optimize the image for the upcoming ocr, which + # will be done by tesseract. All these programs must be available + # in your PATH or the absolute path can be specified below. + ocr { + + # Images greater than this size are skipped. Note that every + # image is loaded completely into memory for doing OCR. + max-image-size = 14000000 + + # Defines what pages to process. If a PDF with 600 pages is + # submitted, it is probably not necessary to scan through all of + # them. This would take a long time and occupy resources for no + # value. The first few pages should suffice. The default is first + # 10 pages. + # + # If you want all pages being processed, set this number to -1. + # + # Note: if you change the ghostscript command below, be aware that + # this setting (if not -1) will add another parameter to the + # beginning of the command. + page-range { + begin = 10 } - working-dir = ${java.io.tmpdir}"/docspell-extraction" - } - # The unpaper command. - unpaper { - command { - program = "unpaper" - args = [ "{{infile}}", "{{outfile}}" ] - timeout = "5 minutes" + # The ghostscript command. + ghostscript { + command { + program = "gs" + args = [ "-dNOPAUSE" + , "-dBATCH" + , "-dSAFER" + , "-sDEVICE=tiffscaled8" + , "-sOutputFile={{outfile}}" + , "{{infile}}" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-extraction" } - } - # The tesseract command. - tesseract { - command { - program = "tesseract" - args = ["{{file}}" - , "stdout" - , "-l" - , "{{lang}}" - ] - timeout = "5 minutes" + # The unpaper command. + unpaper { + command { + program = "unpaper" + args = [ "{{infile}}", "{{outfile}}" ] + timeout = "5 minutes" + } + } + + # The tesseract command. + tesseract { + command { + program = "tesseract" + args = ["{{file}}" + , "stdout" + , "-l" + , "{{lang}}" + ] + timeout = "5 minutes" + } } } } + + # Configuration for converting files into PDFs. + # + # Most of it is delegated to external tools, which can be configured + # below. They must be in the PATH environment or specify the full + # path below via the `program` key. + convert { + chunk-size = 524288 + + max-image-size = 12000000 + + markdown { + internal-css = """ + body { padding: 2em 5em; } + """ + } + + wkhtmlpdf { + cmd = { + program = "wkhtmltopdf" + args = [ + "-s", + "A4", + "--encoding", + "UTF-8", + "-", + "{{outfile}}" + ] + timeout = "20 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + tesseract = { + cmd = { + program = "tesseract" + args = [ + "{{infile}}", + "out", + "-l", + "{{lang}}", + "pdf", + "txt" + ] + timeout = "120 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + unoconv = { + cmd = { + program = "unoconv" + args = [ + "-f", + "pdf", + "-o", + "{{outfile}}", + "{{infile}}" + ] + timeout = "20 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + } } \ No newline at end of file diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 7e4de76c..62ad3aad 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -3,17 +3,17 @@ package docspell.joex import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig -import docspell.extract.ocr.{OcrConfig => OcrConfig} import docspell.convert.ConvertConfig +import docspell.extract.ExtractConfig case class Config( - appId: Ident, - baseUrl: LenientUri, - bind: Config.Bind, - jdbc: JdbcConfig, - scheduler: SchedulerConfig, - extraction: OcrConfig, - convert: ConvertConfig + appId: Ident, + baseUrl: LenientUri, + bind: Config.Bind, + jdbc: JdbcConfig, + scheduler: SchedulerConfig, + extraction: ExtractConfig, + convert: ConvertConfig ) object Config { diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 23de73de..a1035da1 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -1,15 +1,17 @@ package docspell.joex.process -import bitpeace.Mimetype +import bitpeace.{Mimetype, MimetypeHint, RangeDef} +import cats.implicits._ import cats.Functor import cats.implicits._ import cats.effect._ -import cats.data.OptionT - +import cats.data.{Kleisli, OptionT} +import fs2.Stream import docspell.common._ import docspell.convert._ import docspell.joex.scheduler._ import docspell.store.records._ +import docspell.convert.ConversionResult.Handler /** Goes through all attachments and creates a PDF version of it where * supported. @@ -32,32 +34,92 @@ object ConvertPdf { item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - // get mimetype - // try to convert - // save to db - // update file_id of RAttachment - def convert(ra: RAttachment) = - findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m)) + findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m)) for { ras <- item.attachments.traverse(convert) - } yield item.copy(attachments = ras) + nra = ras.map(_._1) + nma = ras.flatMap(_._2) + } yield item.copy(attachments = nra, metas = nma) } - def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] = + def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] = OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) .map(_.mimetype) .getOrElse(Mimetype.`application/octet-stream`) def convertSafe[F[_]: Sync: ContextShift]( cfg: ConvertConfig, - ctx: Context[F, ProcessItemArgs] - )(ra: RAttachment, mime: Mimetype): F[RAttachment] = - Conversion.create[F](cfg, ctx.blocker,ctx.logger).use { conv => - ctx.logger - .info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv") - .map(_ => ra) + ctx: Context[F, ProcessItemArgs], + item: ItemData + )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = + Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv => + mime match { + case Mimetype.`application/pdf` => + ctx.logger.info("Not going to convert a PDF file into a PDF.") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case _ => + val data = ctx.store.bitpeace + .get(ra.fileId.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + val handler = conversionHandler[F](ctx, cfg, ra, item) + ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> + conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data) + } } + + private def conversionHandler[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + item: ItemData + ): Handler[F, (RAttachment, Option[RAttachmentMeta])] = + Kleisli({ + case ConversionResult.SuccessPdf(pdf) => + ctx.logger.info(s"Conversion to pdf successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .map(r => (r, None)) + + case ConversionResult.SuccessPdfTxt(pdf, txt) => + ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .flatMap(r => + txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some)) + ) + + case ConversionResult.UnsupportedFormat(mt) => + ctx.logger.info(s"PDF conversion for type ${mt.asString} not supported!") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.InputMalformed(mt, reason) => + ctx.logger.info( + s"PDF conversion from type ${mt.asString} reported malformed input: $reason." + ) *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.Failure(ex) => + ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + }) + + private def storePDF[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + pdf: Stream[F, Byte] + ) = { + val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf")) + val newName = ra.name.map(n => s"$n.pdf") + ctx.store.bitpeace + .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised)) + .compile + .lastOrError + .map(fm => Ident.unsafe(fm.id)) + .flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)) + .map(fmId => ra.copy(fileId = fmId, name = newName)) + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 1ac90139..6eada36d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -1,5 +1,6 @@ package docspell.joex.process +import bitpeace.FileMeta import cats.implicits._ import cats.effect.Sync import cats.data.OptionT @@ -22,13 +23,15 @@ object CreateItem { def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] = Task { ctx => - val validFiles = ctx.args.meta.validFileTypes.map(_.asString).toSet + def isValidFile(fm: FileMeta) = + ctx.args.meta.validFileTypes.isEmpty || + ctx.args.meta.validFileTypes.map(_.asString).toSet.contains(fm.mimetype.baseType) def fileMetas(itemId: Ident, now: Timestamp) = Stream .emits(ctx.args.files) .flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm))) - .collect({ case (f, Some(fm)) if validFiles.contains(fm.mimetype.baseType) => f }) + .collect({ case (f, Some(fm)) if isValidFile(fm) => f }) .zipWithIndex .evalMap({ case (f, index) => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index c5f474a5..b9fd22c4 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -9,7 +9,7 @@ case class ItemData( attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], dateLabels: Vector[AttachmentDates], - originFile: Map[Ident, Ident] + originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = @@ -17,6 +17,21 @@ case class ItemData( def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] = dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty) + + def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = { + val item = changeMeta(attachId, f) + val next = metas.map(a => if (a.id == attachId) item else a) + copy(metas = next) + } + + def changeMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): RAttachmentMeta = + f(findOrCreate(attachId)) + + def findOrCreate(attachId: Ident): RAttachmentMeta = + metas.find(_.id == attachId).getOrElse { + RAttachmentMeta.empty(attachId) + } + } object ItemData { diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 6ab1c451..6f72836d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -1,25 +1,25 @@ package docspell.joex.process -import bitpeace.RangeDef +import bitpeace.{Mimetype, RangeDef} +import cats.data.OptionT import cats.implicits._ -import cats.effect.{Blocker, ContextShift, Sync} +import cats.effect.{ContextShift, Sync} import docspell.common._ +import docspell.extract.{ExtractConfig, ExtractResult, Extraction} import docspell.joex.scheduler.{Context, Task} -import docspell.store.Store -import docspell.store.records.{RAttachment, RAttachmentMeta} -import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig} +import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta} object TextExtraction { def apply[F[_]: Sync: ContextShift]( - cfg: OcrConfig, - item: ItemData + cfg: ExtractConfig, + item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] - txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item)) + txt <- item.attachments.traverse(extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)) _ <- ctx.logger.debug("Storing extracted texts") _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) dur <- start @@ -27,53 +27,84 @@ object TextExtraction { } yield item.copy(metas = txt) } + def extractTextIfEmpty[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + cfg: ExtractConfig, + lang: Language, + item: ItemData + )(ra: RAttachment): F[RAttachmentMeta] = { + val rm = item.findOrCreate(ra.id) + rm.content match { + case Some(_) => + ctx.logger.info("TextExtraction skipped, since text is already available.") *> + rm.pure[F] + case None => + extractTextToMeta[F](ctx, cfg, lang, item)(ra) + } + } + def extractTextToMeta[F[_]: Sync: ContextShift]( ctx: Context[F, _], - cfg: OcrConfig, - lang: Language, - item: ItemData + cfg: ExtractConfig, + lang: Language, + item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { - _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") + _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") dst <- Duration.stopTime[F] - txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra)) - meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) + txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra)) + meta = item.changeMeta(ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))) est <- dst _ <- ctx.logger.debug( - s"Extracting text for attachment ${ra.name} finished in ${est.formatExact}" + s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" ) } yield meta def extractText[F[_]: Sync: ContextShift]( - ocrConfig: OcrConfig, - lang: Language, - store: Store[F], - blocker: Blocker, - logger: Logger[F] - )(fileId: Ident): F[Option[String]] = { - val data = store.bitpeace + ctx: Context[F, _], + extr: Extraction[F], + lang: Language + )(fileId: Ident): F[ExtractResult] = { + val data = ctx.store.bitpeace .get(fileId.id) .unNoneTerminate - .through(store.bitpeace.fetchData2(RangeDef.all)) + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last + def findMime: F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + findMime + .flatMap(mt => + extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang)) } private def extractTextFallback[F[_]: Sync: ContextShift]( - ctx: Context[F, _], - ocrConfig: OcrConfig, - lang: Language, + ctx: Context[F, _], + cfg: ExtractConfig, + ra: RAttachment, + lang: Language, )(fileIds: List[Ident]): F[Option[String]] = { fileIds match { case Nil => ctx.logger.error(s"Cannot extract text").map(_ => None) case id :: rest => - extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id). - recoverWith({ - case ex => + val extr = Extraction.create[F](ctx.blocker, ctx.logger, cfg) + + extractText[F](ctx, extr, lang)(id) + .flatMap({ + case ExtractResult.Success(txt) => + txt.some.pure[F] + + case ExtractResult.UnsupportedFormat(mt) => + ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file."). + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) + + case ExtractResult.Failure(ex) => ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). - flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest)) + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) }) } } @@ -86,4 +117,9 @@ object TextExtraction { case Some(sid) => List(sid, ra.fileId).distinct case None => List(ra.fileId) } + + private def stripAttachmentName(ra: RAttachment): String = + ra.name + .map(s => if (s.endsWith(".pdf") && s.count(_ == '.') > 1) s.dropRight(4) else s) + .getOrElse("") } diff --git a/modules/restserver/src/main/resources/reference.conf b/modules/restserver/src/main/resources/reference.conf index 182bb0e4..4e165dc5 100644 --- a/modules/restserver/src/main/resources/reference.conf +++ b/modules/restserver/src/main/resources/reference.conf @@ -80,9 +80,10 @@ docspell.server { # The file content types that are considered valid. Docspell # will only pass these files to processing. The processing code # itself has also checks for which files are supported and which - # not. This affects the uploading part and is a first check to - # avoid that 'bad' files get into the system. - valid-mime-types = [ "application/pdf" ] + # not. This affects the uploading part and can be used to + # restrict file types that should be handed over to processing. + # By default all files are allowed. + valid-mime-types = [ ] } } } \ No newline at end of file diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index 22ab8e89..728abc95 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -38,6 +38,9 @@ object RAttachment { fr"${v.id},${v.itemId},${v.fileId.id},${v.position},${v.created},${v.name}" ).update.run + def updateFileIdAndName(attachId: Ident, fId: Ident, fname: Option[String]): ConnectionIO[Int] = + updateRow(table, id.is(attachId), commas(fileId.setTo(fId), name.setTo(fname))).update.run + def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = selectSimple(all, table, id.is(attachId)).query[RAttachment].option @@ -108,7 +111,8 @@ object RAttachment { def delete(attachId: Ident): ConnectionIO[Int] = for { n0 <- RAttachmentMeta.delete(attachId) - n1 <- deleteFrom(table, id.is(attachId)).update.run - } yield n0 + n1 + n1 <- RAttachmentSource.delete(attachId) + n2 <- deleteFrom(table, id.is(attachId)).update.run + } yield n0 + n1 + n2 } diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index f1887399..9de923e2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -8,11 +8,16 @@ import docspell.store.impl._ import docspell.store.impl.Implicits._ case class RAttachmentMeta( - id: Ident, + id: Ident, //same as RAttachment.id content: Option[String], nerlabels: List[NerLabel], proposals: MetaProposalList -) {} +) { + + def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = + if (content.forall(_.trim.isEmpty)) copy(content = txt) + else this +} object RAttachmentMeta { def empty(attachId: Ident) = RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty) diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala index 447af3aa..052be661 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala @@ -41,4 +41,6 @@ object RAttachmentSource { def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] = selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option + def delete(attachId: Ident): ConnectionIO[Int] = + deleteFrom(table, id.is(attachId)).update.run } diff --git a/modules/webapp/src/main/elm/Comp/Dropzone.elm b/modules/webapp/src/main/elm/Comp/Dropzone.elm index 3ef12443..e551209e 100644 --- a/modules/webapp/src/main/elm/Comp/Dropzone.elm +++ b/modules/webapp/src/main/elm/Comp/Dropzone.elm @@ -35,7 +35,7 @@ type alias Settings = defaultSettings : Settings defaultSettings = { classList = \_ -> [ ( "ui placeholder segment", True ) ] - , contentTypes = [ "application/pdf" ] + , contentTypes = [] } @@ -148,7 +148,11 @@ filterMime settings files = pred f = List.member (File.mime f) settings.contentTypes in - List.filter pred files + if settings.contentTypes == [] then + files + + else + List.filter pred files dropDecoder : D.Decoder Msg