diff --git a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala index 7b98a92a..4199a29f 100644 --- a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala +++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala @@ -1,6 +1,10 @@ package docspell.common -case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} +case class MimeTypeHint(filename: Option[String], advertised: Option[String]) { + + def withName(name: String): MimeTypeHint = + copy(filename = Some(name)) +} object MimeTypeHint { val none = MimeTypeHint(None, None) diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala index e106b844..18d62517 100644 --- a/modules/convert/src/main/scala/docspell/convert/Conversion.scala +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -13,7 +13,7 @@ import docspell.files.{ImageSize, TikaMimetype} trait Conversion[F[_]] { - def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] } @@ -26,7 +26,7 @@ object Conversion { ): Resource[F, Conversion[F]] = Resource.pure(new Conversion[F] { - def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = TikaMimetype.resolve(dataType, in).flatMap { case MimeType.pdf => handler.run(ConversionResult.successPdf(in)) @@ -55,14 +55,14 @@ object Conversion { ) ) } else { - Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) } case None => logger.info( s"Cannot read image when determining size for ${mt.asString}. Converting anyways." ) *> - Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler) + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) } case Office(_) => @@ -109,4 +109,13 @@ object Conversion { def unapply(m: MimeType): Option[MimeType] = Some(m).filter(all.contains) } + + def unapply(mt: MimeType): Option[MimeType] = + mt match { + case Office(_) => Some(mt) + case Texts(_) => Some(mt) + case Images(_) => Some(mt) + case MimeType.html => Some(mt) + case _ => None + } } diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala index 7c04608e..233cfa96 100644 --- a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -12,6 +12,7 @@ object Tesseract { def toPDF[F[_]: Sync: ContextShift, A]( cfg: TesseractConfig, + lang: Language, chunkSize: Int, blocker: Blocker, logger: Logger[F] @@ -20,7 +21,7 @@ object Tesseract { val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger) - ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler) + ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler) } } diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala index dc158a31..294ce4db 100644 --- a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -150,7 +150,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks { val load = uri.readURL[IO](8192, blocker) val dataType = DataType.filename(uri.path.segments.last) logger.info(s"Processing file ${uri.path.asString}") *> - conv.toPDF(dataType, handler(index))(load) + conv.toPDF(dataType, Language.German, handler(index))(load) }) def commandsExist: Boolean = diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala index 9db0588c..a2f496ec 100644 --- a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -89,7 +89,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks { val tessCfg = TesseractConfig(cfg, target) val (pdf, txt) = Tesseract - .toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)( + .toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)( ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker), storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt")) ) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala index ae35cb5f..b4951686 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -2,4 +2,4 @@ package docspell.extract import docspell.extract.ocr.OcrConfig -case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig) +case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index 81a61e7a..02ca0502 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -55,10 +55,10 @@ object Extraction { ImageSize.get(data).flatMap { case Some(dim) => - if (dim.product > cfg.maxImageSize) { - logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + if (dim.product > cfg.ocr.maxImageSize) { + logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *> ExtractResult.failure(new Exception( - s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).") + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).") ).pure[F] } else { doExtract @@ -72,6 +72,12 @@ object Extraction { logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> OdfExtract.get(data).map(ExtractResult.fromEither) + case mt@MimeType("text", sub) if !sub.contains("html") => + logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> + data.through(fs2.text.utf8Decode).compile.last.map { txt => + ExtractResult.success(txt.getOrElse("").trim) + } + case mt => ExtractResult.unsupportedFormat(mt).pure[F] diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 2058e072..51c1fbcb 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -33,12 +33,12 @@ object PdfExtract { //maybe better: inspect the pdf and decide whether ocr or not for { - pdfboxRes <- PdfboxExtract.get[F](in) + pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in) res <- pdfboxRes.fold( ex => logger.info( s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " - ) *> runOcr.attempt, + ) >> runOcr.attempt, str => if (str.length >= stripMinLen) str.pure[F].attempt else diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala index b08f46ba..739b0149 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala @@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths} import docspell.common._ case class OcrConfig( - allowedContentTypes: Set[MimeType], - ghostscript: OcrConfig.Ghostscript, + maxImageSize: Int, + ghostscript: OcrConfig.Ghostscript, pageRange: OcrConfig.PageRange, unpaper: OcrConfig.Unpaper, tesseract: OcrConfig.Tesseract ) { - - def isAllowed(mt: MimeType): Boolean = - allowedContentTypes contains mt } object OcrConfig { @@ -27,12 +24,7 @@ object OcrConfig { case class Unpaper(command: SystemCommand.Config) val default = OcrConfig( - allowedContentTypes = Set( - MimeType.pdf, - MimeType.png, - MimeType.jpeg, - MimeType.tiff - ), + maxImageSize = 3000 * 3000, pageRange = PageRange(10), ghostscript = Ghostscript( SystemCommand.Config( diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index c2fd1678..7246bb7c 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -26,9 +26,6 @@ object TextExtract { Stream .eval(TikaMimetype.detect(in, MimeTypeHint.none)) .flatMap({ - case mt if !config.isAllowed(mt) => - raiseError(s"File `$mt` not allowed") - case MimeType.pdf => Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index a6a4ee60..35e48dc5 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -65,66 +65,143 @@ docspell.joex { } # Configuration of text extraction - # - # Extracting text currently only work for image and pdf files. It - # will first run ghostscript to create a gray image from a pdf. Then - # unpaper is run to optimize the image for the upcoming ocr, which - # will be done by tesseract. All these programs must be available in - # your PATH or the absolute path can be specified below. extraction { - allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ] - - # Defines what pages to process. If a PDF with 600 pages is - # submitted, it is probably not necessary to scan through all of - # them. This would take a long time and occupy resources for no - # value. The first few pages should suffice. The default is first - # 10 pages. - # - # If you want all pages being processed, set this number to -1. - # - # Note: if you change the ghostscript command below, be aware that - # this setting (if not -1) will add another parameter to the - # beginning of the command. - page-range { - begin = 10 + # For PDF files it is first tried to read the text parts of the + # PDF. But PDFs can be complex documents and they may contain text + # and images. If the returned text is shorter than the value + # below, OCR is run afterwards. Then both extracted texts are + # compared and the longer will be used. + pdf { + min-text-len = 10 } - # The ghostscript command. - ghostscript { - command { - program = "gs" - args = [ "-dNOPAUSE" - , "-dBATCH" - , "-dSAFER" - , "-sDEVICE=tiffscaled8" - , "-sOutputFile={{outfile}}" - , "{{infile}}" - ] - timeout = "5 minutes" + # Extracting text using OCR works for image and pdf files. It will + # first run ghostscript to create a gray image from a pdf. Then + # unpaper is run to optimize the image for the upcoming ocr, which + # will be done by tesseract. All these programs must be available + # in your PATH or the absolute path can be specified below. + ocr { + + # Images greater than this size are skipped. Note that every + # image is loaded completely into memory for doing OCR. + max-image-size = 14000000 + + # Defines what pages to process. If a PDF with 600 pages is + # submitted, it is probably not necessary to scan through all of + # them. This would take a long time and occupy resources for no + # value. The first few pages should suffice. The default is first + # 10 pages. + # + # If you want all pages being processed, set this number to -1. + # + # Note: if you change the ghostscript command below, be aware that + # this setting (if not -1) will add another parameter to the + # beginning of the command. + page-range { + begin = 10 } - working-dir = ${java.io.tmpdir}"/docspell-extraction" - } - # The unpaper command. - unpaper { - command { - program = "unpaper" - args = [ "{{infile}}", "{{outfile}}" ] - timeout = "5 minutes" + # The ghostscript command. + ghostscript { + command { + program = "gs" + args = [ "-dNOPAUSE" + , "-dBATCH" + , "-dSAFER" + , "-sDEVICE=tiffscaled8" + , "-sOutputFile={{outfile}}" + , "{{infile}}" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-extraction" } - } - # The tesseract command. - tesseract { - command { - program = "tesseract" - args = ["{{file}}" - , "stdout" - , "-l" - , "{{lang}}" - ] - timeout = "5 minutes" + # The unpaper command. + unpaper { + command { + program = "unpaper" + args = [ "{{infile}}", "{{outfile}}" ] + timeout = "5 minutes" + } + } + + # The tesseract command. + tesseract { + command { + program = "tesseract" + args = ["{{file}}" + , "stdout" + , "-l" + , "{{lang}}" + ] + timeout = "5 minutes" + } } } } + + # Configuration for converting files into PDFs. + # + # Most of it is delegated to external tools, which can be configured + # below. They must be in the PATH environment or specify the full + # path below via the `program` key. + convert { + chunk-size = 524288 + + max-image-size = 12000000 + + markdown { + internal-css = """ + body { padding: 2em 5em; } + """ + } + + wkhtmlpdf { + cmd = { + program = "wkhtmltopdf" + args = [ + "-s", + "A4", + "--encoding", + "UTF-8", + "-", + "{{outfile}}" + ] + timeout = "20 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + tesseract = { + cmd = { + program = "tesseract" + args = [ + "{{infile}}", + "out", + "-l", + "{{lang}}", + "pdf", + "txt" + ] + timeout = "120 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + unoconv = { + cmd = { + program = "unoconv" + args = [ + "-f", + "pdf", + "-o", + "{{outfile}}", + "{{infile}}" + ] + timeout = "20 seconds" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + } } \ No newline at end of file diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 7e4de76c..62ad3aad 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -3,17 +3,17 @@ package docspell.joex import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig -import docspell.extract.ocr.{OcrConfig => OcrConfig} import docspell.convert.ConvertConfig +import docspell.extract.ExtractConfig case class Config( - appId: Ident, - baseUrl: LenientUri, - bind: Config.Bind, - jdbc: JdbcConfig, - scheduler: SchedulerConfig, - extraction: OcrConfig, - convert: ConvertConfig + appId: Ident, + baseUrl: LenientUri, + bind: Config.Bind, + jdbc: JdbcConfig, + scheduler: SchedulerConfig, + extraction: ExtractConfig, + convert: ConvertConfig ) object Config { diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 23de73de..a1035da1 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -1,15 +1,17 @@ package docspell.joex.process -import bitpeace.Mimetype +import bitpeace.{Mimetype, MimetypeHint, RangeDef} +import cats.implicits._ import cats.Functor import cats.implicits._ import cats.effect._ -import cats.data.OptionT - +import cats.data.{Kleisli, OptionT} +import fs2.Stream import docspell.common._ import docspell.convert._ import docspell.joex.scheduler._ import docspell.store.records._ +import docspell.convert.ConversionResult.Handler /** Goes through all attachments and creates a PDF version of it where * supported. @@ -32,32 +34,92 @@ object ConvertPdf { item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - // get mimetype - // try to convert - // save to db - // update file_id of RAttachment - def convert(ra: RAttachment) = - findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx)(ra, m)) + findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m)) for { ras <- item.attachments.traverse(convert) - } yield item.copy(attachments = ras) + nra = ras.map(_._1) + nma = ras.flatMap(_._2) + } yield item.copy(attachments = nra, metas = nma) } - def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] = + def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] = OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) .map(_.mimetype) .getOrElse(Mimetype.`application/octet-stream`) def convertSafe[F[_]: Sync: ContextShift]( cfg: ConvertConfig, - ctx: Context[F, ProcessItemArgs] - )(ra: RAttachment, mime: Mimetype): F[RAttachment] = - Conversion.create[F](cfg, ctx.blocker,ctx.logger).use { conv => - ctx.logger - .info(s"File ${ra.name} has mime ${mime.asString}. conv=$conv") - .map(_ => ra) + ctx: Context[F, ProcessItemArgs], + item: ItemData + )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = + Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv => + mime match { + case Mimetype.`application/pdf` => + ctx.logger.info("Not going to convert a PDF file into a PDF.") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case _ => + val data = ctx.store.bitpeace + .get(ra.fileId.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + val handler = conversionHandler[F](ctx, cfg, ra, item) + ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> + conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data) + } } + + private def conversionHandler[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + item: ItemData + ): Handler[F, (RAttachment, Option[RAttachmentMeta])] = + Kleisli({ + case ConversionResult.SuccessPdf(pdf) => + ctx.logger.info(s"Conversion to pdf successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .map(r => (r, None)) + + case ConversionResult.SuccessPdfTxt(pdf, txt) => + ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .flatMap(r => + txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some)) + ) + + case ConversionResult.UnsupportedFormat(mt) => + ctx.logger.info(s"PDF conversion for type ${mt.asString} not supported!") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.InputMalformed(mt, reason) => + ctx.logger.info( + s"PDF conversion from type ${mt.asString} reported malformed input: $reason." + ) *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.Failure(ex) => + ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + }) + + private def storePDF[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + pdf: Stream[F, Byte] + ) = { + val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf")) + val newName = ra.name.map(n => s"$n.pdf") + ctx.store.bitpeace + .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised)) + .compile + .lastOrError + .map(fm => Ident.unsafe(fm.id)) + .flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)) + .map(fmId => ra.copy(fileId = fmId, name = newName)) + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 1ac90139..6eada36d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -1,5 +1,6 @@ package docspell.joex.process +import bitpeace.FileMeta import cats.implicits._ import cats.effect.Sync import cats.data.OptionT @@ -22,13 +23,15 @@ object CreateItem { def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] = Task { ctx => - val validFiles = ctx.args.meta.validFileTypes.map(_.asString).toSet + def isValidFile(fm: FileMeta) = + ctx.args.meta.validFileTypes.isEmpty || + ctx.args.meta.validFileTypes.map(_.asString).toSet.contains(fm.mimetype.baseType) def fileMetas(itemId: Ident, now: Timestamp) = Stream .emits(ctx.args.files) .flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm))) - .collect({ case (f, Some(fm)) if validFiles.contains(fm.mimetype.baseType) => f }) + .collect({ case (f, Some(fm)) if isValidFile(fm) => f }) .zipWithIndex .evalMap({ case (f, index) => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index c5f474a5..b9fd22c4 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -9,7 +9,7 @@ case class ItemData( attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], dateLabels: Vector[AttachmentDates], - originFile: Map[Ident, Ident] + originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = @@ -17,6 +17,21 @@ case class ItemData( def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] = dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty) + + def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = { + val item = changeMeta(attachId, f) + val next = metas.map(a => if (a.id == attachId) item else a) + copy(metas = next) + } + + def changeMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): RAttachmentMeta = + f(findOrCreate(attachId)) + + def findOrCreate(attachId: Ident): RAttachmentMeta = + metas.find(_.id == attachId).getOrElse { + RAttachmentMeta.empty(attachId) + } + } object ItemData { diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 6ab1c451..6f72836d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -1,25 +1,25 @@ package docspell.joex.process -import bitpeace.RangeDef +import bitpeace.{Mimetype, RangeDef} +import cats.data.OptionT import cats.implicits._ -import cats.effect.{Blocker, ContextShift, Sync} +import cats.effect.{ContextShift, Sync} import docspell.common._ +import docspell.extract.{ExtractConfig, ExtractResult, Extraction} import docspell.joex.scheduler.{Context, Task} -import docspell.store.Store -import docspell.store.records.{RAttachment, RAttachmentMeta} -import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig} +import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta} object TextExtraction { def apply[F[_]: Sync: ContextShift]( - cfg: OcrConfig, - item: ItemData + cfg: ExtractConfig, + item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] - txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item)) + txt <- item.attachments.traverse(extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)) _ <- ctx.logger.debug("Storing extracted texts") _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) dur <- start @@ -27,53 +27,84 @@ object TextExtraction { } yield item.copy(metas = txt) } + def extractTextIfEmpty[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + cfg: ExtractConfig, + lang: Language, + item: ItemData + )(ra: RAttachment): F[RAttachmentMeta] = { + val rm = item.findOrCreate(ra.id) + rm.content match { + case Some(_) => + ctx.logger.info("TextExtraction skipped, since text is already available.") *> + rm.pure[F] + case None => + extractTextToMeta[F](ctx, cfg, lang, item)(ra) + } + } + def extractTextToMeta[F[_]: Sync: ContextShift]( ctx: Context[F, _], - cfg: OcrConfig, - lang: Language, - item: ItemData + cfg: ExtractConfig, + lang: Language, + item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { - _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") + _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") dst <- Duration.stopTime[F] - txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra)) - meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) + txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra)) + meta = item.changeMeta(ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))) est <- dst _ <- ctx.logger.debug( - s"Extracting text for attachment ${ra.name} finished in ${est.formatExact}" + s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" ) } yield meta def extractText[F[_]: Sync: ContextShift]( - ocrConfig: OcrConfig, - lang: Language, - store: Store[F], - blocker: Blocker, - logger: Logger[F] - )(fileId: Ident): F[Option[String]] = { - val data = store.bitpeace + ctx: Context[F, _], + extr: Extraction[F], + lang: Language + )(fileId: Ident): F[ExtractResult] = { + val data = ctx.store.bitpeace .get(fileId.id) .unNoneTerminate - .through(store.bitpeace.fetchData2(RangeDef.all)) + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - TextExtract.extract(data, blocker, logger, lang.iso3, ocrConfig).compile.last + def findMime: F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + findMime + .flatMap(mt => + extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang)) } private def extractTextFallback[F[_]: Sync: ContextShift]( - ctx: Context[F, _], - ocrConfig: OcrConfig, - lang: Language, + ctx: Context[F, _], + cfg: ExtractConfig, + ra: RAttachment, + lang: Language, )(fileIds: List[Ident]): F[Option[String]] = { fileIds match { case Nil => ctx.logger.error(s"Cannot extract text").map(_ => None) case id :: rest => - extractText[F](ocrConfig, lang, ctx.store, ctx.blocker, ctx.logger)(id). - recoverWith({ - case ex => + val extr = Extraction.create[F](ctx.blocker, ctx.logger, cfg) + + extractText[F](ctx, extr, lang)(id) + .flatMap({ + case ExtractResult.Success(txt) => + txt.some.pure[F] + + case ExtractResult.UnsupportedFormat(mt) => + ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file."). + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) + + case ExtractResult.Failure(ex) => ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). - flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest)) + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) }) } } @@ -86,4 +117,9 @@ object TextExtraction { case Some(sid) => List(sid, ra.fileId).distinct case None => List(ra.fileId) } + + private def stripAttachmentName(ra: RAttachment): String = + ra.name + .map(s => if (s.endsWith(".pdf") && s.count(_ == '.') > 1) s.dropRight(4) else s) + .getOrElse("") } diff --git a/modules/restserver/src/main/resources/reference.conf b/modules/restserver/src/main/resources/reference.conf index 182bb0e4..4e165dc5 100644 --- a/modules/restserver/src/main/resources/reference.conf +++ b/modules/restserver/src/main/resources/reference.conf @@ -80,9 +80,10 @@ docspell.server { # The file content types that are considered valid. Docspell # will only pass these files to processing. The processing code # itself has also checks for which files are supported and which - # not. This affects the uploading part and is a first check to - # avoid that 'bad' files get into the system. - valid-mime-types = [ "application/pdf" ] + # not. This affects the uploading part and can be used to + # restrict file types that should be handed over to processing. + # By default all files are allowed. + valid-mime-types = [ ] } } } \ No newline at end of file diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index 22ab8e89..728abc95 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -38,6 +38,9 @@ object RAttachment { fr"${v.id},${v.itemId},${v.fileId.id},${v.position},${v.created},${v.name}" ).update.run + def updateFileIdAndName(attachId: Ident, fId: Ident, fname: Option[String]): ConnectionIO[Int] = + updateRow(table, id.is(attachId), commas(fileId.setTo(fId), name.setTo(fname))).update.run + def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = selectSimple(all, table, id.is(attachId)).query[RAttachment].option @@ -108,7 +111,8 @@ object RAttachment { def delete(attachId: Ident): ConnectionIO[Int] = for { n0 <- RAttachmentMeta.delete(attachId) - n1 <- deleteFrom(table, id.is(attachId)).update.run - } yield n0 + n1 + n1 <- RAttachmentSource.delete(attachId) + n2 <- deleteFrom(table, id.is(attachId)).update.run + } yield n0 + n1 + n2 } diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index f1887399..9de923e2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -8,11 +8,16 @@ import docspell.store.impl._ import docspell.store.impl.Implicits._ case class RAttachmentMeta( - id: Ident, + id: Ident, //same as RAttachment.id content: Option[String], nerlabels: List[NerLabel], proposals: MetaProposalList -) {} +) { + + def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = + if (content.forall(_.trim.isEmpty)) copy(content = txt) + else this +} object RAttachmentMeta { def empty(attachId: Ident) = RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty) diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala index 447af3aa..052be661 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala @@ -41,4 +41,6 @@ object RAttachmentSource { def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] = selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option + def delete(attachId: Ident): ConnectionIO[Int] = + deleteFrom(table, id.is(attachId)).update.run } diff --git a/modules/webapp/src/main/elm/Comp/Dropzone.elm b/modules/webapp/src/main/elm/Comp/Dropzone.elm index 3ef12443..e551209e 100644 --- a/modules/webapp/src/main/elm/Comp/Dropzone.elm +++ b/modules/webapp/src/main/elm/Comp/Dropzone.elm @@ -35,7 +35,7 @@ type alias Settings = defaultSettings : Settings defaultSettings = { classList = \_ -> [ ( "ui placeholder segment", True ) ] - , contentTypes = [ "application/pdf" ] + , contentTypes = [] } @@ -148,7 +148,11 @@ filterMime settings files = pred f = List.member (File.mime f) settings.contentTypes in - List.filter pred files + if settings.contentTypes == [] then + files + + else + List.filter pred files dropDecoder : D.Decoder Msg