Convert some files to pdf

This commit is contained in:
Eike Kettner
2020-02-18 21:32:21 +01:00
parent 5869e2ee6e
commit 9b1349734e
19 changed files with 605 additions and 98 deletions

View File

@ -2,4 +2,4 @@ package docspell.extract
import docspell.extract.ocr.OcrConfig
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)

View File

@ -9,6 +9,7 @@ import docspell.extract.poi.{PoiExtract, PoiType}
import docspell.extract.rtf.RtfExtract
import fs2.Stream
import docspell.files.TikaMimetype
import docspell.files.ImageSize
trait Extraction[F[_]] {
@ -44,14 +45,29 @@ object Extraction {
case OdfType(_) =>
OdfExtract.get(data).map(ExtractResult.fromEither)
case OcrType(_) =>
TextExtract
case OcrType(mt) =>
val doExtract = TextExtract
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.compile
.lastOrError
.attempt
.map(ExtractResult.fromEither)
ImageSize.get(data).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
ExtractResult.failure(new Exception(
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
).pure[F]
} else {
doExtract
}
case None =>
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
doExtract
}
case OdfType.container =>
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
OdfExtract.get(data).map(ExtractResult.fromEither)

View File

@ -4,10 +4,10 @@ import docspell.common.MimeType
object OdfType {
val odt = MimeType.application("application/vnd.oasis.opendocument.text")
val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet")
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
val container = MimeType.zip