Integrate support for more files into processing and upload

The restriction that only pdf files can be uploaded is removed. All
files can now be uploaded. The processing may not process all. It is
still possible to restrict file uploads by types via a configuration.
This commit is contained in:
Eike Kettner
2020-02-19 23:27:00 +01:00
parent 9b1349734e
commit 97305d27ff
21 changed files with 366 additions and 148 deletions

View File

@ -2,4 +2,4 @@ package docspell.extract
import docspell.extract.ocr.OcrConfig
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)

View File

@ -55,10 +55,10 @@ object Extraction {
ImageSize.get(data).flatMap {
case Some(dim) =>
if (dim.product > cfg.maxImageSize) {
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
if (dim.product > cfg.ocr.maxImageSize) {
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
ExtractResult.failure(new Exception(
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
).pure[F]
} else {
doExtract
@ -72,6 +72,12 @@ object Extraction {
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
OdfExtract.get(data).map(ExtractResult.fromEither)
case mt@MimeType("text", sub) if !sub.contains("html") =>
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim)
}
case mt =>
ExtractResult.unsupportedFormat(mt).pure[F]

View File

@ -33,12 +33,12 @@ object PdfExtract {
//maybe better: inspect the pdf and decide whether ocr or not
for {
pdfboxRes <- PdfboxExtract.get[F](in)
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
res <- pdfboxRes.fold(
ex =>
logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) *> runOcr.attempt,
) >> runOcr.attempt,
str =>
if (str.length >= stripMinLen) str.pure[F].attempt
else

View File

@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths}
import docspell.common._
case class OcrConfig(
allowedContentTypes: Set[MimeType],
ghostscript: OcrConfig.Ghostscript,
maxImageSize: Int,
ghostscript: OcrConfig.Ghostscript,
pageRange: OcrConfig.PageRange,
unpaper: OcrConfig.Unpaper,
tesseract: OcrConfig.Tesseract
) {
def isAllowed(mt: MimeType): Boolean =
allowedContentTypes contains mt
}
object OcrConfig {
@ -27,12 +24,7 @@ object OcrConfig {
case class Unpaper(command: SystemCommand.Config)
val default = OcrConfig(
allowedContentTypes = Set(
MimeType.pdf,
MimeType.png,
MimeType.jpeg,
MimeType.tiff
),
maxImageSize = 3000 * 3000,
pageRange = PageRange(10),
ghostscript = Ghostscript(
SystemCommand.Config(

View File

@ -26,9 +26,6 @@ object TextExtract {
Stream
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({
case mt if !config.isAllowed(mt) =>
raiseError(s"File `$mt` not allowed")
case MimeType.pdf =>
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate