mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Integrate support for more files into processing and upload
The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration.
This commit is contained in:
@ -2,4 +2,4 @@ package docspell.extract
|
||||
|
||||
import docspell.extract.ocr.OcrConfig
|
||||
|
||||
case class ExtractConfig(maxImageSize: Int, ocr: OcrConfig, pdf: PdfConfig)
|
||||
case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig)
|
||||
|
@ -55,10 +55,10 @@ object Extraction {
|
||||
|
||||
ImageSize.get(data).flatMap {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.maxImageSize) {
|
||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *>
|
||||
if (dim.product > cfg.ocr.maxImageSize) {
|
||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
|
||||
ExtractResult.failure(new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize}).")
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
|
||||
).pure[F]
|
||||
} else {
|
||||
doExtract
|
||||
@ -72,6 +72,12 @@ object Extraction {
|
||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt@MimeType("text", sub) if !sub.contains("html") =>
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
}
|
||||
|
||||
case mt =>
|
||||
ExtractResult.unsupportedFormat(mt).pure[F]
|
||||
|
||||
|
@ -33,12 +33,12 @@ object PdfExtract {
|
||||
|
||||
//maybe better: inspect the pdf and decide whether ocr or not
|
||||
for {
|
||||
pdfboxRes <- PdfboxExtract.get[F](in)
|
||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||
) *> runOcr.attempt,
|
||||
) >> runOcr.attempt,
|
||||
str =>
|
||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||
else
|
||||
|
@ -5,15 +5,12 @@ import java.nio.file.{Path, Paths}
|
||||
import docspell.common._
|
||||
|
||||
case class OcrConfig(
|
||||
allowedContentTypes: Set[MimeType],
|
||||
ghostscript: OcrConfig.Ghostscript,
|
||||
maxImageSize: Int,
|
||||
ghostscript: OcrConfig.Ghostscript,
|
||||
pageRange: OcrConfig.PageRange,
|
||||
unpaper: OcrConfig.Unpaper,
|
||||
tesseract: OcrConfig.Tesseract
|
||||
) {
|
||||
|
||||
def isAllowed(mt: MimeType): Boolean =
|
||||
allowedContentTypes contains mt
|
||||
}
|
||||
|
||||
object OcrConfig {
|
||||
@ -27,12 +24,7 @@ object OcrConfig {
|
||||
case class Unpaper(command: SystemCommand.Config)
|
||||
|
||||
val default = OcrConfig(
|
||||
allowedContentTypes = Set(
|
||||
MimeType.pdf,
|
||||
MimeType.png,
|
||||
MimeType.jpeg,
|
||||
MimeType.tiff
|
||||
),
|
||||
maxImageSize = 3000 * 3000,
|
||||
pageRange = PageRange(10),
|
||||
ghostscript = Ghostscript(
|
||||
SystemCommand.Config(
|
||||
|
@ -26,9 +26,6 @@ object TextExtract {
|
||||
Stream
|
||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||
.flatMap({
|
||||
case mt if !config.isAllowed(mt) =>
|
||||
raiseError(s"File `$mt` not allowed")
|
||||
|
||||
case MimeType.pdf =>
|
||||
Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate
|
||||
|
||||
|
Reference in New Issue
Block a user