Integrate support for more files into processing and upload

The restriction that only pdf files can be uploaded is removed. All
files can now be uploaded. The processing may not process all. It is
still possible to restrict file uploads by types via a configuration.
This commit is contained in:
Eike Kettner
2020-02-19 23:27:00 +01:00
parent 9b1349734e
commit 97305d27ff
21 changed files with 366 additions and 148 deletions

View File

@ -13,7 +13,7 @@ import docspell.files.{ImageSize, TikaMimetype}
trait Conversion[F[_]] {
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
}
@ -26,7 +26,7 @@ object Conversion {
): Resource[F, Conversion[F]] =
Resource.pure(new Conversion[F] {
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
TikaMimetype.resolve(dataType, in).flatMap {
case MimeType.pdf =>
handler.run(ConversionResult.successPdf(in))
@ -55,14 +55,14 @@ object Conversion {
)
)
} else {
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
}
case None =>
logger.info(
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
) *>
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
}
case Office(_) =>
@ -109,4 +109,13 @@ object Conversion {
def unapply(m: MimeType): Option[MimeType] =
Some(m).filter(all.contains)
}
def unapply(mt: MimeType): Option[MimeType] =
mt match {
case Office(_) => Some(mt)
case Texts(_) => Some(mt)
case Images(_) => Some(mt)
case MimeType.html => Some(mt)
case _ => None
}
}

View File

@ -12,6 +12,7 @@ object Tesseract {
def toPDF[F[_]: Sync: ContextShift, A](
cfg: TesseractConfig,
lang: Language,
chunkSize: Int,
blocker: Blocker,
logger: Logger[F]
@ -20,7 +21,7 @@ object Tesseract {
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
}
}

View File

@ -150,7 +150,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
val load = uri.readURL[IO](8192, blocker)
val dataType = DataType.filename(uri.path.segments.last)
logger.info(s"Processing file ${uri.path.asString}") *>
conv.toPDF(dataType, handler(index))(load)
conv.toPDF(dataType, Language.German, handler(index))(load)
})
def commandsExist: Boolean =

View File

@ -89,7 +89,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
val tessCfg = TesseractConfig(cfg, target)
val (pdf, txt) =
Tesseract
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)(
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
)