mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Integrate support for more files into processing and upload
The restriction that only pdf files can be uploaded is removed. All files can now be uploaded. The processing may not process all. It is still possible to restrict file uploads by types via a configuration.
This commit is contained in:
@ -13,7 +13,7 @@ import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||
|
||||
}
|
||||
|
||||
@ -26,7 +26,7 @@ object Conversion {
|
||||
): Resource[F, Conversion[F]] =
|
||||
Resource.pure(new Conversion[F] {
|
||||
|
||||
def toPDF[A](dataType: DataType, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.pdf =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
@ -55,14 +55,14 @@ object Conversion {
|
||||
)
|
||||
)
|
||||
} else {
|
||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
}
|
||||
|
||||
case None =>
|
||||
logger.info(
|
||||
s"Cannot read image when determining size for ${mt.asString}. Converting anyways."
|
||||
) *>
|
||||
Tesseract.toPDF(cfg.tesseract, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler)
|
||||
}
|
||||
|
||||
case Office(_) =>
|
||||
@ -109,4 +109,13 @@ object Conversion {
|
||||
def unapply(m: MimeType): Option[MimeType] =
|
||||
Some(m).filter(all.contains)
|
||||
}
|
||||
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
mt match {
|
||||
case Office(_) => Some(mt)
|
||||
case Texts(_) => Some(mt)
|
||||
case Images(_) => Some(mt)
|
||||
case MimeType.html => Some(mt)
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ object Tesseract {
|
||||
|
||||
def toPDF[F[_]: Sync: ContextShift, A](
|
||||
cfg: TesseractConfig,
|
||||
lang: Language,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F]
|
||||
@ -20,7 +21,7 @@ object Tesseract {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("tesseract", cfg.cmd, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
ExternConv.toPDF[F, A]("tesseract", cfg.cmd.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -150,7 +150,7 @@ object ConversionTest extends SimpleTestSuite with FileChecks {
|
||||
val load = uri.readURL[IO](8192, blocker)
|
||||
val dataType = DataType.filename(uri.path.segments.last)
|
||||
logger.info(s"Processing file ${uri.path.asString}") *>
|
||||
conv.toPDF(dataType, handler(index))(load)
|
||||
conv.toPDF(dataType, Language.German, handler(index))(load)
|
||||
})
|
||||
|
||||
def commandsExist: Boolean =
|
||||
|
@ -89,7 +89,7 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
val tessCfg = TesseractConfig(cfg, target)
|
||||
val (pdf, txt) =
|
||||
Tesseract
|
||||
.toPDF[IO, (Path, Path)](tessCfg, 8192, blocker, logger)(
|
||||
.toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)(
|
||||
ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker),
|
||||
storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt"))
|
||||
)
|
||||
|
Reference in New Issue
Block a user