Starting to support more file types

First, files are be converted to PDF for archiving. It is also easier
to create a preview. This is done via the `ConvertPdf` processing
task (which is not yet implemented).

Text extraction then tries first with the original file. If that
fails, OCR is done on the (potentially) converted pdf file.

To not loose information of the original file, it is saved using the
table `attachment_source`. If the original file is already a pdf, or
the conversion did not succeed, the `attachment` and
`attachment_source` record point to the same file.
This commit is contained in:
Eike Kettner
2020-02-09 19:42:49 +01:00
parent 57ec8eec53
commit ba3865ef5e
11 changed files with 220 additions and 19 deletions

View File

@ -27,7 +27,7 @@ object MimeType {
MimeType("image", partFromString(sub).throwLeft)
private[this] val validChars: Set[Char] =
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.").toSet
def parse(str: String): Either[String, MimeType] =
str.indexOf('/') match {
@ -44,7 +44,7 @@ object MimeType {
private def partFromString(s: String): Either[String, String] =
if (s.forall(validChars.contains)) Right(s)
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}")
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
val octetStream = application("octet-stream")
val pdf = application("pdf")