Starting to support more file types

First, files are be converted to PDF for archiving. It is also easier to create a preview. This is done via the `ConvertPdf` processing task (which is not yet implemented). Text extraction then tries first with the original file. If that fails, OCR is done on the (potentially) converted pdf file. To not loose information of the original file, it is saved using the table `attachment_source`. If the original file is already a pdf, or the conversion did not succeed, the `attachment` and `attachment_source` record point to the same file.
2025-09-15 21:46:53 +00:00 · 2020-02-09 19:42:49 +01:00
parent 57ec8eec53
commit ba3865ef5e
11 changed files with 220 additions and 19 deletions
--- a/modules/common/src/main/scala/docspell/common/MimeType.scala
+++ b/modules/common/src/main/scala/docspell/common/MimeType.scala
@@ -27,7 +27,7 @@ object MimeType {
    MimeType("image", partFromString(sub).throwLeft)

  private[this] val validChars: Set[Char] =
-    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet
+    (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.").toSet

  def parse(str: String): Either[String, MimeType] =
    str.indexOf('/') match {
@@ -44,7 +44,7 @@ object MimeType {

  private def partFromString(s: String): Either[String, String] =
    if (s.forall(validChars.contains)) Right(s)
-    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}")
+    else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")

  val octetStream = application("octet-stream")
  val pdf         = application("pdf")