Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the extracted text as another layer into the pdf - Although not used, the external conversion routine will now check for an existing text file that is named as the pdf file with extension `.txt`. If present it is included in the conversion result and will be used as the extracted text. - text extraction for pdf files happens now on the converted file, because it may already contain the text from the conversion step and thus avoids running OCR twice. - All errors during conversion are not fatal; processing continues without a converted file.
2025-08-05 02:24:52 +00:00 · 2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions
--- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala
@ -3,8 +3,10 @@ package docspell.store.records
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
+import docspell.store.syntax.MimeTypes._

 import bitpeace.FileMeta
+import bitpeace.Mimetype
 import doobie._
 import doobie.implicits._

@ -30,4 +32,13 @@ object RFileMeta {

    selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
  }
+
+  def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
+    import bitpeace.sql._
+
+    selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
+      .query[Mimetype]
+      .option
+      .map(_.map(_.toLocal))
+  }
 }