Use ocrmypdf tool to create pdf/a during conversion

- Use another external tool to convert pdf to pdf which also adds the
  extracted text as another layer into the pdf

- Although not used, the external conversion routine will now check
  for an existing text file that is named as the pdf file with extension
  `.txt`. If present it is included in the conversion result and will be
  used as the extracted text.

- text extraction for pdf files happens now on the converted file,
  because it may already contain the text from the conversion step and
  thus avoids running OCR twice.

- All errors during conversion are not fatal; processing continues
  without a converted file.
This commit is contained in:
Eike Kettner
2020-07-18 12:48:41 +02:00
parent 99210365ce
commit 3d49ceaab5
16 changed files with 316 additions and 21 deletions

View File

@ -3,8 +3,10 @@ package docspell.store.records
import docspell.common._
import docspell.store.impl.Implicits._
import docspell.store.impl._
import docspell.store.syntax.MimeTypes._
import bitpeace.FileMeta
import bitpeace.Mimetype
import doobie._
import doobie.implicits._
@ -30,4 +32,13 @@ object RFileMeta {
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
}
def findMime(fid: Ident): ConnectionIO[Option[MimeType]] = {
import bitpeace.sql._
selectSimple(Seq(Columns.mimetype), table, Columns.id.is(fid))
.query[Mimetype]
.option
.map(_.map(_.toLocal))
}
}