Starting to support more file types

First, files are be converted to PDF for archiving. It is also easier
to create a preview. This is done via the `ConvertPdf` processing
task (which is not yet implemented).

Text extraction then tries first with the original file. If that
fails, OCR is done on the (potentially) converted pdf file.

To not loose information of the original file, it is saved using the
table `attachment_source`. If the original file is already a pdf, or
the conversion did not succeed, the `attachment` and
`attachment_source` record point to the same file.
This commit is contained in:
Eike Kettner
2020-02-09 19:42:49 +01:00
parent 57ec8eec53
commit ba3865ef5e
11 changed files with 220 additions and 19 deletions

View File

@ -0,0 +1,11 @@
CREATE TABLE `attachment_source` (
`id` varchar(254) not null primary key,
`file_id` varchar(254) not null,
`filename` varchar(254),
`created` timestamp not null,
foreign key (`file_id`) references `filemeta`(`id`),
foreign key (`id`) references `attachment`(`attachid`)
);
INSERT INTO `attachment_source`
SELECT `attachid`,`filemetaid`,`name`,`created` FROM `attachment`;

View File

@ -0,0 +1,11 @@
CREATE TABLE "attachment_source" (
"id" varchar(254) not null primary key,
"file_id" varchar(254) not null,
"filename" varchar(254),
"created" timestamp not null,
foreign key ("file_id") references "filemeta"("id"),
foreign key ("id") references "attachment"("attachid")
);
INSERT INTO "attachment_source"
SELECT "attachid","filemetaid","name","created" FROM "attachment";

View File

@ -41,6 +41,20 @@ object RAttachment {
def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] =
selectSimple(all, table, id.is(attachId)).query[RAttachment].option
def findMeta(attachId: Ident): ConnectionIO[Option[FileMeta]] = {
import bitpeace.sql._
val cols = RFileMeta.Columns.all.map(_.prefix("m"))
val aId = id.prefix("a")
val aFileMeta = fileId.prefix("a")
val mId = RFileMeta.Columns.id.prefix("m")
val from = table ++ fr"a INNER JOIN" ++ RFileMeta.table ++ fr"m ON" ++ aFileMeta.is(mId)
val cond = aId.is(attachId)
selectSimple(cols, from, cond).query[FileMeta].option
}
def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] =
selectSimple(
all.map(_.prefix("a")),

View File

@ -0,0 +1,44 @@
package docspell.store.records
import doobie._
import doobie.implicits._
import docspell.common._
import docspell.store.impl._
import docspell.store.impl.Implicits._
/** The origin file of an attachment. The `id` is shared with the
* attachment, to create a 1-1 (or 0..1-1) relationship.
*/
case class RAttachmentSource(
id: Ident, //same as RAttachment.id
fileId: Ident,
name: Option[String],
created: Timestamp
)
object RAttachmentSource {
val table = fr"attachment_source"
object Columns {
val id = Column("id")
val fileId = Column("file_id")
val name = Column("filename")
val created = Column("created")
val all = List(id, fileId, name, created)
}
import Columns._
def of(ra: RAttachment): RAttachmentSource =
RAttachmentSource(ra.id, ra.fileId, ra.name, ra.created)
def insert(v: RAttachmentSource): ConnectionIO[Int] =
insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
}

View File

@ -1,7 +1,12 @@
package docspell.store.records
import bitpeace.FileMeta
import doobie._
import doobie.implicits._
import docspell.common._
import docspell.store.impl._
import docspell.store.impl.Implicits._
object RFileMeta {
@ -19,4 +24,10 @@ object RFileMeta {
val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize)
}
def findById(fid: Ident): ConnectionIO[Option[FileMeta]] = {
import bitpeace.sql._
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
}
}