Starting to support more file types

First, files are be converted to PDF for archiving. It is also easier
to create a preview. This is done via the `ConvertPdf` processing
task (which is not yet implemented).

Text extraction then tries first with the original file. If that
fails, OCR is done on the (potentially) converted pdf file.

To not loose information of the original file, it is saved using the
table `attachment_source`. If the original file is already a pdf, or
the conversion did not succeed, the `attachment` and
`attachment_source` record point to the same file.
This commit is contained in:
Eike Kettner 2020-02-09 19:42:49 +01:00
parent 57ec8eec53
commit ba3865ef5e
11 changed files with 220 additions and 19 deletions

View File

@ -27,7 +27,7 @@ object MimeType {
MimeType("image", partFromString(sub).throwLeft) MimeType("image", partFromString(sub).throwLeft)
private[this] val validChars: Set[Char] = private[this] val validChars: Set[Char] =
(('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.").toSet
def parse(str: String): Either[String, MimeType] = def parse(str: String): Either[String, MimeType] =
str.indexOf('/') match { str.indexOf('/') match {
@ -44,7 +44,7 @@ object MimeType {
private def partFromString(s: String): Either[String, String] = private def partFromString(s: String): Either[String, String] =
if (s.forall(validChars.contains)) Right(s) if (s.forall(validChars.contains)) Right(s)
else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}") else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}")
val octetStream = application("octet-stream") val octetStream = application("octet-stream")
val pdf = application("pdf") val pdf = application("pdf")

View File

@ -0,0 +1,60 @@
package docspell.joex.process
import bitpeace.Mimetype
import cats.Functor
import cats.implicits._
import cats.effect._
import cats.data.OptionT
import docspell.common._
import docspell.joex.scheduler._
import docspell.store.records._
/** Goes through all attachments and creates a PDF version of it where
* supported.
*
* The `attachment` record is updated with the PDF version while the
* original file has been stored in the `attachment_source` record.
*
* If pdf conversion is not possible or if the input is already a
* pdf, both files are identical. That is, the `file_id`s point to
* the same file. Since the name of an attachment may be changed by
* the user, the `attachment_origin` record keeps that, too.
*
* This step assumes an existing premature item, it traverses its
* attachments.
*/
object ConvertPdf {
def apply[F[_]: Sync: ContextShift](
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
// get mimetype
// try to convert
// save to db
// update file_id of RAttachment
def convert(ra: RAttachment) =
findMime(ctx)(ra).flatMap(m => convertSafe(ctx)(ra, m))
for {
ras <- item.attachments.traverse(convert)
} yield item.copy(attachments = ras)
}
def findMime[F[_]: Functor](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Mimetype] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.`application/octet-stream`)
def convertSafe[F[_]: Sync](
ctx: Context[F, ProcessItemArgs]
)(ra: RAttachment, mime: Mimetype): F[RAttachment] = {
ctx.logger.info(s"File ${ra.name} has mime ${mime.asString}").
map(_ => ra)
}
}

View File

@ -2,11 +2,12 @@ package docspell.joex.process
import cats.implicits._ import cats.implicits._
import cats.effect.Sync import cats.effect.Sync
import cats.data.OptionT
import fs2.Stream import fs2.Stream
import docspell.common._ import docspell.common._
import docspell.joex.scheduler.{Context, Task} import docspell.joex.scheduler.{Context, Task}
import docspell.store.queries.QItem import docspell.store.queries.QItem
import docspell.store.records.{RAttachment, RItem} import docspell.store.records.{RAttachment, RAttachmentSource, RItem}
/** /**
* Task that creates the item. * Task that creates the item.
@ -53,13 +54,21 @@ object CreateItem {
n <- ctx.store.transact(RItem.insert(it)) n <- ctx.store.transact(RItem.insert(it))
_ <- if (n != 1) storeItemError[F](ctx) else ().pure[F] _ <- if (n != 1) storeItemError[F](ctx) else ().pure[F]
fm <- fileMetas(it.id, it.created) fm <- fileMetas(it.id, it.created)
k <- fm.traverse(a => ctx.store.transact(RAttachment.insert(a))) k <- fm.traverse(insertAttachment(ctx))
_ <- logDifferences(ctx, fm, k.sum) _ <- logDifferences(ctx, fm, k.sum)
dur <- time dur <- time
_ <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}") _ <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}")
} yield ItemData(it, fm, Vector.empty, Vector.empty) } yield ItemData(it, fm, Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap)
} }
def insertAttachment[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Int] = {
val rs = RAttachmentSource.of(ra)
ctx.store.transact(for {
n <- RAttachment.insert(ra)
_ <- RAttachmentSource.insert(rs)
} yield n)
}
def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] = def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] =
Task { ctx => Task { ctx =>
for { for {
@ -69,12 +78,18 @@ object CreateItem {
ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid)) ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid))
_ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments") _ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments")
else ().pure[F] else ().pure[F]
rms <- cand.headOption.traverse(ri => rms <- OptionT(
ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) cand.headOption.traverse(ri =>
) ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid))
} yield cand.headOption.map(ri => )
ItemData(ri, rms.getOrElse(Vector.empty), Vector.empty, Vector.empty) ).getOrElse(Vector.empty)
) orig <- rms.traverse(a =>
ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s))
)
origMap = orig
.map(originFileTuple)
.toMap
} yield cand.headOption.map(ri => ItemData(ri, rms, Vector.empty, Vector.empty, origMap))
} }
private def logDifferences[F[_]: Sync]( private def logDifferences[F[_]: Sync](
@ -94,4 +109,8 @@ object CreateItem {
val msg = "Inserting item failed. DB returned 0 update count!" val msg = "Inserting item failed. DB returned 0 update count!"
ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg)) ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg))
} }
//TODO if no source is present, it must be saved!
private def originFileTuple(t: (RAttachment, Option[RAttachmentSource])): (Ident, Ident) =
t._2.map(s => s.id -> s.fileId).getOrElse(t._1.id -> t._1.fileId)
} }

View File

@ -8,7 +8,8 @@ case class ItemData(
item: RItem, item: RItem,
attachments: Vector[RAttachment], attachments: Vector[RAttachment],
metas: Vector[RAttachmentMeta], metas: Vector[RAttachmentMeta],
dateLabels: Vector[AttachmentDates] dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident]
) { ) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] = def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -10,7 +10,8 @@ object ProcessItem {
def apply[F[_]: Sync: ContextShift]( def apply[F[_]: Sync: ContextShift](
cfg: OcrConfig cfg: OcrConfig
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextExtraction(cfg, item) ConvertPdf(item)
.flatMap(TextExtraction(cfg, _))
.flatMap(Task.setProgress(25)) .flatMap(Task.setProgress(25))
.flatMap(TextAnalysis[F]) .flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50)) .flatMap(Task.setProgress(50))

View File

@ -3,7 +3,7 @@ package docspell.joex.process
import bitpeace.RangeDef import bitpeace.RangeDef
import cats.implicits._ import cats.implicits._
import cats.effect.{Blocker, ContextShift, Sync} import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common.{Duration, Language, ProcessItemArgs} import docspell.common._
import docspell.joex.scheduler.{Context, Task} import docspell.joex.scheduler.{Context, Task}
import docspell.store.Store import docspell.store.Store
import docspell.store.records.{RAttachment, RAttachmentMeta} import docspell.store.records.{RAttachment, RAttachmentMeta}
@ -19,7 +19,7 @@ object TextExtraction {
for { for {
_ <- ctx.logger.info("Starting text extraction") _ <- ctx.logger.info("Starting text extraction")
start <- Duration.stopTime[F] start <- Duration.stopTime[F]
txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language)) txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language, item))
_ <- ctx.logger.debug("Storing extracted texts") _ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm)))
dur <- start dur <- start
@ -30,12 +30,13 @@ object TextExtraction {
def extractTextToMeta[F[_]: Sync: ContextShift]( def extractTextToMeta[F[_]: Sync: ContextShift](
ctx: Context[F, _], ctx: Context[F, _],
cfg: OcrConfig, cfg: OcrConfig,
lang: Language lang: Language,
item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] = )(ra: RAttachment): F[RAttachmentMeta] =
for { for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}")
dst <- Duration.stopTime[F] dst <- Duration.stopTime[F]
txt <- extractText(cfg, lang, ctx.store, ctx.blocker)(ra) txt <- extractTextFallback(ctx, cfg, lang)(filesToExtract(item, ra))
meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty))
est <- dst est <- dst
_ <- ctx.logger.debug( _ <- ctx.logger.debug(
@ -48,12 +49,40 @@ object TextExtraction {
lang: Language, lang: Language,
store: Store[F], store: Store[F],
blocker: Blocker blocker: Blocker
)(ra: RAttachment): F[Option[String]] = { )(fileId: Ident): F[Option[String]] = {
val data = store.bitpeace val data = store.bitpeace
.get(ra.fileId.id) .get(fileId.id)
.unNoneTerminate .unNoneTerminate
.through(store.bitpeace.fetchData2(RangeDef.all)) .through(store.bitpeace.fetchData2(RangeDef.all))
TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last
} }
private def extractTextFallback[F[_]: Sync: ContextShift](
ctx: Context[F, _],
ocrConfig: OcrConfig,
lang: Language,
)(fileIds: List[Ident]): F[Option[String]] = {
fileIds match {
case Nil =>
ctx.logger.error(s"Cannot extract text").map(_ => None)
case id :: rest =>
extractText[F](ocrConfig, lang, ctx.store, ctx.blocker)(id).
recoverWith({
case ex =>
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
flatMap(_ => extractTextFallback[F](ctx, ocrConfig, lang)(rest))
})
}
}
/** Returns the fileIds to extract text from. First, the source file
* is tried. If that fails, the converted file is tried.
*/
private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] =
item.originFile.get(ra.id) match {
case Some(sid) => List(sid, ra.fileId).distinct
case None => List(ra.fileId)
}
} }

View File

@ -0,0 +1,11 @@
CREATE TABLE `attachment_source` (
`id` varchar(254) not null primary key,
`file_id` varchar(254) not null,
`filename` varchar(254),
`created` timestamp not null,
foreign key (`file_id`) references `filemeta`(`id`),
foreign key (`id`) references `attachment`(`attachid`)
);
INSERT INTO `attachment_source`
SELECT `attachid`,`filemetaid`,`name`,`created` FROM `attachment`;

View File

@ -0,0 +1,11 @@
CREATE TABLE "attachment_source" (
"id" varchar(254) not null primary key,
"file_id" varchar(254) not null,
"filename" varchar(254),
"created" timestamp not null,
foreign key ("file_id") references "filemeta"("id"),
foreign key ("id") references "attachment"("attachid")
);
INSERT INTO "attachment_source"
SELECT "attachid","filemetaid","name","created" FROM "attachment";

View File

@ -41,6 +41,20 @@ object RAttachment {
def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] =
selectSimple(all, table, id.is(attachId)).query[RAttachment].option selectSimple(all, table, id.is(attachId)).query[RAttachment].option
def findMeta(attachId: Ident): ConnectionIO[Option[FileMeta]] = {
import bitpeace.sql._
val cols = RFileMeta.Columns.all.map(_.prefix("m"))
val aId = id.prefix("a")
val aFileMeta = fileId.prefix("a")
val mId = RFileMeta.Columns.id.prefix("m")
val from = table ++ fr"a INNER JOIN" ++ RFileMeta.table ++ fr"m ON" ++ aFileMeta.is(mId)
val cond = aId.is(attachId)
selectSimple(cols, from, cond).query[FileMeta].option
}
def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] = def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] =
selectSimple( selectSimple(
all.map(_.prefix("a")), all.map(_.prefix("a")),

View File

@ -0,0 +1,44 @@
package docspell.store.records
import doobie._
import doobie.implicits._
import docspell.common._
import docspell.store.impl._
import docspell.store.impl.Implicits._
/** The origin file of an attachment. The `id` is shared with the
* attachment, to create a 1-1 (or 0..1-1) relationship.
*/
case class RAttachmentSource(
id: Ident, //same as RAttachment.id
fileId: Ident,
name: Option[String],
created: Timestamp
)
object RAttachmentSource {
val table = fr"attachment_source"
object Columns {
val id = Column("id")
val fileId = Column("file_id")
val name = Column("filename")
val created = Column("created")
val all = List(id, fileId, name, created)
}
import Columns._
def of(ra: RAttachment): RAttachmentSource =
RAttachmentSource(ra.id, ra.fileId, ra.name, ra.created)
def insert(v: RAttachmentSource): ConnectionIO[Int] =
insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
}

View File

@ -1,7 +1,12 @@
package docspell.store.records package docspell.store.records
import bitpeace.FileMeta
import doobie._
import doobie.implicits._ import doobie.implicits._
import docspell.common._
import docspell.store.impl._ import docspell.store.impl._
import docspell.store.impl.Implicits._
object RFileMeta { object RFileMeta {
@ -19,4 +24,10 @@ object RFileMeta {
val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize) val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize)
} }
def findById(fid: Ident): ConnectionIO[Option[FileMeta]] = {
import bitpeace.sql._
selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option
}
} }