mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 21:42:52 +00:00
Merge pull request #299 from eikek/fix-mariadb-text-column
Fix mariadb text column
This commit is contained in:
commit
bcf86d97f5
BIN
modules/files/src/test/resources/large-file.pdf
Normal file
BIN
modules/files/src/test/resources/large-file.pdf
Normal file
Binary file not shown.
@ -38,10 +38,20 @@ object ConvertPdf {
|
|||||||
item: ItemData
|
item: ItemData
|
||||||
): Task[F, ProcessItemArgs, ItemData] =
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
def convert(ra: RAttachment) =
|
def convert(ra: RAttachment): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||||
findMime(ctx)(ra).flatMap(m =>
|
isConverted(ctx)(ra).flatMap {
|
||||||
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
|
case true =>
|
||||||
)
|
ctx.logger.info(
|
||||||
|
s"Conversion to pdf already done for attachment ${ra.name}."
|
||||||
|
) *>
|
||||||
|
ctx.store
|
||||||
|
.transact(RAttachmentMeta.findById(ra.id))
|
||||||
|
.map(rmOpt => (ra, rmOpt))
|
||||||
|
case false =>
|
||||||
|
findMime(ctx)(ra).flatMap(m =>
|
||||||
|
convertSafe(cfg, JsoupSanitizer.clean, ctx, item)(ra, m)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
ras <- item.attachments.traverse(convert)
|
ras <- item.attachments.traverse(convert)
|
||||||
@ -51,6 +61,11 @@ object ConvertPdf {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def isConverted[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(
|
||||||
|
ra: RAttachment
|
||||||
|
): F[Boolean] =
|
||||||
|
ctx.store.transact(RAttachmentSource.isConverted(ra.id))
|
||||||
|
|
||||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
|
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
|
||||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||||
.map(_.mimetype)
|
.map(_.mimetype)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
import cats.data.OptionT
|
import cats.data.OptionT
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
@ -125,21 +126,31 @@ object CreateItem {
|
|||||||
for {
|
for {
|
||||||
cand <- ctx.store.transact(QItem.findByFileIds(fileMetaIds.toSeq))
|
cand <- ctx.store.transact(QItem.findByFileIds(fileMetaIds.toSeq))
|
||||||
_ <-
|
_ <-
|
||||||
if (cand.nonEmpty) ctx.logger.warn("Found existing item with these files.")
|
if (cand.nonEmpty)
|
||||||
|
ctx.logger.warn(s"Found ${cand.size} existing item with these files.")
|
||||||
else ().pure[F]
|
else ().pure[F]
|
||||||
ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid))
|
ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid))
|
||||||
_ <-
|
_ <-
|
||||||
if (ht.sum > 0)
|
if (ht.sum > 0)
|
||||||
ctx.logger.warn(s"Removed ${ht.sum} items with same attachments")
|
ctx.logger.warn(s"Removed ${ht.sum} items with same attachments")
|
||||||
else ().pure[F]
|
else ().pure[F]
|
||||||
rms <- OptionT(
|
rms <- OptionT
|
||||||
//load attachments but only those mentioned in the task's arguments
|
.fromOption[F](NonEmptyList.fromList(fileMetaIds.toList))
|
||||||
cand.headOption.traverse(ri =>
|
.flatMap(fids =>
|
||||||
ctx.store
|
OptionT(
|
||||||
.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid))
|
//load attachments but only those mentioned in the task's arguments
|
||||||
.map(_.filter(r => fileMetaIds.contains(r.fileId)))
|
cand.headOption.traverse(ri =>
|
||||||
|
ctx.store
|
||||||
|
.transact(RAttachment.findByItemCollectiveSource(ri.id, ri.cid, fids))
|
||||||
|
.flatTap(ats =>
|
||||||
|
ctx.logger.debug(
|
||||||
|
s"Found ${ats.size} attachments. Use only those from task args: ${fileMetaIds}"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
).getOrElse(Vector.empty)
|
.getOrElse(Vector.empty)
|
||||||
orig <- rms.traverse(a =>
|
orig <- rms.traverse(a =>
|
||||||
ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s))
|
ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s))
|
||||||
)
|
)
|
||||||
|
@ -7,6 +7,7 @@ import docspell.common._
|
|||||||
import docspell.joex.scheduler.{Context, Task}
|
import docspell.joex.scheduler.{Context, Task}
|
||||||
import docspell.store.queries.QItem
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.RFileMeta
|
import docspell.store.records.RFileMeta
|
||||||
|
import docspell.store.records.RJob
|
||||||
|
|
||||||
import bitpeace.FileMeta
|
import bitpeace.FileMeta
|
||||||
import doobie._
|
import doobie._
|
||||||
@ -17,7 +18,13 @@ object DuplicateCheck {
|
|||||||
def apply[F[_]: Sync]: Task[F, Args, Args] =
|
def apply[F[_]: Sync]: Task[F, Args, Args] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
if (ctx.args.meta.skipDuplicate)
|
if (ctx.args.meta.skipDuplicate)
|
||||||
ctx.logger.debug("Checking for duplicate files") *> removeDuplicates(ctx)
|
for {
|
||||||
|
retries <- getRetryCount(ctx)
|
||||||
|
res <-
|
||||||
|
if (retries == 0)
|
||||||
|
ctx.logger.debug("Checking for duplicate files") *> removeDuplicates(ctx)
|
||||||
|
else ctx.args.pure[F]
|
||||||
|
} yield res
|
||||||
else ctx.logger.debug("Not checking for duplicates") *> ctx.args.pure[F]
|
else ctx.logger.debug("Not checking for duplicates") *> ctx.args.pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,6 +37,9 @@ object DuplicateCheck {
|
|||||||
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id))
|
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private def getRetryCount[F[_]: Sync](ctx: Context[F, Args]): F[Int] =
|
||||||
|
ctx.store.transact(RJob.getRetries(ctx.jobId)).map(_.getOrElse(0))
|
||||||
|
|
||||||
private def deleteDuplicate[F[_]: Sync](
|
private def deleteDuplicate[F[_]: Sync](
|
||||||
ctx: Context[F, Args]
|
ctx: Context[F, Args]
|
||||||
)(fd: FileMetaDupes): F[Unit] = {
|
)(fd: FileMetaDupes): F[Unit] = {
|
||||||
|
@ -20,7 +20,9 @@ object TextExtraction {
|
|||||||
): Task[F, ProcessItemArgs, ItemData] =
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.info("Starting text extraction")
|
_ <- ctx.logger.info(
|
||||||
|
s"Starting text extraction for ${item.attachments.size} files"
|
||||||
|
)
|
||||||
start <- Duration.stopTime[F]
|
start <- Duration.stopTime[F]
|
||||||
txt <- item.attachments.traverse(
|
txt <- item.attachments.traverse(
|
||||||
extractTextIfEmpty(
|
extractTextIfEmpty(
|
||||||
@ -31,9 +33,10 @@ object TextExtraction {
|
|||||||
item
|
item
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
_ <- ctx.logger.debug("Storing extracted texts")
|
_ <- ctx.logger.debug("Storing extracted texts …")
|
||||||
_ <-
|
_ <-
|
||||||
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
|
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
|
||||||
|
_ <- ctx.logger.debug(s"Extracted text stored.")
|
||||||
idxItem = TextData.item(
|
idxItem = TextData.item(
|
||||||
item.item.id,
|
item.item.id,
|
||||||
ctx.args.meta.collective,
|
ctx.args.meta.collective,
|
||||||
|
@ -0,0 +1,14 @@
|
|||||||
|
ALTER TABLE `attachmentmeta`
|
||||||
|
MODIFY COLUMN `content` longtext;
|
||||||
|
|
||||||
|
ALTER TABLE `attachmentmeta`
|
||||||
|
MODIFY COLUMN `nerlabels` longtext;
|
||||||
|
|
||||||
|
ALTER TABLE `attachmentmeta`
|
||||||
|
MODIFY COLUMN `itemproposals` longtext;
|
||||||
|
|
||||||
|
ALTER TABLE `job`
|
||||||
|
MODIFY COLUMN `args` mediumtext;
|
||||||
|
|
||||||
|
ALTER TABLE `joblog`
|
||||||
|
MODIFY COLUMN `message` mediumtext;
|
@ -44,6 +44,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
|
|||||||
def isNot[A: Put](value: A): Fragment =
|
def isNot[A: Put](value: A): Fragment =
|
||||||
f ++ fr"<> $value"
|
f ++ fr"<> $value"
|
||||||
|
|
||||||
|
def isNot(c: Column): Fragment =
|
||||||
|
f ++ fr"<>" ++ c.f
|
||||||
|
|
||||||
def isNull: Fragment =
|
def isNull: Fragment =
|
||||||
f ++ fr"is null"
|
f ++ fr"is null"
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
@ -158,6 +159,36 @@ object RAttachment {
|
|||||||
q.query[RAttachment].to[Vector]
|
q.query[RAttachment].to[Vector]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findByItemCollectiveSource(
|
||||||
|
id: Ident,
|
||||||
|
coll: Ident,
|
||||||
|
fileIds: NonEmptyList[Ident]
|
||||||
|
): ConnectionIO[Vector[RAttachment]] = {
|
||||||
|
|
||||||
|
val iId = RItem.Columns.id.prefix("i")
|
||||||
|
val iColl = RItem.Columns.cid.prefix("i")
|
||||||
|
val aItem = Columns.itemId.prefix("a")
|
||||||
|
val aId = Columns.id.prefix("a")
|
||||||
|
val aFile = Columns.fileId.prefix("a")
|
||||||
|
val sId = RAttachmentSource.Columns.id.prefix("s")
|
||||||
|
val sFile = RAttachmentSource.Columns.fileId.prefix("s")
|
||||||
|
val rId = RAttachmentArchive.Columns.id.prefix("r")
|
||||||
|
val rFile = RAttachmentArchive.Columns.fileId.prefix("r")
|
||||||
|
|
||||||
|
val from = table ++ fr"a INNER JOIN" ++
|
||||||
|
RItem.table ++ fr"i ON" ++ iId.is(aItem) ++ fr"LEFT JOIN" ++
|
||||||
|
RAttachmentSource.table ++ fr"s ON" ++ sId.is(aId) ++ fr"LEFT JOIN" ++
|
||||||
|
RAttachmentArchive.table ++ fr"r ON" ++ rId.is(aId)
|
||||||
|
|
||||||
|
val cond = and(
|
||||||
|
iId.is(id),
|
||||||
|
iColl.is(coll),
|
||||||
|
or(aFile.isIn(fileIds), sFile.isIn(fileIds), rFile.isIn(fileIds))
|
||||||
|
)
|
||||||
|
|
||||||
|
selectSimple(all.map(_.prefix("a")), from, cond).query[RAttachment].to[Vector]
|
||||||
|
}
|
||||||
|
|
||||||
def findByItemAndCollectiveWithMeta(
|
def findByItemAndCollectiveWithMeta(
|
||||||
id: Ident,
|
id: Ident,
|
||||||
coll: Ident
|
coll: Ident
|
||||||
|
@ -46,6 +46,9 @@ object RAttachmentMeta {
|
|||||||
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
||||||
selectCount(id, table, id.is(attachId)).query[Int].unique.map(_ > 0)
|
selectCount(id, table, id.is(attachId)).query[Int].unique.map(_ > 0)
|
||||||
|
|
||||||
|
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentMeta]] =
|
||||||
|
selectSimple(all, table, id.is(attachId)).query[RAttachmentMeta].option
|
||||||
|
|
||||||
def upsert(v: RAttachmentMeta): ConnectionIO[Int] =
|
def upsert(v: RAttachmentMeta): ConnectionIO[Int] =
|
||||||
for {
|
for {
|
||||||
n0 <- update(v)
|
n0 <- update(v)
|
||||||
|
@ -48,6 +48,21 @@ object RAttachmentSource {
|
|||||||
.unique
|
.unique
|
||||||
.map(_ > 0)
|
.map(_ > 0)
|
||||||
|
|
||||||
|
def isConverted(attachId: Ident): ConnectionIO[Boolean] = {
|
||||||
|
val sId = Columns.id.prefix("s")
|
||||||
|
val sFile = Columns.fileId.prefix("s")
|
||||||
|
val aId = RAttachment.Columns.id.prefix("a")
|
||||||
|
val aFile = RAttachment.Columns.fileId.prefix("a")
|
||||||
|
|
||||||
|
val from = table ++ fr"s INNER JOIN" ++
|
||||||
|
RAttachment.table ++ fr"a ON" ++ aId.is(sId)
|
||||||
|
|
||||||
|
selectCount(aId, from, and(aId.is(attachId), aFile.isNot(sFile)))
|
||||||
|
.query[Int]
|
||||||
|
.unique
|
||||||
|
.map(_ > 0)
|
||||||
|
}
|
||||||
|
|
||||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||||
deleteFrom(table, id.is(attachId)).update.run
|
deleteFrom(table, id.is(attachId)).update.run
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user