mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Refactoring for migrating to binny library
This commit is contained in:
@ -468,7 +468,7 @@ Docpell Update Check
|
||||
|
||||
# The chunk size used when storing files. This should be the same
|
||||
# as used with the rest server.
|
||||
chunk-size = 524288
|
||||
chunk-size = ${docspell.joex.files.chunk-size}
|
||||
|
||||
# A string used to change the filename of the converted pdf file.
|
||||
# If empty, the original file name is used for the pdf file ( the
|
||||
|
@ -122,12 +122,12 @@ object JoexAppImpl {
|
||||
for {
|
||||
httpClient <- BlazeClientBuilder[F](clientEC).resource
|
||||
client = JoexClient(httpClient)
|
||||
store <- Store.create(cfg.jdbc, connectEC)
|
||||
store <- Store.create(cfg.jdbc, cfg.files.chunkSize, connectEC)
|
||||
queue <- JobQueue(store)
|
||||
pstore <- PeriodicTaskStore.create(store)
|
||||
nodeOps <- ONode(store)
|
||||
joex <- OJoex(client, store)
|
||||
upload <- OUpload(store, queue, cfg.files, joex)
|
||||
upload <- OUpload(store, queue, joex)
|
||||
fts <- createFtsClient(cfg)(httpClient)
|
||||
createIndex <- CreateIndex.resource(fts, store)
|
||||
itemOps <- OItem(store, fts, createIndex, queue, joex)
|
||||
@ -212,7 +212,7 @@ object JoexAppImpl {
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
MakePreviewArgs.taskName,
|
||||
MakePreviewTask[F](cfg.convert, cfg.extraction.preview),
|
||||
MakePreviewTask[F](cfg.extraction.preview),
|
||||
MakePreviewTask.onCancel[F]
|
||||
)
|
||||
)
|
||||
|
@ -17,8 +17,6 @@ import docspell.common._
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.RClassifierModel
|
||||
|
||||
import bitpeace.RangeDef
|
||||
|
||||
object Classify {
|
||||
|
||||
def apply[F[_]: Async](
|
||||
@ -33,11 +31,7 @@ object Classify {
|
||||
_ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name} …"))
|
||||
model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
|
||||
.flatTapNone(logger.debug("No classifier model found."))
|
||||
modelData =
|
||||
store.bitpeace
|
||||
.get(model.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(store.bitpeace.fetchData2(RangeDef.all))
|
||||
modelData = store.fileStore.getBytes(model.fileId)
|
||||
cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
|
||||
val modelFile = dir.resolve("model.ser.gz")
|
||||
modelData
|
||||
|
@ -90,8 +90,8 @@ object LearnClassifierTask {
|
||||
)
|
||||
n <- ctx.store.transact(RClassifierModel.deleteAll(list.map(_.id)))
|
||||
_ <- list
|
||||
.map(_.fileId.id)
|
||||
.traverse(id => ctx.store.bitpeace.delete(id).compile.drain)
|
||||
.map(_.fileId)
|
||||
.traverse(id => ctx.store.fileStore.delete(id))
|
||||
_ <- ctx.logger.debug(s"Deleted $n model files.")
|
||||
} yield ()
|
||||
|
||||
|
@ -16,8 +16,6 @@ import docspell.joex.scheduler._
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.RClassifierModel
|
||||
|
||||
import bitpeace.MimetypeHint
|
||||
|
||||
object StoreClassifierModel {
|
||||
|
||||
def handleModel[F[_]: Async](
|
||||
@ -43,16 +41,16 @@ object StoreClassifierModel {
|
||||
)
|
||||
_ <- logger.debug(s"Storing new trained model for: ${modelName.name}")
|
||||
fileData = Files[F].readAll(trainedModel.model)
|
||||
newFile <-
|
||||
store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
|
||||
newFileId <-
|
||||
fileData.through(store.fileStore.save(MimeTypeHint.none)).compile.lastOrError
|
||||
_ <- store.transact(
|
||||
RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id))
|
||||
RClassifierModel.updateFile(collective, modelName.name, newFileId)
|
||||
)
|
||||
_ <- logger.debug(s"New model stored at file ${newFile.id}")
|
||||
_ <- logger.debug(s"New model stored at file ${newFileId.id}")
|
||||
_ <- oldFile match {
|
||||
case Some(fid) =>
|
||||
logger.debug(s"Deleting old model file ${fid.id}") *>
|
||||
store.bitpeace.delete(fid.id).compile.drain
|
||||
store.fileStore.delete(fid)
|
||||
case None => ().pure[F]
|
||||
}
|
||||
} yield ()
|
||||
|
@ -19,10 +19,6 @@ import docspell.joex.Config
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.records._
|
||||
|
||||
import bitpeace.FileMeta
|
||||
import bitpeace.Mimetype
|
||||
import bitpeace.MimetypeHint
|
||||
import bitpeace.RangeDef
|
||||
import io.circe.generic.semiauto._
|
||||
import io.circe.{Decoder, Encoder}
|
||||
|
||||
@ -55,8 +51,11 @@ object PdfConvTask {
|
||||
// --- Helper
|
||||
|
||||
// check if file exists and if it is pdf and if source id is the same and if ocrmypdf is enabled
|
||||
def checkInputs[F[_]: Sync](cfg: Config, ctx: Context[F, Args]): F[Option[FileMeta]] = {
|
||||
val none: Option[FileMeta] = None
|
||||
def checkInputs[F[_]: Sync](
|
||||
cfg: Config,
|
||||
ctx: Context[F, Args]
|
||||
): F[Option[RFileMeta]] = {
|
||||
val none: Option[RFileMeta] = None
|
||||
val checkSameFiles =
|
||||
(for {
|
||||
ra <- OptionT(ctx.store.transact(RAttachment.findById(ctx.args.attachId)))
|
||||
@ -67,7 +66,7 @@ object PdfConvTask {
|
||||
val existsPdf =
|
||||
for {
|
||||
meta <- ctx.store.transact(RAttachment.findMeta(ctx.args.attachId))
|
||||
res = meta.filter(_.mimetype.matches(Mimetype.applicationPdf))
|
||||
res = meta.filter(_.mimetype.matches(MimeType.pdf))
|
||||
_ <-
|
||||
if (res.isEmpty)
|
||||
ctx.logger.info(
|
||||
@ -91,12 +90,10 @@ object PdfConvTask {
|
||||
def convert[F[_]: Async](
|
||||
cfg: Config,
|
||||
ctx: Context[F, Args],
|
||||
in: FileMeta
|
||||
in: RFileMeta
|
||||
): F[Unit] = {
|
||||
val bp = ctx.store.bitpeace
|
||||
val data = Stream
|
||||
.emit(in)
|
||||
.through(bp.fetchData2(RangeDef.all))
|
||||
val fs = ctx.store.fileStore
|
||||
val data = fs.getBytes(in.id)
|
||||
|
||||
val storeResult: ConversionResult.Handler[F, Unit] =
|
||||
Kleisli {
|
||||
@ -122,7 +119,7 @@ object PdfConvTask {
|
||||
OcrMyPdf.toPDF[F, Unit](
|
||||
cfg.convert.ocrmypdf,
|
||||
lang,
|
||||
in.chunksize,
|
||||
cfg.files.chunkSize,
|
||||
ctx.logger
|
||||
)(data, storeResult)
|
||||
|
||||
@ -140,18 +137,13 @@ object PdfConvTask {
|
||||
|
||||
def storeToAttachment[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
meta: FileMeta,
|
||||
meta: RFileMeta,
|
||||
newFile: Stream[F, Byte]
|
||||
): F[Unit] = {
|
||||
val mimeHint = MimetypeHint.advertised(meta.mimetype.asString)
|
||||
val mimeHint = MimeTypeHint.advertised(meta.mimetype)
|
||||
for {
|
||||
time <- Timestamp.current[F]
|
||||
fid <- Ident.randomId[F]
|
||||
_ <-
|
||||
ctx.store.bitpeace
|
||||
.saveNew(newFile, meta.chunksize, mimeHint, Some(fid.id), time.value)
|
||||
.compile
|
||||
.lastOrError
|
||||
fid <-
|
||||
newFile.through(ctx.store.fileStore.save(mimeHint)).compile.lastOrError
|
||||
_ <- ctx.store.transact(RAttachment.updateFileId(ctx.args.attachId, fid))
|
||||
} yield ()
|
||||
}
|
||||
|
@ -10,7 +10,6 @@ import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert.ConvertConfig
|
||||
import docspell.extract.pdfbox.PdfboxPreview
|
||||
import docspell.extract.pdfbox.PreviewConfig
|
||||
import docspell.joex.process.AttachmentPreview
|
||||
@ -23,7 +22,7 @@ object MakePreviewTask {
|
||||
|
||||
type Args = MakePreviewArgs
|
||||
|
||||
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] =
|
||||
def apply[F[_]: Sync](pcfg: PreviewConfig): Task[F, Args, Unit] =
|
||||
Task { ctx =>
|
||||
for {
|
||||
exists <- previewExists(ctx)
|
||||
@ -36,7 +35,7 @@ object MakePreviewTask {
|
||||
else
|
||||
ctx.logger.info(
|
||||
s"Generating preview image for attachment ${ctx.args.attachment}"
|
||||
) *> generatePreview(ctx, preview, cfg)
|
||||
) *> generatePreview(ctx, preview)
|
||||
} yield ()
|
||||
}
|
||||
|
||||
@ -45,13 +44,12 @@ object MakePreviewTask {
|
||||
|
||||
private def generatePreview[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
preview: PdfboxPreview[F],
|
||||
cfg: ConvertConfig
|
||||
preview: PdfboxPreview[F]
|
||||
): F[Unit] =
|
||||
for {
|
||||
ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment))
|
||||
_ <- ra
|
||||
.map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize))
|
||||
.map(AttachmentPreview.createPreview(ctx, preview))
|
||||
.getOrElse(
|
||||
ctx.logger.error(s"No attachment found with id: ${ctx.args.attachment}")
|
||||
)
|
||||
|
@ -18,9 +18,6 @@ import docspell.extract.pdfbox.PdfboxExtract
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records.RAttachment
|
||||
import docspell.store.records._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, RangeDef}
|
||||
|
||||
/** Goes through all attachments that must be already converted into a pdf. If it is a
|
||||
* pdf, the number of pages are retrieved and stored in the attachment metadata.
|
||||
@ -100,13 +97,8 @@ object AttachmentPageCount {
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.applicationOctetStream)
|
||||
.map(_.toLocal)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
|
||||
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
|
||||
ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
|
||||
ctx.store.fileStore.getBytes(ra.fileId)
|
||||
}
|
||||
|
@ -13,16 +13,12 @@ import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common._
|
||||
import docspell.convert._
|
||||
import docspell.extract.pdfbox.PdfboxPreview
|
||||
import docspell.extract.pdfbox.PreviewConfig
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.queries.QAttachment
|
||||
import docspell.store.records.RAttachment
|
||||
import docspell.store.records._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||
|
||||
/** Goes through all attachments that must be already converted into a pdf. If it is a
|
||||
* pdf, the first page is converted into a small preview png image and linked to the
|
||||
@ -30,7 +26,7 @@ import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||
*/
|
||||
object AttachmentPreview {
|
||||
|
||||
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig)(
|
||||
def apply[F[_]: Sync](pcfg: PreviewConfig)(
|
||||
item: ItemData
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
@ -40,7 +36,7 @@ object AttachmentPreview {
|
||||
)
|
||||
preview <- PdfboxPreview(pcfg)
|
||||
_ <- item.attachments
|
||||
.traverse(createPreview(ctx, preview, cfg.chunkSize))
|
||||
.traverse(createPreview(ctx, preview))
|
||||
.attempt
|
||||
.flatMap {
|
||||
case Right(_) => ().pure[F]
|
||||
@ -54,8 +50,7 @@ object AttachmentPreview {
|
||||
|
||||
def createPreview[F[_]: Sync](
|
||||
ctx: Context[F, _],
|
||||
preview: PdfboxPreview[F],
|
||||
chunkSize: Int
|
||||
preview: PdfboxPreview[F]
|
||||
)(
|
||||
ra: RAttachment
|
||||
): F[Option[RAttachmentPreview]] =
|
||||
@ -64,7 +59,7 @@ object AttachmentPreview {
|
||||
preview.previewPNG(loadFile(ctx)(ra)).flatMap {
|
||||
case Some(out) =>
|
||||
ctx.logger.debug("Preview generated, saving to database…") *>
|
||||
createRecord(ctx, out, ra, chunkSize).map(_.some)
|
||||
createRecord(ctx, out, ra).map(_.some)
|
||||
case None =>
|
||||
ctx.logger
|
||||
.info(s"Preview could not be generated. Maybe the pdf has no pages?") *>
|
||||
@ -79,23 +74,20 @@ object AttachmentPreview {
|
||||
private def createRecord[F[_]: Sync](
|
||||
ctx: Context[F, _],
|
||||
png: Stream[F, Byte],
|
||||
ra: RAttachment,
|
||||
chunkSize: Int
|
||||
ra: RAttachment
|
||||
): F[RAttachmentPreview] = {
|
||||
val name = ra.name
|
||||
.map(FileName.apply)
|
||||
.map(_.withPart("preview", '_').withExtension("png"))
|
||||
for {
|
||||
fileMeta <- ctx.store.bitpeace
|
||||
.saveNew(
|
||||
png,
|
||||
chunkSize,
|
||||
MimetypeHint(name.map(_.fullName), Some("image/png"))
|
||||
fileId <- png
|
||||
.through(
|
||||
ctx.store.fileStore.save(MimeTypeHint(name.map(_.fullName), Some("image/png")))
|
||||
)
|
||||
.compile
|
||||
.lastOrError
|
||||
now <- Timestamp.current[F]
|
||||
rp = RAttachmentPreview(ra.id, Ident.unsafe(fileMeta.id), name.map(_.fullName), now)
|
||||
rp = RAttachmentPreview(ra.id, fileId, name.map(_.fullName), now)
|
||||
_ <- QAttachment.deletePreview(ctx.store)(ra.id)
|
||||
_ <- ctx.store.transact(RAttachmentPreview.insert(rp))
|
||||
} yield rp
|
||||
@ -104,13 +96,8 @@ object AttachmentPreview {
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.applicationOctetStream)
|
||||
.map(_.toLocal)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
|
||||
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
|
||||
ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
|
||||
ctx.store.fileStore.getBytes(ra.fileId)
|
||||
}
|
||||
|
@ -19,9 +19,6 @@ import docspell.convert._
|
||||
import docspell.joex.extract.JsoupSanitizer
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||
|
||||
/** Goes through all attachments and creates a PDF version of it where supported.
|
||||
*
|
||||
@ -69,24 +66,21 @@ object ConvertPdf {
|
||||
): F[Boolean] =
|
||||
ctx.store.transact(RAttachmentSource.isConverted(ra.id))
|
||||
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.applicationOctetStream)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
|
||||
def convertSafe[F[_]: Async](
|
||||
cfg: ConvertConfig,
|
||||
sanitizeHtml: SanitizeHtml,
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
item: ItemData
|
||||
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
)(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] =
|
||||
Conversion.create[F](cfg, sanitizeHtml, ctx.logger).use { conv =>
|
||||
mime.toLocal match {
|
||||
mime match {
|
||||
case mt =>
|
||||
val data = ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val data = ctx.store.fileStore.getBytes(ra.fileId)
|
||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
|
||||
@ -154,11 +148,11 @@ object ConvertPdf {
|
||||
.map(FileName.apply)
|
||||
.map(_.withExtension("pdf").withPart(cfg.convertedFilenamePart, '.'))
|
||||
.map(_.fullName)
|
||||
ctx.store.bitpeace
|
||||
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
|
||||
|
||||
pdf
|
||||
.through(ctx.store.fileStore.save(MimeTypeHint(hint.filename, hint.advertised)))
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(fm => Ident.unsafe(fm.id))
|
||||
.flatMap(fmId => updateAttachment[F](ctx, ra, fmId, newName).map(_ => fmId))
|
||||
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||
}
|
||||
@ -184,10 +178,8 @@ object ConvertPdf {
|
||||
if (sameFile) ().pure[F]
|
||||
else
|
||||
ctx.logger.info("Deleting previous attachment file") *>
|
||||
ctx.store.bitpeace
|
||||
.delete(raPrev.fileId.id)
|
||||
.compile
|
||||
.drain
|
||||
ctx.store.fileStore
|
||||
.delete(raPrev.fileId)
|
||||
.attempt
|
||||
.flatMap {
|
||||
case Right(_) => ().pure[F]
|
||||
|
@ -15,9 +15,7 @@ import fs2.Stream
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.{RAttachment, RAttachmentSource, RItem}
|
||||
|
||||
import bitpeace.FileMeta
|
||||
import docspell.store.records._
|
||||
|
||||
/** Task that creates the item.
|
||||
*/
|
||||
@ -31,12 +29,10 @@ object CreateItem {
|
||||
|
||||
def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
def isValidFile(fm: FileMeta) =
|
||||
def isValidFile(fm: RFileMeta) =
|
||||
ctx.args.meta.validFileTypes.isEmpty ||
|
||||
ctx.args.meta.validFileTypes
|
||||
.map(_.asString)
|
||||
.toSet
|
||||
.contains(fm.mimetype.baseType)
|
||||
ctx.args.meta.validFileTypes.toSet
|
||||
.contains(fm.mimetype)
|
||||
|
||||
def fileMetas(itemId: Ident, now: Timestamp) =
|
||||
Stream
|
||||
@ -44,7 +40,9 @@ object CreateItem {
|
||||
.flatMap { offset =>
|
||||
Stream
|
||||
.emits(ctx.args.files)
|
||||
.flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm)))
|
||||
.evalMap(f =>
|
||||
ctx.store.fileStore.findMeta(f.fileMetaId).value.map(fm => (f, fm))
|
||||
)
|
||||
.collect { case (f, Some(fm)) if isValidFile(fm) => f }
|
||||
.zipWithIndex
|
||||
.evalMap { case (f, index) =>
|
||||
|
@ -15,7 +15,6 @@ import docspell.store.queries.QItem
|
||||
import docspell.store.records.RFileMeta
|
||||
import docspell.store.records.RJob
|
||||
|
||||
import bitpeace.FileMeta
|
||||
import doobie._
|
||||
|
||||
object DuplicateCheck {
|
||||
@ -40,7 +39,7 @@ object DuplicateCheck {
|
||||
_ <- fileMetas.traverse(deleteDuplicate(ctx))
|
||||
ids = fileMetas.filter(_.exists).map(_.fm.id).toSet
|
||||
} yield ctx.args.copy(files =
|
||||
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id))
|
||||
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId))
|
||||
)
|
||||
|
||||
private def getRetryCount[F[_]: Sync](ctx: Context[F, Args]): F[Int] =
|
||||
@ -49,13 +48,11 @@ object DuplicateCheck {
|
||||
private def deleteDuplicate[F[_]: Sync](
|
||||
ctx: Context[F, Args]
|
||||
)(fd: FileMetaDupes): F[Unit] = {
|
||||
val fname = ctx.args.files.find(_.fileMetaId.id == fd.fm.id).flatMap(_.name)
|
||||
val fname = ctx.args.files.find(_.fileMetaId == fd.fm.id).flatMap(_.name)
|
||||
if (fd.exists)
|
||||
ctx.logger
|
||||
.info(s"Deleting duplicate file $fname!") *> ctx.store.bitpeace
|
||||
.info(s"Deleting duplicate file $fname!") *> ctx.store.fileStore
|
||||
.delete(fd.fm.id)
|
||||
.compile
|
||||
.drain
|
||||
else ().pure[F]
|
||||
}
|
||||
|
||||
@ -69,12 +66,12 @@ object DuplicateCheck {
|
||||
|
||||
private def checkDuplicate[F[_]](
|
||||
ctx: Context[F, Args]
|
||||
)(fm: FileMeta): ConnectionIO[FileMetaDupes] = {
|
||||
)(fm: RFileMeta): ConnectionIO[FileMetaDupes] = {
|
||||
val excludes = ctx.args.files.map(_.fileMetaId).toSet
|
||||
QItem
|
||||
.findByChecksum(fm.checksum, ctx.args.meta.collective, excludes)
|
||||
.findByChecksum(fm.checksum.toHex, ctx.args.meta.collective, excludes)
|
||||
.map(v => FileMetaDupes(fm, v.nonEmpty))
|
||||
}
|
||||
|
||||
case class FileMetaDupes(fm: FileMeta, exists: Boolean)
|
||||
case class FileMetaDupes(fm: RFileMeta, exists: Boolean)
|
||||
}
|
||||
|
@ -20,9 +20,7 @@ import docspell.files.Zip
|
||||
import docspell.joex.mail._
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.records._
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
|
||||
import emil.Mail
|
||||
|
||||
/** Goes through all attachments and extracts archive files, like zip files. The process
|
||||
@ -84,16 +82,16 @@ object ExtractArchive {
|
||||
if (extract.archives.isEmpty) extract
|
||||
else extract.updatePositions
|
||||
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
|
||||
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.applicationOctetStream)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
|
||||
def extractSafe[F[_]: Async](
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
archive: Option[RAttachmentArchive]
|
||||
)(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] =
|
||||
mime.toLocal match {
|
||||
)(ra: RAttachment, pos: Int, mime: MimeType): F[Extracted] =
|
||||
mime match {
|
||||
case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) =>
|
||||
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
|
||||
extractZip(ctx, archive)(ra, pos)
|
||||
@ -122,7 +120,7 @@ object ExtractArchive {
|
||||
)
|
||||
_ <- ctx.store.transact(RAttachmentArchive.delete(ra.id))
|
||||
_ <- ctx.store.transact(RAttachment.delete(ra.id))
|
||||
_ <- ctx.store.bitpeace.delete(ra.fileId.id).compile.drain
|
||||
_ <- ctx.store.fileStore.delete(ra.fileId)
|
||||
} yield extracted
|
||||
case None =>
|
||||
for {
|
||||
@ -137,11 +135,8 @@ object ExtractArchive {
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
archive: Option[RAttachmentArchive]
|
||||
)(ra: RAttachment, pos: Int): F[Extracted] = {
|
||||
val zipData = ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
|
||||
val zipData = ctx.store.fileStore.getBytes(ra.fileId)
|
||||
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
|
||||
ctx.logger.debug(s"Filtering zip entries with '${glob.asString}'") *>
|
||||
zipData
|
||||
.through(Zip.unzipP[F](8192, glob))
|
||||
@ -156,10 +151,7 @@ object ExtractArchive {
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
archive: Option[RAttachmentArchive]
|
||||
)(ra: RAttachment, pos: Int): F[Extracted] = {
|
||||
val email: Stream[F, Byte] = ctx.store.bitpeace
|
||||
.get(ra.fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val email: Stream[F, Byte] = ctx.store.fileStore.getBytes(ra.fileId)
|
||||
|
||||
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
|
||||
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
|
||||
@ -200,15 +192,16 @@ object ExtractArchive {
|
||||
tentry: (Binary[F], Long)
|
||||
): Stream[F, Extracted] = {
|
||||
val (entry, subPos) = tentry
|
||||
val mimeHint = MimetypeHint.filename(entry.name).withAdvertised(entry.mime.asString)
|
||||
val fileMeta = ctx.store.bitpeace.saveNew(entry.data, 8192, mimeHint)
|
||||
val mimeHint = MimeTypeHint.filename(entry.name).withAdvertised(entry.mime.asString)
|
||||
val fileId = entry.data.through(ctx.store.fileStore.save(mimeHint))
|
||||
|
||||
Stream.eval(ctx.logger.debug(s"Extracted ${entry.name}. Storing as attachment.")) >>
|
||||
fileMeta.evalMap { fm =>
|
||||
fileId.evalMap { fid =>
|
||||
Ident.randomId.map { id =>
|
||||
val nra = RAttachment(
|
||||
id,
|
||||
ra.itemId,
|
||||
Ident.unsafe(fm.id),
|
||||
fid,
|
||||
pos,
|
||||
ra.created,
|
||||
Option(entry.name).map(_.trim).filter(_.nonEmpty)
|
||||
|
@ -132,8 +132,8 @@ object ItemHandler {
|
||||
Task(ctx =>
|
||||
ctx.logger.info("Deleting input files …") *>
|
||||
Stream
|
||||
.emits(ctx.args.files.map(_.fileMetaId.id))
|
||||
.flatMap(id => ctx.store.bitpeace.delete(id).attempt.drain)
|
||||
.emits(ctx.args.files.map(_.fileMetaId))
|
||||
.evalMap(id => ctx.store.fileStore.delete(id).attempt)
|
||||
.compile
|
||||
.drain
|
||||
)
|
||||
|
@ -62,7 +62,7 @@ object ProcessItem {
|
||||
ConvertPdf(cfg.convert, item)
|
||||
.flatMap(Task.setProgress(progress._1))
|
||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
|
||||
.flatMap(AttachmentPreview(cfg.extraction.preview))
|
||||
.flatMap(AttachmentPageCount())
|
||||
.flatMap(Task.setProgress(progress._2))
|
||||
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||
|
@ -15,9 +15,6 @@ import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
|
||||
import docspell.ftsclient.{FtsClient, TextData}
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
|
||||
import docspell.store.syntax.MimeTypes._
|
||||
|
||||
import bitpeace.{Mimetype, RangeDef}
|
||||
|
||||
object TextExtraction {
|
||||
|
||||
@ -130,18 +127,15 @@ object TextExtraction {
|
||||
extr: Extraction[F],
|
||||
lang: Language
|
||||
)(fileId: Ident): F[ExtractResult] = {
|
||||
val data = ctx.store.bitpeace
|
||||
.get(fileId.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val data = ctx.store.fileStore.getBytes(fileId)
|
||||
|
||||
def findMime: F[Mimetype] =
|
||||
def findMime: F[MimeType] =
|
||||
OptionT(ctx.store.transact(RFileMeta.findById(fileId)))
|
||||
.map(_.mimetype)
|
||||
.getOrElse(Mimetype.applicationOctetStream)
|
||||
.getOrElse(MimeType.octetStream)
|
||||
|
||||
findMime
|
||||
.flatMap(mt => extr.extractText(data, DataType(mt.toLocal), lang))
|
||||
.flatMap(mt => extr.extractText(data, DataType(mt), lang))
|
||||
}
|
||||
|
||||
private def extractTextFallback[F[_]: Async](
|
||||
|
Reference in New Issue
Block a user