Refactoring for migrating to binny library

This commit is contained in:
eikek
2021-09-22 00:28:47 +02:00
parent 1f98d948b0
commit 20a829cf7a
45 changed files with 485 additions and 344 deletions

View File

@ -468,7 +468,7 @@ Docpell Update Check
# The chunk size used when storing files. This should be the same
# as used with the rest server.
chunk-size = 524288
chunk-size = ${docspell.joex.files.chunk-size}
# A string used to change the filename of the converted pdf file.
# If empty, the original file name is used for the pdf file ( the

View File

@ -122,12 +122,12 @@ object JoexAppImpl {
for {
httpClient <- BlazeClientBuilder[F](clientEC).resource
client = JoexClient(httpClient)
store <- Store.create(cfg.jdbc, connectEC)
store <- Store.create(cfg.jdbc, cfg.files.chunkSize, connectEC)
queue <- JobQueue(store)
pstore <- PeriodicTaskStore.create(store)
nodeOps <- ONode(store)
joex <- OJoex(client, store)
upload <- OUpload(store, queue, cfg.files, joex)
upload <- OUpload(store, queue, joex)
fts <- createFtsClient(cfg)(httpClient)
createIndex <- CreateIndex.resource(fts, store)
itemOps <- OItem(store, fts, createIndex, queue, joex)
@ -212,7 +212,7 @@ object JoexAppImpl {
.withTask(
JobTask.json(
MakePreviewArgs.taskName,
MakePreviewTask[F](cfg.convert, cfg.extraction.preview),
MakePreviewTask[F](cfg.extraction.preview),
MakePreviewTask.onCancel[F]
)
)

View File

@ -17,8 +17,6 @@ import docspell.common._
import docspell.store.Store
import docspell.store.records.RClassifierModel
import bitpeace.RangeDef
object Classify {
def apply[F[_]: Async](
@ -33,11 +31,7 @@ object Classify {
_ <- OptionT.liftF(logger.info(s"Guessing label for ${cname.name}"))
model <- OptionT(store.transact(RClassifierModel.findByName(coll, cname.name)))
.flatTapNone(logger.debug("No classifier model found."))
modelData =
store.bitpeace
.get(model.fileId.id)
.unNoneTerminate
.through(store.bitpeace.fetchData2(RangeDef.all))
modelData = store.fileStore.getBytes(model.fileId)
cls <- OptionT(File.withTempDir(workingDir, "classify").use { dir =>
val modelFile = dir.resolve("model.ser.gz")
modelData

View File

@ -90,8 +90,8 @@ object LearnClassifierTask {
)
n <- ctx.store.transact(RClassifierModel.deleteAll(list.map(_.id)))
_ <- list
.map(_.fileId.id)
.traverse(id => ctx.store.bitpeace.delete(id).compile.drain)
.map(_.fileId)
.traverse(id => ctx.store.fileStore.delete(id))
_ <- ctx.logger.debug(s"Deleted $n model files.")
} yield ()

View File

@ -16,8 +16,6 @@ import docspell.joex.scheduler._
import docspell.store.Store
import docspell.store.records.RClassifierModel
import bitpeace.MimetypeHint
object StoreClassifierModel {
def handleModel[F[_]: Async](
@ -43,16 +41,16 @@ object StoreClassifierModel {
)
_ <- logger.debug(s"Storing new trained model for: ${modelName.name}")
fileData = Files[F].readAll(trainedModel.model)
newFile <-
store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
newFileId <-
fileData.through(store.fileStore.save(MimeTypeHint.none)).compile.lastOrError
_ <- store.transact(
RClassifierModel.updateFile(collective, modelName.name, Ident.unsafe(newFile.id))
RClassifierModel.updateFile(collective, modelName.name, newFileId)
)
_ <- logger.debug(s"New model stored at file ${newFile.id}")
_ <- logger.debug(s"New model stored at file ${newFileId.id}")
_ <- oldFile match {
case Some(fid) =>
logger.debug(s"Deleting old model file ${fid.id}") *>
store.bitpeace.delete(fid.id).compile.drain
store.fileStore.delete(fid)
case None => ().pure[F]
}
} yield ()

View File

@ -19,10 +19,6 @@ import docspell.joex.Config
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records._
import bitpeace.FileMeta
import bitpeace.Mimetype
import bitpeace.MimetypeHint
import bitpeace.RangeDef
import io.circe.generic.semiauto._
import io.circe.{Decoder, Encoder}
@ -55,8 +51,11 @@ object PdfConvTask {
// --- Helper
// check if file exists and if it is pdf and if source id is the same and if ocrmypdf is enabled
def checkInputs[F[_]: Sync](cfg: Config, ctx: Context[F, Args]): F[Option[FileMeta]] = {
val none: Option[FileMeta] = None
def checkInputs[F[_]: Sync](
cfg: Config,
ctx: Context[F, Args]
): F[Option[RFileMeta]] = {
val none: Option[RFileMeta] = None
val checkSameFiles =
(for {
ra <- OptionT(ctx.store.transact(RAttachment.findById(ctx.args.attachId)))
@ -67,7 +66,7 @@ object PdfConvTask {
val existsPdf =
for {
meta <- ctx.store.transact(RAttachment.findMeta(ctx.args.attachId))
res = meta.filter(_.mimetype.matches(Mimetype.applicationPdf))
res = meta.filter(_.mimetype.matches(MimeType.pdf))
_ <-
if (res.isEmpty)
ctx.logger.info(
@ -91,12 +90,10 @@ object PdfConvTask {
def convert[F[_]: Async](
cfg: Config,
ctx: Context[F, Args],
in: FileMeta
in: RFileMeta
): F[Unit] = {
val bp = ctx.store.bitpeace
val data = Stream
.emit(in)
.through(bp.fetchData2(RangeDef.all))
val fs = ctx.store.fileStore
val data = fs.getBytes(in.id)
val storeResult: ConversionResult.Handler[F, Unit] =
Kleisli {
@ -122,7 +119,7 @@ object PdfConvTask {
OcrMyPdf.toPDF[F, Unit](
cfg.convert.ocrmypdf,
lang,
in.chunksize,
cfg.files.chunkSize,
ctx.logger
)(data, storeResult)
@ -140,18 +137,13 @@ object PdfConvTask {
def storeToAttachment[F[_]: Sync](
ctx: Context[F, Args],
meta: FileMeta,
meta: RFileMeta,
newFile: Stream[F, Byte]
): F[Unit] = {
val mimeHint = MimetypeHint.advertised(meta.mimetype.asString)
val mimeHint = MimeTypeHint.advertised(meta.mimetype)
for {
time <- Timestamp.current[F]
fid <- Ident.randomId[F]
_ <-
ctx.store.bitpeace
.saveNew(newFile, meta.chunksize, mimeHint, Some(fid.id), time.value)
.compile
.lastOrError
fid <-
newFile.through(ctx.store.fileStore.save(mimeHint)).compile.lastOrError
_ <- ctx.store.transact(RAttachment.updateFileId(ctx.args.attachId, fid))
} yield ()
}

View File

@ -10,7 +10,6 @@ import cats.effect._
import cats.implicits._
import docspell.common._
import docspell.convert.ConvertConfig
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.process.AttachmentPreview
@ -23,7 +22,7 @@ object MakePreviewTask {
type Args = MakePreviewArgs
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig): Task[F, Args, Unit] =
def apply[F[_]: Sync](pcfg: PreviewConfig): Task[F, Args, Unit] =
Task { ctx =>
for {
exists <- previewExists(ctx)
@ -36,7 +35,7 @@ object MakePreviewTask {
else
ctx.logger.info(
s"Generating preview image for attachment ${ctx.args.attachment}"
) *> generatePreview(ctx, preview, cfg)
) *> generatePreview(ctx, preview)
} yield ()
}
@ -45,13 +44,12 @@ object MakePreviewTask {
private def generatePreview[F[_]: Sync](
ctx: Context[F, Args],
preview: PdfboxPreview[F],
cfg: ConvertConfig
preview: PdfboxPreview[F]
): F[Unit] =
for {
ra <- ctx.store.transact(RAttachment.findById(ctx.args.attachment))
_ <- ra
.map(AttachmentPreview.createPreview(ctx, preview, cfg.chunkSize))
.map(AttachmentPreview.createPreview(ctx, preview))
.getOrElse(
ctx.logger.error(s"No attachment found with id: ${ctx.args.attachment}")
)

View File

@ -18,9 +18,6 @@ import docspell.extract.pdfbox.PdfboxExtract
import docspell.joex.scheduler._
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, RangeDef}
/** Goes through all attachments that must be already converted into a pdf. If it is a
* pdf, the number of pages are retrieved and stored in the attachment metadata.
@ -100,13 +97,8 @@ object AttachmentPageCount {
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.applicationOctetStream)
.map(_.toLocal)
.getOrElse(MimeType.octetStream)
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
ctx.store.fileStore.getBytes(ra.fileId)
}

View File

@ -13,16 +13,12 @@ import cats.implicits._
import fs2.Stream
import docspell.common._
import docspell.convert._
import docspell.extract.pdfbox.PdfboxPreview
import docspell.extract.pdfbox.PreviewConfig
import docspell.joex.scheduler._
import docspell.store.queries.QAttachment
import docspell.store.records.RAttachment
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
/** Goes through all attachments that must be already converted into a pdf. If it is a
* pdf, the first page is converted into a small preview png image and linked to the
@ -30,7 +26,7 @@ import bitpeace.{Mimetype, MimetypeHint, RangeDef}
*/
object AttachmentPreview {
def apply[F[_]: Sync](cfg: ConvertConfig, pcfg: PreviewConfig)(
def apply[F[_]: Sync](pcfg: PreviewConfig)(
item: ItemData
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
@ -40,7 +36,7 @@ object AttachmentPreview {
)
preview <- PdfboxPreview(pcfg)
_ <- item.attachments
.traverse(createPreview(ctx, preview, cfg.chunkSize))
.traverse(createPreview(ctx, preview))
.attempt
.flatMap {
case Right(_) => ().pure[F]
@ -54,8 +50,7 @@ object AttachmentPreview {
def createPreview[F[_]: Sync](
ctx: Context[F, _],
preview: PdfboxPreview[F],
chunkSize: Int
preview: PdfboxPreview[F]
)(
ra: RAttachment
): F[Option[RAttachmentPreview]] =
@ -64,7 +59,7 @@ object AttachmentPreview {
preview.previewPNG(loadFile(ctx)(ra)).flatMap {
case Some(out) =>
ctx.logger.debug("Preview generated, saving to database…") *>
createRecord(ctx, out, ra, chunkSize).map(_.some)
createRecord(ctx, out, ra).map(_.some)
case None =>
ctx.logger
.info(s"Preview could not be generated. Maybe the pdf has no pages?") *>
@ -79,23 +74,20 @@ object AttachmentPreview {
private def createRecord[F[_]: Sync](
ctx: Context[F, _],
png: Stream[F, Byte],
ra: RAttachment,
chunkSize: Int
ra: RAttachment
): F[RAttachmentPreview] = {
val name = ra.name
.map(FileName.apply)
.map(_.withPart("preview", '_').withExtension("png"))
for {
fileMeta <- ctx.store.bitpeace
.saveNew(
png,
chunkSize,
MimetypeHint(name.map(_.fullName), Some("image/png"))
fileId <- png
.through(
ctx.store.fileStore.save(MimeTypeHint(name.map(_.fullName), Some("image/png")))
)
.compile
.lastOrError
now <- Timestamp.current[F]
rp = RAttachmentPreview(ra.id, Ident.unsafe(fileMeta.id), name.map(_.fullName), now)
rp = RAttachmentPreview(ra.id, fileId, name.map(_.fullName), now)
_ <- QAttachment.deletePreview(ctx.store)(ra.id)
_ <- ctx.store.transact(RAttachmentPreview.insert(rp))
} yield rp
@ -104,13 +96,8 @@ object AttachmentPreview {
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.applicationOctetStream)
.map(_.toLocal)
.getOrElse(MimeType.octetStream)
def loadFile[F[_]](ctx: Context[F, _])(ra: RAttachment): Stream[F, Byte] =
ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
ctx.store.fileStore.getBytes(ra.fileId)
}

View File

@ -19,9 +19,6 @@ import docspell.convert._
import docspell.joex.extract.JsoupSanitizer
import docspell.joex.scheduler._
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
/** Goes through all attachments and creates a PDF version of it where supported.
*
@ -69,24 +66,21 @@ object ConvertPdf {
): F[Boolean] =
ctx.store.transact(RAttachmentSource.isConverted(ra.id))
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.applicationOctetStream)
.getOrElse(MimeType.octetStream)
def convertSafe[F[_]: Async](
cfg: ConvertConfig,
sanitizeHtml: SanitizeHtml,
ctx: Context[F, ProcessItemArgs],
item: ItemData
)(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] =
)(ra: RAttachment, mime: MimeType): F[(RAttachment, Option[RAttachmentMeta])] =
Conversion.create[F](cfg, sanitizeHtml, ctx.logger).use { conv =>
mime.toLocal match {
mime match {
case mt =>
val data = ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val data = ctx.store.fileStore.getBytes(ra.fileId)
val handler = conversionHandler[F](ctx, cfg, ra, item)
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
conv.toPDF(DataType(mt), ctx.args.meta.language, handler)(
@ -154,11 +148,11 @@ object ConvertPdf {
.map(FileName.apply)
.map(_.withExtension("pdf").withPart(cfg.convertedFilenamePart, '.'))
.map(_.fullName)
ctx.store.bitpeace
.saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised))
pdf
.through(ctx.store.fileStore.save(MimeTypeHint(hint.filename, hint.advertised)))
.compile
.lastOrError
.map(fm => Ident.unsafe(fm.id))
.flatMap(fmId => updateAttachment[F](ctx, ra, fmId, newName).map(_ => fmId))
.map(fmId => ra.copy(fileId = fmId, name = newName))
}
@ -184,10 +178,8 @@ object ConvertPdf {
if (sameFile) ().pure[F]
else
ctx.logger.info("Deleting previous attachment file") *>
ctx.store.bitpeace
.delete(raPrev.fileId.id)
.compile
.drain
ctx.store.fileStore
.delete(raPrev.fileId)
.attempt
.flatMap {
case Right(_) => ().pure[F]

View File

@ -15,9 +15,7 @@ import fs2.Stream
import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.queries.QItem
import docspell.store.records.{RAttachment, RAttachmentSource, RItem}
import bitpeace.FileMeta
import docspell.store.records._
/** Task that creates the item.
*/
@ -31,12 +29,10 @@ object CreateItem {
def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
def isValidFile(fm: FileMeta) =
def isValidFile(fm: RFileMeta) =
ctx.args.meta.validFileTypes.isEmpty ||
ctx.args.meta.validFileTypes
.map(_.asString)
.toSet
.contains(fm.mimetype.baseType)
ctx.args.meta.validFileTypes.toSet
.contains(fm.mimetype)
def fileMetas(itemId: Ident, now: Timestamp) =
Stream
@ -44,7 +40,9 @@ object CreateItem {
.flatMap { offset =>
Stream
.emits(ctx.args.files)
.flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm)))
.evalMap(f =>
ctx.store.fileStore.findMeta(f.fileMetaId).value.map(fm => (f, fm))
)
.collect { case (f, Some(fm)) if isValidFile(fm) => f }
.zipWithIndex
.evalMap { case (f, index) =>

View File

@ -15,7 +15,6 @@ import docspell.store.queries.QItem
import docspell.store.records.RFileMeta
import docspell.store.records.RJob
import bitpeace.FileMeta
import doobie._
object DuplicateCheck {
@ -40,7 +39,7 @@ object DuplicateCheck {
_ <- fileMetas.traverse(deleteDuplicate(ctx))
ids = fileMetas.filter(_.exists).map(_.fm.id).toSet
} yield ctx.args.copy(files =
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId.id))
ctx.args.files.filterNot(f => ids.contains(f.fileMetaId))
)
private def getRetryCount[F[_]: Sync](ctx: Context[F, Args]): F[Int] =
@ -49,13 +48,11 @@ object DuplicateCheck {
private def deleteDuplicate[F[_]: Sync](
ctx: Context[F, Args]
)(fd: FileMetaDupes): F[Unit] = {
val fname = ctx.args.files.find(_.fileMetaId.id == fd.fm.id).flatMap(_.name)
val fname = ctx.args.files.find(_.fileMetaId == fd.fm.id).flatMap(_.name)
if (fd.exists)
ctx.logger
.info(s"Deleting duplicate file $fname!") *> ctx.store.bitpeace
.info(s"Deleting duplicate file $fname!") *> ctx.store.fileStore
.delete(fd.fm.id)
.compile
.drain
else ().pure[F]
}
@ -69,12 +66,12 @@ object DuplicateCheck {
private def checkDuplicate[F[_]](
ctx: Context[F, Args]
)(fm: FileMeta): ConnectionIO[FileMetaDupes] = {
)(fm: RFileMeta): ConnectionIO[FileMetaDupes] = {
val excludes = ctx.args.files.map(_.fileMetaId).toSet
QItem
.findByChecksum(fm.checksum, ctx.args.meta.collective, excludes)
.findByChecksum(fm.checksum.toHex, ctx.args.meta.collective, excludes)
.map(v => FileMetaDupes(fm, v.nonEmpty))
}
case class FileMetaDupes(fm: FileMeta, exists: Boolean)
case class FileMetaDupes(fm: RFileMeta, exists: Boolean)
}

View File

@ -20,9 +20,7 @@ import docspell.files.Zip
import docspell.joex.mail._
import docspell.joex.scheduler._
import docspell.store.records._
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, MimetypeHint, RangeDef}
import emil.Mail
/** Goes through all attachments and extracts archive files, like zip files. The process
@ -84,16 +82,16 @@ object ExtractArchive {
if (extract.archives.isEmpty) extract
else extract.updatePositions
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] =
def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.applicationOctetStream)
.getOrElse(MimeType.octetStream)
def extractSafe[F[_]: Async](
ctx: Context[F, ProcessItemArgs],
archive: Option[RAttachmentArchive]
)(ra: RAttachment, pos: Int, mime: Mimetype): F[Extracted] =
mime.toLocal match {
)(ra: RAttachment, pos: Int, mime: MimeType): F[Extracted] =
mime match {
case MimeType.ZipMatch(_) if ra.name.exists(_.endsWith(".zip")) =>
ctx.logger.info(s"Extracting zip archive ${ra.name.getOrElse("<noname>")}.") *>
extractZip(ctx, archive)(ra, pos)
@ -122,7 +120,7 @@ object ExtractArchive {
)
_ <- ctx.store.transact(RAttachmentArchive.delete(ra.id))
_ <- ctx.store.transact(RAttachment.delete(ra.id))
_ <- ctx.store.bitpeace.delete(ra.fileId.id).compile.drain
_ <- ctx.store.fileStore.delete(ra.fileId)
} yield extracted
case None =>
for {
@ -137,11 +135,8 @@ object ExtractArchive {
ctx: Context[F, ProcessItemArgs],
archive: Option[RAttachmentArchive]
)(ra: RAttachment, pos: Int): F[Extracted] = {
val zipData = ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val zipData = ctx.store.fileStore.getBytes(ra.fileId)
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
ctx.logger.debug(s"Filtering zip entries with '${glob.asString}'") *>
zipData
.through(Zip.unzipP[F](8192, glob))
@ -156,10 +151,7 @@ object ExtractArchive {
ctx: Context[F, ProcessItemArgs],
archive: Option[RAttachmentArchive]
)(ra: RAttachment, pos: Int): F[Extracted] = {
val email: Stream[F, Byte] = ctx.store.bitpeace
.get(ra.fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val email: Stream[F, Byte] = ctx.store.fileStore.getBytes(ra.fileId)
val glob = ctx.args.meta.fileFilter.getOrElse(Glob.all)
val attachOnly = ctx.args.meta.attachmentsOnly.getOrElse(false)
@ -200,15 +192,16 @@ object ExtractArchive {
tentry: (Binary[F], Long)
): Stream[F, Extracted] = {
val (entry, subPos) = tentry
val mimeHint = MimetypeHint.filename(entry.name).withAdvertised(entry.mime.asString)
val fileMeta = ctx.store.bitpeace.saveNew(entry.data, 8192, mimeHint)
val mimeHint = MimeTypeHint.filename(entry.name).withAdvertised(entry.mime.asString)
val fileId = entry.data.through(ctx.store.fileStore.save(mimeHint))
Stream.eval(ctx.logger.debug(s"Extracted ${entry.name}. Storing as attachment.")) >>
fileMeta.evalMap { fm =>
fileId.evalMap { fid =>
Ident.randomId.map { id =>
val nra = RAttachment(
id,
ra.itemId,
Ident.unsafe(fm.id),
fid,
pos,
ra.created,
Option(entry.name).map(_.trim).filter(_.nonEmpty)

View File

@ -132,8 +132,8 @@ object ItemHandler {
Task(ctx =>
ctx.logger.info("Deleting input files …") *>
Stream
.emits(ctx.args.files.map(_.fileMetaId.id))
.flatMap(id => ctx.store.bitpeace.delete(id).attempt.drain)
.emits(ctx.args.files.map(_.fileMetaId))
.evalMap(id => ctx.store.fileStore.delete(id).attempt)
.compile
.drain
)

View File

@ -62,7 +62,7 @@ object ProcessItem {
ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(AttachmentPreview(cfg.convert, cfg.extraction.preview))
.flatMap(AttachmentPreview(cfg.extraction.preview))
.flatMap(AttachmentPageCount())
.flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser, regexNer))

View File

@ -15,9 +15,6 @@ import docspell.extract.{ExtractConfig, ExtractResult, Extraction}
import docspell.ftsclient.{FtsClient, TextData}
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta}
import docspell.store.syntax.MimeTypes._
import bitpeace.{Mimetype, RangeDef}
object TextExtraction {
@ -130,18 +127,15 @@ object TextExtraction {
extr: Extraction[F],
lang: Language
)(fileId: Ident): F[ExtractResult] = {
val data = ctx.store.bitpeace
.get(fileId.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
val data = ctx.store.fileStore.getBytes(fileId)
def findMime: F[Mimetype] =
def findMime: F[MimeType] =
OptionT(ctx.store.transact(RFileMeta.findById(fileId)))
.map(_.mimetype)
.getOrElse(Mimetype.applicationOctetStream)
.getOrElse(MimeType.octetStream)
findMime
.flatMap(mt => extr.extractText(data, DataType(mt.toLocal), lang))
.flatMap(mt => extr.extractText(data, DataType(mt), lang))
}
private def extractTextFallback[F[_]: Async](