mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Use keywords in pdfs to search for existing tags
During processing, keywords stored in PDF metadata are used to look them up in the tag database and associate any existing tags to the item. See #175
This commit is contained in:
parent
da68405f9b
commit
209c068436
@ -6,6 +6,7 @@ import cats.implicits._
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
|
import docspell.store.UpdateResult
|
||||||
import docspell.store.queries.{QAttachment, QItem}
|
import docspell.store.queries.{QAttachment, QItem}
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
import docspell.store.{AddResult, Store}
|
import docspell.store.{AddResult, Store}
|
||||||
@ -22,6 +23,9 @@ trait OItem[F[_]] {
|
|||||||
/** Create a new tag and add it to the item. */
|
/** Create a new tag and add it to the item. */
|
||||||
def addNewTag(item: Ident, tag: RTag): F[AddResult]
|
def addNewTag(item: Ident, tag: RTag): F[AddResult]
|
||||||
|
|
||||||
|
/** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
|
||||||
|
def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
|
||||||
|
|
||||||
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
|
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
|
||||||
|
|
||||||
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
|
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
|
||||||
@ -90,6 +94,27 @@ object OItem {
|
|||||||
.attempt
|
.attempt
|
||||||
.map(AddResult.fromUpdate)
|
.map(AddResult.fromUpdate)
|
||||||
|
|
||||||
|
def linkTags(
|
||||||
|
item: Ident,
|
||||||
|
tags: List[String],
|
||||||
|
collective: Ident
|
||||||
|
): F[UpdateResult] =
|
||||||
|
tags.distinct match {
|
||||||
|
case Nil => UpdateResult.success.pure[F]
|
||||||
|
case kws =>
|
||||||
|
val db =
|
||||||
|
(for {
|
||||||
|
_ <- OptionT(RItem.checkByIdAndCollective(item, collective))
|
||||||
|
given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
|
||||||
|
exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
|
||||||
|
_ <- OptionT.liftF(
|
||||||
|
RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
|
||||||
|
)
|
||||||
|
} yield UpdateResult.success).getOrElse(UpdateResult.notFound)
|
||||||
|
|
||||||
|
store.transact(db)
|
||||||
|
}
|
||||||
|
|
||||||
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
|
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
|
||||||
val db = for {
|
val db = for {
|
||||||
cid <- RItem.getCollective(item)
|
cid <- RItem.getCollective(item)
|
||||||
|
@ -1,39 +1,41 @@
|
|||||||
package docspell.extract
|
package docspell.extract
|
||||||
|
|
||||||
import scala.util.Try
|
|
||||||
|
|
||||||
import docspell.common.MimeType
|
import docspell.common.MimeType
|
||||||
|
import docspell.extract.pdfbox.PdfMetaData
|
||||||
|
|
||||||
sealed trait ExtractResult {
|
sealed trait ExtractResult {
|
||||||
|
|
||||||
def textOption: Option[String]
|
def textOption: Option[String]
|
||||||
|
|
||||||
|
def pdfMeta: Option[PdfMetaData]
|
||||||
}
|
}
|
||||||
|
|
||||||
object ExtractResult {
|
object ExtractResult {
|
||||||
|
|
||||||
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
|
val pdfMeta = None
|
||||||
}
|
}
|
||||||
def unsupportedFormat(mt: MimeType): ExtractResult =
|
def unsupportedFormat(mt: MimeType): ExtractResult =
|
||||||
UnsupportedFormat(mt)
|
UnsupportedFormat(mt)
|
||||||
|
|
||||||
case class Failure(ex: Throwable) extends ExtractResult {
|
case class Failure(ex: Throwable) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
|
val pdfMeta = None
|
||||||
}
|
}
|
||||||
def failure(ex: Throwable): ExtractResult =
|
def failure(ex: Throwable): ExtractResult =
|
||||||
Failure(ex)
|
Failure(ex)
|
||||||
|
|
||||||
case class Success(text: String) extends ExtractResult {
|
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
||||||
val textOption = Some(text)
|
val textOption = Some(text)
|
||||||
}
|
}
|
||||||
def success(text: String): ExtractResult =
|
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
||||||
Success(text)
|
Success(text, pdfMeta)
|
||||||
|
|
||||||
def fromTry(r: Try[String]): ExtractResult =
|
|
||||||
r.fold(Failure.apply, Success.apply)
|
|
||||||
|
|
||||||
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
||||||
e.fold(failure, success)
|
e.fold(failure, str => success(str, None))
|
||||||
|
|
||||||
|
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
|
||||||
|
e.fold(failure, r => success(r.txt.value, r.meta))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -40,8 +40,7 @@ object Extraction {
|
|||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
PdfExtract
|
PdfExtract
|
||||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||||
.map(_.map(_.value))
|
.map(ExtractResult.fromEitherResult)
|
||||||
.map(ExtractResult.fromEither)
|
|
||||||
|
|
||||||
case PoiType(mt) =>
|
case PoiType(mt) =>
|
||||||
PoiExtract
|
PoiExtract
|
||||||
@ -103,7 +102,7 @@ object Extraction {
|
|||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||||
ExtractResult.success(Text(txt).value)
|
ExtractResult.success(Text(txt).value, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
|
@ -7,9 +7,15 @@ import fs2.Stream
|
|||||||
import docspell.common.{Language, Logger}
|
import docspell.common.{Language, Logger}
|
||||||
import docspell.extract.internal.Text
|
import docspell.extract.internal.Text
|
||||||
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||||
|
import docspell.extract.pdfbox.PdfMetaData
|
||||||
import docspell.extract.pdfbox.PdfboxExtract
|
import docspell.extract.pdfbox.PdfboxExtract
|
||||||
|
|
||||||
object PdfExtract {
|
object PdfExtract {
|
||||||
|
final case class Result(txt: Text, meta: Option[PdfMetaData])
|
||||||
|
object Result {
|
||||||
|
def apply(t: (Text, Option[PdfMetaData])): Result =
|
||||||
|
Result(t._1, t._2)
|
||||||
|
}
|
||||||
|
|
||||||
def get[F[_]: Sync: ContextShift](
|
def get[F[_]: Sync: ContextShift](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
@ -18,39 +24,39 @@ object PdfExtract {
|
|||||||
stripMinLen: Int,
|
stripMinLen: Int,
|
||||||
ocrCfg: OcrConfig,
|
ocrCfg: OcrConfig,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): F[Either[Throwable, Text]] = {
|
): F[Either[Throwable, Result]] = {
|
||||||
|
|
||||||
val runOcr =
|
val runOcr =
|
||||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||||
|
|
||||||
def chooseResult(ocrStr: Text, strippedStr: Text) =
|
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
|
||||||
if (ocrStr.length > strippedStr.length)
|
if (ocrStr.length > strippedRes._1.length)
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
|
||||||
) *> ocrStr.pure[F]
|
) *> Result(ocrStr, strippedRes._2).pure[F]
|
||||||
else
|
else
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
|
s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
|
||||||
) *> strippedStr.pure[F]
|
) *> Result(strippedRes).pure[F]
|
||||||
|
|
||||||
//maybe better: inspect the pdf and decide whether ocr or not
|
//maybe better: inspect the pdf and decide whether ocr or not
|
||||||
for {
|
for {
|
||||||
pdfboxRes <-
|
pdfboxRes <-
|
||||||
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
||||||
.getText[F](in)
|
PdfboxExtract.getTextAndMetaData[F](in)
|
||||||
res <- pdfboxRes.fold(
|
res <- pdfboxRes.fold(
|
||||||
ex =>
|
ex =>
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||||
) >> runOcr.attempt,
|
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
||||||
str =>
|
pair =>
|
||||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
|
||||||
else
|
else
|
||||||
logger
|
logger
|
||||||
.info(
|
.info(
|
||||||
s"Stripped text from PDF is small (${str.length}). Trying with OCR."
|
s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
|
||||||
) *>
|
) *>
|
||||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
|
||||||
)
|
)
|
||||||
} yield res
|
} yield res
|
||||||
}
|
}
|
||||||
|
@ -9,17 +9,17 @@ import cats.effect.Sync
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.Timestamp
|
||||||
import docspell.extract.internal.Text
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
import org.apache.pdfbox.text.PDFTextStripper
|
import org.apache.pdfbox.text.PDFTextStripper
|
||||||
import docspell.common.Timestamp
|
|
||||||
|
|
||||||
object PdfboxExtract {
|
object PdfboxExtract {
|
||||||
|
|
||||||
def getTextAndMetaData[F[_]: Sync](
|
def getTextAndMetaData[F[_]: Sync](
|
||||||
data: Stream[F, Byte]
|
data: Stream[F, Byte]
|
||||||
): F[Either[Throwable, (Text, PdfMetaData)]] =
|
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||||
data.compile
|
data.compile
|
||||||
.to(Array)
|
.to(Array)
|
||||||
.map(bytes =>
|
.map(bytes =>
|
||||||
@ -27,7 +27,7 @@ object PdfboxExtract {
|
|||||||
for {
|
for {
|
||||||
txt <- readText(doc)
|
txt <- readText(doc)
|
||||||
md <- readMetaData(doc)
|
md <- readMetaData(doc)
|
||||||
} yield (txt, md)
|
} yield (txt, Some(md).filter(_.nonEmpty))
|
||||||
}.toEither.flatten
|
}.toEither.flatten
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -107,7 +107,8 @@ object CreateItem {
|
|||||||
Vector.empty,
|
Vector.empty,
|
||||||
Vector.empty,
|
Vector.empty,
|
||||||
fm.map(a => a.id -> a.fileId).toMap,
|
fm.map(a => a.id -> a.fileId).toMap,
|
||||||
MetaProposalList.empty
|
MetaProposalList.empty,
|
||||||
|
Nil
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,7 +149,15 @@ object CreateItem {
|
|||||||
.map(originFileTuple)
|
.map(originFileTuple)
|
||||||
.toMap
|
.toMap
|
||||||
} yield cand.headOption.map(ri =>
|
} yield cand.headOption.map(ri =>
|
||||||
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
|
ItemData(
|
||||||
|
ri,
|
||||||
|
rms,
|
||||||
|
Vector.empty,
|
||||||
|
Vector.empty,
|
||||||
|
origMap,
|
||||||
|
MetaProposalList.empty,
|
||||||
|
Nil
|
||||||
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,8 @@ case class ItemData(
|
|||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
dateLabels: Vector[AttachmentDates],
|
dateLabels: Vector[AttachmentDates],
|
||||||
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||||
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
|
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
||||||
|
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||||
|
@ -17,6 +17,12 @@ object SetGivenData {
|
|||||||
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
|
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
|
||||||
.map(_ => data)
|
.map(_ => data)
|
||||||
else
|
else
|
||||||
|
setFolder(data, ops).flatMap(d => setTags[F](d, ops))
|
||||||
|
|
||||||
|
private def setFolder[F[_]: Sync](
|
||||||
|
data: ItemData,
|
||||||
|
ops: OItem[F]
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
val itemId = data.item.id
|
val itemId = data.item.id
|
||||||
val folderId = ctx.args.meta.folderId
|
val folderId = ctx.args.meta.folderId
|
||||||
@ -32,4 +38,20 @@ object SetGivenData {
|
|||||||
} yield data
|
} yield data
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def setTags[F[_]: Sync](
|
||||||
|
data: ItemData,
|
||||||
|
ops: OItem[F]
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
val itemId = data.item.id
|
||||||
|
val collective = ctx.args.meta.collective
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
|
||||||
|
e <- ops.linkTags(itemId, data.tags, collective).attempt
|
||||||
|
_ <- e.fold(
|
||||||
|
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
|
||||||
|
_ => ().pure[F]
|
||||||
|
)
|
||||||
|
} yield data
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,8 @@ object TextExtraction {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
_ <- ctx.logger.debug("Storing extracted texts")
|
_ <- ctx.logger.debug("Storing extracted texts")
|
||||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
|
_ <-
|
||||||
|
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
|
||||||
idxItem = TextData.item(
|
idxItem = TextData.item(
|
||||||
item.item.id,
|
item.item.id,
|
||||||
ctx.args.meta.collective,
|
ctx.args.meta.collective,
|
||||||
@ -40,22 +41,26 @@ object TextExtraction {
|
|||||||
item.item.name.some,
|
item.item.name.some,
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
|
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
|
||||||
dur <- start
|
dur <- start
|
||||||
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
||||||
} yield item.copy(metas = txt.map(_._1))
|
} yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- helpers
|
||||||
|
|
||||||
|
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
|
||||||
|
|
||||||
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, ProcessItemArgs],
|
||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
|
)(ra: RAttachment): F[Result] = {
|
||||||
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
|
def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
|
||||||
(
|
Result(
|
||||||
rm,
|
pair._1,
|
||||||
TextData.attachment(
|
TextData.attachment(
|
||||||
item.item.id,
|
item.item.id,
|
||||||
ra.id,
|
ra.id,
|
||||||
@ -63,15 +68,16 @@ object TextExtraction {
|
|||||||
ctx.args.meta.folderId,
|
ctx.args.meta.folderId,
|
||||||
lang,
|
lang,
|
||||||
ra.name,
|
ra.name,
|
||||||
rm.content
|
pair._1.content
|
||||||
)
|
),
|
||||||
|
pair._2
|
||||||
)
|
)
|
||||||
|
|
||||||
val rm = item.findOrCreate(ra.id)
|
val rm = item.findOrCreate(ra.id)
|
||||||
rm.content match {
|
rm.content match {
|
||||||
case Some(_) =>
|
case Some(_) =>
|
||||||
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||||
makeTextData(rm).pure[F]
|
makeTextData((rm, Nil)).pure[F]
|
||||||
case None =>
|
case None =>
|
||||||
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||||
.map(makeTextData)
|
.map(makeTextData)
|
||||||
@ -83,21 +89,22 @@ object TextExtraction {
|
|||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
)(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||||
dst <- Duration.stopTime[F]
|
dst <- Duration.stopTime[F]
|
||||||
fids <- filesToExtract(ctx)(item, ra)
|
fids <- filesToExtract(ctx)(item, ra)
|
||||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||||
meta = item.changeMeta(
|
meta = item.changeMeta(
|
||||||
ra.id,
|
ra.id,
|
||||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
|
||||||
)
|
)
|
||||||
|
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
||||||
est <- dst
|
est <- dst
|
||||||
_ <- ctx.logger.info(
|
_ <- ctx.logger.info(
|
||||||
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
||||||
)
|
)
|
||||||
} yield meta
|
} yield (meta, tags)
|
||||||
|
|
||||||
def extractText[F[_]: Sync: ContextShift](
|
def extractText[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
@ -123,7 +130,7 @@ object TextExtraction {
|
|||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
ra: RAttachment,
|
ra: RAttachment,
|
||||||
lang: Language
|
lang: Language
|
||||||
)(fileIds: List[Ident]): F[Option[String]] =
|
)(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
|
||||||
fileIds match {
|
fileIds match {
|
||||||
case Nil =>
|
case Nil =>
|
||||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||||
@ -133,8 +140,8 @@ object TextExtraction {
|
|||||||
|
|
||||||
extractText[F](ctx, extr, lang)(id)
|
extractText[F](ctx, extr, lang)(id)
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case ExtractResult.Success(txt) =>
|
case res @ ExtractResult.Success(_, _) =>
|
||||||
txt.some.pure[F]
|
res.some.pure[F]
|
||||||
|
|
||||||
case ExtractResult.UnsupportedFormat(mt) =>
|
case ExtractResult.UnsupportedFormat(mt) =>
|
||||||
ctx.logger
|
ctx.logger
|
||||||
|
@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
|
|||||||
def isIn[A: Put](values: NonEmptyList[A]): Fragment =
|
def isIn[A: Put](values: NonEmptyList[A]): Fragment =
|
||||||
isIn(values.map(a => sql"$a").toList)
|
isIn(values.map(a => sql"$a").toList)
|
||||||
|
|
||||||
|
def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
|
||||||
|
fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
|
||||||
|
|
||||||
def isIn(frag: Fragment): Fragment =
|
def isIn(frag: Fragment): Fragment =
|
||||||
f ++ fr"IN (" ++ frag ++ fr")"
|
f ++ fr"IN (" ++ frag ++ fr")"
|
||||||
|
|
||||||
|
@ -314,6 +314,9 @@ object RItem {
|
|||||||
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
|
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
|
||||||
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
|
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
|
||||||
|
|
||||||
|
def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
|
||||||
|
selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
|
||||||
|
|
||||||
def removeFolder(folderId: Ident): ConnectionIO[Int] = {
|
def removeFolder(folderId: Ident): ConnectionIO[Int] = {
|
||||||
val empty: Option[Ident] = None
|
val empty: Option[Ident] = None
|
||||||
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
|
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.store.impl.Implicits._
|
import docspell.store.impl.Implicits._
|
||||||
import docspell.store.impl._
|
import docspell.store.impl._
|
||||||
@ -101,6 +104,21 @@ object RTag {
|
|||||||
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
|
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findAllByNameOrId(
|
||||||
|
nameOrIds: List[String],
|
||||||
|
coll: Ident
|
||||||
|
): ConnectionIO[Vector[RTag]] = {
|
||||||
|
val idList =
|
||||||
|
NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
|
||||||
|
val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
|
||||||
|
|
||||||
|
val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
|
||||||
|
nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
|
||||||
|
|
||||||
|
if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
|
||||||
|
else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
|
||||||
|
}
|
||||||
|
|
||||||
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
||||||
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
|
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -43,4 +44,28 @@ object RTagItem {
|
|||||||
|
|
||||||
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
|
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
|
||||||
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
|
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
|
||||||
|
|
||||||
|
def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
|
||||||
|
NonEmptyList.fromList(tags.toList) match {
|
||||||
|
case Some(nel) =>
|
||||||
|
selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
|
||||||
|
.query[RTagItem]
|
||||||
|
.to[Vector]
|
||||||
|
case None =>
|
||||||
|
Vector.empty.pure[ConnectionIO]
|
||||||
|
}
|
||||||
|
|
||||||
|
def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
|
||||||
|
if (tags.isEmpty) 0.pure[ConnectionIO]
|
||||||
|
else
|
||||||
|
for {
|
||||||
|
entities <- tags.toList.traverse(tagId =>
|
||||||
|
Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
|
||||||
|
)
|
||||||
|
n <- insertRows(
|
||||||
|
table,
|
||||||
|
all,
|
||||||
|
entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
|
||||||
|
).update.run
|
||||||
|
} yield n
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user