mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Use keywords in pdfs to search for existing tags
During processing, keywords stored in PDF metadata are used to look them up in the tag database and associate any existing tags to the item. See #175
This commit is contained in:
@ -107,7 +107,8 @@ object CreateItem {
|
||||
Vector.empty,
|
||||
Vector.empty,
|
||||
fm.map(a => a.id -> a.fileId).toMap,
|
||||
MetaProposalList.empty
|
||||
MetaProposalList.empty,
|
||||
Nil
|
||||
)
|
||||
}
|
||||
|
||||
@ -148,7 +149,15 @@ object CreateItem {
|
||||
.map(originFileTuple)
|
||||
.toMap
|
||||
} yield cand.headOption.map(ri =>
|
||||
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
|
||||
ItemData(
|
||||
ri,
|
||||
rms,
|
||||
Vector.empty,
|
||||
Vector.empty,
|
||||
origMap,
|
||||
MetaProposalList.empty,
|
||||
Nil
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,8 @@ case class ItemData(
|
||||
metas: Vector[RAttachmentMeta],
|
||||
dateLabels: Vector[AttachmentDates],
|
||||
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
|
||||
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
||||
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
|
||||
) {
|
||||
|
||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||
|
@ -17,19 +17,41 @@ object SetGivenData {
|
||||
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
|
||||
.map(_ => data)
|
||||
else
|
||||
Task { ctx =>
|
||||
val itemId = data.item.id
|
||||
val folderId = ctx.args.meta.folderId
|
||||
val collective = ctx.args.meta.collective
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting setting given data")
|
||||
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
|
||||
e <- ops.setFolder(itemId, folderId, collective).attempt
|
||||
_ <- e.fold(
|
||||
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
|
||||
_ => ().pure[F]
|
||||
)
|
||||
} yield data
|
||||
}
|
||||
setFolder(data, ops).flatMap(d => setTags[F](d, ops))
|
||||
|
||||
private def setFolder[F[_]: Sync](
|
||||
data: ItemData,
|
||||
ops: OItem[F]
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
val itemId = data.item.id
|
||||
val folderId = ctx.args.meta.folderId
|
||||
val collective = ctx.args.meta.collective
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting setting given data")
|
||||
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
|
||||
e <- ops.setFolder(itemId, folderId, collective).attempt
|
||||
_ <- e.fold(
|
||||
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
|
||||
_ => ().pure[F]
|
||||
)
|
||||
} yield data
|
||||
}
|
||||
|
||||
private def setTags[F[_]: Sync](
|
||||
data: ItemData,
|
||||
ops: OItem[F]
|
||||
): Task[F, ProcessItemArgs, ItemData] =
|
||||
Task { ctx =>
|
||||
val itemId = data.item.id
|
||||
val collective = ctx.args.meta.collective
|
||||
for {
|
||||
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
|
||||
e <- ops.linkTags(itemId, data.tags, collective).attempt
|
||||
_ <- e.fold(
|
||||
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
|
||||
_ => ().pure[F]
|
||||
)
|
||||
} yield data
|
||||
}
|
||||
}
|
||||
|
@ -32,7 +32,8 @@ object TextExtraction {
|
||||
)
|
||||
)
|
||||
_ <- ctx.logger.debug("Storing extracted texts")
|
||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
|
||||
_ <-
|
||||
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
|
||||
idxItem = TextData.item(
|
||||
item.item.id,
|
||||
ctx.args.meta.collective,
|
||||
@ -40,22 +41,26 @@ object TextExtraction {
|
||||
item.item.name.some,
|
||||
None
|
||||
)
|
||||
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
|
||||
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
|
||||
dur <- start
|
||||
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
||||
} yield item.copy(metas = txt.map(_._1))
|
||||
} yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
|
||||
}
|
||||
|
||||
// -- helpers
|
||||
|
||||
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
|
||||
|
||||
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
cfg: ExtractConfig,
|
||||
lang: Language,
|
||||
collective: Ident,
|
||||
item: ItemData
|
||||
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
|
||||
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
|
||||
(
|
||||
rm,
|
||||
)(ra: RAttachment): F[Result] = {
|
||||
def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
|
||||
Result(
|
||||
pair._1,
|
||||
TextData.attachment(
|
||||
item.item.id,
|
||||
ra.id,
|
||||
@ -63,15 +68,16 @@ object TextExtraction {
|
||||
ctx.args.meta.folderId,
|
||||
lang,
|
||||
ra.name,
|
||||
rm.content
|
||||
)
|
||||
pair._1.content
|
||||
),
|
||||
pair._2
|
||||
)
|
||||
|
||||
val rm = item.findOrCreate(ra.id)
|
||||
rm.content match {
|
||||
case Some(_) =>
|
||||
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||
makeTextData(rm).pure[F]
|
||||
makeTextData((rm, Nil)).pure[F]
|
||||
case None =>
|
||||
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||
.map(makeTextData)
|
||||
@ -83,21 +89,22 @@ object TextExtraction {
|
||||
cfg: ExtractConfig,
|
||||
lang: Language,
|
||||
item: ItemData
|
||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
||||
)(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
|
||||
for {
|
||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||
dst <- Duration.stopTime[F]
|
||||
fids <- filesToExtract(ctx)(item, ra)
|
||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||
meta = item.changeMeta(
|
||||
ra.id,
|
||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
||||
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
|
||||
)
|
||||
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
||||
est <- dst
|
||||
_ <- ctx.logger.info(
|
||||
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
||||
)
|
||||
} yield meta
|
||||
} yield (meta, tags)
|
||||
|
||||
def extractText[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, _],
|
||||
@ -123,7 +130,7 @@ object TextExtraction {
|
||||
cfg: ExtractConfig,
|
||||
ra: RAttachment,
|
||||
lang: Language
|
||||
)(fileIds: List[Ident]): F[Option[String]] =
|
||||
)(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
|
||||
fileIds match {
|
||||
case Nil =>
|
||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||
@ -133,8 +140,8 @@ object TextExtraction {
|
||||
|
||||
extractText[F](ctx, extr, lang)(id)
|
||||
.flatMap({
|
||||
case ExtractResult.Success(txt) =>
|
||||
txt.some.pure[F]
|
||||
case res @ ExtractResult.Success(_, _) =>
|
||||
res.some.pure[F]
|
||||
|
||||
case ExtractResult.UnsupportedFormat(mt) =>
|
||||
ctx.logger
|
||||
|
Reference in New Issue
Block a user