Use keywords in pdfs to search for existing tags

During processing, keywords stored in PDF metadata are used to look
them up in the tag database and associate any existing tags to the
item.

See #175
This commit is contained in:
Eike Kettner
2020-07-19 00:28:04 +02:00
parent da68405f9b
commit 209c068436
14 changed files with 184 additions and 64 deletions

View File

@ -107,7 +107,8 @@ object CreateItem {
Vector.empty,
Vector.empty,
fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty
MetaProposalList.empty,
Nil
)
}
@ -148,7 +149,15 @@ object CreateItem {
.map(originFileTuple)
.toMap
} yield cand.headOption.map(ri =>
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
ItemData(
ri,
rms,
Vector.empty,
Vector.empty,
origMap,
MetaProposalList.empty,
Nil
)
)
}

View File

@ -22,7 +22,8 @@ case class ItemData(
metas: Vector[RAttachmentMeta],
dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -17,19 +17,41 @@ object SetGivenData {
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
.map(_ => data)
else
Task { ctx =>
val itemId = data.item.id
val folderId = ctx.args.meta.folderId
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info("Starting setting given data")
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
e <- ops.setFolder(itemId, folderId, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
setFolder(data, ops).flatMap(d => setTags[F](d, ops))
private def setFolder[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val itemId = data.item.id
val folderId = ctx.args.meta.folderId
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info("Starting setting given data")
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
e <- ops.setFolder(itemId, folderId, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
private def setTags[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val itemId = data.item.id
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
e <- ops.linkTags(itemId, data.tags, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
}

View File

@ -32,7 +32,8 @@ object TextExtraction {
)
)
_ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
_ <-
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
idxItem = TextData.item(
item.item.id,
ctx.args.meta.collective,
@ -40,22 +41,26 @@ object TextExtraction {
item.item.name.some,
None
)
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1))
} yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
}
// -- helpers
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, ProcessItemArgs],
cfg: ExtractConfig,
lang: Language,
collective: Ident,
item: ItemData
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
(
rm,
)(ra: RAttachment): F[Result] = {
def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
Result(
pair._1,
TextData.attachment(
item.item.id,
ra.id,
@ -63,15 +68,16 @@ object TextExtraction {
ctx.args.meta.folderId,
lang,
ra.name,
rm.content
)
pair._1.content
),
pair._2
)
val rm = item.findOrCreate(ra.id)
rm.content match {
case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
makeTextData(rm).pure[F]
makeTextData((rm, Nil)).pure[F]
case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData)
@ -83,21 +89,22 @@ object TextExtraction {
cfg: ExtractConfig,
lang: Language,
item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] =
)(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F]
fids <- filesToExtract(ctx)(item, ra)
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta(
ra.id,
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
)
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
est <- dst
_ <- ctx.logger.info(
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
)
} yield meta
} yield (meta, tags)
def extractText[F[_]: Sync: ContextShift](
ctx: Context[F, _],
@ -123,7 +130,7 @@ object TextExtraction {
cfg: ExtractConfig,
ra: RAttachment,
lang: Language
)(fileIds: List[Ident]): F[Option[String]] =
)(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
fileIds match {
case Nil =>
ctx.logger.error(s"Cannot extract text").map(_ => None)
@ -133,8 +140,8 @@ object TextExtraction {
extractText[F](ctx, extr, lang)(id)
.flatMap({
case ExtractResult.Success(txt) =>
txt.some.pure[F]
case res @ ExtractResult.Success(_, _) =>
res.some.pure[F]
case ExtractResult.UnsupportedFormat(mt) =>
ctx.logger