Use keywords in pdfs to search for existing tags

During processing, keywords stored in PDF metadata are used to look
them up in the tag database and associate any existing tags to the
item.

See #175
This commit is contained in:
Eike Kettner 2020-07-19 00:28:04 +02:00
parent da68405f9b
commit 209c068436
14 changed files with 184 additions and 64 deletions

View File

@ -6,6 +6,7 @@ import cats.implicits._
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.store.UpdateResult
import docspell.store.queries.{QAttachment, QItem} import docspell.store.queries.{QAttachment, QItem}
import docspell.store.records._ import docspell.store.records._
import docspell.store.{AddResult, Store} import docspell.store.{AddResult, Store}
@ -22,6 +23,9 @@ trait OItem[F[_]] {
/** Create a new tag and add it to the item. */ /** Create a new tag and add it to the item. */
def addNewTag(item: Ident, tag: RTag): F[AddResult] def addNewTag(item: Ident, tag: RTag): F[AddResult]
/** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult] def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult] def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
@ -90,6 +94,27 @@ object OItem {
.attempt .attempt
.map(AddResult.fromUpdate) .map(AddResult.fromUpdate)
def linkTags(
item: Ident,
tags: List[String],
collective: Ident
): F[UpdateResult] =
tags.distinct match {
case Nil => UpdateResult.success.pure[F]
case kws =>
val db =
(for {
_ <- OptionT(RItem.checkByIdAndCollective(item, collective))
given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
_ <- OptionT.liftF(
RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
)
} yield UpdateResult.success).getOrElse(UpdateResult.notFound)
store.transact(db)
}
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = { def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
val db = for { val db = for {
cid <- RItem.getCollective(item) cid <- RItem.getCollective(item)

View File

@ -1,39 +1,41 @@
package docspell.extract package docspell.extract
import scala.util.Try
import docspell.common.MimeType import docspell.common.MimeType
import docspell.extract.pdfbox.PdfMetaData
sealed trait ExtractResult { sealed trait ExtractResult {
def textOption: Option[String] def textOption: Option[String]
def pdfMeta: Option[PdfMetaData]
} }
object ExtractResult { object ExtractResult {
case class UnsupportedFormat(mime: MimeType) extends ExtractResult { case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
val textOption = None val textOption = None
val pdfMeta = None
} }
def unsupportedFormat(mt: MimeType): ExtractResult = def unsupportedFormat(mt: MimeType): ExtractResult =
UnsupportedFormat(mt) UnsupportedFormat(mt)
case class Failure(ex: Throwable) extends ExtractResult { case class Failure(ex: Throwable) extends ExtractResult {
val textOption = None val textOption = None
val pdfMeta = None
} }
def failure(ex: Throwable): ExtractResult = def failure(ex: Throwable): ExtractResult =
Failure(ex) Failure(ex)
case class Success(text: String) extends ExtractResult { case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
val textOption = Some(text) val textOption = Some(text)
} }
def success(text: String): ExtractResult = def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
Success(text) Success(text, pdfMeta)
def fromTry(r: Try[String]): ExtractResult =
r.fold(Failure.apply, Success.apply)
def fromEither(e: Either[Throwable, String]): ExtractResult = def fromEither(e: Either[Throwable, String]): ExtractResult =
e.fold(failure, success) e.fold(failure, str => success(str, None))
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
e.fold(failure, r => success(r.txt.value, r.meta))
} }

View File

@ -40,8 +40,7 @@ object Extraction {
case MimeType.PdfMatch(_) => case MimeType.PdfMatch(_) =>
PdfExtract PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(_.map(_.value)) .map(ExtractResult.fromEitherResult)
.map(ExtractResult.fromEither)
case PoiType(mt) => case PoiType(mt) =>
PoiExtract PoiExtract
@ -103,7 +102,7 @@ object Extraction {
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(Text(txt).value) ExtractResult.success(Text(txt).value, None)
} }
case mt => case mt =>

View File

@ -7,9 +7,15 @@ import fs2.Stream
import docspell.common.{Language, Logger} import docspell.common.{Language, Logger}
import docspell.extract.internal.Text import docspell.extract.internal.Text
import docspell.extract.ocr.{OcrConfig, TextExtract} import docspell.extract.ocr.{OcrConfig, TextExtract}
import docspell.extract.pdfbox.PdfMetaData
import docspell.extract.pdfbox.PdfboxExtract import docspell.extract.pdfbox.PdfboxExtract
object PdfExtract { object PdfExtract {
final case class Result(txt: Text, meta: Option[PdfMetaData])
object Result {
def apply(t: (Text, Option[PdfMetaData])): Result =
Result(t._1, t._2)
}
def get[F[_]: Sync: ContextShift]( def get[F[_]: Sync: ContextShift](
in: Stream[F, Byte], in: Stream[F, Byte],
@ -18,39 +24,39 @@ object PdfExtract {
stripMinLen: Int, stripMinLen: Int,
ocrCfg: OcrConfig, ocrCfg: OcrConfig,
logger: Logger[F] logger: Logger[F]
): F[Either[Throwable, Text]] = { ): F[Either[Throwable, Result]] = {
val runOcr = val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: Text, strippedStr: Text) = def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
if (ocrStr.length > strippedStr.length) if (ocrStr.length > strippedRes._1.length)
logger.info( logger.info(
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
) *> ocrStr.pure[F] ) *> Result(ocrStr, strippedRes._2).pure[F]
else else
logger.info( logger.info(
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
) *> strippedStr.pure[F] ) *> Result(strippedRes).pure[F]
//maybe better: inspect the pdf and decide whether ocr or not //maybe better: inspect the pdf and decide whether ocr or not
for { for {
pdfboxRes <- pdfboxRes <-
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract logger.debug("Trying to strip text from pdf using pdfbox.") *>
.getText[F](in) PdfboxExtract.getTextAndMetaData[F](in)
res <- pdfboxRes.fold( res <- pdfboxRes.fold(
ex => ex =>
logger.info( logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) >> runOcr.attempt, ) >> runOcr.map(txt => Result(txt, None)).attempt,
str => pair =>
if (str.length >= stripMinLen) str.pure[F].attempt if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
else else
logger logger
.info( .info(
s"Stripped text from PDF is small (${str.length}). Trying with OCR." s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
) *> ) *>
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
) )
} yield res } yield res
} }

View File

@ -9,17 +9,17 @@ import cats.effect.Sync
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.common.Timestamp
import docspell.extract.internal.Text import docspell.extract.internal.Text
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper import org.apache.pdfbox.text.PDFTextStripper
import docspell.common.Timestamp
object PdfboxExtract { object PdfboxExtract {
def getTextAndMetaData[F[_]: Sync]( def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte] data: Stream[F, Byte]
): F[Either[Throwable, (Text, PdfMetaData)]] = ): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile data.compile
.to(Array) .to(Array)
.map(bytes => .map(bytes =>
@ -27,7 +27,7 @@ object PdfboxExtract {
for { for {
txt <- readText(doc) txt <- readText(doc)
md <- readMetaData(doc) md <- readMetaData(doc)
} yield (txt, md) } yield (txt, Some(md).filter(_.nonEmpty))
}.toEither.flatten }.toEither.flatten
) )

View File

@ -107,7 +107,8 @@ object CreateItem {
Vector.empty, Vector.empty,
Vector.empty, Vector.empty,
fm.map(a => a.id -> a.fileId).toMap, fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty MetaProposalList.empty,
Nil
) )
} }
@ -148,7 +149,15 @@ object CreateItem {
.map(originFileTuple) .map(originFileTuple)
.toMap .toMap
} yield cand.headOption.map(ri => } yield cand.headOption.map(ri =>
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty) ItemData(
ri,
rms,
Vector.empty,
Vector.empty,
origMap,
MetaProposalList.empty,
Nil
)
) )
} }

View File

@ -22,7 +22,8 @@ case class ItemData(
metas: Vector[RAttachmentMeta], metas: Vector[RAttachmentMeta],
dateLabels: Vector[AttachmentDates], dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList // given meta data not associated to a specific attachment givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
) { ) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] = def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -17,6 +17,12 @@ object SetGivenData {
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item")) .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
.map(_ => data) .map(_ => data)
else else
setFolder(data, ops).flatMap(d => setTags[F](d, ops))
private def setFolder[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
val itemId = data.item.id val itemId = data.item.id
val folderId = ctx.args.meta.folderId val folderId = ctx.args.meta.folderId
@ -32,4 +38,20 @@ object SetGivenData {
} yield data } yield data
} }
private def setTags[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val itemId = data.item.id
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
e <- ops.linkTags(itemId, data.tags, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
} }

View File

@ -32,7 +32,8 @@ object TextExtraction {
) )
) )
_ <- ctx.logger.debug("Storing extracted texts") _ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) _ <-
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
idxItem = TextData.item( idxItem = TextData.item(
item.item.id, item.item.id,
ctx.args.meta.collective, ctx.args.meta.collective,
@ -40,22 +41,26 @@ object TextExtraction {
item.item.name.some, item.item.name.some,
None None
) )
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*) _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
dur <- start dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1)) } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
} }
// -- helpers
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
def extractTextIfEmpty[F[_]: Sync: ContextShift]( def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, ProcessItemArgs], ctx: Context[F, ProcessItemArgs],
cfg: ExtractConfig, cfg: ExtractConfig,
lang: Language, lang: Language,
collective: Ident, collective: Ident,
item: ItemData item: ItemData
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = { )(ra: RAttachment): F[Result] = {
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) = def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
( Result(
rm, pair._1,
TextData.attachment( TextData.attachment(
item.item.id, item.item.id,
ra.id, ra.id,
@ -63,15 +68,16 @@ object TextExtraction {
ctx.args.meta.folderId, ctx.args.meta.folderId,
lang, lang,
ra.name, ra.name,
rm.content pair._1.content
) ),
pair._2
) )
val rm = item.findOrCreate(ra.id) val rm = item.findOrCreate(ra.id)
rm.content match { rm.content match {
case Some(_) => case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *> ctx.logger.info("TextExtraction skipped, since text is already available.") *>
makeTextData(rm).pure[F] makeTextData((rm, Nil)).pure[F]
case None => case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra) extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData) .map(makeTextData)
@ -83,21 +89,22 @@ object TextExtraction {
cfg: ExtractConfig, cfg: ExtractConfig,
lang: Language, lang: Language,
item: ItemData item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] = )(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
for { for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F] dst <- Duration.stopTime[F]
fids <- filesToExtract(ctx)(item, ra) fids <- filesToExtract(ctx)(item, ra)
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids) res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta( meta = item.changeMeta(
ra.id, ra.id,
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)) rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
) )
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
est <- dst est <- dst
_ <- ctx.logger.info( _ <- ctx.logger.info(
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
) )
} yield meta } yield (meta, tags)
def extractText[F[_]: Sync: ContextShift]( def extractText[F[_]: Sync: ContextShift](
ctx: Context[F, _], ctx: Context[F, _],
@ -123,7 +130,7 @@ object TextExtraction {
cfg: ExtractConfig, cfg: ExtractConfig,
ra: RAttachment, ra: RAttachment,
lang: Language lang: Language
)(fileIds: List[Ident]): F[Option[String]] = )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
fileIds match { fileIds match {
case Nil => case Nil =>
ctx.logger.error(s"Cannot extract text").map(_ => None) ctx.logger.error(s"Cannot extract text").map(_ => None)
@ -133,8 +140,8 @@ object TextExtraction {
extractText[F](ctx, extr, lang)(id) extractText[F](ctx, extr, lang)(id)
.flatMap({ .flatMap({
case ExtractResult.Success(txt) => case res @ ExtractResult.Success(_, _) =>
txt.some.pure[F] res.some.pure[F]
case ExtractResult.UnsupportedFormat(mt) => case ExtractResult.UnsupportedFormat(mt) =>
ctx.logger ctx.logger

View File

@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
def isIn[A: Put](values: NonEmptyList[A]): Fragment = def isIn[A: Put](values: NonEmptyList[A]): Fragment =
isIn(values.map(a => sql"$a").toList) isIn(values.map(a => sql"$a").toList)
def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
def isIn(frag: Fragment): Fragment = def isIn(frag: Fragment): Fragment =
f ++ fr"IN (" ++ frag ++ fr")" f ++ fr"IN (" ++ frag ++ fr")"

View File

@ -314,6 +314,9 @@ object RItem {
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] = def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
def removeFolder(folderId: Ident): ConnectionIO[Int] = { def removeFolder(folderId: Ident): ConnectionIO[Int] = {
val empty: Option[Ident] = None val empty: Option[Ident] = None
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run

View File

@ -1,5 +1,8 @@
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._
import docspell.common._ import docspell.common._
import docspell.store.impl.Implicits._ import docspell.store.impl.Implicits._
import docspell.store.impl._ import docspell.store.impl._
@ -101,6 +104,21 @@ object RTag {
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector] ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
} }
def findAllByNameOrId(
nameOrIds: List[String],
coll: Ident
): ConnectionIO[Vector[RTag]] = {
val idList =
NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
}
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
} }

View File

@ -1,5 +1,6 @@
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._ import cats.implicits._
import docspell.common._ import docspell.common._
@ -43,4 +44,28 @@ object RTagItem {
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] = def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector] selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
NonEmptyList.fromList(tags.toList) match {
case Some(nel) =>
selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
.query[RTagItem]
.to[Vector]
case None =>
Vector.empty.pure[ConnectionIO]
}
def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
if (tags.isEmpty) 0.pure[ConnectionIO]
else
for {
entities <- tags.toList.traverse(tagId =>
Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
)
n <- insertRows(
table,
all,
entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
).update.run
} yield n
} }