Use keywords in pdfs to search for existing tags

During processing, keywords stored in PDF metadata are used to look them up in the tag database and associate any existing tags to the item. See #175
2025-10-31 17:50:11 +00:00 · 2020-07-19 00:28:04 +02:00
parent da68405f9b
commit 209c068436
14 changed files with 184 additions and 64 deletions
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
@@ -6,6 +6,7 @@ import cats.implicits._
 import docspell.common._
 import docspell.ftsclient.FtsClient
 import docspell.store.UpdateResult
 import docspell.store.queries.{QAttachment, QItem}
 import docspell.store.records._
 import docspell.store.{AddResult, Store}
@@ -22,6 +23,9 @@ trait OItem[F[_]] {
  /** Create a new tag and add it to the item. */
  def addNewTag(item: Ident, tag: RTag): F[AddResult]
  /** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
  def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
  def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
  def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
@@ -90,6 +94,27 @@ object OItem {
            .attempt
            .map(AddResult.fromUpdate)
        def linkTags(
            item: Ident,
            tags: List[String],
            collective: Ident
        ): F[UpdateResult] =
          tags.distinct match {
            case Nil => UpdateResult.success.pure[F]
            case kws =>
              val db =
                (for {
                  _     <- OptionT(RItem.checkByIdAndCollective(item, collective))
                  given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
                  exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
                  _ <- OptionT.liftF(
                    RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
                  )
                } yield UpdateResult.success).getOrElse(UpdateResult.notFound)
              store.transact(db)
          }
        def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
          val db = for {
            cid <- RItem.getCollective(item)
--- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
@@ -1,39 +1,41 @@
 package docspell.extract
 import scala.util.Try
 import docspell.common.MimeType
 import docspell.extract.pdfbox.PdfMetaData
 sealed trait ExtractResult {
  def textOption: Option[String]
  def pdfMeta: Option[PdfMetaData]
 }
 object ExtractResult {
  case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
    val textOption = None
    val pdfMeta    = None
  }
  def unsupportedFormat(mt: MimeType): ExtractResult =
    UnsupportedFormat(mt)
  case class Failure(ex: Throwable) extends ExtractResult {
    val textOption = None
    val pdfMeta    = None
  }
  def failure(ex: Throwable): ExtractResult =
    Failure(ex)
-  case class Success(text: String) extends ExtractResult {
+  case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
    val textOption = Some(text)
  }
-  def success(text: String): ExtractResult =
+  def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
-    Success(text)
+    Success(text, pdfMeta)
  def fromTry(r: Try[String]): ExtractResult =
    r.fold(Failure.apply, Success.apply)
  def fromEither(e: Either[Throwable, String]): ExtractResult =
-    e.fold(failure, success)
+    e.fold(failure, str => success(str, None))
  def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
    e.fold(failure, r => success(r.txt.value, r.meta))
 }
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -40,8 +40,7 @@ object Extraction {
          case MimeType.PdfMatch(_) =>
            PdfExtract
              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
-              .map(_.map(_.value))
+              .map(ExtractResult.fromEitherResult)
              .map(ExtractResult.fromEither)
          case PoiType(mt) =>
            PoiExtract
@@ -103,7 +102,7 @@ object Extraction {
            val cs = mt.charsetOrUtf8
            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
-                ExtractResult.success(Text(txt).value)
+                ExtractResult.success(Text(txt).value, None)
              }
          case mt =>
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -7,9 +7,15 @@ import fs2.Stream
 import docspell.common.{Language, Logger}
 import docspell.extract.internal.Text
 import docspell.extract.ocr.{OcrConfig, TextExtract}
 import docspell.extract.pdfbox.PdfMetaData
 import docspell.extract.pdfbox.PdfboxExtract
 object PdfExtract {
  final case class Result(txt: Text, meta: Option[PdfMetaData])
  object Result {
    def apply(t: (Text, Option[PdfMetaData])): Result =
      Result(t._1, t._2)
  }
  def get[F[_]: Sync: ContextShift](
      in: Stream[F, Byte],
@@ -18,39 +24,39 @@ object PdfExtract {
      stripMinLen: Int,
      ocrCfg: OcrConfig,
      logger: Logger[F]
-  ): F[Either[Throwable, Text]] = {
+  ): F[Either[Throwable, Result]] = {
    val runOcr =
      TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
-    def chooseResult(ocrStr: Text, strippedStr: Text) =
+    def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
-      if (ocrStr.length > strippedStr.length)
+      if (ocrStr.length > strippedRes._1.length)
        logger.info(
-          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
+          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
-        ) *> ocrStr.pure[F]
+        ) *> Result(ocrStr, strippedRes._2).pure[F]
      else
        logger.info(
-          s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
+          s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
-        ) *> strippedStr.pure[F]
+        ) *> Result(strippedRes).pure[F]
    //maybe better: inspect the pdf and decide whether ocr or not
    for {
      pdfboxRes <-
-        logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
+        logger.debug("Trying to strip text from pdf using pdfbox.") *>
-          .getText[F](in)
+          PdfboxExtract.getTextAndMetaData[F](in)
      res <- pdfboxRes.fold(
        ex =>
          logger.info(
            s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
-          ) >> runOcr.attempt,
+          ) >> runOcr.map(txt => Result(txt, None)).attempt,
-        str =>
+        pair =>
-          if (str.length >= stripMinLen) str.pure[F].attempt
+          if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
          else
            logger
              .info(
-                s"Stripped text from PDF is small (${str.length}). Trying with OCR."
+                s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
              ) *>
-              runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
+              runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
      )
    } yield res
  }
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -9,17 +9,17 @@ import cats.effect.Sync
 import cats.implicits._
 import fs2.Stream
 import docspell.common.Timestamp
 import docspell.extract.internal.Text
 import org.apache.pdfbox.pdmodel.PDDocument
 import org.apache.pdfbox.text.PDFTextStripper
 import docspell.common.Timestamp
 object PdfboxExtract {
  def getTextAndMetaData[F[_]: Sync](
      data: Stream[F, Byte]
-  ): F[Either[Throwable, (Text, PdfMetaData)]] =
+  ): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
    data.compile
      .to(Array)
      .map(bytes =>
@@ -27,7 +27,7 @@ object PdfboxExtract {
          for {
            txt <- readText(doc)
            md  <- readMetaData(doc)
-          } yield (txt, md)
+          } yield (txt, Some(md).filter(_.nonEmpty))
        }.toEither.flatten
      )
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -107,7 +107,8 @@ object CreateItem {
        Vector.empty,
        Vector.empty,
        fm.map(a => a.id -> a.fileId).toMap,
-        MetaProposalList.empty
+        MetaProposalList.empty,
        Nil
      )
    }
@@ -148,7 +149,15 @@ object CreateItem {
            .map(originFileTuple)
            .toMap
      } yield cand.headOption.map(ri =>
-        ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
+        ItemData(
          ri,
          rms,
          Vector.empty,
          Vector.empty,
          origMap,
          MetaProposalList.empty,
          Nil
        )
      )
    }
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -22,7 +22,8 @@ case class ItemData(
    metas: Vector[RAttachmentMeta],
    dateLabels: Vector[AttachmentDates],
    originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
-    givenMeta: MetaProposalList    // given meta data not associated to a specific attachment
+    givenMeta: MetaProposalList,   // given meta data not associated to a specific attachment
    tags: List[String]             // a list of tags (names or ids) attached to the item if they exist
 ) {
  def findMeta(attachId: Ident): Option[RAttachmentMeta] =
--- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
@@ -17,6 +17,12 @@ object SetGivenData {
        .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
        .map(_ => data)
    else
      setFolder(data, ops).flatMap(d => setTags[F](d, ops))
  private def setFolder[F[_]: Sync](
      data: ItemData,
      ops: OItem[F]
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
      val itemId     = data.item.id
      val folderId   = ctx.args.meta.folderId
@@ -32,4 +38,20 @@ object SetGivenData {
      } yield data
    }
  private def setTags[F[_]: Sync](
      data: ItemData,
      ops: OItem[F]
  ): Task[F, ProcessItemArgs, ItemData] =
    Task { ctx =>
      val itemId     = data.item.id
      val collective = ctx.args.meta.collective
      for {
        _ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
        e <- ops.linkTags(itemId, data.tags, collective).attempt
        _ <- e.fold(
          ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
          _ => ().pure[F]
        )
      } yield data
    }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -32,7 +32,8 @@ object TextExtraction {
          )
        )
        _ <- ctx.logger.debug("Storing extracted texts")
-        _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
+        _ <-
          txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
        idxItem = TextData.item(
          item.item.id,
          ctx.args.meta.collective,
@@ -40,22 +41,26 @@ object TextExtraction {
          item.item.name.some,
          None
        )
-        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
+        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
        dur <- start
        _   <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
-      } yield item.copy(metas = txt.map(_._1))
+      } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
    }
  // --  helpers
  case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
  def extractTextIfEmpty[F[_]: Sync: ContextShift](
      ctx: Context[F, ProcessItemArgs],
      cfg: ExtractConfig,
      lang: Language,
      collective: Ident,
      item: ItemData
-  )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
+  )(ra: RAttachment): F[Result] = {
-    def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
+    def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
-      (
+      Result(
-        rm,
+        pair._1,
        TextData.attachment(
          item.item.id,
          ra.id,
@@ -63,15 +68,16 @@ object TextExtraction {
          ctx.args.meta.folderId,
          lang,
          ra.name,
-          rm.content
+          pair._1.content
-        )
+        ),
        pair._2
      )
    val rm = item.findOrCreate(ra.id)
    rm.content match {
      case Some(_) =>
        ctx.logger.info("TextExtraction skipped, since text is already available.") *>
-          makeTextData(rm).pure[F]
+          makeTextData((rm, Nil)).pure[F]
      case None =>
        extractTextToMeta[F](ctx, cfg, lang, item)(ra)
          .map(makeTextData)
@@ -83,21 +89,22 @@ object TextExtraction {
      cfg: ExtractConfig,
      lang: Language,
      item: ItemData
-  )(ra: RAttachment): F[RAttachmentMeta] =
+  )(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
    for {
      _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
      dst  <- Duration.stopTime[F]
      fids <- filesToExtract(ctx)(item, ra)
-      txt  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
+      res  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
      meta = item.changeMeta(
        ra.id,
-        rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
+        rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
      )
      tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
      est <- dst
      _ <- ctx.logger.info(
        s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
      )
-    } yield meta
+    } yield (meta, tags)
  def extractText[F[_]: Sync: ContextShift](
      ctx: Context[F, _],
@@ -123,7 +130,7 @@ object TextExtraction {
      cfg: ExtractConfig,
      ra: RAttachment,
      lang: Language
-  )(fileIds: List[Ident]): F[Option[String]] =
+  )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
    fileIds match {
      case Nil =>
        ctx.logger.error(s"Cannot extract text").map(_ => None)
@@ -133,8 +140,8 @@ object TextExtraction {
        extractText[F](ctx, extr, lang)(id)
          .flatMap({
-            case ExtractResult.Success(txt) =>
+            case res @ ExtractResult.Success(_, _) =>
-              txt.some.pure[F]
+              res.some.pure[F]
            case ExtractResult.UnsupportedFormat(mt) =>
              ctx.logger
--- a/modules/store/src/main/scala/docspell/store/impl/Column.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/Column.scala
@@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
  def isIn[A: Put](values: NonEmptyList[A]): Fragment =
    isIn(values.map(a => sql"$a").toList)
  def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
    fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
  def isIn(frag: Fragment): Fragment =
    f ++ fr"IN (" ++ frag ++ fr")"
--- a/modules/store/src/main/scala/docspell/store/records/RItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala
@@ -314,6 +314,9 @@ object RItem {
  def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
    selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
  def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
    selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
  def removeFolder(folderId: Ident): ConnectionIO[Int] = {
    val empty: Option[Ident] = None
    updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
--- a/modules/store/src/main/scala/docspell/store/records/RTag.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala
@@ -1,5 +1,8 @@
 package docspell.store.records
 import cats.data.NonEmptyList
 import cats.implicits._
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
@@ -101,6 +104,21 @@ object RTag {
    ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
  }
  def findAllByNameOrId(
      nameOrIds: List[String],
      coll: Ident
  ): ConnectionIO[Vector[RTag]] = {
    val idList =
      NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
    val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
    val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
      nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
    if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
    else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
  }
  def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
    deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
 }
--- a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
@@ -1,5 +1,6 @@
 package docspell.store.records
 import cats.data.NonEmptyList
 import cats.implicits._
 import docspell.common._
@@ -43,4 +44,28 @@ object RTagItem {
  def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
    selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
  def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
    NonEmptyList.fromList(tags.toList) match {
      case Some(nel) =>
        selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
          .query[RTagItem]
          .to[Vector]
      case None =>
        Vector.empty.pure[ConnectionIO]
    }
  def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
    if (tags.isEmpty) 0.pure[ConnectionIO]
    else
      for {
        entities <- tags.toList.traverse(tagId =>
          Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
        )
        n <- insertRows(
          table,
          all,
          entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
        ).update.run
      } yield n
 }