Merge pull request #183 from eikek/pdf-metadata

Pdf metadata
2025-10-30 05:10:11 +00:00 · 2020-07-18 23:19:21 +00:00
parent fe27c0656f cec4948710
commit 185a103942
17 changed files with 315 additions and 69 deletions
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
@@ -6,6 +6,7 @@ import cats.implicits._

 import docspell.common._
 import docspell.ftsclient.FtsClient
+import docspell.store.UpdateResult
 import docspell.store.queries.{QAttachment, QItem}
 import docspell.store.records._
 import docspell.store.{AddResult, Store}
@@ -22,6 +23,9 @@ trait OItem[F[_]] {
  /** Create a new tag and add it to the item. */
  def addNewTag(item: Ident, tag: RTag): F[AddResult]

+  /** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
+  def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
+
  def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]

  def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
@@ -90,6 +94,27 @@ object OItem {
            .attempt
            .map(AddResult.fromUpdate)

+        def linkTags(
+            item: Ident,
+            tags: List[String],
+            collective: Ident
+        ): F[UpdateResult] =
+          tags.distinct match {
+            case Nil => UpdateResult.success.pure[F]
+            case kws =>
+              val db =
+                (for {
+                  _     <- OptionT(RItem.checkByIdAndCollective(item, collective))
+                  given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
+                  exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
+                  _ <- OptionT.liftF(
+                    RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
+                  )
+                } yield UpdateResult.success).getOrElse(UpdateResult.notFound)
+
+              store.transact(db)
+          }
+
        def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
          val db = for {
            cid <- RItem.getCollective(item)
--- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
@@ -1,39 +1,47 @@
 package docspell.extract

-import scala.util.Try
-
 import docspell.common.MimeType
+import docspell.extract.pdfbox.PdfMetaData

 sealed trait ExtractResult {

  def textOption: Option[String]

+  def pdfMeta: Option[PdfMetaData]
 }

 object ExtractResult {

  case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
    val textOption = None
+    val pdfMeta    = None
  }
  def unsupportedFormat(mt: MimeType): ExtractResult =
    UnsupportedFormat(mt)

  case class Failure(ex: Throwable) extends ExtractResult {
    val textOption = None
+    val pdfMeta    = None
  }
  def failure(ex: Throwable): ExtractResult =
    Failure(ex)

-  case class Success(text: String) extends ExtractResult {
+  case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
    val textOption = Some(text)
+    def appendPdfMetaToText: Success =
+      pdfMeta.flatMap(_.asText) match {
+        case Some(m) =>
+          copy(text = text + "\n\n" + m)
+        case None => this
+      }
  }
-  def success(text: String): ExtractResult =
-    Success(text)
-
-  def fromTry(r: Try[String]): ExtractResult =
-    r.fold(Failure.apply, Success.apply)
+  def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
+    Success(text, pdfMeta)

  def fromEither(e: Either[Throwable, String]): ExtractResult =
-    e.fold(failure, success)
+    e.fold(failure, str => success(str, None))
+
+  def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
+    e.fold(failure, r => success(r.txt.value, r.meta))

 }
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -40,8 +40,7 @@ object Extraction {
          case MimeType.PdfMatch(_) =>
            PdfExtract
              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
-              .map(_.map(_.value))
-              .map(ExtractResult.fromEither)
+              .map(ExtractResult.fromEitherResult)

          case PoiType(mt) =>
            PoiExtract
@@ -103,7 +102,7 @@ object Extraction {
            val cs = mt.charsetOrUtf8
            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
-                ExtractResult.success(Text(txt).value)
+                ExtractResult.success(Text(txt).value, None)
              }

          case mt =>
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -7,9 +7,15 @@ import fs2.Stream
 import docspell.common.{Language, Logger}
 import docspell.extract.internal.Text
 import docspell.extract.ocr.{OcrConfig, TextExtract}
+import docspell.extract.pdfbox.PdfMetaData
 import docspell.extract.pdfbox.PdfboxExtract

 object PdfExtract {
+  final case class Result(txt: Text, meta: Option[PdfMetaData])
+  object Result {
+    def apply(t: (Text, Option[PdfMetaData])): Result =
+      Result(t._1, t._2)
+  }

  def get[F[_]: Sync: ContextShift](
      in: Stream[F, Byte],
@@ -18,39 +24,39 @@ object PdfExtract {
      stripMinLen: Int,
      ocrCfg: OcrConfig,
      logger: Logger[F]
-  ): F[Either[Throwable, Text]] = {
+  ): F[Either[Throwable, Result]] = {

    val runOcr =
      TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError

-    def chooseResult(ocrStr: Text, strippedStr: Text) =
-      if (ocrStr.length > strippedStr.length)
+    def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
+      if (ocrStr.length > strippedRes._1.length)
        logger.info(
-          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
-        ) *> ocrStr.pure[F]
+          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
+        ) *> Result(ocrStr, strippedRes._2).pure[F]
      else
        logger.info(
-          s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
-        ) *> strippedStr.pure[F]
+          s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
+        ) *> Result(strippedRes).pure[F]

    //maybe better: inspect the pdf and decide whether ocr or not
    for {
      pdfboxRes <-
-        logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
-          .get[F](in)
+        logger.debug("Trying to strip text from pdf using pdfbox.") *>
+          PdfboxExtract.getTextAndMetaData[F](in)
      res <- pdfboxRes.fold(
        ex =>
          logger.info(
            s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
-          ) >> runOcr.attempt,
-        str =>
-          if (str.length >= stripMinLen) str.pure[F].attempt
+          ) >> runOcr.map(txt => Result(txt, None)).attempt,
+        pair =>
+          if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
          else
            logger
              .info(
-                s"Stripped text from PDF is small (${str.length}). Trying with OCR."
+                s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
              ) *>
-              runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
+              runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
      )
    } yield res
  }
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala
@@ -0,0 +1,40 @@
+package docspell.extract.pdfbox
+
+import docspell.common.Timestamp
+
+final case class PdfMetaData(
+    title: Option[String],
+    author: Option[String],
+    subject: Option[String],
+    keywords: Option[String],
+    creator: Option[String],
+    creationDate: Option[Timestamp]
+) {
+
+  def isEmpty: Boolean =
+    title.isEmpty &&
+      author.isEmpty &&
+      subject.isEmpty &&
+      keywords.isEmpty &&
+      creator.isEmpty &&
+      creationDate.isEmpty
+
+  def nonEmpty: Boolean =
+    !isEmpty
+
+  def keywordList: List[String] =
+    keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
+
+  /** Return all data in lines, except keywords. Keywords are handled separately. */
+  def asText: Option[String] =
+    (title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
+      _.toUtcDate.toString
+    )) match {
+      case Nil  => None
+      case list => Some(list.mkString("\n"))
+    }
+}
+
+object PdfMetaData {
+  val empty = PdfMetaData(None, None, None, None, None, None)
+}
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -9,6 +9,7 @@ import cats.effect.Sync
 import cats.implicits._
 import fs2.Stream

+import docspell.common.Timestamp
 import docspell.extract.internal.Text

 import org.apache.pdfbox.pdmodel.PDDocument
@@ -16,15 +17,29 @@ import org.apache.pdfbox.text.PDFTextStripper

 object PdfboxExtract {

-  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
+  def getTextAndMetaData[F[_]: Sync](
+      data: Stream[F, Byte]
+  ): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
+    data.compile
+      .to(Array)
+      .map(bytes =>
+        Using(PDDocument.load(bytes)) { doc =>
+          for {
+            txt <- readText(doc)
+            md  <- readMetaData(doc)
+          } yield (txt, Some(md).filter(_.nonEmpty))
+        }.toEither.flatten
+      )
+
+  def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile
      .to(Array)
      .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)

-  def get(is: InputStream): Either[Throwable, Text] =
+  def getText(is: InputStream): Either[Throwable, Text] =
    Using(PDDocument.load(is))(readText).toEither.flatten

-  def get(inFile: Path): Either[Throwable, Text] =
+  def getText(inFile: Path): Either[Throwable, Text] =
    Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten

  private def readText(doc: PDDocument): Either[Throwable, Text] =
@@ -34,4 +49,31 @@ object PdfboxExtract {
      stripper.setLineSeparator("\n")
      Text(Option(stripper.getText(doc)))
    }.toEither
+
+  def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
+    data.compile
+      .to(Array)
+      .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
+
+  def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
+    Using(PDDocument.load(is))(readMetaData).toEither.flatten
+
+  def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
+    Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
+
+  private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
+    Try {
+      def mkValue(s: String) =
+        Option(s).map(_.trim).filter(_.nonEmpty)
+
+      val info = doc.getDocumentInformation
+      PdfMetaData(
+        mkValue(info.getTitle),
+        mkValue(info.getAuthor),
+        mkValue(info.getSubject),
+        mkValue(info.getKeywords),
+        mkValue(info.getCreator),
+        Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
+      )
+    }.toEither
 }
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala
@@ -0,0 +1,22 @@
+package docspell.extract.pdfbox
+
+import minitest.SimpleTestSuite
+
+object PdfMetaDataTest extends SimpleTestSuite {
+
+  test("split keywords on comma") {
+    val md = PdfMetaData.empty.copy(keywords = Some("a,b, c"))
+    assertEquals(md.keywordList, List("a", "b", "c"))
+  }
+
+  test("split keywords on semicolon") {
+    val md = PdfMetaData.empty.copy(keywords = Some("a; b;c"))
+    assertEquals(md.keywordList, List("a", "b", "c"))
+  }
+
+  test("split keywords on comma and semicolon") {
+    val md = PdfMetaData.empty.copy(keywords = Some("a, b; c"))
+    assertEquals(md.keywordList, List("a", "b", "c"))
+  }
+
+}
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
    textPDFs.foreach {
      case (file, txt) =>
        val url      = file.toJavaUrl.fold(sys.error, identity)
-        val str      = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+        val str      = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
        val received = removeFormatting(str.value)
        val expect   = removeFormatting(txt)
        assertEquals(received, expect)
@@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
    textPDFs.foreach {
      case (file, txt) =>
        val data     = file.readURL[IO](8192, blocker)
-        val str      = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
+        val str      = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
        val received = removeFormatting(str.value)
        val expect   = removeFormatting(txt)
        assertEquals(received, expect)
@@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite {
  test("extract text from image PDFs") {
    val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)

-    val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+    val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)

    assertEquals(str.value, "")
  }

+  test("extract metadata from pdf") {
+    val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
+    val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
+    assert(str.value.startsWith("Keywords in PDF"))
+    val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
+    assertEquals(md.author, Some("E.K."))
+    assertEquals(md.title, Some("Keywords in PDF"))
+    assertEquals(md.subject, Some("This is a subject"))
+    assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo"))
+    assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)"))
+    assert(md.creationDate.isDefined)
+  }
+
  private def removeFormatting(str: String): String =
    str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
 }
--- a/modules/files/src/test/resources/keywords.pdf
+++ b/modules/files/src/test/resources/keywords.pdf
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -107,7 +107,8 @@ object CreateItem {
        Vector.empty,
        Vector.empty,
        fm.map(a => a.id -> a.fileId).toMap,
-        MetaProposalList.empty
+        MetaProposalList.empty,
+        Nil
      )
    }

@@ -148,7 +149,15 @@ object CreateItem {
            .map(originFileTuple)
            .toMap
      } yield cand.headOption.map(ri =>
-        ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
+        ItemData(
+          ri,
+          rms,
+          Vector.empty,
+          Vector.empty,
+          origMap,
+          MetaProposalList.empty,
+          Nil
+        )
      )
    }

--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -22,7 +22,8 @@ case class ItemData(
    metas: Vector[RAttachmentMeta],
    dateLabels: Vector[AttachmentDates],
    originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
-    givenMeta: MetaProposalList    // given meta data not associated to a specific attachment
+    givenMeta: MetaProposalList,   // given meta data not associated to a specific attachment
+    tags: List[String]             // a list of tags (names or ids) attached to the item if they exist
 ) {

  def findMeta(attachId: Ident): Option[RAttachmentMeta] =
--- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
@@ -17,19 +17,41 @@ object SetGivenData {
        .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
        .map(_ => data)
    else
-      Task { ctx =>
-        val itemId     = data.item.id
-        val folderId   = ctx.args.meta.folderId
-        val collective = ctx.args.meta.collective
-        for {
-          _ <- ctx.logger.info("Starting setting given data")
-          _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
-          e <- ops.setFolder(itemId, folderId, collective).attempt
-          _ <- e.fold(
-            ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
-            _ => ().pure[F]
-          )
-        } yield data
-      }
+      setFolder(data, ops).flatMap(d => setTags[F](d, ops))

+  private def setFolder[F[_]: Sync](
+      data: ItemData,
+      ops: OItem[F]
+  ): Task[F, ProcessItemArgs, ItemData] =
+    Task { ctx =>
+      val itemId     = data.item.id
+      val folderId   = ctx.args.meta.folderId
+      val collective = ctx.args.meta.collective
+      for {
+        _ <- ctx.logger.info("Starting setting given data")
+        _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
+        e <- ops.setFolder(itemId, folderId, collective).attempt
+        _ <- e.fold(
+          ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
+          _ => ().pure[F]
+        )
+      } yield data
+    }
+
+  private def setTags[F[_]: Sync](
+      data: ItemData,
+      ops: OItem[F]
+  ): Task[F, ProcessItemArgs, ItemData] =
+    Task { ctx =>
+      val itemId     = data.item.id
+      val collective = ctx.args.meta.collective
+      for {
+        _ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
+        e <- ops.linkTags(itemId, data.tags, collective).attempt
+        _ <- e.fold(
+          ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
+          _ => ().pure[F]
+        )
+      } yield data
+    }
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -32,46 +32,52 @@ object TextExtraction {
          )
        )
        _ <- ctx.logger.debug("Storing extracted texts")
-        _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
+        _ <-
+          txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
        idxItem = TextData.item(
          item.item.id,
          ctx.args.meta.collective,
-          None, //folder
+          ctx.args.meta.folderId,
          item.item.name.some,
          None
        )
-        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
+        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
        dur <- start
        _   <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
-      } yield item.copy(metas = txt.map(_._1))
+      } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
    }

+  // --  helpers
+
+  case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
+
  def extractTextIfEmpty[F[_]: Sync: ContextShift](
-      ctx: Context[F, _],
+      ctx: Context[F, ProcessItemArgs],
      cfg: ExtractConfig,
      lang: Language,
      collective: Ident,
      item: ItemData
-  )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
-    def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
-      (
-        rm,
+  )(ra: RAttachment): F[Result] = {
+    def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
+      Result(
+        pair._1,
        TextData.attachment(
          item.item.id,
          ra.id,
          collective,
-          None, //folder
+          ctx.args.meta.folderId,
          lang,
          ra.name,
-          rm.content
-        )
+          pair._1.content
+        ),
+        pair._2
      )

    val rm = item.findOrCreate(ra.id)
    rm.content match {
      case Some(_) =>
        ctx.logger.info("TextExtraction skipped, since text is already available.") *>
-          makeTextData(rm).pure[F]
+          makeTextData((rm, Nil)).pure[F]
      case None =>
        extractTextToMeta[F](ctx, cfg, lang, item)(ra)
          .map(makeTextData)
@@ -83,21 +89,25 @@ object TextExtraction {
      cfg: ExtractConfig,
      lang: Language,
      item: ItemData
-  )(ra: RAttachment): F[RAttachmentMeta] =
+  )(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
    for {
      _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
      dst  <- Duration.stopTime[F]
      fids <- filesToExtract(ctx)(item, ra)
-      txt  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
+      res  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
      meta = item.changeMeta(
        ra.id,
-        rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
+        rm =>
+          rm.setContentIfEmpty(
+            res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
+          )
      )
+      tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
      est <- dst
      _ <- ctx.logger.info(
        s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
      )
-    } yield meta
+    } yield (meta, tags)

  def extractText[F[_]: Sync: ContextShift](
      ctx: Context[F, _],
@@ -123,7 +133,7 @@ object TextExtraction {
      cfg: ExtractConfig,
      ra: RAttachment,
      lang: Language
-  )(fileIds: List[Ident]): F[Option[String]] =
+  )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
    fileIds match {
      case Nil =>
        ctx.logger.error(s"Cannot extract text").map(_ => None)
@@ -133,8 +143,8 @@ object TextExtraction {

        extractText[F](ctx, extr, lang)(id)
          .flatMap({
-            case ExtractResult.Success(txt) =>
-              txt.some.pure[F]
+            case res @ ExtractResult.Success(_, _) =>
+              res.some.pure[F]

            case ExtractResult.UnsupportedFormat(mt) =>
              ctx.logger
--- a/modules/store/src/main/scala/docspell/store/impl/Column.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/Column.scala
@@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
  def isIn[A: Put](values: NonEmptyList[A]): Fragment =
    isIn(values.map(a => sql"$a").toList)

+  def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
+    fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
+
  def isIn(frag: Fragment): Fragment =
    f ++ fr"IN (" ++ frag ++ fr")"

--- a/modules/store/src/main/scala/docspell/store/records/RItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala
@@ -314,6 +314,9 @@ object RItem {
  def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
    selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option

+  def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
+    selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
+
  def removeFolder(folderId: Ident): ConnectionIO[Int] = {
    val empty: Option[Ident] = None
    updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
--- a/modules/store/src/main/scala/docspell/store/records/RTag.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala
@@ -1,5 +1,8 @@
 package docspell.store.records

+import cats.data.NonEmptyList
+import cats.implicits._
+
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
@@ -101,6 +104,21 @@ object RTag {
    ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
  }

+  def findAllByNameOrId(
+      nameOrIds: List[String],
+      coll: Ident
+  ): ConnectionIO[Vector[RTag]] = {
+    val idList =
+      NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
+    val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
+
+    val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
+      nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
+
+    if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
+    else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
+  }
+
  def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
    deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
 }
--- a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
@@ -1,5 +1,6 @@
 package docspell.store.records

+import cats.data.NonEmptyList
 import cats.implicits._

 import docspell.common._
@@ -43,4 +44,28 @@ object RTagItem {

  def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
    selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
+
+  def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
+    NonEmptyList.fromList(tags.toList) match {
+      case Some(nel) =>
+        selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
+          .query[RTagItem]
+          .to[Vector]
+      case None =>
+        Vector.empty.pure[ConnectionIO]
+    }
+
+  def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
+    if (tags.isEmpty) 0.pure[ConnectionIO]
+    else
+      for {
+        entities <- tags.toList.traverse(tagId =>
+          Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
+        )
+        n <- insertRows(
+          table,
+          all,
+          entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
+        ).update.run
+      } yield n
 }