From 209c068436a14c1fc68f9f697770001fb99be172 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Sun, 19 Jul 2020 00:28:04 +0200
Subject: [PATCH] Use keywords in pdfs to search for existing tags

During processing, keywords stored in PDF metadata are used to look
them up in the tag database and associate any existing tags to the
item.

See #175
---
 .../scala/docspell/backend/ops/OItem.scala    | 25 ++++++++++
 .../docspell/extract/ExtractResult.scala      | 20 ++++----
 .../scala/docspell/extract/Extraction.scala   |  5 +-
 .../scala/docspell/extract/PdfExtract.scala   | 34 +++++++------
 .../extract/pdfbox/PdfboxExtract.scala        |  6 +--
 .../extract/pdfbox/PdfboxExtractTest.scala    |  2 +-
 .../docspell/joex/process/CreateItem.scala    | 13 ++++-
 .../docspell/joex/process/ItemData.scala      |  3 +-
 .../docspell/joex/process/SetGivenData.scala  | 50 +++++++++++++------
 .../joex/process/TextExtraction.scala         | 41 ++++++++-------
 .../scala/docspell/store/impl/Column.scala    |  3 ++
 .../scala/docspell/store/records/RItem.scala  |  3 ++
 .../scala/docspell/store/records/RTag.scala   | 18 +++++++
 .../docspell/store/records/RTagItem.scala     | 25 ++++++++++
 14 files changed, 184 insertions(+), 64 deletions(-)

diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
index d17b453b..133991ae 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala
@@ -6,6 +6,7 @@ import cats.implicits._
 
 import docspell.common._
 import docspell.ftsclient.FtsClient
+import docspell.store.UpdateResult
 import docspell.store.queries.{QAttachment, QItem}
 import docspell.store.records._
 import docspell.store.{AddResult, Store}
@@ -22,6 +23,9 @@ trait OItem[F[_]] {
   /** Create a new tag and add it to the item. */
   def addNewTag(item: Ident, tag: RTag): F[AddResult]
 
+  /** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
+  def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
+
   def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
 
   def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
@@ -90,6 +94,27 @@ object OItem {
             .attempt
             .map(AddResult.fromUpdate)
 
+        def linkTags(
+            item: Ident,
+            tags: List[String],
+            collective: Ident
+        ): F[UpdateResult] =
+          tags.distinct match {
+            case Nil => UpdateResult.success.pure[F]
+            case kws =>
+              val db =
+                (for {
+                  _     <- OptionT(RItem.checkByIdAndCollective(item, collective))
+                  given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
+                  exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
+                  _ <- OptionT.liftF(
+                    RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
+                  )
+                } yield UpdateResult.success).getOrElse(UpdateResult.notFound)
+
+              store.transact(db)
+          }
+
         def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
           val db = for {
             cid <- RItem.getCollective(item)
diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
index 3a0f3a1b..d48b63c8 100644
--- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
@@ -1,39 +1,41 @@
 package docspell.extract
 
-import scala.util.Try
-
 import docspell.common.MimeType
+import docspell.extract.pdfbox.PdfMetaData
 
 sealed trait ExtractResult {
 
   def textOption: Option[String]
 
+  def pdfMeta: Option[PdfMetaData]
 }
 
 object ExtractResult {
 
   case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
     val textOption = None
+    val pdfMeta    = None
   }
   def unsupportedFormat(mt: MimeType): ExtractResult =
     UnsupportedFormat(mt)
 
   case class Failure(ex: Throwable) extends ExtractResult {
     val textOption = None
+    val pdfMeta    = None
   }
   def failure(ex: Throwable): ExtractResult =
     Failure(ex)
 
-  case class Success(text: String) extends ExtractResult {
+  case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
     val textOption = Some(text)
   }
-  def success(text: String): ExtractResult =
-    Success(text)
-
-  def fromTry(r: Try[String]): ExtractResult =
-    r.fold(Failure.apply, Success.apply)
+  def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
+    Success(text, pdfMeta)
 
   def fromEither(e: Either[Throwable, String]): ExtractResult =
-    e.fold(failure, success)
+    e.fold(failure, str => success(str, None))
+
+  def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
+    e.fold(failure, r => success(r.txt.value, r.meta))
 
 }
diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
index cc333b71..2507c119 100644
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -40,8 +40,7 @@ object Extraction {
           case MimeType.PdfMatch(_) =>
             PdfExtract
               .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
-              .map(_.map(_.value))
-              .map(ExtractResult.fromEither)
+              .map(ExtractResult.fromEitherResult)
 
           case PoiType(mt) =>
             PoiExtract
@@ -103,7 +102,7 @@ object Extraction {
             val cs = mt.charsetOrUtf8
             logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
               data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
-                ExtractResult.success(Text(txt).value)
+                ExtractResult.success(Text(txt).value, None)
               }
 
           case mt =>
diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
index 839b0261..4189c510 100644
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -7,9 +7,15 @@ import fs2.Stream
 import docspell.common.{Language, Logger}
 import docspell.extract.internal.Text
 import docspell.extract.ocr.{OcrConfig, TextExtract}
+import docspell.extract.pdfbox.PdfMetaData
 import docspell.extract.pdfbox.PdfboxExtract
 
 object PdfExtract {
+  final case class Result(txt: Text, meta: Option[PdfMetaData])
+  object Result {
+    def apply(t: (Text, Option[PdfMetaData])): Result =
+      Result(t._1, t._2)
+  }
 
   def get[F[_]: Sync: ContextShift](
       in: Stream[F, Byte],
@@ -18,39 +24,39 @@ object PdfExtract {
       stripMinLen: Int,
       ocrCfg: OcrConfig,
       logger: Logger[F]
-  ): F[Either[Throwable, Text]] = {
+  ): F[Either[Throwable, Result]] = {
 
     val runOcr =
       TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
 
-    def chooseResult(ocrStr: Text, strippedStr: Text) =
-      if (ocrStr.length > strippedStr.length)
+    def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
+      if (ocrStr.length > strippedRes._1.length)
         logger.info(
-          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
-        ) *> ocrStr.pure[F]
+          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
+        ) *> Result(ocrStr, strippedRes._2).pure[F]
       else
         logger.info(
-          s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
-        ) *> strippedStr.pure[F]
+          s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
+        ) *> Result(strippedRes).pure[F]
 
     //maybe better: inspect the pdf and decide whether ocr or not
     for {
       pdfboxRes <-
-        logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
-          .getText[F](in)
+        logger.debug("Trying to strip text from pdf using pdfbox.") *>
+          PdfboxExtract.getTextAndMetaData[F](in)
       res <- pdfboxRes.fold(
         ex =>
           logger.info(
             s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
-          ) >> runOcr.attempt,
-        str =>
-          if (str.length >= stripMinLen) str.pure[F].attempt
+          ) >> runOcr.map(txt => Result(txt, None)).attempt,
+        pair =>
+          if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
           else
             logger
               .info(
-                s"Stripped text from PDF is small (${str.length}). Trying with OCR."
+                s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
               ) *>
-              runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
+              runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
       )
     } yield res
   }
diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
index 233d7c31..def9c8ee 100644
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -9,17 +9,17 @@ import cats.effect.Sync
 import cats.implicits._
 import fs2.Stream
 
+import docspell.common.Timestamp
 import docspell.extract.internal.Text
 
 import org.apache.pdfbox.pdmodel.PDDocument
 import org.apache.pdfbox.text.PDFTextStripper
-import docspell.common.Timestamp
 
 object PdfboxExtract {
 
   def getTextAndMetaData[F[_]: Sync](
       data: Stream[F, Byte]
-  ): F[Either[Throwable, (Text, PdfMetaData)]] =
+  ): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
     data.compile
       .to(Array)
       .map(bytes =>
@@ -27,7 +27,7 @@ object PdfboxExtract {
           for {
             txt <- readText(doc)
             md  <- readMetaData(doc)
-          } yield (txt, md)
+          } yield (txt, Some(md).filter(_.nonEmpty))
         }.toEither.flatten
       )
 
diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
index b72b182a..3659cf4b 100644
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@@ -47,7 +47,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
     val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
     val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
     assert(str.value.startsWith("Keywords in PDF"))
-    val md =  PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
+    val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
     assertEquals(md.author, Some("E.K."))
     assertEquals(md.title, Some("Keywords in PDF"))
     assertEquals(md.subject, Some("This is a subject"))
diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
index bf48f49e..08de8d83 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala
@@ -107,7 +107,8 @@ object CreateItem {
         Vector.empty,
         Vector.empty,
         fm.map(a => a.id -> a.fileId).toMap,
-        MetaProposalList.empty
+        MetaProposalList.empty,
+        Nil
       )
     }
 
@@ -148,7 +149,15 @@ object CreateItem {
             .map(originFileTuple)
             .toMap
       } yield cand.headOption.map(ri =>
-        ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
+        ItemData(
+          ri,
+          rms,
+          Vector.empty,
+          Vector.empty,
+          origMap,
+          MetaProposalList.empty,
+          Nil
+        )
       )
     }
 
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
index 46ef9f8c..d4f83fc2 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -22,7 +22,8 @@ case class ItemData(
     metas: Vector[RAttachmentMeta],
     dateLabels: Vector[AttachmentDates],
     originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
-    givenMeta: MetaProposalList    // given meta data not associated to a specific attachment
+    givenMeta: MetaProposalList,   // given meta data not associated to a specific attachment
+    tags: List[String]             // a list of tags (names or ids) attached to the item if they exist
 ) {
 
   def findMeta(attachId: Ident): Option[RAttachmentMeta] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
index ba51af23..b0c279e7 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala
@@ -17,19 +17,41 @@ object SetGivenData {
         .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
         .map(_ => data)
     else
-      Task { ctx =>
-        val itemId     = data.item.id
-        val folderId   = ctx.args.meta.folderId
-        val collective = ctx.args.meta.collective
-        for {
-          _ <- ctx.logger.info("Starting setting given data")
-          _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
-          e <- ops.setFolder(itemId, folderId, collective).attempt
-          _ <- e.fold(
-            ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
-            _ => ().pure[F]
-          )
-        } yield data
-      }
+      setFolder(data, ops).flatMap(d => setTags[F](d, ops))
 
+  private def setFolder[F[_]: Sync](
+      data: ItemData,
+      ops: OItem[F]
+  ): Task[F, ProcessItemArgs, ItemData] =
+    Task { ctx =>
+      val itemId     = data.item.id
+      val folderId   = ctx.args.meta.folderId
+      val collective = ctx.args.meta.collective
+      for {
+        _ <- ctx.logger.info("Starting setting given data")
+        _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
+        e <- ops.setFolder(itemId, folderId, collective).attempt
+        _ <- e.fold(
+          ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
+          _ => ().pure[F]
+        )
+      } yield data
+    }
+
+  private def setTags[F[_]: Sync](
+      data: ItemData,
+      ops: OItem[F]
+  ): Task[F, ProcessItemArgs, ItemData] =
+    Task { ctx =>
+      val itemId     = data.item.id
+      val collective = ctx.args.meta.collective
+      for {
+        _ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
+        e <- ops.linkTags(itemId, data.tags, collective).attempt
+        _ <- e.fold(
+          ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
+          _ => ().pure[F]
+        )
+      } yield data
+    }
 }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
index bc048467..9bc41683 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -32,7 +32,8 @@ object TextExtraction {
           )
         )
         _ <- ctx.logger.debug("Storing extracted texts")
-        _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
+        _ <-
+          txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
         idxItem = TextData.item(
           item.item.id,
           ctx.args.meta.collective,
@@ -40,22 +41,26 @@ object TextExtraction {
           item.item.name.some,
           None
         )
-        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
+        _   <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
         dur <- start
         _   <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
-      } yield item.copy(metas = txt.map(_._1))
+      } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
     }
 
+  // --  helpers
+
+  case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
+
   def extractTextIfEmpty[F[_]: Sync: ContextShift](
       ctx: Context[F, ProcessItemArgs],
       cfg: ExtractConfig,
       lang: Language,
       collective: Ident,
       item: ItemData
-  )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
-    def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
-      (
-        rm,
+  )(ra: RAttachment): F[Result] = {
+    def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
+      Result(
+        pair._1,
         TextData.attachment(
           item.item.id,
           ra.id,
@@ -63,15 +68,16 @@ object TextExtraction {
           ctx.args.meta.folderId,
           lang,
           ra.name,
-          rm.content
-        )
+          pair._1.content
+        ),
+        pair._2
       )
 
     val rm = item.findOrCreate(ra.id)
     rm.content match {
       case Some(_) =>
         ctx.logger.info("TextExtraction skipped, since text is already available.") *>
-          makeTextData(rm).pure[F]
+          makeTextData((rm, Nil)).pure[F]
       case None =>
         extractTextToMeta[F](ctx, cfg, lang, item)(ra)
           .map(makeTextData)
@@ -83,21 +89,22 @@ object TextExtraction {
       cfg: ExtractConfig,
       lang: Language,
       item: ItemData
-  )(ra: RAttachment): F[RAttachmentMeta] =
+  )(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
     for {
       _    <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
       dst  <- Duration.stopTime[F]
       fids <- filesToExtract(ctx)(item, ra)
-      txt  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
+      res  <- extractTextFallback(ctx, cfg, ra, lang)(fids)
       meta = item.changeMeta(
         ra.id,
-        rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
+        rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
       )
+      tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
       est <- dst
       _ <- ctx.logger.info(
         s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
       )
-    } yield meta
+    } yield (meta, tags)
 
   def extractText[F[_]: Sync: ContextShift](
       ctx: Context[F, _],
@@ -123,7 +130,7 @@ object TextExtraction {
       cfg: ExtractConfig,
       ra: RAttachment,
       lang: Language
-  )(fileIds: List[Ident]): F[Option[String]] =
+  )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
     fileIds match {
       case Nil =>
         ctx.logger.error(s"Cannot extract text").map(_ => None)
@@ -133,8 +140,8 @@ object TextExtraction {
 
         extractText[F](ctx, extr, lang)(id)
           .flatMap({
-            case ExtractResult.Success(txt) =>
-              txt.some.pure[F]
+            case res @ ExtractResult.Success(_, _) =>
+              res.some.pure[F]
 
             case ExtractResult.UnsupportedFormat(mt) =>
               ctx.logger
diff --git a/modules/store/src/main/scala/docspell/store/impl/Column.scala b/modules/store/src/main/scala/docspell/store/impl/Column.scala
index 67c1097e..134e0afb 100644
--- a/modules/store/src/main/scala/docspell/store/impl/Column.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/Column.scala
@@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
   def isIn[A: Put](values: NonEmptyList[A]): Fragment =
     isIn(values.map(a => sql"$a").toList)
 
+  def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
+    fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
+
   def isIn(frag: Fragment): Fragment =
     f ++ fr"IN (" ++ frag ++ fr")"
 
diff --git a/modules/store/src/main/scala/docspell/store/records/RItem.scala b/modules/store/src/main/scala/docspell/store/records/RItem.scala
index 97b87d84..e961e8b2 100644
--- a/modules/store/src/main/scala/docspell/store/records/RItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala
@@ -314,6 +314,9 @@ object RItem {
   def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
     selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
 
+  def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
+    selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
+
   def removeFolder(folderId: Ident): ConnectionIO[Int] = {
     val empty: Option[Ident] = None
     updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala
index 27a04bf2..71b7b1f0 100644
--- a/modules/store/src/main/scala/docspell/store/records/RTag.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala
@@ -1,5 +1,8 @@
 package docspell.store.records
 
+import cats.data.NonEmptyList
+import cats.implicits._
+
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
@@ -101,6 +104,21 @@ object RTag {
     ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
   }
 
+  def findAllByNameOrId(
+      nameOrIds: List[String],
+      coll: Ident
+  ): ConnectionIO[Vector[RTag]] = {
+    val idList =
+      NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
+    val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
+
+    val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
+      nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
+
+    if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
+    else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
+  }
+
   def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
     deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
 }
diff --git a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
index 2782731d..35050225 100644
--- a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala
@@ -1,5 +1,6 @@
 package docspell.store.records
 
+import cats.data.NonEmptyList
 import cats.implicits._
 
 import docspell.common._
@@ -43,4 +44,28 @@ object RTagItem {
 
   def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
     selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
+
+  def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
+    NonEmptyList.fromList(tags.toList) match {
+      case Some(nel) =>
+        selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
+          .query[RTagItem]
+          .to[Vector]
+      case None =>
+        Vector.empty.pure[ConnectionIO]
+    }
+
+  def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
+    if (tags.isEmpty) 0.pure[ConnectionIO]
+    else
+      for {
+        entities <- tags.toList.traverse(tagId =>
+          Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
+        )
+        n <- insertRows(
+          table,
+          all,
+          entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
+        ).update.run
+      } yield n
 }