diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index d17b453b..133991ae 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -6,6 +6,7 @@ import cats.implicits._ import docspell.common._ import docspell.ftsclient.FtsClient +import docspell.store.UpdateResult import docspell.store.queries.{QAttachment, QItem} import docspell.store.records._ import docspell.store.{AddResult, Store} @@ -22,6 +23,9 @@ trait OItem[F[_]] { /** Create a new tag and add it to the item. */ def addNewTag(item: Ident, tag: RTag): F[AddResult] + /** Apply all tags to the given item. Tags must exist, but can be IDs or names. */ + def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult] + def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult] def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult] @@ -90,6 +94,27 @@ object OItem { .attempt .map(AddResult.fromUpdate) + def linkTags( + item: Ident, + tags: List[String], + collective: Ident + ): F[UpdateResult] = + tags.distinct match { + case Nil => UpdateResult.success.pure[F] + case kws => + val db = + (for { + _ <- OptionT(RItem.checkByIdAndCollective(item, collective)) + given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective)) + exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId))) + _ <- OptionT.liftF( + RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId))) + ) + } yield UpdateResult.success).getOrElse(UpdateResult.notFound) + + store.transact(db) + } + def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = { val db = for { cid <- RItem.getCollective(item) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala index 3a0f3a1b..ac9716b3 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala @@ -1,39 +1,47 @@ package docspell.extract -import scala.util.Try - import docspell.common.MimeType +import docspell.extract.pdfbox.PdfMetaData sealed trait ExtractResult { def textOption: Option[String] + def pdfMeta: Option[PdfMetaData] } object ExtractResult { case class UnsupportedFormat(mime: MimeType) extends ExtractResult { val textOption = None + val pdfMeta = None } def unsupportedFormat(mt: MimeType): ExtractResult = UnsupportedFormat(mt) case class Failure(ex: Throwable) extends ExtractResult { val textOption = None + val pdfMeta = None } def failure(ex: Throwable): ExtractResult = Failure(ex) - case class Success(text: String) extends ExtractResult { + case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult { val textOption = Some(text) + def appendPdfMetaToText: Success = + pdfMeta.flatMap(_.asText) match { + case Some(m) => + copy(text = text + "\n\n" + m) + case None => this + } } - def success(text: String): ExtractResult = - Success(text) - - def fromTry(r: Try[String]): ExtractResult = - r.fold(Failure.apply, Success.apply) + def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult = + Success(text, pdfMeta) def fromEither(e: Either[Throwable, String]): ExtractResult = - e.fold(failure, success) + e.fold(failure, str => success(str, None)) + + def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult = + e.fold(failure, r => success(r.txt.value, r.meta)) } diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index cc333b71..2507c119 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -40,8 +40,7 @@ object Extraction { case MimeType.PdfMatch(_) => PdfExtract .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) - .map(_.map(_.value)) - .map(ExtractResult.fromEither) + .map(ExtractResult.fromEitherResult) case PoiType(mt) => PoiExtract @@ -103,7 +102,7 @@ object Extraction { val cs = mt.charsetOrUtf8 logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => - ExtractResult.success(Text(txt).value) + ExtractResult.success(Text(txt).value, None) } case mt => diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 6d2d4a7b..4189c510 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -7,9 +7,15 @@ import fs2.Stream import docspell.common.{Language, Logger} import docspell.extract.internal.Text import docspell.extract.ocr.{OcrConfig, TextExtract} +import docspell.extract.pdfbox.PdfMetaData import docspell.extract.pdfbox.PdfboxExtract object PdfExtract { + final case class Result(txt: Text, meta: Option[PdfMetaData]) + object Result { + def apply(t: (Text, Option[PdfMetaData])): Result = + Result(t._1, t._2) + } def get[F[_]: Sync: ContextShift]( in: Stream[F, Byte], @@ -18,39 +24,39 @@ object PdfExtract { stripMinLen: Int, ocrCfg: OcrConfig, logger: Logger[F] - ): F[Either[Throwable, Text]] = { + ): F[Either[Throwable, Result]] = { val runOcr = TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError - def chooseResult(ocrStr: Text, strippedStr: Text) = - if (ocrStr.length > strippedStr.length) + def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) = + if (ocrStr.length > strippedRes._1.length) logger.info( - s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" - ) *> ocrStr.pure[F] + s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})" + ) *> Result(ocrStr, strippedRes._2).pure[F] else logger.info( - s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" - ) *> strippedStr.pure[F] + s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})" + ) *> Result(strippedRes).pure[F] //maybe better: inspect the pdf and decide whether ocr or not for { pdfboxRes <- - logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract - .get[F](in) + logger.debug("Trying to strip text from pdf using pdfbox.") *> + PdfboxExtract.getTextAndMetaData[F](in) res <- pdfboxRes.fold( ex => logger.info( s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " - ) >> runOcr.attempt, - str => - if (str.length >= stripMinLen) str.pure[F].attempt + ) >> runOcr.map(txt => Result(txt, None)).attempt, + pair => + if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt else logger .info( - s"Stripped text from PDF is small (${str.length}). Trying with OCR." + s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR." ) *> - runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt + runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt ) } yield res } diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala new file mode 100644 index 00000000..4663d1c8 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala @@ -0,0 +1,40 @@ +package docspell.extract.pdfbox + +import docspell.common.Timestamp + +final case class PdfMetaData( + title: Option[String], + author: Option[String], + subject: Option[String], + keywords: Option[String], + creator: Option[String], + creationDate: Option[Timestamp] +) { + + def isEmpty: Boolean = + title.isEmpty && + author.isEmpty && + subject.isEmpty && + keywords.isEmpty && + creator.isEmpty && + creationDate.isEmpty + + def nonEmpty: Boolean = + !isEmpty + + def keywordList: List[String] = + keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil) + + /** Return all data in lines, except keywords. Keywords are handled separately. */ + def asText: Option[String] = + (title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map( + _.toUtcDate.toString + )) match { + case Nil => None + case list => Some(list.mkString("\n")) + } +} + +object PdfMetaData { + val empty = PdfMetaData(None, None, None, None, None, None) +} diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index d44e2af7..def9c8ee 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -9,6 +9,7 @@ import cats.effect.Sync import cats.implicits._ import fs2.Stream +import docspell.common.Timestamp import docspell.extract.internal.Text import org.apache.pdfbox.pdmodel.PDDocument @@ -16,15 +17,29 @@ import org.apache.pdfbox.text.PDFTextStripper object PdfboxExtract { - def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = + def getTextAndMetaData[F[_]: Sync]( + data: Stream[F, Byte] + ): F[Either[Throwable, (Text, Option[PdfMetaData])]] = + data.compile + .to(Array) + .map(bytes => + Using(PDDocument.load(bytes)) { doc => + for { + txt <- readText(doc) + md <- readMetaData(doc) + } yield (txt, Some(md).filter(_.nonEmpty)) + }.toEither.flatten + ) + + def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile .to(Array) .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) - def get(is: InputStream): Either[Throwable, Text] = + def getText(is: InputStream): Either[Throwable, Text] = Using(PDDocument.load(is))(readText).toEither.flatten - def get(inFile: Path): Either[Throwable, Text] = + def getText(inFile: Path): Either[Throwable, Text] = Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten private def readText(doc: PDDocument): Either[Throwable, Text] = @@ -34,4 +49,31 @@ object PdfboxExtract { stripper.setLineSeparator("\n") Text(Option(stripper.getText(doc))) }.toEither + + def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] = + data.compile + .to(Array) + .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten) + + def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = + Using(PDDocument.load(is))(readMetaData).toEither.flatten + + def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] = + Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten + + private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] = + Try { + def mkValue(s: String) = + Option(s).map(_.trim).filter(_.nonEmpty) + + val info = doc.getDocumentInformation + PdfMetaData( + mkValue(info.getTitle), + mkValue(info.getAuthor), + mkValue(info.getSubject), + mkValue(info.getKeywords), + mkValue(info.getCreator), + Option(info.getCreationDate).map(c => Timestamp(c.toInstant)) + ) + }.toEither } diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala new file mode 100644 index 00000000..b3cfb12d --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala @@ -0,0 +1,22 @@ +package docspell.extract.pdfbox + +import minitest.SimpleTestSuite + +object PdfMetaDataTest extends SimpleTestSuite { + + test("split keywords on comma") { + val md = PdfMetaData.empty.copy(keywords = Some("a,b, c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + + test("split keywords on semicolon") { + val md = PdfMetaData.empty.copy(keywords = Some("a; b;c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + + test("split keywords on comma and semicolon") { + val md = PdfMetaData.empty.copy(keywords = Some("a, b; c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + +} diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala index 1f436b25..3659cf4b 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite { textPDFs.foreach { case (file, txt) => val url = file.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) @@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite { textPDFs.foreach { case (file, txt) => val data = file.readURL[IO](8192, blocker) - val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) + val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity) val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) @@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite { test("extract text from image PDFs") { val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) assertEquals(str.value, "") } + test("extract metadata from pdf") { + val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) + assert(str.value.startsWith("Keywords in PDF")) + val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) + assertEquals(md.author, Some("E.K.")) + assertEquals(md.title, Some("Keywords in PDF")) + assertEquals(md.subject, Some("This is a subject")) + assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo")) + assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)")) + assert(md.creationDate.isDefined) + } + private def removeFormatting(str: String): String = str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase } diff --git a/modules/files/src/test/resources/keywords.pdf b/modules/files/src/test/resources/keywords.pdf new file mode 100644 index 00000000..963fe42c Binary files /dev/null and b/modules/files/src/test/resources/keywords.pdf differ diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index bf48f49e..08de8d83 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -107,7 +107,8 @@ object CreateItem { Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap, - MetaProposalList.empty + MetaProposalList.empty, + Nil ) } @@ -148,7 +149,15 @@ object CreateItem { .map(originFileTuple) .toMap } yield cand.headOption.map(ri => - ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty) + ItemData( + ri, + rms, + Vector.empty, + Vector.empty, + origMap, + MetaProposalList.empty, + Nil + ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 46ef9f8c..d4f83fc2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -22,7 +22,8 @@ case class ItemData( metas: Vector[RAttachmentMeta], dateLabels: Vector[AttachmentDates], originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id - givenMeta: MetaProposalList // given meta data not associated to a specific attachment + givenMeta: MetaProposalList, // given meta data not associated to a specific attachment + tags: List[String] // a list of tags (names or ids) attached to the item if they exist ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala index ba51af23..b0c279e7 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala @@ -17,19 +17,41 @@ object SetGivenData { .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item")) .map(_ => data) else - Task { ctx => - val itemId = data.item.id - val folderId = ctx.args.meta.folderId - val collective = ctx.args.meta.collective - for { - _ <- ctx.logger.info("Starting setting given data") - _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'") - e <- ops.setFolder(itemId, folderId, collective).attempt - _ <- e.fold( - ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"), - _ => ().pure[F] - ) - } yield data - } + setFolder(data, ops).flatMap(d => setTags[F](d, ops)) + private def setFolder[F[_]: Sync]( + data: ItemData, + ops: OItem[F] + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + val itemId = data.item.id + val folderId = ctx.args.meta.folderId + val collective = ctx.args.meta.collective + for { + _ <- ctx.logger.info("Starting setting given data") + _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'") + e <- ops.setFolder(itemId, folderId, collective).attempt + _ <- e.fold( + ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"), + _ => ().pure[F] + ) + } yield data + } + + private def setTags[F[_]: Sync]( + data: ItemData, + ops: OItem[F] + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + val itemId = data.item.id + val collective = ctx.args.meta.collective + for { + _ <- ctx.logger.info(s"Set tags from given data: ${data.tags}") + e <- ops.linkTags(itemId, data.tags, collective).attempt + _ <- e.fold( + ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"), + _ => ().pure[F] + ) + } yield data + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 384741e2..89bb1f61 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -32,46 +32,52 @@ object TextExtraction { ) ) _ <- ctx.logger.debug("Storing extracted texts") - _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) + _ <- + txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am))) idxItem = TextData.item( item.item.id, ctx.args.meta.collective, - None, //folder + ctx.args.meta.folderId, item.item.name.some, None ) - _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*) + _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*) dur <- start _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") - } yield item.copy(metas = txt.map(_._1)) + } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList) } + // -- helpers + + case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil) + def extractTextIfEmpty[F[_]: Sync: ContextShift]( - ctx: Context[F, _], + ctx: Context[F, ProcessItemArgs], cfg: ExtractConfig, lang: Language, collective: Ident, item: ItemData - )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = { - def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) = - ( - rm, + )(ra: RAttachment): F[Result] = { + def makeTextData(pair: (RAttachmentMeta, List[String])): Result = + Result( + pair._1, TextData.attachment( item.item.id, ra.id, collective, - None, //folder + ctx.args.meta.folderId, lang, ra.name, - rm.content - ) + pair._1.content + ), + pair._2 ) val rm = item.findOrCreate(ra.id) rm.content match { case Some(_) => ctx.logger.info("TextExtraction skipped, since text is already available.") *> - makeTextData(rm).pure[F] + makeTextData((rm, Nil)).pure[F] case None => extractTextToMeta[F](ctx, cfg, lang, item)(ra) .map(makeTextData) @@ -83,21 +89,25 @@ object TextExtraction { cfg: ExtractConfig, lang: Language, item: ItemData - )(ra: RAttachment): F[RAttachmentMeta] = + )(ra: RAttachment): F[(RAttachmentMeta, List[String])] = for { _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") dst <- Duration.stopTime[F] fids <- filesToExtract(ctx)(item, ra) - txt <- extractTextFallback(ctx, cfg, ra, lang)(fids) + res <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, - rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)) + rm => + rm.setContentIfEmpty( + res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty) + ) ) + tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil) est <- dst _ <- ctx.logger.info( s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" ) - } yield meta + } yield (meta, tags) def extractText[F[_]: Sync: ContextShift]( ctx: Context[F, _], @@ -123,7 +133,7 @@ object TextExtraction { cfg: ExtractConfig, ra: RAttachment, lang: Language - )(fileIds: List[Ident]): F[Option[String]] = + )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] = fileIds match { case Nil => ctx.logger.error(s"Cannot extract text").map(_ => None) @@ -133,8 +143,8 @@ object TextExtraction { extractText[F](ctx, extr, lang)(id) .flatMap({ - case ExtractResult.Success(txt) => - txt.some.pure[F] + case res @ ExtractResult.Success(_, _) => + res.some.pure[F] case ExtractResult.UnsupportedFormat(mt) => ctx.logger diff --git a/modules/store/src/main/scala/docspell/store/impl/Column.scala b/modules/store/src/main/scala/docspell/store/impl/Column.scala index 67c1097e..134e0afb 100644 --- a/modules/store/src/main/scala/docspell/store/impl/Column.scala +++ b/modules/store/src/main/scala/docspell/store/impl/Column.scala @@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") { def isIn[A: Put](values: NonEmptyList[A]): Fragment = isIn(values.map(a => sql"$a").toList) + def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment = + fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")" + def isIn(frag: Fragment): Fragment = f ++ fr"IN (" ++ frag ++ fr")" diff --git a/modules/store/src/main/scala/docspell/store/records/RItem.scala b/modules/store/src/main/scala/docspell/store/records/RItem.scala index 97b87d84..e961e8b2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RItem.scala +++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala @@ -314,6 +314,9 @@ object RItem { def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] = selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option + def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] = + selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option + def removeFolder(folderId: Ident): ConnectionIO[Int] = { val empty: Option[Ident] = None updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 27a04bf2..71b7b1f0 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -1,5 +1,8 @@ package docspell.store.records +import cats.data.NonEmptyList +import cats.implicits._ + import docspell.common._ import docspell.store.impl.Implicits._ import docspell.store.impl._ @@ -101,6 +104,21 @@ object RTag { ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector] } + def findAllByNameOrId( + nameOrIds: List[String], + coll: Ident + ): ConnectionIO[Vector[RTag]] = { + val idList = + NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq + val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq + + val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++ + nameList.flatMap(ns => Seq(name.isLowerIn(ns))) + + if (cond.isEmpty) Vector.empty.pure[ConnectionIO] + else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector] + } + def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run } diff --git a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala index 2782731d..35050225 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala @@ -1,5 +1,6 @@ package docspell.store.records +import cats.data.NonEmptyList import cats.implicits._ import docspell.common._ @@ -43,4 +44,28 @@ object RTagItem { def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] = selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector] + + def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] = + NonEmptyList.fromList(tags.toList) match { + case Some(nel) => + selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel))) + .query[RTagItem] + .to[Vector] + case None => + Vector.empty.pure[ConnectionIO] + } + + def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] = + if (tags.isEmpty) 0.pure[ConnectionIO] + else + for { + entities <- tags.toList.traverse(tagId => + Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId)) + ) + n <- insertRows( + table, + all, + entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}") + ).update.run + } yield n }