diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index d17b453b..133991ae 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -6,6 +6,7 @@ import cats.implicits._ import docspell.common._ import docspell.ftsclient.FtsClient +import docspell.store.UpdateResult import docspell.store.queries.{QAttachment, QItem} import docspell.store.records._ import docspell.store.{AddResult, Store} @@ -22,6 +23,9 @@ trait OItem[F[_]] { /** Create a new tag and add it to the item. */ def addNewTag(item: Ident, tag: RTag): F[AddResult] + /** Apply all tags to the given item. Tags must exist, but can be IDs or names. */ + def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult] + def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult] def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult] @@ -90,6 +94,27 @@ object OItem { .attempt .map(AddResult.fromUpdate) + def linkTags( + item: Ident, + tags: List[String], + collective: Ident + ): F[UpdateResult] = + tags.distinct match { + case Nil => UpdateResult.success.pure[F] + case kws => + val db = + (for { + _ <- OptionT(RItem.checkByIdAndCollective(item, collective)) + given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective)) + exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId))) + _ <- OptionT.liftF( + RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId))) + ) + } yield UpdateResult.success).getOrElse(UpdateResult.notFound) + + store.transact(db) + } + def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = { val db = for { cid <- RItem.getCollective(item) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala index 3a0f3a1b..d48b63c8 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala @@ -1,39 +1,41 @@ package docspell.extract -import scala.util.Try - import docspell.common.MimeType +import docspell.extract.pdfbox.PdfMetaData sealed trait ExtractResult { def textOption: Option[String] + def pdfMeta: Option[PdfMetaData] } object ExtractResult { case class UnsupportedFormat(mime: MimeType) extends ExtractResult { val textOption = None + val pdfMeta = None } def unsupportedFormat(mt: MimeType): ExtractResult = UnsupportedFormat(mt) case class Failure(ex: Throwable) extends ExtractResult { val textOption = None + val pdfMeta = None } def failure(ex: Throwable): ExtractResult = Failure(ex) - case class Success(text: String) extends ExtractResult { + case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult { val textOption = Some(text) } - def success(text: String): ExtractResult = - Success(text) - - def fromTry(r: Try[String]): ExtractResult = - r.fold(Failure.apply, Success.apply) + def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult = + Success(text, pdfMeta) def fromEither(e: Either[Throwable, String]): ExtractResult = - e.fold(failure, success) + e.fold(failure, str => success(str, None)) + + def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult = + e.fold(failure, r => success(r.txt.value, r.meta)) } diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index cc333b71..2507c119 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -40,8 +40,7 @@ object Extraction { case MimeType.PdfMatch(_) => PdfExtract .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) - .map(_.map(_.value)) - .map(ExtractResult.fromEither) + .map(ExtractResult.fromEitherResult) case PoiType(mt) => PoiExtract @@ -103,7 +102,7 @@ object Extraction { val cs = mt.charsetOrUtf8 logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => - ExtractResult.success(Text(txt).value) + ExtractResult.success(Text(txt).value, None) } case mt => diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 839b0261..4189c510 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -7,9 +7,15 @@ import fs2.Stream import docspell.common.{Language, Logger} import docspell.extract.internal.Text import docspell.extract.ocr.{OcrConfig, TextExtract} +import docspell.extract.pdfbox.PdfMetaData import docspell.extract.pdfbox.PdfboxExtract object PdfExtract { + final case class Result(txt: Text, meta: Option[PdfMetaData]) + object Result { + def apply(t: (Text, Option[PdfMetaData])): Result = + Result(t._1, t._2) + } def get[F[_]: Sync: ContextShift]( in: Stream[F, Byte], @@ -18,39 +24,39 @@ object PdfExtract { stripMinLen: Int, ocrCfg: OcrConfig, logger: Logger[F] - ): F[Either[Throwable, Text]] = { + ): F[Either[Throwable, Result]] = { val runOcr = TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError - def chooseResult(ocrStr: Text, strippedStr: Text) = - if (ocrStr.length > strippedStr.length) + def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) = + if (ocrStr.length > strippedRes._1.length) logger.info( - s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" - ) *> ocrStr.pure[F] + s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})" + ) *> Result(ocrStr, strippedRes._2).pure[F] else logger.info( - s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" - ) *> strippedStr.pure[F] + s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})" + ) *> Result(strippedRes).pure[F] //maybe better: inspect the pdf and decide whether ocr or not for { pdfboxRes <- - logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract - .getText[F](in) + logger.debug("Trying to strip text from pdf using pdfbox.") *> + PdfboxExtract.getTextAndMetaData[F](in) res <- pdfboxRes.fold( ex => logger.info( s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " - ) >> runOcr.attempt, - str => - if (str.length >= stripMinLen) str.pure[F].attempt + ) >> runOcr.map(txt => Result(txt, None)).attempt, + pair => + if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt else logger .info( - s"Stripped text from PDF is small (${str.length}). Trying with OCR." + s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR." ) *> - runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt + runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt ) } yield res } diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index 233d7c31..def9c8ee 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -9,17 +9,17 @@ import cats.effect.Sync import cats.implicits._ import fs2.Stream +import docspell.common.Timestamp import docspell.extract.internal.Text import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.text.PDFTextStripper -import docspell.common.Timestamp object PdfboxExtract { def getTextAndMetaData[F[_]: Sync]( data: Stream[F, Byte] - ): F[Either[Throwable, (Text, PdfMetaData)]] = + ): F[Either[Throwable, (Text, Option[PdfMetaData])]] = data.compile .to(Array) .map(bytes => @@ -27,7 +27,7 @@ object PdfboxExtract { for { txt <- readText(doc) md <- readMetaData(doc) - } yield (txt, md) + } yield (txt, Some(md).filter(_.nonEmpty)) }.toEither.flatten ) diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala index b72b182a..3659cf4b 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -47,7 +47,7 @@ object PdfboxExtractTest extends SimpleTestSuite { val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) assert(str.value.startsWith("Keywords in PDF")) - val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) + val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) assertEquals(md.author, Some("E.K.")) assertEquals(md.title, Some("Keywords in PDF")) assertEquals(md.subject, Some("This is a subject")) diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index bf48f49e..08de8d83 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -107,7 +107,8 @@ object CreateItem { Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap, - MetaProposalList.empty + MetaProposalList.empty, + Nil ) } @@ -148,7 +149,15 @@ object CreateItem { .map(originFileTuple) .toMap } yield cand.headOption.map(ri => - ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty) + ItemData( + ri, + rms, + Vector.empty, + Vector.empty, + origMap, + MetaProposalList.empty, + Nil + ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 46ef9f8c..d4f83fc2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -22,7 +22,8 @@ case class ItemData( metas: Vector[RAttachmentMeta], dateLabels: Vector[AttachmentDates], originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id - givenMeta: MetaProposalList // given meta data not associated to a specific attachment + givenMeta: MetaProposalList, // given meta data not associated to a specific attachment + tags: List[String] // a list of tags (names or ids) attached to the item if they exist ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala index ba51af23..b0c279e7 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SetGivenData.scala @@ -17,19 +17,41 @@ object SetGivenData { .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item")) .map(_ => data) else - Task { ctx => - val itemId = data.item.id - val folderId = ctx.args.meta.folderId - val collective = ctx.args.meta.collective - for { - _ <- ctx.logger.info("Starting setting given data") - _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'") - e <- ops.setFolder(itemId, folderId, collective).attempt - _ <- e.fold( - ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"), - _ => ().pure[F] - ) - } yield data - } + setFolder(data, ops).flatMap(d => setTags[F](d, ops)) + private def setFolder[F[_]: Sync]( + data: ItemData, + ops: OItem[F] + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + val itemId = data.item.id + val folderId = ctx.args.meta.folderId + val collective = ctx.args.meta.collective + for { + _ <- ctx.logger.info("Starting setting given data") + _ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'") + e <- ops.setFolder(itemId, folderId, collective).attempt + _ <- e.fold( + ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"), + _ => ().pure[F] + ) + } yield data + } + + private def setTags[F[_]: Sync]( + data: ItemData, + ops: OItem[F] + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + val itemId = data.item.id + val collective = ctx.args.meta.collective + for { + _ <- ctx.logger.info(s"Set tags from given data: ${data.tags}") + e <- ops.linkTags(itemId, data.tags, collective).attempt + _ <- e.fold( + ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"), + _ => ().pure[F] + ) + } yield data + } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index bc048467..9bc41683 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -32,7 +32,8 @@ object TextExtraction { ) ) _ <- ctx.logger.debug("Storing extracted texts") - _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) + _ <- + txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am))) idxItem = TextData.item( item.item.id, ctx.args.meta.collective, @@ -40,22 +41,26 @@ object TextExtraction { item.item.name.some, None ) - _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*) + _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*) dur <- start _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") - } yield item.copy(metas = txt.map(_._1)) + } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList) } + // -- helpers + + case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil) + def extractTextIfEmpty[F[_]: Sync: ContextShift]( ctx: Context[F, ProcessItemArgs], cfg: ExtractConfig, lang: Language, collective: Ident, item: ItemData - )(ra: RAttachment): F[(RAttachmentMeta, TextData)] = { - def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) = - ( - rm, + )(ra: RAttachment): F[Result] = { + def makeTextData(pair: (RAttachmentMeta, List[String])): Result = + Result( + pair._1, TextData.attachment( item.item.id, ra.id, @@ -63,15 +68,16 @@ object TextExtraction { ctx.args.meta.folderId, lang, ra.name, - rm.content - ) + pair._1.content + ), + pair._2 ) val rm = item.findOrCreate(ra.id) rm.content match { case Some(_) => ctx.logger.info("TextExtraction skipped, since text is already available.") *> - makeTextData(rm).pure[F] + makeTextData((rm, Nil)).pure[F] case None => extractTextToMeta[F](ctx, cfg, lang, item)(ra) .map(makeTextData) @@ -83,21 +89,22 @@ object TextExtraction { cfg: ExtractConfig, lang: Language, item: ItemData - )(ra: RAttachment): F[RAttachmentMeta] = + )(ra: RAttachment): F[(RAttachmentMeta, List[String])] = for { _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") dst <- Duration.stopTime[F] fids <- filesToExtract(ctx)(item, ra) - txt <- extractTextFallback(ctx, cfg, ra, lang)(fids) + res <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, - rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)) + rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty)) ) + tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil) est <- dst _ <- ctx.logger.info( s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" ) - } yield meta + } yield (meta, tags) def extractText[F[_]: Sync: ContextShift]( ctx: Context[F, _], @@ -123,7 +130,7 @@ object TextExtraction { cfg: ExtractConfig, ra: RAttachment, lang: Language - )(fileIds: List[Ident]): F[Option[String]] = + )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] = fileIds match { case Nil => ctx.logger.error(s"Cannot extract text").map(_ => None) @@ -133,8 +140,8 @@ object TextExtraction { extractText[F](ctx, extr, lang)(id) .flatMap({ - case ExtractResult.Success(txt) => - txt.some.pure[F] + case res @ ExtractResult.Success(_, _) => + res.some.pure[F] case ExtractResult.UnsupportedFormat(mt) => ctx.logger diff --git a/modules/store/src/main/scala/docspell/store/impl/Column.scala b/modules/store/src/main/scala/docspell/store/impl/Column.scala index 67c1097e..134e0afb 100644 --- a/modules/store/src/main/scala/docspell/store/impl/Column.scala +++ b/modules/store/src/main/scala/docspell/store/impl/Column.scala @@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") { def isIn[A: Put](values: NonEmptyList[A]): Fragment = isIn(values.map(a => sql"$a").toList) + def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment = + fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")" + def isIn(frag: Fragment): Fragment = f ++ fr"IN (" ++ frag ++ fr")" diff --git a/modules/store/src/main/scala/docspell/store/records/RItem.scala b/modules/store/src/main/scala/docspell/store/records/RItem.scala index 97b87d84..e961e8b2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RItem.scala +++ b/modules/store/src/main/scala/docspell/store/records/RItem.scala @@ -314,6 +314,9 @@ object RItem { def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] = selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option + def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] = + selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option + def removeFolder(folderId: Ident): ConnectionIO[Int] = { val empty: Option[Ident] = None updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run diff --git a/modules/store/src/main/scala/docspell/store/records/RTag.scala b/modules/store/src/main/scala/docspell/store/records/RTag.scala index 27a04bf2..71b7b1f0 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTag.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTag.scala @@ -1,5 +1,8 @@ package docspell.store.records +import cats.data.NonEmptyList +import cats.implicits._ + import docspell.common._ import docspell.store.impl.Implicits._ import docspell.store.impl._ @@ -101,6 +104,21 @@ object RTag { ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector] } + def findAllByNameOrId( + nameOrIds: List[String], + coll: Ident + ): ConnectionIO[Vector[RTag]] = { + val idList = + NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq + val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq + + val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++ + nameList.flatMap(ns => Seq(name.isLowerIn(ns))) + + if (cond.isEmpty) Vector.empty.pure[ConnectionIO] + else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector] + } + def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run } diff --git a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala index 2782731d..35050225 100644 --- a/modules/store/src/main/scala/docspell/store/records/RTagItem.scala +++ b/modules/store/src/main/scala/docspell/store/records/RTagItem.scala @@ -1,5 +1,6 @@ package docspell.store.records +import cats.data.NonEmptyList import cats.implicits._ import docspell.common._ @@ -43,4 +44,28 @@ object RTagItem { def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] = selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector] + + def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] = + NonEmptyList.fromList(tags.toList) match { + case Some(nel) => + selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel))) + .query[RTagItem] + .to[Vector] + case None => + Vector.empty.pure[ConnectionIO] + } + + def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] = + if (tags.isEmpty) 0.pure[ConnectionIO] + else + for { + entities <- tags.toList.traverse(tagId => + Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId)) + ) + n <- insertRows( + table, + all, + entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}") + ).update.run + } yield n }