Merge pull request #183 from eikek/pdf-metadata

Pdf metadata
This commit is contained in:
mergify[bot] 2020-07-18 23:19:21 +00:00 committed by GitHub
commit 185a103942
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 315 additions and 69 deletions

View File

@ -6,6 +6,7 @@ import cats.implicits._
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.store.UpdateResult
import docspell.store.queries.{QAttachment, QItem} import docspell.store.queries.{QAttachment, QItem}
import docspell.store.records._ import docspell.store.records._
import docspell.store.{AddResult, Store} import docspell.store.{AddResult, Store}
@ -22,6 +23,9 @@ trait OItem[F[_]] {
/** Create a new tag and add it to the item. */ /** Create a new tag and add it to the item. */
def addNewTag(item: Ident, tag: RTag): F[AddResult] def addNewTag(item: Ident, tag: RTag): F[AddResult]
/** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult] def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult] def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
@ -90,6 +94,27 @@ object OItem {
.attempt .attempt
.map(AddResult.fromUpdate) .map(AddResult.fromUpdate)
def linkTags(
item: Ident,
tags: List[String],
collective: Ident
): F[UpdateResult] =
tags.distinct match {
case Nil => UpdateResult.success.pure[F]
case kws =>
val db =
(for {
_ <- OptionT(RItem.checkByIdAndCollective(item, collective))
given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
_ <- OptionT.liftF(
RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
)
} yield UpdateResult.success).getOrElse(UpdateResult.notFound)
store.transact(db)
}
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = { def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
val db = for { val db = for {
cid <- RItem.getCollective(item) cid <- RItem.getCollective(item)

View File

@ -1,39 +1,47 @@
package docspell.extract package docspell.extract
import scala.util.Try
import docspell.common.MimeType import docspell.common.MimeType
import docspell.extract.pdfbox.PdfMetaData
sealed trait ExtractResult { sealed trait ExtractResult {
def textOption: Option[String] def textOption: Option[String]
def pdfMeta: Option[PdfMetaData]
} }
object ExtractResult { object ExtractResult {
case class UnsupportedFormat(mime: MimeType) extends ExtractResult { case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
val textOption = None val textOption = None
val pdfMeta = None
} }
def unsupportedFormat(mt: MimeType): ExtractResult = def unsupportedFormat(mt: MimeType): ExtractResult =
UnsupportedFormat(mt) UnsupportedFormat(mt)
case class Failure(ex: Throwable) extends ExtractResult { case class Failure(ex: Throwable) extends ExtractResult {
val textOption = None val textOption = None
val pdfMeta = None
} }
def failure(ex: Throwable): ExtractResult = def failure(ex: Throwable): ExtractResult =
Failure(ex) Failure(ex)
case class Success(text: String) extends ExtractResult { case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
val textOption = Some(text) val textOption = Some(text)
def appendPdfMetaToText: Success =
pdfMeta.flatMap(_.asText) match {
case Some(m) =>
copy(text = text + "\n\n" + m)
case None => this
}
} }
def success(text: String): ExtractResult = def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
Success(text) Success(text, pdfMeta)
def fromTry(r: Try[String]): ExtractResult =
r.fold(Failure.apply, Success.apply)
def fromEither(e: Either[Throwable, String]): ExtractResult = def fromEither(e: Either[Throwable, String]): ExtractResult =
e.fold(failure, success) e.fold(failure, str => success(str, None))
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
e.fold(failure, r => success(r.txt.value, r.meta))
} }

View File

@ -40,8 +40,7 @@ object Extraction {
case MimeType.PdfMatch(_) => case MimeType.PdfMatch(_) =>
PdfExtract PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(_.map(_.value)) .map(ExtractResult.fromEitherResult)
.map(ExtractResult.fromEither)
case PoiType(mt) => case PoiType(mt) =>
PoiExtract PoiExtract
@ -103,7 +102,7 @@ object Extraction {
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(Text(txt).value) ExtractResult.success(Text(txt).value, None)
} }
case mt => case mt =>

View File

@ -7,9 +7,15 @@ import fs2.Stream
import docspell.common.{Language, Logger} import docspell.common.{Language, Logger}
import docspell.extract.internal.Text import docspell.extract.internal.Text
import docspell.extract.ocr.{OcrConfig, TextExtract} import docspell.extract.ocr.{OcrConfig, TextExtract}
import docspell.extract.pdfbox.PdfMetaData
import docspell.extract.pdfbox.PdfboxExtract import docspell.extract.pdfbox.PdfboxExtract
object PdfExtract { object PdfExtract {
final case class Result(txt: Text, meta: Option[PdfMetaData])
object Result {
def apply(t: (Text, Option[PdfMetaData])): Result =
Result(t._1, t._2)
}
def get[F[_]: Sync: ContextShift]( def get[F[_]: Sync: ContextShift](
in: Stream[F, Byte], in: Stream[F, Byte],
@ -18,39 +24,39 @@ object PdfExtract {
stripMinLen: Int, stripMinLen: Int,
ocrCfg: OcrConfig, ocrCfg: OcrConfig,
logger: Logger[F] logger: Logger[F]
): F[Either[Throwable, Text]] = { ): F[Either[Throwable, Result]] = {
val runOcr = val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: Text, strippedStr: Text) = def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
if (ocrStr.length > strippedStr.length) if (ocrStr.length > strippedRes._1.length)
logger.info( logger.info(
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
) *> ocrStr.pure[F] ) *> Result(ocrStr, strippedRes._2).pure[F]
else else
logger.info( logger.info(
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
) *> strippedStr.pure[F] ) *> Result(strippedRes).pure[F]
//maybe better: inspect the pdf and decide whether ocr or not //maybe better: inspect the pdf and decide whether ocr or not
for { for {
pdfboxRes <- pdfboxRes <-
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract logger.debug("Trying to strip text from pdf using pdfbox.") *>
.get[F](in) PdfboxExtract.getTextAndMetaData[F](in)
res <- pdfboxRes.fold( res <- pdfboxRes.fold(
ex => ex =>
logger.info( logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) >> runOcr.attempt, ) >> runOcr.map(txt => Result(txt, None)).attempt,
str => pair =>
if (str.length >= stripMinLen) str.pure[F].attempt if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
else else
logger logger
.info( .info(
s"Stripped text from PDF is small (${str.length}). Trying with OCR." s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
) *> ) *>
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
) )
} yield res } yield res
} }

View File

@ -0,0 +1,40 @@
package docspell.extract.pdfbox
import docspell.common.Timestamp
final case class PdfMetaData(
title: Option[String],
author: Option[String],
subject: Option[String],
keywords: Option[String],
creator: Option[String],
creationDate: Option[Timestamp]
) {
def isEmpty: Boolean =
title.isEmpty &&
author.isEmpty &&
subject.isEmpty &&
keywords.isEmpty &&
creator.isEmpty &&
creationDate.isEmpty
def nonEmpty: Boolean =
!isEmpty
def keywordList: List[String] =
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
/** Return all data in lines, except keywords. Keywords are handled separately. */
def asText: Option[String] =
(title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
_.toUtcDate.toString
)) match {
case Nil => None
case list => Some(list.mkString("\n"))
}
}
object PdfMetaData {
val empty = PdfMetaData(None, None, None, None, None, None)
}

View File

@ -9,6 +9,7 @@ import cats.effect.Sync
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.common.Timestamp
import docspell.extract.internal.Text import docspell.extract.internal.Text
import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.pdmodel.PDDocument
@ -16,15 +17,29 @@ import org.apache.pdfbox.text.PDFTextStripper
object PdfboxExtract { object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte]
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile
.to(Array)
.map(bytes =>
Using(PDDocument.load(bytes)) { doc =>
for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, Some(md).filter(_.nonEmpty))
}.toEither.flatten
)
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile data.compile
.to(Array) .to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, Text] = def getText(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, Text] = def getText(inFile: Path): Either[Throwable, Text] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, Text] = private def readText(doc: PDDocument): Either[Throwable, Text] =
@ -34,4 +49,31 @@ object PdfboxExtract {
stripper.setLineSeparator("\n") stripper.setLineSeparator("\n")
Text(Option(stripper.getText(doc))) Text(Option(stripper.getText(doc)))
}.toEither }.toEither
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(is))(readMetaData).toEither.flatten
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
Try {
def mkValue(s: String) =
Option(s).map(_.trim).filter(_.nonEmpty)
val info = doc.getDocumentInformation
PdfMetaData(
mkValue(info.getTitle),
mkValue(info.getAuthor),
mkValue(info.getSubject),
mkValue(info.getKeywords),
mkValue(info.getCreator),
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
)
}.toEither
} }

View File

@ -0,0 +1,22 @@
package docspell.extract.pdfbox
import minitest.SimpleTestSuite
object PdfMetaDataTest extends SimpleTestSuite {
test("split keywords on comma") {
val md = PdfMetaData.empty.copy(keywords = Some("a,b, c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
test("split keywords on semicolon") {
val md = PdfMetaData.empty.copy(keywords = Some("a; b;c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
test("split keywords on comma and semicolon") {
val md = PdfMetaData.empty.copy(keywords = Some("a, b; c"))
assertEquals(md.keywordList, List("a", "b", "c"))
}
}

View File

@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
textPDFs.foreach { textPDFs.foreach {
case (file, txt) => case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity) val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str.value) val received = removeFormatting(str.value)
val expect = removeFormatting(txt) val expect = removeFormatting(txt)
assertEquals(received, expect) assertEquals(received, expect)
@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
textPDFs.foreach { textPDFs.foreach {
case (file, txt) => case (file, txt) =>
val data = file.readURL[IO](8192, blocker) val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str.value) val received = removeFormatting(str.value)
val expect = removeFormatting(txt) val expect = removeFormatting(txt)
assertEquals(received, expect) assertEquals(received, expect)
@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite {
test("extract text from image PDFs") { test("extract text from image PDFs") {
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
assertEquals(str.value, "") assertEquals(str.value, "")
} }
test("extract metadata from pdf") {
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
assert(str.value.startsWith("Keywords in PDF"))
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
assertEquals(md.author, Some("E.K."))
assertEquals(md.title, Some("Keywords in PDF"))
assertEquals(md.subject, Some("This is a subject"))
assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo"))
assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)"))
assert(md.creationDate.isDefined)
}
private def removeFormatting(str: String): String = private def removeFormatting(str: String): String =
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
} }

Binary file not shown.

View File

@ -107,7 +107,8 @@ object CreateItem {
Vector.empty, Vector.empty,
Vector.empty, Vector.empty,
fm.map(a => a.id -> a.fileId).toMap, fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty MetaProposalList.empty,
Nil
) )
} }
@ -148,7 +149,15 @@ object CreateItem {
.map(originFileTuple) .map(originFileTuple)
.toMap .toMap
} yield cand.headOption.map(ri => } yield cand.headOption.map(ri =>
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty) ItemData(
ri,
rms,
Vector.empty,
Vector.empty,
origMap,
MetaProposalList.empty,
Nil
)
) )
} }

View File

@ -22,7 +22,8 @@ case class ItemData(
metas: Vector[RAttachmentMeta], metas: Vector[RAttachmentMeta],
dateLabels: Vector[AttachmentDates], dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList // given meta data not associated to a specific attachment givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
) { ) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] = def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -17,19 +17,41 @@ object SetGivenData {
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item")) .log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
.map(_ => data) .map(_ => data)
else else
Task { ctx => setFolder(data, ops).flatMap(d => setTags[F](d, ops))
val itemId = data.item.id
val folderId = ctx.args.meta.folderId
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info("Starting setting given data")
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
e <- ops.setFolder(itemId, folderId, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
private def setFolder[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val itemId = data.item.id
val folderId = ctx.args.meta.folderId
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info("Starting setting given data")
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
e <- ops.setFolder(itemId, folderId, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
private def setTags[F[_]: Sync](
data: ItemData,
ops: OItem[F]
): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
val itemId = data.item.id
val collective = ctx.args.meta.collective
for {
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
e <- ops.linkTags(itemId, data.tags, collective).attempt
_ <- e.fold(
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
_ => ().pure[F]
)
} yield data
}
} }

View File

@ -32,46 +32,52 @@ object TextExtraction {
) )
) )
_ <- ctx.logger.debug("Storing extracted texts") _ <- ctx.logger.debug("Storing extracted texts")
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1))) _ <-
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
idxItem = TextData.item( idxItem = TextData.item(
item.item.id, item.item.id,
ctx.args.meta.collective, ctx.args.meta.collective,
None, //folder ctx.args.meta.folderId,
item.item.name.some, item.item.name.some,
None None
) )
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*) _ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
dur <- start dur <- start
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}") _ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
} yield item.copy(metas = txt.map(_._1)) } yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
} }
// -- helpers
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
def extractTextIfEmpty[F[_]: Sync: ContextShift]( def extractTextIfEmpty[F[_]: Sync: ContextShift](
ctx: Context[F, _], ctx: Context[F, ProcessItemArgs],
cfg: ExtractConfig, cfg: ExtractConfig,
lang: Language, lang: Language,
collective: Ident, collective: Ident,
item: ItemData item: ItemData
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = { )(ra: RAttachment): F[Result] = {
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) = def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
( Result(
rm, pair._1,
TextData.attachment( TextData.attachment(
item.item.id, item.item.id,
ra.id, ra.id,
collective, collective,
None, //folder ctx.args.meta.folderId,
lang, lang,
ra.name, ra.name,
rm.content pair._1.content
) ),
pair._2
) )
val rm = item.findOrCreate(ra.id) val rm = item.findOrCreate(ra.id)
rm.content match { rm.content match {
case Some(_) => case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *> ctx.logger.info("TextExtraction skipped, since text is already available.") *>
makeTextData(rm).pure[F] makeTextData((rm, Nil)).pure[F]
case None => case None =>
extractTextToMeta[F](ctx, cfg, lang, item)(ra) extractTextToMeta[F](ctx, cfg, lang, item)(ra)
.map(makeTextData) .map(makeTextData)
@ -83,21 +89,25 @@ object TextExtraction {
cfg: ExtractConfig, cfg: ExtractConfig,
lang: Language, lang: Language,
item: ItemData item: ItemData
)(ra: RAttachment): F[RAttachmentMeta] = )(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
for { for {
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
dst <- Duration.stopTime[F] dst <- Duration.stopTime[F]
fids <- filesToExtract(ctx)(item, ra) fids <- filesToExtract(ctx)(item, ra)
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids) res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta( meta = item.changeMeta(
ra.id, ra.id,
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty)) rm =>
rm.setContentIfEmpty(
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
)
) )
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
est <- dst est <- dst
_ <- ctx.logger.info( _ <- ctx.logger.info(
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
) )
} yield meta } yield (meta, tags)
def extractText[F[_]: Sync: ContextShift]( def extractText[F[_]: Sync: ContextShift](
ctx: Context[F, _], ctx: Context[F, _],
@ -123,7 +133,7 @@ object TextExtraction {
cfg: ExtractConfig, cfg: ExtractConfig,
ra: RAttachment, ra: RAttachment,
lang: Language lang: Language
)(fileIds: List[Ident]): F[Option[String]] = )(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
fileIds match { fileIds match {
case Nil => case Nil =>
ctx.logger.error(s"Cannot extract text").map(_ => None) ctx.logger.error(s"Cannot extract text").map(_ => None)
@ -133,8 +143,8 @@ object TextExtraction {
extractText[F](ctx, extr, lang)(id) extractText[F](ctx, extr, lang)(id)
.flatMap({ .flatMap({
case ExtractResult.Success(txt) => case res @ ExtractResult.Success(_, _) =>
txt.some.pure[F] res.some.pure[F]
case ExtractResult.UnsupportedFormat(mt) => case ExtractResult.UnsupportedFormat(mt) =>
ctx.logger ctx.logger

View File

@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
def isIn[A: Put](values: NonEmptyList[A]): Fragment = def isIn[A: Put](values: NonEmptyList[A]): Fragment =
isIn(values.map(a => sql"$a").toList) isIn(values.map(a => sql"$a").toList)
def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
def isIn(frag: Fragment): Fragment = def isIn(frag: Fragment): Fragment =
f ++ fr"IN (" ++ frag ++ fr")" f ++ fr"IN (" ++ frag ++ fr")"

View File

@ -314,6 +314,9 @@ object RItem {
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] = def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
def removeFolder(folderId: Ident): ConnectionIO[Int] = { def removeFolder(folderId: Ident): ConnectionIO[Int] = {
val empty: Option[Ident] = None val empty: Option[Ident] = None
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run

View File

@ -1,5 +1,8 @@
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._
import docspell.common._ import docspell.common._
import docspell.store.impl.Implicits._ import docspell.store.impl.Implicits._
import docspell.store.impl._ import docspell.store.impl._
@ -101,6 +104,21 @@ object RTag {
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector] ) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
} }
def findAllByNameOrId(
nameOrIds: List[String],
coll: Ident
): ConnectionIO[Vector[RTag]] = {
val idList =
NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
}
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] = def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
} }

View File

@ -1,5 +1,6 @@
package docspell.store.records package docspell.store.records
import cats.data.NonEmptyList
import cats.implicits._ import cats.implicits._
import docspell.common._ import docspell.common._
@ -43,4 +44,28 @@ object RTagItem {
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] = def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector] selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
NonEmptyList.fromList(tags.toList) match {
case Some(nel) =>
selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
.query[RTagItem]
.to[Vector]
case None =>
Vector.empty.pure[ConnectionIO]
}
def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
if (tags.isEmpty) 0.pure[ConnectionIO]
else
for {
entities <- tags.toList.traverse(tagId =>
Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
)
n <- insertRows(
table,
all,
entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
).update.run
} yield n
} }