mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
commit
185a103942
@ -6,6 +6,7 @@ import cats.implicits._
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
|
import docspell.store.UpdateResult
|
||||||
import docspell.store.queries.{QAttachment, QItem}
|
import docspell.store.queries.{QAttachment, QItem}
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
import docspell.store.{AddResult, Store}
|
import docspell.store.{AddResult, Store}
|
||||||
@ -22,6 +23,9 @@ trait OItem[F[_]] {
|
|||||||
/** Create a new tag and add it to the item. */
|
/** Create a new tag and add it to the item. */
|
||||||
def addNewTag(item: Ident, tag: RTag): F[AddResult]
|
def addNewTag(item: Ident, tag: RTag): F[AddResult]
|
||||||
|
|
||||||
|
/** Apply all tags to the given item. Tags must exist, but can be IDs or names. */
|
||||||
|
def linkTags(item: Ident, tags: List[String], collective: Ident): F[UpdateResult]
|
||||||
|
|
||||||
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
|
def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult]
|
||||||
|
|
||||||
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
|
def setFolder(item: Ident, folder: Option[Ident], collective: Ident): F[AddResult]
|
||||||
@ -90,6 +94,27 @@ object OItem {
|
|||||||
.attempt
|
.attempt
|
||||||
.map(AddResult.fromUpdate)
|
.map(AddResult.fromUpdate)
|
||||||
|
|
||||||
|
def linkTags(
|
||||||
|
item: Ident,
|
||||||
|
tags: List[String],
|
||||||
|
collective: Ident
|
||||||
|
): F[UpdateResult] =
|
||||||
|
tags.distinct match {
|
||||||
|
case Nil => UpdateResult.success.pure[F]
|
||||||
|
case kws =>
|
||||||
|
val db =
|
||||||
|
(for {
|
||||||
|
_ <- OptionT(RItem.checkByIdAndCollective(item, collective))
|
||||||
|
given <- OptionT.liftF(RTag.findAllByNameOrId(kws, collective))
|
||||||
|
exist <- OptionT.liftF(RTagItem.findAllIn(item, given.map(_.tagId)))
|
||||||
|
_ <- OptionT.liftF(
|
||||||
|
RTagItem.setAllTags(item, given.map(_.tagId).diff(exist.map(_.tagId)))
|
||||||
|
)
|
||||||
|
} yield UpdateResult.success).getOrElse(UpdateResult.notFound)
|
||||||
|
|
||||||
|
store.transact(db)
|
||||||
|
}
|
||||||
|
|
||||||
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
|
def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = {
|
||||||
val db = for {
|
val db = for {
|
||||||
cid <- RItem.getCollective(item)
|
cid <- RItem.getCollective(item)
|
||||||
|
@ -1,39 +1,47 @@
|
|||||||
package docspell.extract
|
package docspell.extract
|
||||||
|
|
||||||
import scala.util.Try
|
|
||||||
|
|
||||||
import docspell.common.MimeType
|
import docspell.common.MimeType
|
||||||
|
import docspell.extract.pdfbox.PdfMetaData
|
||||||
|
|
||||||
sealed trait ExtractResult {
|
sealed trait ExtractResult {
|
||||||
|
|
||||||
def textOption: Option[String]
|
def textOption: Option[String]
|
||||||
|
|
||||||
|
def pdfMeta: Option[PdfMetaData]
|
||||||
}
|
}
|
||||||
|
|
||||||
object ExtractResult {
|
object ExtractResult {
|
||||||
|
|
||||||
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
|
val pdfMeta = None
|
||||||
}
|
}
|
||||||
def unsupportedFormat(mt: MimeType): ExtractResult =
|
def unsupportedFormat(mt: MimeType): ExtractResult =
|
||||||
UnsupportedFormat(mt)
|
UnsupportedFormat(mt)
|
||||||
|
|
||||||
case class Failure(ex: Throwable) extends ExtractResult {
|
case class Failure(ex: Throwable) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
|
val pdfMeta = None
|
||||||
}
|
}
|
||||||
def failure(ex: Throwable): ExtractResult =
|
def failure(ex: Throwable): ExtractResult =
|
||||||
Failure(ex)
|
Failure(ex)
|
||||||
|
|
||||||
case class Success(text: String) extends ExtractResult {
|
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
||||||
val textOption = Some(text)
|
val textOption = Some(text)
|
||||||
|
def appendPdfMetaToText: Success =
|
||||||
|
pdfMeta.flatMap(_.asText) match {
|
||||||
|
case Some(m) =>
|
||||||
|
copy(text = text + "\n\n" + m)
|
||||||
|
case None => this
|
||||||
|
}
|
||||||
}
|
}
|
||||||
def success(text: String): ExtractResult =
|
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
||||||
Success(text)
|
Success(text, pdfMeta)
|
||||||
|
|
||||||
def fromTry(r: Try[String]): ExtractResult =
|
|
||||||
r.fold(Failure.apply, Success.apply)
|
|
||||||
|
|
||||||
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
||||||
e.fold(failure, success)
|
e.fold(failure, str => success(str, None))
|
||||||
|
|
||||||
|
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
|
||||||
|
e.fold(failure, r => success(r.txt.value, r.meta))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -40,8 +40,7 @@ object Extraction {
|
|||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
PdfExtract
|
PdfExtract
|
||||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||||
.map(_.map(_.value))
|
.map(ExtractResult.fromEitherResult)
|
||||||
.map(ExtractResult.fromEither)
|
|
||||||
|
|
||||||
case PoiType(mt) =>
|
case PoiType(mt) =>
|
||||||
PoiExtract
|
PoiExtract
|
||||||
@ -103,7 +102,7 @@ object Extraction {
|
|||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||||
ExtractResult.success(Text(txt).value)
|
ExtractResult.success(Text(txt).value, None)
|
||||||
}
|
}
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
|
@ -7,9 +7,15 @@ import fs2.Stream
|
|||||||
import docspell.common.{Language, Logger}
|
import docspell.common.{Language, Logger}
|
||||||
import docspell.extract.internal.Text
|
import docspell.extract.internal.Text
|
||||||
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||||
|
import docspell.extract.pdfbox.PdfMetaData
|
||||||
import docspell.extract.pdfbox.PdfboxExtract
|
import docspell.extract.pdfbox.PdfboxExtract
|
||||||
|
|
||||||
object PdfExtract {
|
object PdfExtract {
|
||||||
|
final case class Result(txt: Text, meta: Option[PdfMetaData])
|
||||||
|
object Result {
|
||||||
|
def apply(t: (Text, Option[PdfMetaData])): Result =
|
||||||
|
Result(t._1, t._2)
|
||||||
|
}
|
||||||
|
|
||||||
def get[F[_]: Sync: ContextShift](
|
def get[F[_]: Sync: ContextShift](
|
||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
@ -18,39 +24,39 @@ object PdfExtract {
|
|||||||
stripMinLen: Int,
|
stripMinLen: Int,
|
||||||
ocrCfg: OcrConfig,
|
ocrCfg: OcrConfig,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): F[Either[Throwable, Text]] = {
|
): F[Either[Throwable, Result]] = {
|
||||||
|
|
||||||
val runOcr =
|
val runOcr =
|
||||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||||
|
|
||||||
def chooseResult(ocrStr: Text, strippedStr: Text) =
|
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
|
||||||
if (ocrStr.length > strippedStr.length)
|
if (ocrStr.length > strippedRes._1.length)
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
|
||||||
) *> ocrStr.pure[F]
|
) *> Result(ocrStr, strippedRes._2).pure[F]
|
||||||
else
|
else
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
|
s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
|
||||||
) *> strippedStr.pure[F]
|
) *> Result(strippedRes).pure[F]
|
||||||
|
|
||||||
//maybe better: inspect the pdf and decide whether ocr or not
|
//maybe better: inspect the pdf and decide whether ocr or not
|
||||||
for {
|
for {
|
||||||
pdfboxRes <-
|
pdfboxRes <-
|
||||||
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
||||||
.get[F](in)
|
PdfboxExtract.getTextAndMetaData[F](in)
|
||||||
res <- pdfboxRes.fold(
|
res <- pdfboxRes.fold(
|
||||||
ex =>
|
ex =>
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||||
) >> runOcr.attempt,
|
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
||||||
str =>
|
pair =>
|
||||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
|
||||||
else
|
else
|
||||||
logger
|
logger
|
||||||
.info(
|
.info(
|
||||||
s"Stripped text from PDF is small (${str.length}). Trying with OCR."
|
s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
|
||||||
) *>
|
) *>
|
||||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
|
||||||
)
|
)
|
||||||
} yield res
|
} yield res
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,40 @@
|
|||||||
|
package docspell.extract.pdfbox
|
||||||
|
|
||||||
|
import docspell.common.Timestamp
|
||||||
|
|
||||||
|
final case class PdfMetaData(
|
||||||
|
title: Option[String],
|
||||||
|
author: Option[String],
|
||||||
|
subject: Option[String],
|
||||||
|
keywords: Option[String],
|
||||||
|
creator: Option[String],
|
||||||
|
creationDate: Option[Timestamp]
|
||||||
|
) {
|
||||||
|
|
||||||
|
def isEmpty: Boolean =
|
||||||
|
title.isEmpty &&
|
||||||
|
author.isEmpty &&
|
||||||
|
subject.isEmpty &&
|
||||||
|
keywords.isEmpty &&
|
||||||
|
creator.isEmpty &&
|
||||||
|
creationDate.isEmpty
|
||||||
|
|
||||||
|
def nonEmpty: Boolean =
|
||||||
|
!isEmpty
|
||||||
|
|
||||||
|
def keywordList: List[String] =
|
||||||
|
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
||||||
|
|
||||||
|
/** Return all data in lines, except keywords. Keywords are handled separately. */
|
||||||
|
def asText: Option[String] =
|
||||||
|
(title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
|
||||||
|
_.toUtcDate.toString
|
||||||
|
)) match {
|
||||||
|
case Nil => None
|
||||||
|
case list => Some(list.mkString("\n"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object PdfMetaData {
|
||||||
|
val empty = PdfMetaData(None, None, None, None, None, None)
|
||||||
|
}
|
@ -9,6 +9,7 @@ import cats.effect.Sync
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.Timestamp
|
||||||
import docspell.extract.internal.Text
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument
|
import org.apache.pdfbox.pdmodel.PDDocument
|
||||||
@ -16,15 +17,29 @@ import org.apache.pdfbox.text.PDFTextStripper
|
|||||||
|
|
||||||
object PdfboxExtract {
|
object PdfboxExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
def getTextAndMetaData[F[_]: Sync](
|
||||||
|
data: Stream[F, Byte]
|
||||||
|
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||||
|
data.compile
|
||||||
|
.to(Array)
|
||||||
|
.map(bytes =>
|
||||||
|
Using(PDDocument.load(bytes)) { doc =>
|
||||||
|
for {
|
||||||
|
txt <- readText(doc)
|
||||||
|
md <- readMetaData(doc)
|
||||||
|
} yield (txt, Some(md).filter(_.nonEmpty))
|
||||||
|
}.toEither.flatten
|
||||||
|
)
|
||||||
|
|
||||||
|
def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile
|
data.compile
|
||||||
.to(Array)
|
.to(Array)
|
||||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||||
|
|
||||||
def get(is: InputStream): Either[Throwable, Text] =
|
def getText(is: InputStream): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||||
|
|
||||||
def get(inFile: Path): Either[Throwable, Text] =
|
def getText(inFile: Path): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||||
|
|
||||||
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||||
@ -34,4 +49,31 @@ object PdfboxExtract {
|
|||||||
stripper.setLineSeparator("\n")
|
stripper.setLineSeparator("\n")
|
||||||
Text(Option(stripper.getText(doc)))
|
Text(Option(stripper.getText(doc)))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
|
def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] =
|
||||||
|
data.compile
|
||||||
|
.to(Array)
|
||||||
|
.map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten)
|
||||||
|
|
||||||
|
def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] =
|
||||||
|
Using(PDDocument.load(is))(readMetaData).toEither.flatten
|
||||||
|
|
||||||
|
def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] =
|
||||||
|
Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten
|
||||||
|
|
||||||
|
private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] =
|
||||||
|
Try {
|
||||||
|
def mkValue(s: String) =
|
||||||
|
Option(s).map(_.trim).filter(_.nonEmpty)
|
||||||
|
|
||||||
|
val info = doc.getDocumentInformation
|
||||||
|
PdfMetaData(
|
||||||
|
mkValue(info.getTitle),
|
||||||
|
mkValue(info.getAuthor),
|
||||||
|
mkValue(info.getSubject),
|
||||||
|
mkValue(info.getKeywords),
|
||||||
|
mkValue(info.getCreator),
|
||||||
|
Option(info.getCreationDate).map(c => Timestamp(c.toInstant))
|
||||||
|
)
|
||||||
|
}.toEither
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
package docspell.extract.pdfbox
|
||||||
|
|
||||||
|
import minitest.SimpleTestSuite
|
||||||
|
|
||||||
|
object PdfMetaDataTest extends SimpleTestSuite {
|
||||||
|
|
||||||
|
test("split keywords on comma") {
|
||||||
|
val md = PdfMetaData.empty.copy(keywords = Some("a,b, c"))
|
||||||
|
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("split keywords on semicolon") {
|
||||||
|
val md = PdfMetaData.empty.copy(keywords = Some("a; b;c"))
|
||||||
|
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||||
|
}
|
||||||
|
|
||||||
|
test("split keywords on comma and semicolon") {
|
||||||
|
val md = PdfMetaData.empty.copy(keywords = Some("a, b; c"))
|
||||||
|
assertEquals(md.keywordList, List("a", "b", "c"))
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
textPDFs.foreach {
|
textPDFs.foreach {
|
||||||
case (file, txt) =>
|
case (file, txt) =>
|
||||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
val url = file.toJavaUrl.fold(sys.error, identity)
|
||||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||||
val received = removeFormatting(str.value)
|
val received = removeFormatting(str.value)
|
||||||
val expect = removeFormatting(txt)
|
val expect = removeFormatting(txt)
|
||||||
assertEquals(received, expect)
|
assertEquals(received, expect)
|
||||||
@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
textPDFs.foreach {
|
textPDFs.foreach {
|
||||||
case (file, txt) =>
|
case (file, txt) =>
|
||||||
val data = file.readURL[IO](8192, blocker)
|
val data = file.readURL[IO](8192, blocker)
|
||||||
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity)
|
||||||
val received = removeFormatting(str.value)
|
val received = removeFormatting(str.value)
|
||||||
val expect = removeFormatting(txt)
|
val expect = removeFormatting(txt)
|
||||||
assertEquals(received, expect)
|
assertEquals(received, expect)
|
||||||
@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
test("extract text from image PDFs") {
|
test("extract text from image PDFs") {
|
||||||
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
|
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
|
||||||
|
|
||||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||||
|
|
||||||
assertEquals(str.value, "")
|
assertEquals(str.value, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("extract metadata from pdf") {
|
||||||
|
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
|
||||||
|
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||||
|
assert(str.value.startsWith("Keywords in PDF"))
|
||||||
|
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
|
||||||
|
assertEquals(md.author, Some("E.K."))
|
||||||
|
assertEquals(md.title, Some("Keywords in PDF"))
|
||||||
|
assertEquals(md.subject, Some("This is a subject"))
|
||||||
|
assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo"))
|
||||||
|
assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)"))
|
||||||
|
assert(md.creationDate.isDefined)
|
||||||
|
}
|
||||||
|
|
||||||
private def removeFormatting(str: String): String =
|
private def removeFormatting(str: String): String =
|
||||||
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
|
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
|
||||||
}
|
}
|
||||||
|
BIN
modules/files/src/test/resources/keywords.pdf
Normal file
BIN
modules/files/src/test/resources/keywords.pdf
Normal file
Binary file not shown.
@ -107,7 +107,8 @@ object CreateItem {
|
|||||||
Vector.empty,
|
Vector.empty,
|
||||||
Vector.empty,
|
Vector.empty,
|
||||||
fm.map(a => a.id -> a.fileId).toMap,
|
fm.map(a => a.id -> a.fileId).toMap,
|
||||||
MetaProposalList.empty
|
MetaProposalList.empty,
|
||||||
|
Nil
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,7 +149,15 @@ object CreateItem {
|
|||||||
.map(originFileTuple)
|
.map(originFileTuple)
|
||||||
.toMap
|
.toMap
|
||||||
} yield cand.headOption.map(ri =>
|
} yield cand.headOption.map(ri =>
|
||||||
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
|
ItemData(
|
||||||
|
ri,
|
||||||
|
rms,
|
||||||
|
Vector.empty,
|
||||||
|
Vector.empty,
|
||||||
|
origMap,
|
||||||
|
MetaProposalList.empty,
|
||||||
|
Nil
|
||||||
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,8 @@ case class ItemData(
|
|||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
dateLabels: Vector[AttachmentDates],
|
dateLabels: Vector[AttachmentDates],
|
||||||
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||||
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
|
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
||||||
|
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||||
|
@ -17,19 +17,41 @@ object SetGivenData {
|
|||||||
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
|
.log[F, ProcessItemArgs](_.debug(s"Not setting data on existing item"))
|
||||||
.map(_ => data)
|
.map(_ => data)
|
||||||
else
|
else
|
||||||
Task { ctx =>
|
setFolder(data, ops).flatMap(d => setTags[F](d, ops))
|
||||||
val itemId = data.item.id
|
|
||||||
val folderId = ctx.args.meta.folderId
|
|
||||||
val collective = ctx.args.meta.collective
|
|
||||||
for {
|
|
||||||
_ <- ctx.logger.info("Starting setting given data")
|
|
||||||
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
|
|
||||||
e <- ops.setFolder(itemId, folderId, collective).attempt
|
|
||||||
_ <- e.fold(
|
|
||||||
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
|
|
||||||
_ => ().pure[F]
|
|
||||||
)
|
|
||||||
} yield data
|
|
||||||
}
|
|
||||||
|
|
||||||
|
private def setFolder[F[_]: Sync](
|
||||||
|
data: ItemData,
|
||||||
|
ops: OItem[F]
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
val itemId = data.item.id
|
||||||
|
val folderId = ctx.args.meta.folderId
|
||||||
|
val collective = ctx.args.meta.collective
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.info("Starting setting given data")
|
||||||
|
_ <- ctx.logger.debug(s"Set item folder: '${folderId.map(_.id)}'")
|
||||||
|
e <- ops.setFolder(itemId, folderId, collective).attempt
|
||||||
|
_ <- e.fold(
|
||||||
|
ex => ctx.logger.warn(s"Error setting folder: ${ex.getMessage}"),
|
||||||
|
_ => ().pure[F]
|
||||||
|
)
|
||||||
|
} yield data
|
||||||
|
}
|
||||||
|
|
||||||
|
private def setTags[F[_]: Sync](
|
||||||
|
data: ItemData,
|
||||||
|
ops: OItem[F]
|
||||||
|
): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
Task { ctx =>
|
||||||
|
val itemId = data.item.id
|
||||||
|
val collective = ctx.args.meta.collective
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.info(s"Set tags from given data: ${data.tags}")
|
||||||
|
e <- ops.linkTags(itemId, data.tags, collective).attempt
|
||||||
|
_ <- e.fold(
|
||||||
|
ex => ctx.logger.warn(s"Error setting tags: ${ex.getMessage}"),
|
||||||
|
_ => ().pure[F]
|
||||||
|
)
|
||||||
|
} yield data
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,46 +32,52 @@ object TextExtraction {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
_ <- ctx.logger.debug("Storing extracted texts")
|
_ <- ctx.logger.debug("Storing extracted texts")
|
||||||
_ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm._1)))
|
_ <-
|
||||||
|
txt.toList.traverse(res => ctx.store.transact(RAttachmentMeta.upsert(res.am)))
|
||||||
idxItem = TextData.item(
|
idxItem = TextData.item(
|
||||||
item.item.id,
|
item.item.id,
|
||||||
ctx.args.meta.collective,
|
ctx.args.meta.collective,
|
||||||
None, //folder
|
ctx.args.meta.folderId,
|
||||||
item.item.name.some,
|
item.item.name.some,
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_._2)).toSeq: _*)
|
_ <- fts.indexData(ctx.logger, (idxItem +: txt.map(_.td)).toSeq: _*)
|
||||||
dur <- start
|
dur <- start
|
||||||
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
_ <- ctx.logger.info(s"Text extraction finished in ${dur.formatExact}")
|
||||||
} yield item.copy(metas = txt.map(_._1))
|
} yield item.copy(metas = txt.map(_.am), tags = txt.flatMap(_.tags).distinct.toList)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- helpers
|
||||||
|
|
||||||
|
case class Result(am: RAttachmentMeta, td: TextData, tags: List[String] = Nil)
|
||||||
|
|
||||||
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
def extractTextIfEmpty[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, ProcessItemArgs],
|
||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
collective: Ident,
|
collective: Ident,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[(RAttachmentMeta, TextData)] = {
|
)(ra: RAttachment): F[Result] = {
|
||||||
def makeTextData(rm: RAttachmentMeta): (RAttachmentMeta, TextData) =
|
def makeTextData(pair: (RAttachmentMeta, List[String])): Result =
|
||||||
(
|
Result(
|
||||||
rm,
|
pair._1,
|
||||||
TextData.attachment(
|
TextData.attachment(
|
||||||
item.item.id,
|
item.item.id,
|
||||||
ra.id,
|
ra.id,
|
||||||
collective,
|
collective,
|
||||||
None, //folder
|
ctx.args.meta.folderId,
|
||||||
lang,
|
lang,
|
||||||
ra.name,
|
ra.name,
|
||||||
rm.content
|
pair._1.content
|
||||||
)
|
),
|
||||||
|
pair._2
|
||||||
)
|
)
|
||||||
|
|
||||||
val rm = item.findOrCreate(ra.id)
|
val rm = item.findOrCreate(ra.id)
|
||||||
rm.content match {
|
rm.content match {
|
||||||
case Some(_) =>
|
case Some(_) =>
|
||||||
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
|
||||||
makeTextData(rm).pure[F]
|
makeTextData((rm, Nil)).pure[F]
|
||||||
case None =>
|
case None =>
|
||||||
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
extractTextToMeta[F](ctx, cfg, lang, item)(ra)
|
||||||
.map(makeTextData)
|
.map(makeTextData)
|
||||||
@ -83,21 +89,25 @@ object TextExtraction {
|
|||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
lang: Language,
|
lang: Language,
|
||||||
item: ItemData
|
item: ItemData
|
||||||
)(ra: RAttachment): F[RAttachmentMeta] =
|
)(ra: RAttachment): F[(RAttachmentMeta, List[String])] =
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
_ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}")
|
||||||
dst <- Duration.stopTime[F]
|
dst <- Duration.stopTime[F]
|
||||||
fids <- filesToExtract(ctx)(item, ra)
|
fids <- filesToExtract(ctx)(item, ra)
|
||||||
txt <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||||
meta = item.changeMeta(
|
meta = item.changeMeta(
|
||||||
ra.id,
|
ra.id,
|
||||||
rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))
|
rm =>
|
||||||
|
rm.setContentIfEmpty(
|
||||||
|
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
||||||
est <- dst
|
est <- dst
|
||||||
_ <- ctx.logger.info(
|
_ <- ctx.logger.info(
|
||||||
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}"
|
||||||
)
|
)
|
||||||
} yield meta
|
} yield (meta, tags)
|
||||||
|
|
||||||
def extractText[F[_]: Sync: ContextShift](
|
def extractText[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
@ -123,7 +133,7 @@ object TextExtraction {
|
|||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
ra: RAttachment,
|
ra: RAttachment,
|
||||||
lang: Language
|
lang: Language
|
||||||
)(fileIds: List[Ident]): F[Option[String]] =
|
)(fileIds: List[Ident]): F[Option[ExtractResult.Success]] =
|
||||||
fileIds match {
|
fileIds match {
|
||||||
case Nil =>
|
case Nil =>
|
||||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||||
@ -133,8 +143,8 @@ object TextExtraction {
|
|||||||
|
|
||||||
extractText[F](ctx, extr, lang)(id)
|
extractText[F](ctx, extr, lang)(id)
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case ExtractResult.Success(txt) =>
|
case res @ ExtractResult.Success(_, _) =>
|
||||||
txt.some.pure[F]
|
res.some.pure[F]
|
||||||
|
|
||||||
case ExtractResult.UnsupportedFormat(mt) =>
|
case ExtractResult.UnsupportedFormat(mt) =>
|
||||||
ctx.logger
|
ctx.logger
|
||||||
|
@ -53,6 +53,9 @@ case class Column(name: String, ns: String = "", alias: String = "") {
|
|||||||
def isIn[A: Put](values: NonEmptyList[A]): Fragment =
|
def isIn[A: Put](values: NonEmptyList[A]): Fragment =
|
||||||
isIn(values.map(a => sql"$a").toList)
|
isIn(values.map(a => sql"$a").toList)
|
||||||
|
|
||||||
|
def isLowerIn[A: Put](values: NonEmptyList[A]): Fragment =
|
||||||
|
fr"lower(" ++ f ++ fr") IN (" ++ commas(values.map(a => sql"$a").toList) ++ fr")"
|
||||||
|
|
||||||
def isIn(frag: Fragment): Fragment =
|
def isIn(frag: Fragment): Fragment =
|
||||||
f ++ fr"IN (" ++ frag ++ fr")"
|
f ++ fr"IN (" ++ frag ++ fr")"
|
||||||
|
|
||||||
|
@ -314,6 +314,9 @@ object RItem {
|
|||||||
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
|
def findByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[RItem]] =
|
||||||
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
|
selectSimple(all, table, and(id.is(itemId), cid.is(coll))).query[RItem].option
|
||||||
|
|
||||||
|
def checkByIdAndCollective(itemId: Ident, coll: Ident): ConnectionIO[Option[Ident]] =
|
||||||
|
selectSimple(Seq(id), table, and(id.is(itemId), cid.is(coll))).query[Ident].option
|
||||||
|
|
||||||
def removeFolder(folderId: Ident): ConnectionIO[Int] = {
|
def removeFolder(folderId: Ident): ConnectionIO[Int] = {
|
||||||
val empty: Option[Ident] = None
|
val empty: Option[Ident] = None
|
||||||
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
|
updateRow(table, folder.is(folderId), folder.setTo(empty)).update.run
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.store.impl.Implicits._
|
import docspell.store.impl.Implicits._
|
||||||
import docspell.store.impl._
|
import docspell.store.impl._
|
||||||
@ -101,6 +104,21 @@ object RTag {
|
|||||||
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
|
) ++ orderBy(name.prefix("t").asc)).query[RTag].to[Vector]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findAllByNameOrId(
|
||||||
|
nameOrIds: List[String],
|
||||||
|
coll: Ident
|
||||||
|
): ConnectionIO[Vector[RTag]] = {
|
||||||
|
val idList =
|
||||||
|
NonEmptyList.fromList(nameOrIds.flatMap(s => Ident.fromString(s).toOption)).toSeq
|
||||||
|
val nameList = NonEmptyList.fromList(nameOrIds.map(_.toLowerCase)).toSeq
|
||||||
|
|
||||||
|
val cond = idList.flatMap(ids => Seq(tid.isIn(ids))) ++
|
||||||
|
nameList.flatMap(ns => Seq(name.isLowerIn(ns)))
|
||||||
|
|
||||||
|
if (cond.isEmpty) Vector.empty.pure[ConnectionIO]
|
||||||
|
else selectSimple(all, table, and(cid.is(coll), or(cond))).query[RTag].to[Vector]
|
||||||
|
}
|
||||||
|
|
||||||
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
def delete(tagId: Ident, coll: Ident): ConnectionIO[Int] =
|
||||||
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
|
deleteFrom(table, and(tid.is(tagId), cid.is(coll))).update.run
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.store.records
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
@ -43,4 +44,28 @@ object RTagItem {
|
|||||||
|
|
||||||
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
|
def findByItem(item: Ident): ConnectionIO[Vector[RTagItem]] =
|
||||||
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
|
selectSimple(all, table, itemId.is(item)).query[RTagItem].to[Vector]
|
||||||
|
|
||||||
|
def findAllIn(item: Ident, tags: Seq[Ident]): ConnectionIO[Vector[RTagItem]] =
|
||||||
|
NonEmptyList.fromList(tags.toList) match {
|
||||||
|
case Some(nel) =>
|
||||||
|
selectSimple(all, table, and(itemId.is(item), tagId.isIn(nel)))
|
||||||
|
.query[RTagItem]
|
||||||
|
.to[Vector]
|
||||||
|
case None =>
|
||||||
|
Vector.empty.pure[ConnectionIO]
|
||||||
|
}
|
||||||
|
|
||||||
|
def setAllTags(item: Ident, tags: Seq[Ident]): ConnectionIO[Int] =
|
||||||
|
if (tags.isEmpty) 0.pure[ConnectionIO]
|
||||||
|
else
|
||||||
|
for {
|
||||||
|
entities <- tags.toList.traverse(tagId =>
|
||||||
|
Ident.randomId[ConnectionIO].map(id => RTagItem(id, item, tagId))
|
||||||
|
)
|
||||||
|
n <- insertRows(
|
||||||
|
table,
|
||||||
|
all,
|
||||||
|
entities.map(v => fr"${v.tagItemId},${v.itemId},${v.tagId}")
|
||||||
|
).update.run
|
||||||
|
} yield n
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user