mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-04 06:05:59 +00:00
Add pdf meta data to extracted text to add it to full-text index
This commit is contained in:
parent
209c068436
commit
cec4948710
@ -28,6 +28,12 @@ object ExtractResult {
|
|||||||
|
|
||||||
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
||||||
val textOption = Some(text)
|
val textOption = Some(text)
|
||||||
|
def appendPdfMetaToText: Success =
|
||||||
|
pdfMeta.flatMap(_.asText) match {
|
||||||
|
case Some(m) =>
|
||||||
|
copy(text = text + "\n\n" + m)
|
||||||
|
case None => this
|
||||||
|
}
|
||||||
}
|
}
|
||||||
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
||||||
Success(text, pdfMeta)
|
Success(text, pdfMeta)
|
||||||
|
@ -24,6 +24,15 @@ final case class PdfMetaData(
|
|||||||
|
|
||||||
def keywordList: List[String] =
|
def keywordList: List[String] =
|
||||||
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
||||||
|
|
||||||
|
/** Return all data in lines, except keywords. Keywords are handled separately. */
|
||||||
|
def asText: Option[String] =
|
||||||
|
(title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
|
||||||
|
_.toUtcDate.toString
|
||||||
|
)) match {
|
||||||
|
case Nil => None
|
||||||
|
case list => Some(list.mkString("\n"))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object PdfMetaData {
|
object PdfMetaData {
|
||||||
|
@ -97,7 +97,10 @@ object TextExtraction {
|
|||||||
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||||
meta = item.changeMeta(
|
meta = item.changeMeta(
|
||||||
ra.id,
|
ra.id,
|
||||||
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
|
rm =>
|
||||||
|
rm.setContentIfEmpty(
|
||||||
|
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
||||||
est <- dst
|
est <- dst
|
||||||
|
Loading…
x
Reference in New Issue
Block a user