mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 13:32:51 +00:00
Add pdf meta data to extracted text to add it to full-text index
This commit is contained in:
parent
209c068436
commit
cec4948710
@ -28,6 +28,12 @@ object ExtractResult {
|
||||
|
||||
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
||||
val textOption = Some(text)
|
||||
def appendPdfMetaToText: Success =
|
||||
pdfMeta.flatMap(_.asText) match {
|
||||
case Some(m) =>
|
||||
copy(text = text + "\n\n" + m)
|
||||
case None => this
|
||||
}
|
||||
}
|
||||
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
||||
Success(text, pdfMeta)
|
||||
|
@ -24,6 +24,15 @@ final case class PdfMetaData(
|
||||
|
||||
def keywordList: List[String] =
|
||||
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
|
||||
|
||||
/** Return all data in lines, except keywords. Keywords are handled separately. */
|
||||
def asText: Option[String] =
|
||||
(title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
|
||||
_.toUtcDate.toString
|
||||
)) match {
|
||||
case Nil => None
|
||||
case list => Some(list.mkString("\n"))
|
||||
}
|
||||
}
|
||||
|
||||
object PdfMetaData {
|
||||
|
@ -97,7 +97,10 @@ object TextExtraction {
|
||||
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
|
||||
meta = item.changeMeta(
|
||||
ra.id,
|
||||
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
|
||||
rm =>
|
||||
rm.setContentIfEmpty(
|
||||
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
|
||||
)
|
||||
)
|
||||
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
|
||||
est <- dst
|
||||
|
Loading…
x
Reference in New Issue
Block a user