Add pdf meta data to extracted text to add it to full-text index

This commit is contained in:
Eike Kettner 2020-07-19 01:07:49 +02:00
parent 209c068436
commit cec4948710
3 changed files with 19 additions and 1 deletions

View File

@ -28,6 +28,12 @@ object ExtractResult {
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
val textOption = Some(text)
def appendPdfMetaToText: Success =
pdfMeta.flatMap(_.asText) match {
case Some(m) =>
copy(text = text + "\n\n" + m)
case None => this
}
}
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
Success(text, pdfMeta)

View File

@ -24,6 +24,15 @@ final case class PdfMetaData(
def keywordList: List[String] =
keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil)
/** Return all data in lines, except keywords. Keywords are handled separately. */
def asText: Option[String] =
(title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map(
_.toUtcDate.toString
)) match {
case Nil => None
case list => Some(list.mkString("\n"))
}
}
object PdfMetaData {

View File

@ -97,7 +97,10 @@ object TextExtraction {
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta(
ra.id,
rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty))
rm =>
rm.setContentIfEmpty(
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)
)
)
tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil)
est <- dst