diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala index d48b63c8..ac9716b3 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala @@ -28,6 +28,12 @@ object ExtractResult { case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult { val textOption = Some(text) + def appendPdfMetaToText: Success = + pdfMeta.flatMap(_.asText) match { + case Some(m) => + copy(text = text + "\n\n" + m) + case None => this + } } def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult = Success(text, pdfMeta) diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala index 7cff3b6c..4663d1c8 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala @@ -24,6 +24,15 @@ final case class PdfMetaData( def keywordList: List[String] = keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil) + + /** Return all data in lines, except keywords. Keywords are handled separately. */ + def asText: Option[String] = + (title.toList ++ author.toList ++ subject.toList ++ creationDate.toList.map( + _.toUtcDate.toString + )) match { + case Nil => None + case list => Some(list.mkString("\n")) + } } object PdfMetaData { diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 9bc41683..89bb1f61 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -97,7 +97,10 @@ object TextExtraction { res <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, - rm => rm.setContentIfEmpty(res.map(_.text.trim).filter(_.nonEmpty)) + rm => + rm.setContentIfEmpty( + res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty) + ) ) tags = res.flatMap(_.pdfMeta).map(_.keywordList).getOrElse(Nil) est <- dst