diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 6d2d4a7b..839b0261 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -37,7 +37,7 @@ object PdfExtract { for { pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract - .get[F](in) + .getText[F](in) res <- pdfboxRes.fold( ex => logger.info( diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala new file mode 100644 index 00000000..7cff3b6c --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfMetaData.scala @@ -0,0 +1,31 @@ +package docspell.extract.pdfbox + +import docspell.common.Timestamp + +final case class PdfMetaData( + title: Option[String], + author: Option[String], + subject: Option[String], + keywords: Option[String], + creator: Option[String], + creationDate: Option[Timestamp] +) { + + def isEmpty: Boolean = + title.isEmpty && + author.isEmpty && + subject.isEmpty && + keywords.isEmpty && + creator.isEmpty && + creationDate.isEmpty + + def nonEmpty: Boolean = + !isEmpty + + def keywordList: List[String] = + keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil) +} + +object PdfMetaData { + val empty = PdfMetaData(None, None, None, None, None, None) +} diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index d44e2af7..233d7c31 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -13,18 +13,33 @@ import docspell.extract.internal.Text import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.text.PDFTextStripper +import docspell.common.Timestamp object PdfboxExtract { - def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = + def getTextAndMetaData[F[_]: Sync]( + data: Stream[F, Byte] + ): F[Either[Throwable, (Text, PdfMetaData)]] = + data.compile + .to(Array) + .map(bytes => + Using(PDDocument.load(bytes)) { doc => + for { + txt <- readText(doc) + md <- readMetaData(doc) + } yield (txt, md) + }.toEither.flatten + ) + + def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile .to(Array) .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) - def get(is: InputStream): Either[Throwable, Text] = + def getText(is: InputStream): Either[Throwable, Text] = Using(PDDocument.load(is))(readText).toEither.flatten - def get(inFile: Path): Either[Throwable, Text] = + def getText(inFile: Path): Either[Throwable, Text] = Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten private def readText(doc: PDDocument): Either[Throwable, Text] = @@ -34,4 +49,31 @@ object PdfboxExtract { stripper.setLineSeparator("\n") Text(Option(stripper.getText(doc))) }.toEither + + def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] = + data.compile + .to(Array) + .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten) + + def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = + Using(PDDocument.load(is))(readMetaData).toEither.flatten + + def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] = + Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten + + private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] = + Try { + def mkValue(s: String) = + Option(s).map(_.trim).filter(_.nonEmpty) + + val info = doc.getDocumentInformation + PdfMetaData( + mkValue(info.getTitle), + mkValue(info.getAuthor), + mkValue(info.getSubject), + mkValue(info.getKeywords), + mkValue(info.getCreator), + Option(info.getCreationDate).map(c => Timestamp(c.toInstant)) + ) + }.toEither } diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala new file mode 100644 index 00000000..b3cfb12d --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfMetaDataTest.scala @@ -0,0 +1,22 @@ +package docspell.extract.pdfbox + +import minitest.SimpleTestSuite + +object PdfMetaDataTest extends SimpleTestSuite { + + test("split keywords on comma") { + val md = PdfMetaData.empty.copy(keywords = Some("a,b, c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + + test("split keywords on semicolon") { + val md = PdfMetaData.empty.copy(keywords = Some("a; b;c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + + test("split keywords on comma and semicolon") { + val md = PdfMetaData.empty.copy(keywords = Some("a, b; c")) + assertEquals(md.keywordList, List("a", "b", "c")) + } + +} diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala index 1f436b25..b72b182a 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite { textPDFs.foreach { case (file, txt) => val url = file.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) @@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite { textPDFs.foreach { case (file, txt) => val data = file.readURL[IO](8192, blocker) - val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) + val str = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity) val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) @@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite { test("extract text from image PDFs") { val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) - val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) assertEquals(str.value, "") } + test("extract metadata from pdf") { + val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) + val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) + assert(str.value.startsWith("Keywords in PDF")) + val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) + assertEquals(md.author, Some("E.K.")) + assertEquals(md.title, Some("Keywords in PDF")) + assertEquals(md.subject, Some("This is a subject")) + assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo")) + assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)")) + assert(md.creationDate.isDefined) + } + private def removeFormatting(str: String): String = str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase } diff --git a/modules/files/src/test/resources/keywords.pdf b/modules/files/src/test/resources/keywords.pdf new file mode 100644 index 00000000..963fe42c Binary files /dev/null and b/modules/files/src/test/resources/keywords.pdf differ