mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-30 21:40:12 +00:00 
			
		
		
		
	Extract meta data from pdfs using pdfbox
This commit is contained in:
		| @@ -37,7 +37,7 @@ object PdfExtract { | ||||
|     for { | ||||
|       pdfboxRes <- | ||||
|         logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract | ||||
|           .get[F](in) | ||||
|           .getText[F](in) | ||||
|       res <- pdfboxRes.fold( | ||||
|         ex => | ||||
|           logger.info( | ||||
|   | ||||
| @@ -0,0 +1,31 @@ | ||||
| package docspell.extract.pdfbox | ||||
|  | ||||
| import docspell.common.Timestamp | ||||
|  | ||||
| final case class PdfMetaData( | ||||
|     title: Option[String], | ||||
|     author: Option[String], | ||||
|     subject: Option[String], | ||||
|     keywords: Option[String], | ||||
|     creator: Option[String], | ||||
|     creationDate: Option[Timestamp] | ||||
| ) { | ||||
|  | ||||
|   def isEmpty: Boolean = | ||||
|     title.isEmpty && | ||||
|       author.isEmpty && | ||||
|       subject.isEmpty && | ||||
|       keywords.isEmpty && | ||||
|       creator.isEmpty && | ||||
|       creationDate.isEmpty | ||||
|  | ||||
|   def nonEmpty: Boolean = | ||||
|     !isEmpty | ||||
|  | ||||
|   def keywordList: List[String] = | ||||
|     keywords.map(kws => kws.split("[,;]\\s*").toList).getOrElse(Nil) | ||||
| } | ||||
|  | ||||
| object PdfMetaData { | ||||
|   val empty = PdfMetaData(None, None, None, None, None, None) | ||||
| } | ||||
| @@ -13,18 +13,33 @@ import docspell.extract.internal.Text | ||||
|  | ||||
| import org.apache.pdfbox.pdmodel.PDDocument | ||||
| import org.apache.pdfbox.text.PDFTextStripper | ||||
| import docspell.common.Timestamp | ||||
|  | ||||
| object PdfboxExtract { | ||||
|  | ||||
|   def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = | ||||
|   def getTextAndMetaData[F[_]: Sync]( | ||||
|       data: Stream[F, Byte] | ||||
|   ): F[Either[Throwable, (Text, PdfMetaData)]] = | ||||
|     data.compile | ||||
|       .to(Array) | ||||
|       .map(bytes => | ||||
|         Using(PDDocument.load(bytes)) { doc => | ||||
|           for { | ||||
|             txt <- readText(doc) | ||||
|             md  <- readMetaData(doc) | ||||
|           } yield (txt, md) | ||||
|         }.toEither.flatten | ||||
|       ) | ||||
|  | ||||
|   def getText[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = | ||||
|     data.compile | ||||
|       .to(Array) | ||||
|       .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) | ||||
|  | ||||
|   def get(is: InputStream): Either[Throwable, Text] = | ||||
|   def getText(is: InputStream): Either[Throwable, Text] = | ||||
|     Using(PDDocument.load(is))(readText).toEither.flatten | ||||
|  | ||||
|   def get(inFile: Path): Either[Throwable, Text] = | ||||
|   def getText(inFile: Path): Either[Throwable, Text] = | ||||
|     Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten | ||||
|  | ||||
|   private def readText(doc: PDDocument): Either[Throwable, Text] = | ||||
| @@ -34,4 +49,31 @@ object PdfboxExtract { | ||||
|       stripper.setLineSeparator("\n") | ||||
|       Text(Option(stripper.getText(doc))) | ||||
|     }.toEither | ||||
|  | ||||
|   def getMetaData[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, PdfMetaData]] = | ||||
|     data.compile | ||||
|       .to(Array) | ||||
|       .map(bytes => Using(PDDocument.load(bytes))(readMetaData).toEither.flatten) | ||||
|  | ||||
|   def getMetaData(is: InputStream): Either[Throwable, PdfMetaData] = | ||||
|     Using(PDDocument.load(is))(readMetaData).toEither.flatten | ||||
|  | ||||
|   def getMetaData(inFile: Path): Either[Throwable, PdfMetaData] = | ||||
|     Using(PDDocument.load(inFile.toFile))(readMetaData).toEither.flatten | ||||
|  | ||||
|   private def readMetaData(doc: PDDocument): Either[Throwable, PdfMetaData] = | ||||
|     Try { | ||||
|       def mkValue(s: String) = | ||||
|         Option(s).map(_.trim).filter(_.nonEmpty) | ||||
|  | ||||
|       val info = doc.getDocumentInformation | ||||
|       PdfMetaData( | ||||
|         mkValue(info.getTitle), | ||||
|         mkValue(info.getAuthor), | ||||
|         mkValue(info.getSubject), | ||||
|         mkValue(info.getKeywords), | ||||
|         mkValue(info.getCreator), | ||||
|         Option(info.getCreationDate).map(c => Timestamp(c.toInstant)) | ||||
|       ) | ||||
|     }.toEither | ||||
| } | ||||
|   | ||||
| @@ -0,0 +1,22 @@ | ||||
| package docspell.extract.pdfbox | ||||
|  | ||||
| import minitest.SimpleTestSuite | ||||
|  | ||||
| object PdfMetaDataTest extends SimpleTestSuite { | ||||
|  | ||||
|   test("split keywords on comma") { | ||||
|     val md = PdfMetaData.empty.copy(keywords = Some("a,b, c")) | ||||
|     assertEquals(md.keywordList, List("a", "b", "c")) | ||||
|   } | ||||
|  | ||||
|   test("split keywords on semicolon") { | ||||
|     val md = PdfMetaData.empty.copy(keywords = Some("a; b;c")) | ||||
|     assertEquals(md.keywordList, List("a", "b", "c")) | ||||
|   } | ||||
|  | ||||
|   test("split keywords on comma and semicolon") { | ||||
|     val md = PdfMetaData.empty.copy(keywords = Some("a, b; c")) | ||||
|     assertEquals(md.keywordList, List("a", "b", "c")) | ||||
|   } | ||||
|  | ||||
| } | ||||
| @@ -17,7 +17,7 @@ object PdfboxExtractTest extends SimpleTestSuite { | ||||
|     textPDFs.foreach { | ||||
|       case (file, txt) => | ||||
|         val url      = file.toJavaUrl.fold(sys.error, identity) | ||||
|         val str      = PdfboxExtract.get(url.openStream()).fold(throw _, identity) | ||||
|         val str      = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) | ||||
|         val received = removeFormatting(str.value) | ||||
|         val expect   = removeFormatting(txt) | ||||
|         assertEquals(received, expect) | ||||
| @@ -28,7 +28,7 @@ object PdfboxExtractTest extends SimpleTestSuite { | ||||
|     textPDFs.foreach { | ||||
|       case (file, txt) => | ||||
|         val data     = file.readURL[IO](8192, blocker) | ||||
|         val str      = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) | ||||
|         val str      = PdfboxExtract.getText(data).unsafeRunSync().fold(throw _, identity) | ||||
|         val received = removeFormatting(str.value) | ||||
|         val expect   = removeFormatting(txt) | ||||
|         assertEquals(received, expect) | ||||
| @@ -38,11 +38,24 @@ object PdfboxExtractTest extends SimpleTestSuite { | ||||
|   test("extract text from image PDFs") { | ||||
|     val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) | ||||
|  | ||||
|     val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) | ||||
|     val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) | ||||
|  | ||||
|     assertEquals(str.value, "") | ||||
|   } | ||||
|  | ||||
|   test("extract metadata from pdf") { | ||||
|     val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity) | ||||
|     val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity) | ||||
|     assert(str.value.startsWith("Keywords in PDF")) | ||||
|     val md =  PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity) | ||||
|     assertEquals(md.author, Some("E.K.")) | ||||
|     assertEquals(md.title, Some("Keywords in PDF")) | ||||
|     assertEquals(md.subject, Some("This is a subject")) | ||||
|     assertEquals(md.keywordList, List("Test", "Keywords in PDF", "Todo")) | ||||
|     assertEquals(md.creator, Some("Emacs 26.3 (Org mode 9.3)")) | ||||
|     assert(md.creationDate.isDefined) | ||||
|   } | ||||
|  | ||||
|   private def removeFormatting(str: String): String = | ||||
|     str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase | ||||
| } | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								modules/files/src/test/resources/keywords.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								modules/files/src/test/resources/keywords.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
		Reference in New Issue
	
	Block a user