mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 17:50:11 +00:00 
			
		
		
		
	| @@ -48,15 +48,21 @@ object PdfExtract { | ||||
|     // maybe better: inspect the pdf and decide whether ocr or not | ||||
|     for { | ||||
|       pdfboxRes <- | ||||
|         logger.debug("Trying to strip text from pdf using pdfbox.") *> | ||||
|           PdfboxExtract.getTextAndMetaData[F](in) | ||||
|         if (stripMinLen > 0) | ||||
|           logger.debug("Trying to strip text from pdf using pdfbox.") *> | ||||
|             PdfboxExtract.getTextAndMetaData[F](in) | ||||
|         else | ||||
|           logger | ||||
|             .debug(s"Not stripping text from pdf, min-length=$stripMinLen") | ||||
|             .as(Right(Text("") -> None)) | ||||
|       res <- pdfboxRes.fold( | ||||
|         ex => | ||||
|           logger.info( | ||||
|             s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " | ||||
|           ) >> runOcr.map(txt => Result(txt, None)).attempt, | ||||
|         pair => | ||||
|           if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt | ||||
|           if (pair._1.length >= stripMinLen && stripMinLen > 0) | ||||
|             Result(pair).pure[F].attempt | ||||
|           else | ||||
|             logger | ||||
|               .info( | ||||
|   | ||||
| @@ -298,6 +298,9 @@ Docpell Update Check | ||||
|     # and images. If the returned text is shorter than the value | ||||
|     # below, OCR is run afterwards. Then both extracted texts are | ||||
|     # compared and the longer will be used. | ||||
|     # | ||||
|     # If you set this to 0 (or a negative value), then the text parts | ||||
|     # of a PDF are ignored and OCR is always run and its result used. | ||||
|     pdf { | ||||
|       min-text-len = 500 | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user