mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Use keywords in pdfs to search for existing tags
During processing, keywords stored in PDF metadata are used to look them up in the tag database and associate any existing tags to the item. See #175
This commit is contained in:
@ -1,39 +1,41 @@
|
||||
package docspell.extract
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
import docspell.common.MimeType
|
||||
import docspell.extract.pdfbox.PdfMetaData
|
||||
|
||||
sealed trait ExtractResult {
|
||||
|
||||
def textOption: Option[String]
|
||||
|
||||
def pdfMeta: Option[PdfMetaData]
|
||||
}
|
||||
|
||||
object ExtractResult {
|
||||
|
||||
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
||||
val textOption = None
|
||||
val pdfMeta = None
|
||||
}
|
||||
def unsupportedFormat(mt: MimeType): ExtractResult =
|
||||
UnsupportedFormat(mt)
|
||||
|
||||
case class Failure(ex: Throwable) extends ExtractResult {
|
||||
val textOption = None
|
||||
val pdfMeta = None
|
||||
}
|
||||
def failure(ex: Throwable): ExtractResult =
|
||||
Failure(ex)
|
||||
|
||||
case class Success(text: String) extends ExtractResult {
|
||||
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
|
||||
val textOption = Some(text)
|
||||
}
|
||||
def success(text: String): ExtractResult =
|
||||
Success(text)
|
||||
|
||||
def fromTry(r: Try[String]): ExtractResult =
|
||||
r.fold(Failure.apply, Success.apply)
|
||||
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
|
||||
Success(text, pdfMeta)
|
||||
|
||||
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
||||
e.fold(failure, success)
|
||||
e.fold(failure, str => success(str, None))
|
||||
|
||||
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
|
||||
e.fold(failure, r => success(r.txt.value, r.meta))
|
||||
|
||||
}
|
||||
|
@ -40,8 +40,7 @@ object Extraction {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(_.map(_.value))
|
||||
.map(ExtractResult.fromEither)
|
||||
.map(ExtractResult.fromEitherResult)
|
||||
|
||||
case PoiType(mt) =>
|
||||
PoiExtract
|
||||
@ -103,7 +102,7 @@ object Extraction {
|
||||
val cs = mt.charsetOrUtf8
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||
ExtractResult.success(Text(txt).value)
|
||||
ExtractResult.success(Text(txt).value, None)
|
||||
}
|
||||
|
||||
case mt =>
|
||||
|
@ -7,9 +7,15 @@ import fs2.Stream
|
||||
import docspell.common.{Language, Logger}
|
||||
import docspell.extract.internal.Text
|
||||
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||
import docspell.extract.pdfbox.PdfMetaData
|
||||
import docspell.extract.pdfbox.PdfboxExtract
|
||||
|
||||
object PdfExtract {
|
||||
final case class Result(txt: Text, meta: Option[PdfMetaData])
|
||||
object Result {
|
||||
def apply(t: (Text, Option[PdfMetaData])): Result =
|
||||
Result(t._1, t._2)
|
||||
}
|
||||
|
||||
def get[F[_]: Sync: ContextShift](
|
||||
in: Stream[F, Byte],
|
||||
@ -18,39 +24,39 @@ object PdfExtract {
|
||||
stripMinLen: Int,
|
||||
ocrCfg: OcrConfig,
|
||||
logger: Logger[F]
|
||||
): F[Either[Throwable, Text]] = {
|
||||
): F[Either[Throwable, Result]] = {
|
||||
|
||||
val runOcr =
|
||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||
|
||||
def chooseResult(ocrStr: Text, strippedStr: Text) =
|
||||
if (ocrStr.length > strippedStr.length)
|
||||
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
|
||||
if (ocrStr.length > strippedRes._1.length)
|
||||
logger.info(
|
||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
||||
) *> ocrStr.pure[F]
|
||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
|
||||
) *> Result(ocrStr, strippedRes._2).pure[F]
|
||||
else
|
||||
logger.info(
|
||||
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
|
||||
) *> strippedStr.pure[F]
|
||||
s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
|
||||
) *> Result(strippedRes).pure[F]
|
||||
|
||||
//maybe better: inspect the pdf and decide whether ocr or not
|
||||
for {
|
||||
pdfboxRes <-
|
||||
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||
.getText[F](in)
|
||||
logger.debug("Trying to strip text from pdf using pdfbox.") *>
|
||||
PdfboxExtract.getTextAndMetaData[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||
) >> runOcr.attempt,
|
||||
str =>
|
||||
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||
) >> runOcr.map(txt => Result(txt, None)).attempt,
|
||||
pair =>
|
||||
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
|
||||
else
|
||||
logger
|
||||
.info(
|
||||
s"Stripped text from PDF is small (${str.length}). Trying with OCR."
|
||||
s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
|
||||
) *>
|
||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
||||
runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
|
||||
)
|
||||
} yield res
|
||||
}
|
||||
|
@ -9,17 +9,17 @@ import cats.effect.Sync
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common.Timestamp
|
||||
import docspell.extract.internal.Text
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.text.PDFTextStripper
|
||||
import docspell.common.Timestamp
|
||||
|
||||
object PdfboxExtract {
|
||||
|
||||
def getTextAndMetaData[F[_]: Sync](
|
||||
data: Stream[F, Byte]
|
||||
): F[Either[Throwable, (Text, PdfMetaData)]] =
|
||||
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes =>
|
||||
@ -27,7 +27,7 @@ object PdfboxExtract {
|
||||
for {
|
||||
txt <- readText(doc)
|
||||
md <- readMetaData(doc)
|
||||
} yield (txt, md)
|
||||
} yield (txt, Some(md).filter(_.nonEmpty))
|
||||
}.toEither.flatten
|
||||
)
|
||||
|
||||
|
@ -47,7 +47,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
|
||||
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
|
||||
assert(str.value.startsWith("Keywords in PDF"))
|
||||
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
|
||||
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
|
||||
assertEquals(md.author, Some("E.K."))
|
||||
assertEquals(md.title, Some("Keywords in PDF"))
|
||||
assertEquals(md.subject, Some("This is a subject"))
|
||||
|
Reference in New Issue
Block a user