Use keywords in pdfs to search for existing tags

During processing, keywords stored in PDF metadata are used to look
them up in the tag database and associate any existing tags to the
item.

See #175
This commit is contained in:
Eike Kettner
2020-07-19 00:28:04 +02:00
parent da68405f9b
commit 209c068436
14 changed files with 184 additions and 64 deletions

View File

@ -1,39 +1,41 @@
package docspell.extract
import scala.util.Try
import docspell.common.MimeType
import docspell.extract.pdfbox.PdfMetaData
sealed trait ExtractResult {
def textOption: Option[String]
def pdfMeta: Option[PdfMetaData]
}
object ExtractResult {
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
val textOption = None
val pdfMeta = None
}
def unsupportedFormat(mt: MimeType): ExtractResult =
UnsupportedFormat(mt)
case class Failure(ex: Throwable) extends ExtractResult {
val textOption = None
val pdfMeta = None
}
def failure(ex: Throwable): ExtractResult =
Failure(ex)
case class Success(text: String) extends ExtractResult {
case class Success(text: String, pdfMeta: Option[PdfMetaData]) extends ExtractResult {
val textOption = Some(text)
}
def success(text: String): ExtractResult =
Success(text)
def fromTry(r: Try[String]): ExtractResult =
r.fold(Failure.apply, Success.apply)
def success(text: String, pdfMeta: Option[PdfMetaData]): ExtractResult =
Success(text, pdfMeta)
def fromEither(e: Either[Throwable, String]): ExtractResult =
e.fold(failure, success)
e.fold(failure, str => success(str, None))
def fromEitherResult(e: Either[Throwable, PdfExtract.Result]): ExtractResult =
e.fold(failure, r => success(r.txt.value, r.meta))
}

View File

@ -40,8 +40,7 @@ object Extraction {
case MimeType.PdfMatch(_) =>
PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
.map(ExtractResult.fromEitherResult)
case PoiType(mt) =>
PoiExtract
@ -103,7 +102,7 @@ object Extraction {
val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(Text(txt).value)
ExtractResult.success(Text(txt).value, None)
}
case mt =>

View File

@ -7,9 +7,15 @@ import fs2.Stream
import docspell.common.{Language, Logger}
import docspell.extract.internal.Text
import docspell.extract.ocr.{OcrConfig, TextExtract}
import docspell.extract.pdfbox.PdfMetaData
import docspell.extract.pdfbox.PdfboxExtract
object PdfExtract {
final case class Result(txt: Text, meta: Option[PdfMetaData])
object Result {
def apply(t: (Text, Option[PdfMetaData])): Result =
Result(t._1, t._2)
}
def get[F[_]: Sync: ContextShift](
in: Stream[F, Byte],
@ -18,39 +24,39 @@ object PdfExtract {
stripMinLen: Int,
ocrCfg: OcrConfig,
logger: Logger[F]
): F[Either[Throwable, Text]] = {
): F[Either[Throwable, Result]] = {
val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: Text, strippedStr: Text) =
if (ocrStr.length > strippedStr.length)
def chooseResult(ocrStr: Text, strippedRes: (Text, Option[PdfMetaData])) =
if (ocrStr.length > strippedRes._1.length)
logger.info(
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
) *> ocrStr.pure[F]
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedRes._1.length})"
) *> Result(ocrStr, strippedRes._2).pure[F]
else
logger.info(
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
) *> strippedStr.pure[F]
s"Using stripped text (not OCR), as it is longer (${strippedRes._1.length} > ${ocrStr.length})"
) *> Result(strippedRes).pure[F]
//maybe better: inspect the pdf and decide whether ocr or not
for {
pdfboxRes <-
logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
.getText[F](in)
logger.debug("Trying to strip text from pdf using pdfbox.") *>
PdfboxExtract.getTextAndMetaData[F](in)
res <- pdfboxRes.fold(
ex =>
logger.info(
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
) >> runOcr.attempt,
str =>
if (str.length >= stripMinLen) str.pure[F].attempt
) >> runOcr.map(txt => Result(txt, None)).attempt,
pair =>
if (pair._1.length >= stripMinLen) Result(pair).pure[F].attempt
else
logger
.info(
s"Stripped text from PDF is small (${str.length}). Trying with OCR."
s"Stripped text from PDF is small (${pair._1.length}). Trying with OCR."
) *>
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
runOcr.flatMap(ocrStr => chooseResult(ocrStr, pair)).attempt
)
} yield res
}

View File

@ -9,17 +9,17 @@ import cats.effect.Sync
import cats.implicits._
import fs2.Stream
import docspell.common.Timestamp
import docspell.extract.internal.Text
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import docspell.common.Timestamp
object PdfboxExtract {
def getTextAndMetaData[F[_]: Sync](
data: Stream[F, Byte]
): F[Either[Throwable, (Text, PdfMetaData)]] =
): F[Either[Throwable, (Text, Option[PdfMetaData])]] =
data.compile
.to(Array)
.map(bytes =>
@ -27,7 +27,7 @@ object PdfboxExtract {
for {
txt <- readText(doc)
md <- readMetaData(doc)
} yield (txt, md)
} yield (txt, Some(md).filter(_.nonEmpty))
}.toEither.flatten
)

View File

@ -47,7 +47,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
val url = ExampleFiles.keywords_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.getText(url.openStream()).fold(throw _, identity)
assert(str.value.startsWith("Keywords in PDF"))
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
val md = PdfboxExtract.getMetaData(url.openStream()).fold(throw _, identity)
assertEquals(md.author, Some("E.K."))
assertEquals(md.title, Some("Keywords in PDF"))
assertEquals(md.subject, Some("This is a subject"))