Post process all extracted text

Removes 0 bytes and leading/trailing whitespace
This commit is contained in:
Eike Kettner 2020-05-25 13:41:38 +02:00
parent 4e22361985
commit 2e88207ff1
10 changed files with 76 additions and 38 deletions

View File

@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract}
import docspell.extract.odf.{OdfExtract, OdfType} import docspell.extract.odf.{OdfExtract, OdfType}
import docspell.extract.poi.{PoiExtract, PoiType} import docspell.extract.poi.{PoiExtract, PoiType}
import docspell.extract.rtf.RtfExtract import docspell.extract.rtf.RtfExtract
import docspell.extract.internal.Text
import fs2.Stream import fs2.Stream
import docspell.files.TikaMimetype import docspell.files.TikaMimetype
import docspell.files.ImageSize import docspell.files.ImageSize
@ -38,23 +39,30 @@ object Extraction {
case MimeType.PdfMatch(_) => case MimeType.PdfMatch(_) =>
PdfExtract PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(_.map(_.value))
.map(ExtractResult.fromEither) .map(ExtractResult.fromEither)
case PoiType(mt) => case PoiType(mt) =>
PoiExtract.get(data, mt).map(ExtractResult.fromEither) PoiExtract
.get(data, mt)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case RtfExtract.rtfType => case RtfExtract.rtfType =>
RtfExtract.get(data).map(ExtractResult.fromEither) RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither)
case OdfType(_) => case OdfType(_) =>
OdfExtract.get(data).map(ExtractResult.fromEither) OdfExtract
.get(data)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case OcrType(mt) => case OcrType(mt) =>
val doExtract = TextExtract val doExtract = TextExtract
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.compile .compile
.lastOrError .lastOrError
.map(_.trim) .map(_.value)
.attempt .attempt
.map(ExtractResult.fromEither) .map(ExtractResult.fromEither)
@ -85,13 +93,16 @@ object Extraction {
.info( .info(
s"File detected as ${OdfType.container}. Try to read as OpenDocument file." s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
) *> ) *>
OdfExtract.get(data).map(ExtractResult.fromEither) OdfExtract
.get(data)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case MimeType.NonHtmlText(mt) => case MimeType.NonHtmlText(mt) =>
val cs = mt.charsetOrUtf8 val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim) ExtractResult.success(Text(txt).value)
} }
case mt => case mt =>

View File

@ -6,6 +6,7 @@ import fs2.Stream
import docspell.common.{Language, Logger} import docspell.common.{Language, Logger}
import docspell.extract.ocr.{OcrConfig, TextExtract} import docspell.extract.ocr.{OcrConfig, TextExtract}
import docspell.extract.pdfbox.PdfboxExtract import docspell.extract.pdfbox.PdfboxExtract
import docspell.extract.internal.Text
object PdfExtract { object PdfExtract {
@ -16,12 +17,12 @@ object PdfExtract {
stripMinLen: Int, stripMinLen: Int,
ocrCfg: OcrConfig, ocrCfg: OcrConfig,
logger: Logger[F] logger: Logger[F]
): F[Either[Throwable, String]] = { ): F[Either[Throwable, Text]] = {
val runOcr = val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: String, strippedStr: String) = def chooseResult(ocrStr: Text, strippedStr: Text) =
if (ocrStr.length > strippedStr.length) if (ocrStr.length > strippedStr.length)
logger.info( logger.info(
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"

View File

@ -0,0 +1,20 @@
package docspell.extract.internal
case class Text(raw: String) {
private lazy val textValue =
raw.trim.replace("\u0000", "")
def length: Int =
textValue.length
def value: String =
textValue
}
object Text {
def apply(ot: Option[String]): Text =
Text(ot.getOrElse(""))
}

View File

@ -3,6 +3,7 @@ package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync} import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common._ import docspell.common._
import docspell.files._ import docspell.files._
import docspell.extract.internal.Text
import fs2.Stream import fs2.Stream
object TextExtract { object TextExtract {
@ -13,7 +14,7 @@ object TextExtract {
logger: Logger[F], logger: Logger[F],
lang: String, lang: String,
config: OcrConfig config: OcrConfig
): Stream[F, String] = ): Stream[F, Text] =
extractOCR(in, blocker, logger, lang, config) extractOCR(in, blocker, logger, lang, config)
def extractOCR[F[_]: Sync: ContextShift]( def extractOCR[F[_]: Sync: ContextShift](
@ -22,7 +23,7 @@ object TextExtract {
logger: Logger[F], logger: Logger[F],
lang: String, lang: String,
config: OcrConfig config: OcrConfig
): Stream[F, String] = ): Stream[F, Text] =
Stream Stream
.eval(TikaMimetype.detect(in, MimeTypeHint.none)) .eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({ .flatMap({
@ -35,6 +36,7 @@ object TextExtract {
case mt => case mt =>
raiseError(s"File `$mt` not supported") raiseError(s"File `$mt` not supported")
}) })
.map(Text.apply)
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] = private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
Stream.raiseError[F](new Exception(msg)) Stream.raiseError[F](new Exception(msg))

View File

@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser
import org.apache.tika.sax.BodyContentHandler import org.apache.tika.sax.BodyContentHandler
import scala.util.Try import scala.util.Try
import docspell.extract.internal.Text
object OdfExtract { object OdfExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
def get(is: InputStream) = def get(is: InputStream) =
@ -24,7 +25,7 @@ object OdfExtract {
val meta = new Metadata() val meta = new Metadata()
val ooparser = new OpenDocumentParser() val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx) ooparser.parse(is, handler, meta, pctx)
handler.toString.trim Text(Option(handler.toString))
}.toEither }.toEither
} }

View File

@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper
import scala.util.{Try, Using} import scala.util.{Try, Using}
import fs2.Stream import fs2.Stream
import docspell.extract.internal.Text
object PdfboxExtract { object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile data.compile
.to(Array) .to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, String] = def get(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, String] = def get(inFile: Path): Either[Throwable, Text] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, String] = private def readText(doc: PDDocument): Either[Throwable, Text] =
Try { Try {
val stripper = new PDFTextStripper() val stripper = new PDFTextStripper()
stripper.setAddMoreFormatting(true) stripper.setAddMoreFormatting(true)
stripper.setLineSeparator("\n") stripper.setLineSeparator("\n")
stripper.getText(doc).trim // trim here already Text(Option(stripper.getText(doc)))
}.toEither }.toEither
} }

View File

@ -17,19 +17,20 @@ import fs2.Stream
import scala.util.Try import scala.util.Try
import docspell.common._ import docspell.common._
import docspell.files.TikaMimetype import docspell.files.TikaMimetype
import docspell.extract.internal.Text
object PoiExtract { object PoiExtract {
def get[F[_]: Sync]( def get[F[_]: Sync](
data: Stream[F, Byte], data: Stream[F, Byte],
hint: MimeTypeHint hint: MimeTypeHint
): F[Either[Throwable, String]] = ): F[Either[Throwable, Text]] =
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt)) TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
def get[F[_]: Sync]( def get[F[_]: Sync](
data: Stream[F, Byte], data: Stream[F, Byte],
mime: MimeType mime: MimeType
): F[Either[Throwable, String]] = ): F[Either[Throwable, Text]] =
mime match { mime match {
case PoiType.doc => case PoiType.doc =>
getDoc(data) getDoc(data)
@ -55,40 +56,40 @@ object PoiExtract {
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}"))) Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
} }
def getDocx(is: InputStream): Either[Throwable, String] = def getDocx(is: InputStream): Either[Throwable, Text] =
Try { Try {
val xt = new XWPFWordExtractor(new XWPFDocument(is)) val xt = new XWPFWordExtractor(new XWPFDocument(is))
Option(xt.getText).map(_.trim).getOrElse("") Text(Option(xt.getText))
}.toEither }.toEither
def getDoc(is: InputStream): Either[Throwable, String] = def getDoc(is: InputStream): Either[Throwable, Text] =
Try { Try {
val xt = new WordExtractor(is) val xt = new WordExtractor(is)
Option(xt.getText).map(_.trim).getOrElse("") Text(Option(xt.getText))
}.toEither }.toEither
def getXlsx(is: InputStream): Either[Throwable, String] = def getXlsx(is: InputStream): Either[Throwable, Text] =
Try { Try {
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is)) val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
Option(xt.getText).map(_.trim).getOrElse("") Text(Option(xt.getText))
}.toEither }.toEither
def getXls(is: InputStream): Either[Throwable, String] = def getXls(is: InputStream): Either[Throwable, Text] =
Try { Try {
val xt = new ExcelExtractor(new HSSFWorkbook(is)) val xt = new ExcelExtractor(new HSSFWorkbook(is))
Option(xt.getText).map(_.trim).getOrElse("") Text(Option(xt.getText))
}.toEither }.toEither
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
} }

View File

@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream}
import cats.implicits._ import cats.implicits._
import cats.effect.Sync import cats.effect.Sync
import docspell.common.MimeType import docspell.common.MimeType
import docspell.extract.internal.Text
import fs2.Stream import fs2.Stream
import javax.swing.text.rtf.RTFEditorKit import javax.swing.text.rtf.RTFEditorKit
@ -14,14 +15,14 @@ object RtfExtract {
val rtfType = MimeType.application("rtf") val rtfType = MimeType.application("rtf")
def get(is: InputStream): Either[Throwable, String] = def get(is: InputStream): Either[Throwable, Text] =
Try { Try {
val kit = new RTFEditorKit() val kit = new RTFEditorKit()
val doc = kit.createDefaultDocument() val doc = kit.createDefaultDocument()
kit.read(is, doc, 0) kit.read(is, doc, 0)
doc.getText(0, doc.getLength).trim Text(doc.getText(0, doc.getLength))
}.toEither }.toEither
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
} }

View File

@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite {
.lastOrError .lastOrError
.unsafeRunSync() .unsafeRunSync()
assertEquals(extract.trim, expect.trim) assertEquals(extract.value, expect)
} }
} }

View File

@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
case (file, txt) => case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity) val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str) val received = removeFormatting(str.value)
val expect = removeFormatting(txt) val expect = removeFormatting(txt)
assertEquals(received, expect) assertEquals(received, expect)
} }
@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
case (file, txt) => case (file, txt) =>
val data = file.readURL[IO](8192, blocker) val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str) val received = removeFormatting(str.value)
val expect = removeFormatting(txt) val expect = removeFormatting(txt)
assertEquals(received, expect) assertEquals(received, expect)
} }
@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
assertEquals(str, "") assertEquals(str.value, "")
} }
private def removeFormatting(str: String): String = private def removeFormatting(str: String): String =