Post process all extracted text

Removes 0 bytes and leading/trailing whitespace
This commit is contained in:
Eike Kettner 2020-05-25 13:41:38 +02:00
parent 4e22361985
commit 2e88207ff1
10 changed files with 76 additions and 38 deletions

View File

@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract}
import docspell.extract.odf.{OdfExtract, OdfType}
import docspell.extract.poi.{PoiExtract, PoiType}
import docspell.extract.rtf.RtfExtract
import docspell.extract.internal.Text
import fs2.Stream
import docspell.files.TikaMimetype
import docspell.files.ImageSize
@ -38,23 +39,30 @@ object Extraction {
case MimeType.PdfMatch(_) =>
PdfExtract
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case PoiType(mt) =>
PoiExtract.get(data, mt).map(ExtractResult.fromEither)
PoiExtract
.get(data, mt)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case RtfExtract.rtfType =>
RtfExtract.get(data).map(ExtractResult.fromEither)
RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither)
case OdfType(_) =>
OdfExtract.get(data).map(ExtractResult.fromEither)
OdfExtract
.get(data)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case OcrType(mt) =>
val doExtract = TextExtract
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.compile
.lastOrError
.map(_.trim)
.map(_.value)
.attempt
.map(ExtractResult.fromEither)
@ -85,13 +93,16 @@ object Extraction {
.info(
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
) *>
OdfExtract.get(data).map(ExtractResult.fromEither)
OdfExtract
.get(data)
.map(_.map(_.value))
.map(ExtractResult.fromEither)
case MimeType.NonHtmlText(mt) =>
val cs = mt.charsetOrUtf8
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim)
ExtractResult.success(Text(txt).value)
}
case mt =>

View File

@ -6,6 +6,7 @@ import fs2.Stream
import docspell.common.{Language, Logger}
import docspell.extract.ocr.{OcrConfig, TextExtract}
import docspell.extract.pdfbox.PdfboxExtract
import docspell.extract.internal.Text
object PdfExtract {
@ -16,12 +17,12 @@ object PdfExtract {
stripMinLen: Int,
ocrCfg: OcrConfig,
logger: Logger[F]
): F[Either[Throwable, String]] = {
): F[Either[Throwable, Text]] = {
val runOcr =
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
def chooseResult(ocrStr: String, strippedStr: String) =
def chooseResult(ocrStr: Text, strippedStr: Text) =
if (ocrStr.length > strippedStr.length)
logger.info(
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"

View File

@ -0,0 +1,20 @@
package docspell.extract.internal
case class Text(raw: String) {
private lazy val textValue =
raw.trim.replace("\u0000", "")
def length: Int =
textValue.length
def value: String =
textValue
}
object Text {
def apply(ot: Option[String]): Text =
Text(ot.getOrElse(""))
}

View File

@ -3,6 +3,7 @@ package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common._
import docspell.files._
import docspell.extract.internal.Text
import fs2.Stream
object TextExtract {
@ -13,7 +14,7 @@ object TextExtract {
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] =
): Stream[F, Text] =
extractOCR(in, blocker, logger, lang, config)
def extractOCR[F[_]: Sync: ContextShift](
@ -22,7 +23,7 @@ object TextExtract {
logger: Logger[F],
lang: String,
config: OcrConfig
): Stream[F, String] =
): Stream[F, Text] =
Stream
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({
@ -35,6 +36,7 @@ object TextExtract {
case mt =>
raiseError(s"File `$mt` not supported")
})
.map(Text.apply)
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
Stream.raiseError[F](new Exception(msg))

View File

@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser
import org.apache.tika.sax.BodyContentHandler
import scala.util.Try
import docspell.extract.internal.Text
object OdfExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
def get(is: InputStream) =
@ -24,7 +25,7 @@ object OdfExtract {
val meta = new Metadata()
val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx)
handler.toString.trim
Text(Option(handler.toString))
}.toEither
}

View File

@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper
import scala.util.{Try, Using}
import fs2.Stream
import docspell.extract.internal.Text
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, String] =
def get(is: InputStream): Either[Throwable, Text] =
Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, String] =
def get(inFile: Path): Either[Throwable, Text] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, String] =
private def readText(doc: PDDocument): Either[Throwable, Text] =
Try {
val stripper = new PDFTextStripper()
stripper.setAddMoreFormatting(true)
stripper.setLineSeparator("\n")
stripper.getText(doc).trim // trim here already
Text(Option(stripper.getText(doc)))
}.toEither
}

View File

@ -17,19 +17,20 @@ import fs2.Stream
import scala.util.Try
import docspell.common._
import docspell.files.TikaMimetype
import docspell.extract.internal.Text
object PoiExtract {
def get[F[_]: Sync](
data: Stream[F, Byte],
hint: MimeTypeHint
): F[Either[Throwable, String]] =
): F[Either[Throwable, Text]] =
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
def get[F[_]: Sync](
data: Stream[F, Byte],
mime: MimeType
): F[Either[Throwable, String]] =
): F[Either[Throwable, Text]] =
mime match {
case PoiType.doc =>
getDoc(data)
@ -55,40 +56,40 @@ object PoiExtract {
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
}
def getDocx(is: InputStream): Either[Throwable, String] =
def getDocx(is: InputStream): Either[Throwable, Text] =
Try {
val xt = new XWPFWordExtractor(new XWPFDocument(is))
Option(xt.getText).map(_.trim).getOrElse("")
Text(Option(xt.getText))
}.toEither
def getDoc(is: InputStream): Either[Throwable, String] =
def getDoc(is: InputStream): Either[Throwable, Text] =
Try {
val xt = new WordExtractor(is)
Option(xt.getText).map(_.trim).getOrElse("")
Text(Option(xt.getText))
}.toEither
def getXlsx(is: InputStream): Either[Throwable, String] =
def getXlsx(is: InputStream): Either[Throwable, Text] =
Try {
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
Option(xt.getText).map(_.trim).getOrElse("")
Text(Option(xt.getText))
}.toEither
def getXls(is: InputStream): Either[Throwable, String] =
def getXls(is: InputStream): Either[Throwable, Text] =
Try {
val xt = new ExcelExtractor(new HSSFWorkbook(is))
Option(xt.getText).map(_.trim).getOrElse("")
Text(Option(xt.getText))
}.toEither
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
}

View File

@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream}
import cats.implicits._
import cats.effect.Sync
import docspell.common.MimeType
import docspell.extract.internal.Text
import fs2.Stream
import javax.swing.text.rtf.RTFEditorKit
@ -14,14 +15,14 @@ object RtfExtract {
val rtfType = MimeType.application("rtf")
def get(is: InputStream): Either[Throwable, String] =
def get(is: InputStream): Either[Throwable, Text] =
Try {
val kit = new RTFEditorKit()
val doc = kit.createDefaultDocument()
kit.read(is, doc, 0)
doc.getText(0, doc.getLength).trim
Text(doc.getText(0, doc.getLength))
}.toEither
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
}

View File

@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite {
.lastOrError
.unsafeRunSync()
assertEquals(extract.trim, expect.trim)
assertEquals(extract.value, expect)
}
}

View File

@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str)
val received = removeFormatting(str.value)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
case (file, txt) =>
val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str)
val received = removeFormatting(str.value)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
assertEquals(str, "")
assertEquals(str.value, "")
}
private def removeFormatting(str: String): String =