mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Post process all extracted text
Removes 0 bytes and leading/trailing whitespace
This commit is contained in:
parent
4e22361985
commit
2e88207ff1
@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract}
|
||||
import docspell.extract.odf.{OdfExtract, OdfType}
|
||||
import docspell.extract.poi.{PoiExtract, PoiType}
|
||||
import docspell.extract.rtf.RtfExtract
|
||||
import docspell.extract.internal.Text
|
||||
import fs2.Stream
|
||||
import docspell.files.TikaMimetype
|
||||
import docspell.files.ImageSize
|
||||
@ -38,23 +39,30 @@ object Extraction {
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(_.map(_.value))
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
case PoiType(mt) =>
|
||||
PoiExtract.get(data, mt).map(ExtractResult.fromEither)
|
||||
PoiExtract
|
||||
.get(data, mt)
|
||||
.map(_.map(_.value))
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
case RtfExtract.rtfType =>
|
||||
RtfExtract.get(data).map(ExtractResult.fromEither)
|
||||
RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither)
|
||||
|
||||
case OdfType(_) =>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
OdfExtract
|
||||
.get(data)
|
||||
.map(_.map(_.value))
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
case OcrType(mt) =>
|
||||
val doExtract = TextExtract
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(_.trim)
|
||||
.map(_.value)
|
||||
.attempt
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
@ -85,13 +93,16 @@ object Extraction {
|
||||
.info(
|
||||
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
|
||||
) *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
OdfExtract
|
||||
.get(data)
|
||||
.map(_.map(_.value))
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
case MimeType.NonHtmlText(mt) =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
ExtractResult.success(Text(txt).value)
|
||||
}
|
||||
|
||||
case mt =>
|
||||
|
@ -6,6 +6,7 @@ import fs2.Stream
|
||||
import docspell.common.{Language, Logger}
|
||||
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||
import docspell.extract.pdfbox.PdfboxExtract
|
||||
import docspell.extract.internal.Text
|
||||
|
||||
object PdfExtract {
|
||||
|
||||
@ -16,12 +17,12 @@ object PdfExtract {
|
||||
stripMinLen: Int,
|
||||
ocrCfg: OcrConfig,
|
||||
logger: Logger[F]
|
||||
): F[Either[Throwable, String]] = {
|
||||
): F[Either[Throwable, Text]] = {
|
||||
|
||||
val runOcr =
|
||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||
|
||||
def chooseResult(ocrStr: String, strippedStr: String) =
|
||||
def chooseResult(ocrStr: Text, strippedStr: Text) =
|
||||
if (ocrStr.length > strippedStr.length)
|
||||
logger.info(
|
||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
||||
|
@ -0,0 +1,20 @@
|
||||
package docspell.extract.internal
|
||||
|
||||
case class Text(raw: String) {
|
||||
|
||||
private lazy val textValue =
|
||||
raw.trim.replace("\u0000", "")
|
||||
|
||||
def length: Int =
|
||||
textValue.length
|
||||
|
||||
def value: String =
|
||||
textValue
|
||||
}
|
||||
|
||||
object Text {
|
||||
|
||||
def apply(ot: Option[String]): Text =
|
||||
Text(ot.getOrElse(""))
|
||||
|
||||
}
|
@ -3,6 +3,7 @@ package docspell.extract.ocr
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import docspell.extract.internal.Text
|
||||
import fs2.Stream
|
||||
|
||||
object TextExtract {
|
||||
@ -13,7 +14,7 @@ object TextExtract {
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
): Stream[F, Text] =
|
||||
extractOCR(in, blocker, logger, lang, config)
|
||||
|
||||
def extractOCR[F[_]: Sync: ContextShift](
|
||||
@ -22,7 +23,7 @@ object TextExtract {
|
||||
logger: Logger[F],
|
||||
lang: String,
|
||||
config: OcrConfig
|
||||
): Stream[F, String] =
|
||||
): Stream[F, Text] =
|
||||
Stream
|
||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||
.flatMap({
|
||||
@ -35,6 +36,7 @@ object TextExtract {
|
||||
case mt =>
|
||||
raiseError(s"File `$mt` not supported")
|
||||
})
|
||||
.map(Text.apply)
|
||||
|
||||
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
|
||||
Stream.raiseError[F](new Exception(msg))
|
||||
|
@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser
|
||||
import org.apache.tika.sax.BodyContentHandler
|
||||
|
||||
import scala.util.Try
|
||||
import docspell.extract.internal.Text
|
||||
|
||||
object OdfExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
|
||||
def get(is: InputStream) =
|
||||
@ -24,7 +25,7 @@ object OdfExtract {
|
||||
val meta = new Metadata()
|
||||
val ooparser = new OpenDocumentParser()
|
||||
ooparser.parse(is, handler, meta, pctx)
|
||||
handler.toString.trim
|
||||
Text(Option(handler.toString))
|
||||
}.toEither
|
||||
|
||||
}
|
||||
|
@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper
|
||||
|
||||
import scala.util.{Try, Using}
|
||||
import fs2.Stream
|
||||
import docspell.extract.internal.Text
|
||||
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
def get(is: InputStream): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
||||
def get(inFile: Path): Either[Throwable, String] =
|
||||
def get(inFile: Path): Either[Throwable, Text] =
|
||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||
|
||||
private def readText(doc: PDDocument): Either[Throwable, String] =
|
||||
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||
Try {
|
||||
val stripper = new PDFTextStripper()
|
||||
stripper.setAddMoreFormatting(true)
|
||||
stripper.setLineSeparator("\n")
|
||||
stripper.getText(doc).trim // trim here already
|
||||
Text(Option(stripper.getText(doc)))
|
||||
}.toEither
|
||||
}
|
||||
|
@ -17,19 +17,20 @@ import fs2.Stream
|
||||
import scala.util.Try
|
||||
import docspell.common._
|
||||
import docspell.files.TikaMimetype
|
||||
import docspell.extract.internal.Text
|
||||
|
||||
object PoiExtract {
|
||||
|
||||
def get[F[_]: Sync](
|
||||
data: Stream[F, Byte],
|
||||
hint: MimeTypeHint
|
||||
): F[Either[Throwable, String]] =
|
||||
): F[Either[Throwable, Text]] =
|
||||
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
|
||||
|
||||
def get[F[_]: Sync](
|
||||
data: Stream[F, Byte],
|
||||
mime: MimeType
|
||||
): F[Either[Throwable, String]] =
|
||||
): F[Either[Throwable, Text]] =
|
||||
mime match {
|
||||
case PoiType.doc =>
|
||||
getDoc(data)
|
||||
@ -55,40 +56,40 @@ object PoiExtract {
|
||||
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
|
||||
}
|
||||
|
||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
||||
def getDocx(is: InputStream): Either[Throwable, Text] =
|
||||
Try {
|
||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
Text(Option(xt.getText))
|
||||
}.toEither
|
||||
|
||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
||||
def getDoc(is: InputStream): Either[Throwable, Text] =
|
||||
Try {
|
||||
val xt = new WordExtractor(is)
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
Text(Option(xt.getText))
|
||||
}.toEither
|
||||
|
||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
||||
def getXlsx(is: InputStream): Either[Throwable, Text] =
|
||||
Try {
|
||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
Text(Option(xt.getText))
|
||||
}.toEither
|
||||
|
||||
def getXls(is: InputStream): Either[Throwable, String] =
|
||||
def getXls(is: InputStream): Either[Throwable, Text] =
|
||||
Try {
|
||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
Text(Option(xt.getText))
|
||||
}.toEither
|
||||
|
||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
|
||||
|
||||
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
|
||||
|
||||
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
|
||||
|
||||
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
|
||||
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream}
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common.MimeType
|
||||
import docspell.extract.internal.Text
|
||||
import fs2.Stream
|
||||
import javax.swing.text.rtf.RTFEditorKit
|
||||
|
||||
@ -14,14 +15,14 @@ object RtfExtract {
|
||||
|
||||
val rtfType = MimeType.application("rtf")
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
def get(is: InputStream): Either[Throwable, Text] =
|
||||
Try {
|
||||
val kit = new RTFEditorKit()
|
||||
val doc = kit.createDefaultDocument()
|
||||
kit.read(is, doc, 0)
|
||||
doc.getText(0, doc.getLength).trim
|
||||
Text(doc.getText(0, doc.getLength))
|
||||
}.toEither
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
}
|
||||
|
@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite {
|
||||
.lastOrError
|
||||
.unsafeRunSync()
|
||||
|
||||
assertEquals(extract.trim, expect.trim)
|
||||
assertEquals(extract.value, expect)
|
||||
}
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
case (file, txt) =>
|
||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
val received = removeFormatting(str)
|
||||
val received = removeFormatting(str.value)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
}
|
||||
@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
case (file, txt) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
||||
val received = removeFormatting(str)
|
||||
val received = removeFormatting(str.value)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
}
|
||||
@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
||||
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
|
||||
assertEquals(str, "")
|
||||
assertEquals(str.value, "")
|
||||
}
|
||||
|
||||
private def removeFormatting(str: String): String =
|
||||
|
Loading…
x
Reference in New Issue
Block a user