mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 15:15:58 +00:00
Post process all extracted text
Removes 0 bytes and leading/trailing whitespace
This commit is contained in:
parent
4e22361985
commit
2e88207ff1
@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract}
|
|||||||
import docspell.extract.odf.{OdfExtract, OdfType}
|
import docspell.extract.odf.{OdfExtract, OdfType}
|
||||||
import docspell.extract.poi.{PoiExtract, PoiType}
|
import docspell.extract.poi.{PoiExtract, PoiType}
|
||||||
import docspell.extract.rtf.RtfExtract
|
import docspell.extract.rtf.RtfExtract
|
||||||
|
import docspell.extract.internal.Text
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import docspell.files.TikaMimetype
|
import docspell.files.TikaMimetype
|
||||||
import docspell.files.ImageSize
|
import docspell.files.ImageSize
|
||||||
@ -38,23 +39,30 @@ object Extraction {
|
|||||||
case MimeType.PdfMatch(_) =>
|
case MimeType.PdfMatch(_) =>
|
||||||
PdfExtract
|
PdfExtract
|
||||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||||
|
.map(_.map(_.value))
|
||||||
.map(ExtractResult.fromEither)
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case PoiType(mt) =>
|
case PoiType(mt) =>
|
||||||
PoiExtract.get(data, mt).map(ExtractResult.fromEither)
|
PoiExtract
|
||||||
|
.get(data, mt)
|
||||||
|
.map(_.map(_.value))
|
||||||
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case RtfExtract.rtfType =>
|
case RtfExtract.rtfType =>
|
||||||
RtfExtract.get(data).map(ExtractResult.fromEither)
|
RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case OdfType(_) =>
|
case OdfType(_) =>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract
|
||||||
|
.get(data)
|
||||||
|
.map(_.map(_.value))
|
||||||
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case OcrType(mt) =>
|
case OcrType(mt) =>
|
||||||
val doExtract = TextExtract
|
val doExtract = TextExtract
|
||||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.map(_.trim)
|
.map(_.value)
|
||||||
.attempt
|
.attempt
|
||||||
.map(ExtractResult.fromEither)
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
@ -85,13 +93,16 @@ object Extraction {
|
|||||||
.info(
|
.info(
|
||||||
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
|
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
|
||||||
) *>
|
) *>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract
|
||||||
|
.get(data)
|
||||||
|
.map(_.map(_.value))
|
||||||
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case MimeType.NonHtmlText(mt) =>
|
case MimeType.NonHtmlText(mt) =>
|
||||||
val cs = mt.charsetOrUtf8
|
val cs = mt.charsetOrUtf8
|
||||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||||
ExtractResult.success(txt.getOrElse("").trim)
|
ExtractResult.success(Text(txt).value)
|
||||||
}
|
}
|
||||||
|
|
||||||
case mt =>
|
case mt =>
|
||||||
|
@ -6,6 +6,7 @@ import fs2.Stream
|
|||||||
import docspell.common.{Language, Logger}
|
import docspell.common.{Language, Logger}
|
||||||
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||||
import docspell.extract.pdfbox.PdfboxExtract
|
import docspell.extract.pdfbox.PdfboxExtract
|
||||||
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
object PdfExtract {
|
object PdfExtract {
|
||||||
|
|
||||||
@ -16,12 +17,12 @@ object PdfExtract {
|
|||||||
stripMinLen: Int,
|
stripMinLen: Int,
|
||||||
ocrCfg: OcrConfig,
|
ocrCfg: OcrConfig,
|
||||||
logger: Logger[F]
|
logger: Logger[F]
|
||||||
): F[Either[Throwable, String]] = {
|
): F[Either[Throwable, Text]] = {
|
||||||
|
|
||||||
val runOcr =
|
val runOcr =
|
||||||
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError
|
||||||
|
|
||||||
def chooseResult(ocrStr: String, strippedStr: String) =
|
def chooseResult(ocrStr: Text, strippedStr: Text) =
|
||||||
if (ocrStr.length > strippedStr.length)
|
if (ocrStr.length > strippedStr.length)
|
||||||
logger.info(
|
logger.info(
|
||||||
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
||||||
|
@ -0,0 +1,20 @@
|
|||||||
|
package docspell.extract.internal
|
||||||
|
|
||||||
|
case class Text(raw: String) {
|
||||||
|
|
||||||
|
private lazy val textValue =
|
||||||
|
raw.trim.replace("\u0000", "")
|
||||||
|
|
||||||
|
def length: Int =
|
||||||
|
textValue.length
|
||||||
|
|
||||||
|
def value: String =
|
||||||
|
textValue
|
||||||
|
}
|
||||||
|
|
||||||
|
object Text {
|
||||||
|
|
||||||
|
def apply(ot: Option[String]): Text =
|
||||||
|
Text(ot.getOrElse(""))
|
||||||
|
|
||||||
|
}
|
@ -3,6 +3,7 @@ package docspell.extract.ocr
|
|||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{Blocker, ContextShift, Sync}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.files._
|
import docspell.files._
|
||||||
|
import docspell.extract.internal.Text
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
object TextExtract {
|
object TextExtract {
|
||||||
@ -13,7 +14,7 @@ object TextExtract {
|
|||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, Text] =
|
||||||
extractOCR(in, blocker, logger, lang, config)
|
extractOCR(in, blocker, logger, lang, config)
|
||||||
|
|
||||||
def extractOCR[F[_]: Sync: ContextShift](
|
def extractOCR[F[_]: Sync: ContextShift](
|
||||||
@ -22,7 +23,7 @@ object TextExtract {
|
|||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
lang: String,
|
lang: String,
|
||||||
config: OcrConfig
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, Text] =
|
||||||
Stream
|
Stream
|
||||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||||
.flatMap({
|
.flatMap({
|
||||||
@ -35,6 +36,7 @@ object TextExtract {
|
|||||||
case mt =>
|
case mt =>
|
||||||
raiseError(s"File `$mt` not supported")
|
raiseError(s"File `$mt` not supported")
|
||||||
})
|
})
|
||||||
|
.map(Text.apply)
|
||||||
|
|
||||||
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
|
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
|
||||||
Stream.raiseError[F](new Exception(msg))
|
Stream.raiseError[F](new Exception(msg))
|
||||||
|
@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser
|
|||||||
import org.apache.tika.sax.BodyContentHandler
|
import org.apache.tika.sax.BodyContentHandler
|
||||||
|
|
||||||
import scala.util.Try
|
import scala.util.Try
|
||||||
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
object OdfExtract {
|
object OdfExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||||
|
|
||||||
def get(is: InputStream) =
|
def get(is: InputStream) =
|
||||||
@ -24,7 +25,7 @@ object OdfExtract {
|
|||||||
val meta = new Metadata()
|
val meta = new Metadata()
|
||||||
val ooparser = new OpenDocumentParser()
|
val ooparser = new OpenDocumentParser()
|
||||||
ooparser.parse(is, handler, meta, pctx)
|
ooparser.parse(is, handler, meta, pctx)
|
||||||
handler.toString.trim
|
Text(Option(handler.toString))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper
|
|||||||
|
|
||||||
import scala.util.{Try, Using}
|
import scala.util.{Try, Using}
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
object PdfboxExtract {
|
object PdfboxExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile
|
data.compile
|
||||||
.to(Array)
|
.to(Array)
|
||||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||||
|
|
||||||
def get(is: InputStream): Either[Throwable, String] =
|
def get(is: InputStream): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||||
|
|
||||||
def get(inFile: Path): Either[Throwable, String] =
|
def get(inFile: Path): Either[Throwable, Text] =
|
||||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||||
|
|
||||||
private def readText(doc: PDDocument): Either[Throwable, String] =
|
private def readText(doc: PDDocument): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val stripper = new PDFTextStripper()
|
val stripper = new PDFTextStripper()
|
||||||
stripper.setAddMoreFormatting(true)
|
stripper.setAddMoreFormatting(true)
|
||||||
stripper.setLineSeparator("\n")
|
stripper.setLineSeparator("\n")
|
||||||
stripper.getText(doc).trim // trim here already
|
Text(Option(stripper.getText(doc)))
|
||||||
}.toEither
|
}.toEither
|
||||||
}
|
}
|
||||||
|
@ -17,19 +17,20 @@ import fs2.Stream
|
|||||||
import scala.util.Try
|
import scala.util.Try
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.files.TikaMimetype
|
import docspell.files.TikaMimetype
|
||||||
|
import docspell.extract.internal.Text
|
||||||
|
|
||||||
object PoiExtract {
|
object PoiExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](
|
def get[F[_]: Sync](
|
||||||
data: Stream[F, Byte],
|
data: Stream[F, Byte],
|
||||||
hint: MimeTypeHint
|
hint: MimeTypeHint
|
||||||
): F[Either[Throwable, String]] =
|
): F[Either[Throwable, Text]] =
|
||||||
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
|
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
|
||||||
|
|
||||||
def get[F[_]: Sync](
|
def get[F[_]: Sync](
|
||||||
data: Stream[F, Byte],
|
data: Stream[F, Byte],
|
||||||
mime: MimeType
|
mime: MimeType
|
||||||
): F[Either[Throwable, String]] =
|
): F[Either[Throwable, Text]] =
|
||||||
mime match {
|
mime match {
|
||||||
case PoiType.doc =>
|
case PoiType.doc =>
|
||||||
getDoc(data)
|
getDoc(data)
|
||||||
@ -55,40 +56,40 @@ object PoiExtract {
|
|||||||
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
|
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
|
||||||
}
|
}
|
||||||
|
|
||||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
def getDocx(is: InputStream): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||||
Option(xt.getText).map(_.trim).getOrElse("")
|
Text(Option(xt.getText))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
def getDoc(is: InputStream): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new WordExtractor(is)
|
val xt = new WordExtractor(is)
|
||||||
Option(xt.getText).map(_.trim).getOrElse("")
|
Text(Option(xt.getText))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
def getXlsx(is: InputStream): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||||
Option(xt.getText).map(_.trim).getOrElse("")
|
Text(Option(xt.getText))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getXls(is: InputStream): Either[Throwable, String] =
|
def getXls(is: InputStream): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||||
Option(xt.getText).map(_.trim).getOrElse("")
|
Text(Option(xt.getText))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
|
||||||
|
|
||||||
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
|
||||||
|
|
||||||
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
|
||||||
|
|
||||||
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream}
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import docspell.common.MimeType
|
import docspell.common.MimeType
|
||||||
|
import docspell.extract.internal.Text
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import javax.swing.text.rtf.RTFEditorKit
|
import javax.swing.text.rtf.RTFEditorKit
|
||||||
|
|
||||||
@ -14,14 +15,14 @@ object RtfExtract {
|
|||||||
|
|
||||||
val rtfType = MimeType.application("rtf")
|
val rtfType = MimeType.application("rtf")
|
||||||
|
|
||||||
def get(is: InputStream): Either[Throwable, String] =
|
def get(is: InputStream): Either[Throwable, Text] =
|
||||||
Try {
|
Try {
|
||||||
val kit = new RTFEditorKit()
|
val kit = new RTFEditorKit()
|
||||||
val doc = kit.createDefaultDocument()
|
val doc = kit.createDefaultDocument()
|
||||||
kit.read(is, doc, 0)
|
kit.read(is, doc, 0)
|
||||||
doc.getText(0, doc.getLength).trim
|
Text(doc.getText(0, doc.getLength))
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite {
|
|||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
|
|
||||||
assertEquals(extract.trim, expect.trim)
|
assertEquals(extract.value, expect)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
case (file, txt) =>
|
case (file, txt) =>
|
||||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
val url = file.toJavaUrl.fold(sys.error, identity)
|
||||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||||
val received = removeFormatting(str)
|
val received = removeFormatting(str.value)
|
||||||
val expect = removeFormatting(txt)
|
val expect = removeFormatting(txt)
|
||||||
assertEquals(received, expect)
|
assertEquals(received, expect)
|
||||||
}
|
}
|
||||||
@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
case (file, txt) =>
|
case (file, txt) =>
|
||||||
val data = file.readURL[IO](8192, blocker)
|
val data = file.readURL[IO](8192, blocker)
|
||||||
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
||||||
val received = removeFormatting(str)
|
val received = removeFormatting(str.value)
|
||||||
val expect = removeFormatting(txt)
|
val expect = removeFormatting(txt)
|
||||||
assertEquals(received, expect)
|
assertEquals(received, expect)
|
||||||
}
|
}
|
||||||
@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
|
|||||||
|
|
||||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||||
|
|
||||||
assertEquals(str, "")
|
assertEquals(str.value, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
private def removeFormatting(str: String): String =
|
private def removeFormatting(str: String): String =
|
||||||
|
Loading…
x
Reference in New Issue
Block a user