Post process all extracted text

Removes 0 bytes and leading/trailing whitespace
2025-09-15 21:46:53 +00:00 · 2020-05-25 13:41:38 +02:00
parent 4e22361985
commit 2e88207ff1
10 changed files with 76 additions and 38 deletions
--- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala
+++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala
@@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract}
 import docspell.extract.odf.{OdfExtract, OdfType}
 import docspell.extract.poi.{PoiExtract, PoiType}
 import docspell.extract.rtf.RtfExtract
+import docspell.extract.internal.Text
 import fs2.Stream
 import docspell.files.TikaMimetype
 import docspell.files.ImageSize
@@ -38,23 +39,30 @@ object Extraction {
          case MimeType.PdfMatch(_) =>
            PdfExtract
              .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
+              .map(_.map(_.value))
              .map(ExtractResult.fromEither)

          case PoiType(mt) =>
-            PoiExtract.get(data, mt).map(ExtractResult.fromEither)
+            PoiExtract
+              .get(data, mt)
+              .map(_.map(_.value))
+              .map(ExtractResult.fromEither)

          case RtfExtract.rtfType =>
-            RtfExtract.get(data).map(ExtractResult.fromEither)
+            RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither)

          case OdfType(_) =>
-            OdfExtract.get(data).map(ExtractResult.fromEither)
+            OdfExtract
+              .get(data)
+              .map(_.map(_.value))
+              .map(ExtractResult.fromEither)

          case OcrType(mt) =>
            val doExtract = TextExtract
              .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
              .compile
              .lastOrError
-              .map(_.trim)
+              .map(_.value)
              .attempt
              .map(ExtractResult.fromEither)

@@ -85,13 +93,16 @@ object Extraction {
              .info(
                s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
              ) *>
-              OdfExtract.get(data).map(ExtractResult.fromEither)
+              OdfExtract
+                .get(data)
+                .map(_.map(_.value))
+                .map(ExtractResult.fromEither)

          case MimeType.NonHtmlText(mt) =>
            val cs = mt.charsetOrUtf8
            logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
              data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
-                ExtractResult.success(txt.getOrElse("").trim)
+                ExtractResult.success(Text(txt).value)
              }

          case mt =>
--- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala
@@ -6,6 +6,7 @@ import fs2.Stream
 import docspell.common.{Language, Logger}
 import docspell.extract.ocr.{OcrConfig, TextExtract}
 import docspell.extract.pdfbox.PdfboxExtract
+import docspell.extract.internal.Text

 object PdfExtract {

@@ -16,12 +17,12 @@ object PdfExtract {
      stripMinLen: Int,
      ocrCfg: OcrConfig,
      logger: Logger[F]
-  ): F[Either[Throwable, String]] = {
+  ): F[Either[Throwable, Text]] = {

    val runOcr =
      TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError

-    def chooseResult(ocrStr: String, strippedStr: String) =
+    def chooseResult(ocrStr: Text, strippedStr: Text) =
      if (ocrStr.length > strippedStr.length)
        logger.info(
          s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
--- a/modules/extract/src/main/scala/docspell/extract/internal/Text.scala
+++ b/modules/extract/src/main/scala/docspell/extract/internal/Text.scala
@@ -0,0 +1,20 @@
+package docspell.extract.internal
+
+case class Text(raw: String) {
+
+  private lazy val textValue =
+    raw.trim.replace("\u0000", "")
+
+  def length: Int =
+    textValue.length
+
+  def value: String =
+    textValue
+}
+
+object Text {
+
+  def apply(ot: Option[String]): Text =
+    Text(ot.getOrElse(""))
+
+}
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@@ -3,6 +3,7 @@ package docspell.extract.ocr
 import cats.effect.{Blocker, ContextShift, Sync}
 import docspell.common._
 import docspell.files._
+import docspell.extract.internal.Text
 import fs2.Stream

 object TextExtract {
@@ -13,7 +14,7 @@ object TextExtract {
      logger: Logger[F],
      lang: String,
      config: OcrConfig
-  ): Stream[F, String] =
+  ): Stream[F, Text] =
    extractOCR(in, blocker, logger, lang, config)

  def extractOCR[F[_]: Sync: ContextShift](
@@ -22,7 +23,7 @@ object TextExtract {
      logger: Logger[F],
      lang: String,
      config: OcrConfig
-  ): Stream[F, String] =
+  ): Stream[F, Text] =
    Stream
      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
      .flatMap({
@@ -35,6 +36,7 @@ object TextExtract {
        case mt =>
          raiseError(s"File `$mt` not supported")
      })
+      .map(Text.apply)

  private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
    Stream.raiseError[F](new Exception(msg))
--- a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
@@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser
 import org.apache.tika.sax.BodyContentHandler

 import scala.util.Try
+import docspell.extract.internal.Text

 object OdfExtract {

-  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)

  def get(is: InputStream) =
@@ -24,7 +25,7 @@ object OdfExtract {
      val meta     = new Metadata()
      val ooparser = new OpenDocumentParser()
      ooparser.parse(is, handler, meta, pctx)
-      handler.toString.trim
+      Text(Option(handler.toString))
    }.toEither

 }
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper

 import scala.util.{Try, Using}
 import fs2.Stream
+import docspell.extract.internal.Text

 object PdfboxExtract {

-  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile
      .to(Array)
      .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)

-  def get(is: InputStream): Either[Throwable, String] =
+  def get(is: InputStream): Either[Throwable, Text] =
    Using(PDDocument.load(is))(readText).toEither.flatten

-  def get(inFile: Path): Either[Throwable, String] =
+  def get(inFile: Path): Either[Throwable, Text] =
    Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten

-  private def readText(doc: PDDocument): Either[Throwable, String] =
+  private def readText(doc: PDDocument): Either[Throwable, Text] =
    Try {
      val stripper = new PDFTextStripper()
      stripper.setAddMoreFormatting(true)
      stripper.setLineSeparator("\n")
-      stripper.getText(doc).trim // trim here already
+      Text(Option(stripper.getText(doc)))
    }.toEither
 }
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
@@ -17,19 +17,20 @@ import fs2.Stream
 import scala.util.Try
 import docspell.common._
 import docspell.files.TikaMimetype
+import docspell.extract.internal.Text

 object PoiExtract {

  def get[F[_]: Sync](
      data: Stream[F, Byte],
      hint: MimeTypeHint
-  ): F[Either[Throwable, String]] =
+  ): F[Either[Throwable, Text]] =
    TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))

  def get[F[_]: Sync](
      data: Stream[F, Byte],
      mime: MimeType
-  ): F[Either[Throwable, String]] =
+  ): F[Either[Throwable, Text]] =
    mime match {
      case PoiType.doc =>
        getDoc(data)
@@ -55,40 +56,40 @@ object PoiExtract {
        Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
    }

-  def getDocx(is: InputStream): Either[Throwable, String] =
+  def getDocx(is: InputStream): Either[Throwable, Text] =
    Try {
      val xt = new XWPFWordExtractor(new XWPFDocument(is))
-      Option(xt.getText).map(_.trim).getOrElse("")
+      Text(Option(xt.getText))
    }.toEither

-  def getDoc(is: InputStream): Either[Throwable, String] =
+  def getDoc(is: InputStream): Either[Throwable, Text] =
    Try {
      val xt = new WordExtractor(is)
-      Option(xt.getText).map(_.trim).getOrElse("")
+      Text(Option(xt.getText))
    }.toEither

-  def getXlsx(is: InputStream): Either[Throwable, String] =
+  def getXlsx(is: InputStream): Either[Throwable, Text] =
    Try {
      val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
-      Option(xt.getText).map(_.trim).getOrElse("")
+      Text(Option(xt.getText))
    }.toEither

-  def getXls(is: InputStream): Either[Throwable, String] =
+  def getXls(is: InputStream): Either[Throwable, Text] =
    Try {
      val xt = new ExcelExtractor(new HSSFWorkbook(is))
-      Option(xt.getText).map(_.trim).getOrElse("")
+      Text(Option(xt.getText))
    }.toEither

-  def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)

-  def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)

-  def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)

-  def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)

 }
--- a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
@@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream}
 import cats.implicits._
 import cats.effect.Sync
 import docspell.common.MimeType
+import docspell.extract.internal.Text
 import fs2.Stream
 import javax.swing.text.rtf.RTFEditorKit

@@ -14,14 +15,14 @@ object RtfExtract {

  val rtfType = MimeType.application("rtf")

-  def get(is: InputStream): Either[Throwable, String] =
+  def get(is: InputStream): Either[Throwable, Text] =
    Try {
      val kit = new RTFEditorKit()
      val doc = kit.createDefaultDocument()
      kit.read(is, doc, 0)
-      doc.getText(0, doc.getLength).trim
+      Text(doc.getText(0, doc.getLength))
    }.toEither

-  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] =
    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
 }
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite {
      .lastOrError
      .unsafeRunSync()

-    assertEquals(extract.trim, expect.trim)
+    assertEquals(extract.value, expect)
  }
 }
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
      case (file, txt) =>
        val url      = file.toJavaUrl.fold(sys.error, identity)
        val str      = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
-        val received = removeFormatting(str)
+        val received = removeFormatting(str.value)
        val expect   = removeFormatting(txt)
        assertEquals(received, expect)
    }
@@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite {
      case (file, txt) =>
        val data     = file.readURL[IO](8192, blocker)
        val str      = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
-        val received = removeFormatting(str)
+        val received = removeFormatting(str.value)
        val expect   = removeFormatting(txt)
        assertEquals(received, expect)
    }
@@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite {

    val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)

-    assertEquals(str, "")
+    assertEquals(str.value, "")
  }

  private def removeFormatting(str: String): String =