diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala index d5604499..54d41a09 100644 --- a/modules/extract/src/main/scala/docspell/extract/Extraction.scala +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -7,6 +7,7 @@ import docspell.extract.ocr.{OcrType, TextExtract} import docspell.extract.odf.{OdfExtract, OdfType} import docspell.extract.poi.{PoiExtract, PoiType} import docspell.extract.rtf.RtfExtract +import docspell.extract.internal.Text import fs2.Stream import docspell.files.TikaMimetype import docspell.files.ImageSize @@ -38,23 +39,30 @@ object Extraction { case MimeType.PdfMatch(_) => PdfExtract .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) + .map(_.map(_.value)) .map(ExtractResult.fromEither) case PoiType(mt) => - PoiExtract.get(data, mt).map(ExtractResult.fromEither) + PoiExtract + .get(data, mt) + .map(_.map(_.value)) + .map(ExtractResult.fromEither) case RtfExtract.rtfType => - RtfExtract.get(data).map(ExtractResult.fromEither) + RtfExtract.get(data).map(_.map(_.value)).map(ExtractResult.fromEither) case OdfType(_) => - OdfExtract.get(data).map(ExtractResult.fromEither) + OdfExtract + .get(data) + .map(_.map(_.value)) + .map(ExtractResult.fromEither) case OcrType(mt) => val doExtract = TextExtract .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) .compile .lastOrError - .map(_.trim) + .map(_.value) .attempt .map(ExtractResult.fromEither) @@ -85,13 +93,16 @@ object Extraction { .info( s"File detected as ${OdfType.container}. Try to read as OpenDocument file." ) *> - OdfExtract.get(data).map(ExtractResult.fromEither) + OdfExtract + .get(data) + .map(_.map(_.value)) + .map(ExtractResult.fromEither) case MimeType.NonHtmlText(mt) => val cs = mt.charsetOrUtf8 logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt => - ExtractResult.success(txt.getOrElse("").trim) + ExtractResult.success(Text(txt).value) } case mt => diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala index 5eaad5f1..bcdd0226 100644 --- a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -6,6 +6,7 @@ import fs2.Stream import docspell.common.{Language, Logger} import docspell.extract.ocr.{OcrConfig, TextExtract} import docspell.extract.pdfbox.PdfboxExtract +import docspell.extract.internal.Text object PdfExtract { @@ -16,12 +17,12 @@ object PdfExtract { stripMinLen: Int, ocrCfg: OcrConfig, logger: Logger[F] - ): F[Either[Throwable, String]] = { + ): F[Either[Throwable, Text]] = { val runOcr = TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError - def chooseResult(ocrStr: String, strippedStr: String) = + def chooseResult(ocrStr: Text, strippedStr: Text) = if (ocrStr.length > strippedStr.length) logger.info( s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" diff --git a/modules/extract/src/main/scala/docspell/extract/internal/Text.scala b/modules/extract/src/main/scala/docspell/extract/internal/Text.scala new file mode 100644 index 00000000..a42cb89e --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/internal/Text.scala @@ -0,0 +1,20 @@ +package docspell.extract.internal + +case class Text(raw: String) { + + private lazy val textValue = + raw.trim.replace("\u0000", "") + + def length: Int = + textValue.length + + def value: String = + textValue +} + +object Text { + + def apply(ot: Option[String]): Text = + Text(ot.getOrElse("")) + +} diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index 7246bb7c..4abc00e1 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -3,6 +3,7 @@ package docspell.extract.ocr import cats.effect.{Blocker, ContextShift, Sync} import docspell.common._ import docspell.files._ +import docspell.extract.internal.Text import fs2.Stream object TextExtract { @@ -13,7 +14,7 @@ object TextExtract { logger: Logger[F], lang: String, config: OcrConfig - ): Stream[F, String] = + ): Stream[F, Text] = extractOCR(in, blocker, logger, lang, config) def extractOCR[F[_]: Sync: ContextShift]( @@ -22,7 +23,7 @@ object TextExtract { logger: Logger[F], lang: String, config: OcrConfig - ): Stream[F, String] = + ): Stream[F, Text] = Stream .eval(TikaMimetype.detect(in, MimeTypeHint.none)) .flatMap({ @@ -35,6 +36,7 @@ object TextExtract { case mt => raiseError(s"File `$mt` not supported") }) + .map(Text.apply) private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] = Stream.raiseError[F](new Exception(msg)) diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala index 08b8e2fd..232be427 100644 --- a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala @@ -11,10 +11,11 @@ import org.apache.tika.parser.odf.OpenDocumentParser import org.apache.tika.sax.BodyContentHandler import scala.util.Try +import docspell.extract.internal.Text object OdfExtract { - def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) def get(is: InputStream) = @@ -24,7 +25,7 @@ object OdfExtract { val meta = new Metadata() val ooparser = new OpenDocumentParser() ooparser.parse(is, handler, meta, pctx) - handler.toString.trim + Text(Option(handler.toString)) }.toEither } diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala index ac0f2f45..502db289 100644 --- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -10,25 +10,26 @@ import org.apache.pdfbox.text.PDFTextStripper import scala.util.{Try, Using} import fs2.Stream +import docspell.extract.internal.Text object PdfboxExtract { - def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile .to(Array) .map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten) - def get(is: InputStream): Either[Throwable, String] = + def get(is: InputStream): Either[Throwable, Text] = Using(PDDocument.load(is))(readText).toEither.flatten - def get(inFile: Path): Either[Throwable, String] = + def get(inFile: Path): Either[Throwable, Text] = Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten - private def readText(doc: PDDocument): Either[Throwable, String] = + private def readText(doc: PDDocument): Either[Throwable, Text] = Try { val stripper = new PDFTextStripper() stripper.setAddMoreFormatting(true) stripper.setLineSeparator("\n") - stripper.getText(doc).trim // trim here already + Text(Option(stripper.getText(doc))) }.toEither } diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala index 10c0c439..daae9aae 100644 --- a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala @@ -17,19 +17,20 @@ import fs2.Stream import scala.util.Try import docspell.common._ import docspell.files.TikaMimetype +import docspell.extract.internal.Text object PoiExtract { def get[F[_]: Sync]( data: Stream[F, Byte], hint: MimeTypeHint - ): F[Either[Throwable, String]] = + ): F[Either[Throwable, Text]] = TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt)) def get[F[_]: Sync]( data: Stream[F, Byte], mime: MimeType - ): F[Either[Throwable, String]] = + ): F[Either[Throwable, Text]] = mime match { case PoiType.doc => getDoc(data) @@ -55,40 +56,40 @@ object PoiExtract { Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}"))) } - def getDocx(is: InputStream): Either[Throwable, String] = + def getDocx(is: InputStream): Either[Throwable, Text] = Try { val xt = new XWPFWordExtractor(new XWPFDocument(is)) - Option(xt.getText).map(_.trim).getOrElse("") + Text(Option(xt.getText)) }.toEither - def getDoc(is: InputStream): Either[Throwable, String] = + def getDoc(is: InputStream): Either[Throwable, Text] = Try { val xt = new WordExtractor(is) - Option(xt.getText).map(_.trim).getOrElse("") + Text(Option(xt.getText)) }.toEither - def getXlsx(is: InputStream): Either[Throwable, String] = + def getXlsx(is: InputStream): Either[Throwable, Text] = Try { val xt = new XSSFExcelExtractor(new XSSFWorkbook(is)) - Option(xt.getText).map(_.trim).getOrElse("") + Text(Option(xt.getText)) }.toEither - def getXls(is: InputStream): Either[Throwable, String] = + def getXls(is: InputStream): Either[Throwable, Text] = Try { val xt = new ExcelExtractor(new HSSFWorkbook(is)) - Option(xt.getText).map(_.trim).getOrElse("") + Text(Option(xt.getText)) }.toEither - def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx) - def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc) - def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx) - def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls) } diff --git a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala index c4a37fec..6850ed33 100644 --- a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala @@ -5,6 +5,7 @@ import java.io.{ByteArrayInputStream, InputStream} import cats.implicits._ import cats.effect.Sync import docspell.common.MimeType +import docspell.extract.internal.Text import fs2.Stream import javax.swing.text.rtf.RTFEditorKit @@ -14,14 +15,14 @@ object RtfExtract { val rtfType = MimeType.application("rtf") - def get(is: InputStream): Either[Throwable, String] = + def get(is: InputStream): Either[Throwable, Text] = Try { val kit = new RTFEditorKit() val doc = kit.createDefaultDocument() kit.read(is, doc, 0) - doc.getText(0, doc.getLength).trim + Text(doc.getText(0, doc.getLength)) }.toEither - def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, Text]] = data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) } diff --git a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index ac2998a8..4693fd6b 100644 --- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -29,6 +29,6 @@ object TextExtractionSuite extends SimpleTestSuite { .lastOrError .unsafeRunSync() - assertEquals(extract.trim, expect.trim) + assertEquals(extract.value, expect) } } diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala index 4d06be76..1f436b25 100644 --- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -18,7 +18,7 @@ object PdfboxExtractTest extends SimpleTestSuite { case (file, txt) => val url = file.toJavaUrl.fold(sys.error, identity) val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) - val received = removeFormatting(str) + val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) } @@ -29,7 +29,7 @@ object PdfboxExtractTest extends SimpleTestSuite { case (file, txt) => val data = file.readURL[IO](8192, blocker) val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) - val received = removeFormatting(str) + val received = removeFormatting(str.value) val expect = removeFormatting(txt) assertEquals(received, expect) } @@ -40,7 +40,7 @@ object PdfboxExtractTest extends SimpleTestSuite { val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) - assertEquals(str, "") + assertEquals(str.value, "") } private def removeFormatting(str: String): String =