sbt scalafmtAll

This commit is contained in:
Eike Kettner
2020-02-25 20:55:00 +01:00
parent 4dbf75dd8f
commit 2f87065b2e
86 changed files with 582 additions and 525 deletions

View File

@ -29,7 +29,7 @@ object Extraction {
data: Stream[F, Byte],
dataType: DataType,
lang: Language
): F[ExtractResult] = {
): F[ExtractResult] =
TikaMimetype.resolve(dataType, data).flatMap {
case MimeType.pdf =>
PdfExtract
@ -50,39 +50,46 @@ object Extraction {
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
.compile
.lastOrError
.map(_.trim)
.attempt
.map(ExtractResult.fromEither)
ImageSize.get(data).flatMap {
case Some(dim) =>
if (dim.product > cfg.ocr.maxImageSize) {
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
ExtractResult.failure(new Exception(
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
).pure[F]
logger.info(
s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize})."
) *>
ExtractResult
.failure(
new Exception(
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize})."
)
)
.pure[F]
} else {
doExtract
}
case None =>
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
doExtract
doExtract
}
case OdfType.container =>
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
logger
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
OdfExtract.get(data).map(ExtractResult.fromEither)
case mt@MimeType("text", sub) if !sub.contains("html") =>
case mt @ MimeType("text", sub) if !sub.contains("html") =>
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim)
}
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
ExtractResult.success(txt.getOrElse("").trim)
}
case mt =>
ExtractResult.unsupportedFormat(mt).pure[F]
}
}
}
}

View File

@ -1,3 +1,3 @@
package docspell.extract
case class PdfConfig (minTextLen: Int)
case class PdfConfig(minTextLen: Int)

View File

@ -33,7 +33,8 @@ object PdfExtract {
//maybe better: inspect the pdf and decide whether ocr or not
for {
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
.get[F](in)
res <- pdfboxRes.fold(
ex =>
logger.info(

View File

@ -5,13 +5,12 @@ import java.nio.file.{Path, Paths}
import docspell.common._
case class OcrConfig(
maxImageSize: Int,
ghostscript: OcrConfig.Ghostscript,
maxImageSize: Int,
ghostscript: OcrConfig.Ghostscript,
pageRange: OcrConfig.PageRange,
unpaper: OcrConfig.Unpaper,
tesseract: OcrConfig.Tesseract
) {
}
) {}
object OcrConfig {

View File

@ -5,9 +5,9 @@ import docspell.common.MimeType
object OcrType {
val jpeg = MimeType.jpeg
val png = MimeType.png
val png = MimeType.png
val tiff = MimeType.tiff
val pdf = MimeType.pdf
val pdf = MimeType.pdf
val all = Set(jpeg, png, tiff, pdf)

View File

@ -17,14 +17,14 @@ object OdfExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
def get(is: InputStream) = Try {
val handler = new BodyContentHandler()
val pctx = new ParseContext()
val meta = new Metadata()
val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx)
handler.toString.trim
}.toEither
def get(is: InputStream) =
Try {
val handler = new BodyContentHandler()
val pctx = new ParseContext()
val meta = new Metadata()
val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx)
handler.toString.trim
}.toEither
}

View File

@ -4,8 +4,8 @@ import docspell.common.MimeType
object OdfType {
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odt = MimeType.application("vnd.oasis.opendocument.text")
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")

View File

@ -14,9 +14,7 @@ import fs2.Stream
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map { bytes =>
Using(PDDocument.load(bytes))(readText).toEither.flatten
}
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, String] =
Using(PDDocument.load(is))(readText).toEither.flatten

View File

@ -52,25 +52,25 @@ object PoiExtract {
def getDocx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XWPFWordExtractor(new XWPFDocument(is))
xt.getText.trim
Option(xt.getText).map(_.trim).getOrElse("")
}.toEither
def getDoc(is: InputStream): Either[Throwable, String] =
Try {
val xt = new WordExtractor(is)
xt.getText.trim
Option(xt.getText).map(_.trim).getOrElse("")
}.toEither
def getXlsx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
xt.getText.trim
Option(xt.getText).map(_.trim).getOrElse("")
}.toEither
def getXls(is: InputStream): Either[Throwable, String] =
Try {
val xt = new ExcelExtractor(new HSSFWorkbook(is))
xt.getText.trim
Option(xt.getText).map(_.trim).getOrElse("")
}.toEither
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =

View File

@ -5,11 +5,11 @@ import docspell.common.MimeType
object PoiType {
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)

View File

@ -5,7 +5,7 @@ import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object OdfExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val files = List(
@ -14,14 +14,15 @@ object OdfExtractTest extends SimpleTestSuite {
)
test("test extract from odt") {
files.foreach { case (file, len) =>
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str1 = OdfExtract.get(is).fold(throw _, identity)
assertEquals(str1.length, len)
files.foreach {
case (file, len) =>
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str1 = OdfExtract.get(is).fold(throw _, identity)
assertEquals(str1.length, len)
val data = file.readURL[IO](8192, blocker)
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
assertEquals(str2, str1)
val data = file.readURL[IO](8192, blocker)
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
assertEquals(str2, str1)
}
}

View File

@ -7,8 +7,8 @@ object RtfExtractTest extends SimpleTestSuite {
test("extract text from rtf using java input-stream") {
val file = ExampleFiles.examples_sample_rtf
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str = RtfExtract.get(is).fold(throw _, identity)
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str = RtfExtract.get(is).fold(throw _, identity)
assertEquals(str.length, 7342)
}
}