mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
sbt scalafmtAll
This commit is contained in:
@ -29,7 +29,7 @@ object Extraction {
|
||||
data: Stream[F, Byte],
|
||||
dataType: DataType,
|
||||
lang: Language
|
||||
): F[ExtractResult] = {
|
||||
): F[ExtractResult] =
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.pdf =>
|
||||
PdfExtract
|
||||
@ -50,39 +50,46 @@ object Extraction {
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(_.trim)
|
||||
.attempt
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
ImageSize.get(data).flatMap {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.ocr.maxImageSize) {
|
||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
|
||||
ExtractResult.failure(new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
|
||||
).pure[F]
|
||||
logger.info(
|
||||
s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||
) *>
|
||||
ExtractResult
|
||||
.failure(
|
||||
new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||
)
|
||||
)
|
||||
.pure[F]
|
||||
} else {
|
||||
doExtract
|
||||
}
|
||||
case None =>
|
||||
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
|
||||
doExtract
|
||||
doExtract
|
||||
}
|
||||
|
||||
case OdfType.container =>
|
||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
logger
|
||||
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt@MimeType("text", sub) if !sub.contains("html") =>
|
||||
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
}
|
||||
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
}
|
||||
|
||||
case mt =>
|
||||
ExtractResult.unsupportedFormat(mt).pure[F]
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,3 +1,3 @@
|
||||
package docspell.extract
|
||||
|
||||
case class PdfConfig (minTextLen: Int)
|
||||
case class PdfConfig(minTextLen: Int)
|
||||
|
@ -33,7 +33,8 @@ object PdfExtract {
|
||||
|
||||
//maybe better: inspect the pdf and decide whether ocr or not
|
||||
for {
|
||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
|
||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||
.get[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
|
@ -5,13 +5,12 @@ import java.nio.file.{Path, Paths}
|
||||
import docspell.common._
|
||||
|
||||
case class OcrConfig(
|
||||
maxImageSize: Int,
|
||||
ghostscript: OcrConfig.Ghostscript,
|
||||
maxImageSize: Int,
|
||||
ghostscript: OcrConfig.Ghostscript,
|
||||
pageRange: OcrConfig.PageRange,
|
||||
unpaper: OcrConfig.Unpaper,
|
||||
tesseract: OcrConfig.Tesseract
|
||||
) {
|
||||
}
|
||||
) {}
|
||||
|
||||
object OcrConfig {
|
||||
|
||||
|
@ -5,9 +5,9 @@ import docspell.common.MimeType
|
||||
object OcrType {
|
||||
|
||||
val jpeg = MimeType.jpeg
|
||||
val png = MimeType.png
|
||||
val png = MimeType.png
|
||||
val tiff = MimeType.tiff
|
||||
val pdf = MimeType.pdf
|
||||
val pdf = MimeType.pdf
|
||||
|
||||
val all = Set(jpeg, png, tiff, pdf)
|
||||
|
||||
|
@ -17,14 +17,14 @@ object OdfExtract {
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
|
||||
|
||||
def get(is: InputStream) = Try {
|
||||
val handler = new BodyContentHandler()
|
||||
val pctx = new ParseContext()
|
||||
val meta = new Metadata()
|
||||
val ooparser = new OpenDocumentParser()
|
||||
ooparser.parse(is, handler, meta, pctx)
|
||||
handler.toString.trim
|
||||
}.toEither
|
||||
def get(is: InputStream) =
|
||||
Try {
|
||||
val handler = new BodyContentHandler()
|
||||
val pctx = new ParseContext()
|
||||
val meta = new Metadata()
|
||||
val ooparser = new OpenDocumentParser()
|
||||
ooparser.parse(is, handler, meta, pctx)
|
||||
handler.toString.trim
|
||||
}.toEither
|
||||
|
||||
}
|
||||
|
@ -4,8 +4,8 @@ import docspell.common.MimeType
|
||||
|
||||
object OdfType {
|
||||
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
val odt = MimeType.application("vnd.oasis.opendocument.text")
|
||||
val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet")
|
||||
val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text")
|
||||
val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet")
|
||||
|
||||
|
@ -14,9 +14,7 @@ import fs2.Stream
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map { bytes =>
|
||||
Using(PDDocument.load(bytes))(readText).toEither.flatten
|
||||
}
|
||||
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
@ -52,25 +52,25 @@ object PoiExtract {
|
||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new WordExtractor(is)
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getXls(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
|
@ -5,11 +5,11 @@ import docspell.common.MimeType
|
||||
object PoiType {
|
||||
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
|
||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||
|
||||
|
@ -5,7 +5,7 @@ import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object OdfExtractTest extends SimpleTestSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val files = List(
|
||||
@ -14,14 +14,15 @@ object OdfExtractTest extends SimpleTestSuite {
|
||||
)
|
||||
|
||||
test("test extract from odt") {
|
||||
files.foreach { case (file, len) =>
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str1.length, len)
|
||||
files.foreach {
|
||||
case (file, len) =>
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str1.length, len)
|
||||
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
|
||||
assertEquals(str2, str1)
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
|
||||
assertEquals(str2, str1)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,8 +7,8 @@ object RtfExtractTest extends SimpleTestSuite {
|
||||
|
||||
test("extract text from rtf using java input-stream") {
|
||||
val file = ExampleFiles.examples_sample_rtf
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str = RtfExtract.get(is).fold(throw _, identity)
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str = RtfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str.length, 7342)
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user