scalafmtAll

This commit is contained in:
Eike Kettner
2020-03-26 18:26:00 +01:00
parent 09ea724c13
commit 9656ba62f4
91 changed files with 871 additions and 295 deletions

View File

@ -13,7 +13,11 @@ import docspell.files.ImageSize
trait Extraction[F[_]] {
def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult]
def extractText(
data: Stream[F, Byte],
dataType: DataType,
lang: Language
): F[ExtractResult]
}
@ -71,13 +75,17 @@ object Extraction {
doExtract
}
case None =>
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
logger.info(
s"Cannot read image data from ${mt.asString}. Extracting anyways."
) *>
doExtract
}
case OdfType.ContainerMatch(_) =>
logger
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
.info(
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
) *>
OdfExtract.get(data).map(ExtractResult.fromEither)
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>

View File

@ -135,7 +135,9 @@ object Ocr {
.map(_ => targetFile)
.handleErrorWith { th =>
logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
.warn(
s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction."
)
Stream.emit(img)
}
}
@ -152,10 +154,15 @@ object Ocr {
): Stream[F, String] =
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
val cmd = config.tesseract.command
.replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
uimg =>
val cmd = config.tesseract.command
.replace(
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
)
SystemCommand
.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
.map(_.stdout)
}
/** Run tesseract on the given image file and return the extracted

View File

@ -41,11 +41,16 @@ object OcrConfig {
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
),
unpaper = Unpaper(
SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
SystemCommand
.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
),
tesseract = Tesseract(
SystemCommand
.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
.Config(
"tesseract",
Seq("{{file}}", "stdout", "-l", "{{lang}}"),
Duration.minutes(1)
)
)
)
}

View File

@ -14,7 +14,9 @@ import fs2.Stream
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
data.compile
.to(Array)
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
def get(is: InputStream): Either[Throwable, String] =
Using(PDDocument.load(is))(readText).toEither.flatten

View File

@ -20,10 +20,16 @@ import docspell.files.TikaMimetype
object PoiExtract {
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
def get[F[_]: Sync](
data: Stream[F, Byte],
hint: MimeTypeHint
): F[Either[Throwable, String]] =
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] =
def get[F[_]: Sync](
data: Stream[F, Byte],
mime: MimeType
): F[Either[Throwable, String]] =
mime match {
case PoiType.doc =>
getDoc(data)

View File

@ -6,10 +6,11 @@ object PoiType {
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val docx =
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)