mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
scalafmtAll
This commit is contained in:
@ -13,7 +13,11 @@ import docspell.files.ImageSize
|
||||
|
||||
trait Extraction[F[_]] {
|
||||
|
||||
def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult]
|
||||
def extractText(
|
||||
data: Stream[F, Byte],
|
||||
dataType: DataType,
|
||||
lang: Language
|
||||
): F[ExtractResult]
|
||||
|
||||
}
|
||||
|
||||
@ -71,13 +75,17 @@ object Extraction {
|
||||
doExtract
|
||||
}
|
||||
case None =>
|
||||
logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *>
|
||||
logger.info(
|
||||
s"Cannot read image data from ${mt.asString}. Extracting anyways."
|
||||
) *>
|
||||
doExtract
|
||||
}
|
||||
|
||||
case OdfType.ContainerMatch(_) =>
|
||||
logger
|
||||
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
.info(
|
||||
s"File detected as ${OdfType.container}. Try to read as OpenDocument file."
|
||||
) *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
|
||||
|
@ -135,7 +135,9 @@ object Ocr {
|
||||
.map(_ => targetFile)
|
||||
.handleErrorWith { th =>
|
||||
logger
|
||||
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
|
||||
.warn(
|
||||
s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction."
|
||||
)
|
||||
Stream.emit(img)
|
||||
}
|
||||
}
|
||||
@ -152,10 +154,15 @@ object Ocr {
|
||||
): Stream[F, String] =
|
||||
// tesseract cannot cope with absolute filenames
|
||||
// so use the parent as working dir
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg =>
|
||||
val cmd = config.tesseract.command
|
||||
.replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
|
||||
SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout)
|
||||
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap {
|
||||
uimg =>
|
||||
val cmd = config.tesseract.command
|
||||
.replace(
|
||||
Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))
|
||||
)
|
||||
SystemCommand
|
||||
.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent))
|
||||
.map(_.stdout)
|
||||
}
|
||||
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
|
@ -41,11 +41,16 @@ object OcrConfig {
|
||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||
),
|
||||
unpaper = Unpaper(
|
||||
SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
||||
SystemCommand
|
||||
.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
||||
),
|
||||
tesseract = Tesseract(
|
||||
SystemCommand
|
||||
.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
||||
.Config(
|
||||
"tesseract",
|
||||
Seq("{{file}}", "stdout", "-l", "{{lang}}"),
|
||||
Duration.minutes(1)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
@ -14,7 +14,9 @@ import fs2.Stream
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
data.compile
|
||||
.to(Array)
|
||||
.map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
@ -20,10 +20,16 @@ import docspell.files.TikaMimetype
|
||||
|
||||
object PoiExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
|
||||
def get[F[_]: Sync](
|
||||
data: Stream[F, Byte],
|
||||
hint: MimeTypeHint
|
||||
): F[Either[Throwable, String]] =
|
||||
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] =
|
||||
def get[F[_]: Sync](
|
||||
data: Stream[F, Byte],
|
||||
mime: MimeType
|
||||
): F[Either[Throwable, String]] =
|
||||
mime match {
|
||||
case PoiType.doc =>
|
||||
getDoc(data)
|
||||
|
@ -6,10 +6,11 @@ object PoiType {
|
||||
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
val docx =
|
||||
MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
|
||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||
|
||||
|
Reference in New Issue
Block a user