mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
sbt scalafmtAll
This commit is contained in:
parent
4dbf75dd8f
commit
2f87065b2e
@ -12,13 +12,13 @@ object Contact {
|
||||
def annotate(text: String): Vector[NerLabel] =
|
||||
TextSplitter
|
||||
.splitToken[Nothing](text, " \t\r\n".toSet)
|
||||
.map({ token =>
|
||||
.map { token =>
|
||||
if (isEmailAddress(token.value))
|
||||
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
|
||||
else if (isWebsite(token.value))
|
||||
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
|
||||
else None
|
||||
})
|
||||
}
|
||||
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
|
||||
.toVector
|
||||
|
||||
|
@ -11,7 +11,14 @@ import docspell.store.queries.{QAttachment, QItem}
|
||||
import OItem.{AttachmentData, AttachmentSourceData, ItemData, ListItem, Query}
|
||||
import bitpeace.{FileMeta, RangeDef}
|
||||
import docspell.common.{Direction, Ident, ItemState, MetaProposalList, Timestamp}
|
||||
import docspell.store.records.{RAttachment, RAttachmentMeta, RAttachmentSource, RItem, RSource, RTagItem}
|
||||
import docspell.store.records.{
|
||||
RAttachment,
|
||||
RAttachmentMeta,
|
||||
RAttachmentSource,
|
||||
RItem,
|
||||
RSource,
|
||||
RTagItem
|
||||
}
|
||||
|
||||
trait OItem[F[_]] {
|
||||
|
||||
@ -80,8 +87,11 @@ object OItem {
|
||||
val fileId = ra.fileId
|
||||
}
|
||||
|
||||
case class AttachmentSourceData[F[_]](rs: RAttachmentSource, meta: FileMeta, data: Stream[F, Byte])
|
||||
extends BinaryData[F] {
|
||||
case class AttachmentSourceData[F[_]](
|
||||
rs: RAttachmentSource,
|
||||
meta: FileMeta,
|
||||
data: Stream[F, Byte]
|
||||
) extends BinaryData[F] {
|
||||
val name = rs.name
|
||||
val fileId = rs.fileId
|
||||
}
|
||||
@ -131,7 +141,11 @@ object OItem {
|
||||
|
||||
private def makeBinaryData[A](fileId: Ident)(f: FileMeta => A): F[Option[A]] =
|
||||
store.bitpeace
|
||||
.get(fileId.id).unNoneTerminate.compile.last.map(
|
||||
.get(fileId.id)
|
||||
.unNoneTerminate
|
||||
.compile
|
||||
.last
|
||||
.map(
|
||||
_.map(m => f(m))
|
||||
)
|
||||
|
||||
|
@ -51,7 +51,8 @@ object OSignup {
|
||||
res <- if (ok) addUser(data).map(SignupResult.fromAddResult)
|
||||
else SignupResult.invalidInvitationKey.pure[F]
|
||||
_ <- if (retryInvite(res))
|
||||
logger.fdebug(s"Adding account failed ($res). Allow retry with invite.") *> store
|
||||
logger
|
||||
.fdebug(s"Adding account failed ($res). Allow retry with invite.") *> store
|
||||
.transact(
|
||||
RInvitation.insert(RInvitation(inv, now))
|
||||
)
|
||||
|
@ -26,9 +26,7 @@ object AccountId {
|
||||
invalid
|
||||
}
|
||||
|
||||
val separated = sepearatorChars.foldRight(invalid) { (c, v) =>
|
||||
v.orElse(parse0(c))
|
||||
}
|
||||
val separated = sepearatorChars.foldRight(invalid)((c, v) => v.orElse(parse0(c)))
|
||||
|
||||
separated.orElse(Ident.fromString(str).map(id => AccountId(id, id)))
|
||||
}
|
||||
|
@ -1,8 +1,6 @@
|
||||
package docspell.common
|
||||
|
||||
sealed trait DataType {
|
||||
|
||||
}
|
||||
sealed trait DataType {}
|
||||
|
||||
object DataType {
|
||||
|
||||
@ -10,7 +8,6 @@ object DataType {
|
||||
|
||||
case class Hint(hint: MimeTypeHint) extends DataType
|
||||
|
||||
|
||||
def apply(mt: MimeType): DataType =
|
||||
Exact(mt)
|
||||
|
||||
|
@ -65,11 +65,13 @@ object File {
|
||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||
}
|
||||
|
||||
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
|
||||
def readAll[F[_]: Sync: ContextShift](
|
||||
file: Path,
|
||||
blocker: Blocker,
|
||||
chunkSize: Int
|
||||
): Stream[F, Byte] =
|
||||
fs2.io.file.readAll(file, blocker, chunkSize)
|
||||
|
||||
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||
readAll[F](file, blocker, 8192).
|
||||
through(fs2.text.utf8Decode).
|
||||
compile.foldMonoid
|
||||
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||
}
|
||||
|
@ -66,9 +66,7 @@ case class LenientUri(
|
||||
)
|
||||
|
||||
def readText[F[_]: Sync: ContextShift](chunkSize: Int, blocker: Blocker): F[String] =
|
||||
readURL[F](chunkSize, blocker).
|
||||
through(fs2.text.utf8Decode).
|
||||
compile.foldMonoid
|
||||
readURL[F](chunkSize, blocker).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||
|
||||
def host: Option[String] =
|
||||
authority.map(a =>
|
||||
|
@ -17,7 +17,6 @@ trait Logger[F[_]] {
|
||||
|
||||
object Logger {
|
||||
|
||||
|
||||
def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] {
|
||||
def trace(msg: => String): F[Unit] =
|
||||
log.ftrace(msg)
|
||||
|
@ -66,9 +66,7 @@ object MetaProposalList {
|
||||
case None => map.updated(mp.proposalType, mp)
|
||||
}
|
||||
|
||||
val merged = ml.foldLeft(init) { (map, el) =>
|
||||
el.proposals.foldLeft(map)(updateMap)
|
||||
}
|
||||
val merged = ml.foldLeft(init)((map, el) => el.proposals.foldLeft(map)(updateMap))
|
||||
|
||||
fromMap(merged)
|
||||
}
|
||||
|
@ -23,7 +23,8 @@ object SystemCommand {
|
||||
repl.foldLeft(s) {
|
||||
case (res, (k, v)) =>
|
||||
res.replace(k, v)
|
||||
})
|
||||
}
|
||||
)
|
||||
|
||||
def toCmd: List[String] =
|
||||
program :: args.toList
|
||||
@ -75,12 +76,18 @@ object SystemCommand {
|
||||
else Stream.emit(r)
|
||||
}
|
||||
|
||||
private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])(
|
||||
private def startProcess[F[_]: Sync, A](
|
||||
cmd: Config,
|
||||
wd: Option[Path],
|
||||
logger: Logger[F],
|
||||
stdin: Stream[F, Byte]
|
||||
)(
|
||||
f: Process => Stream[F, A]
|
||||
): Stream[F, A] = {
|
||||
val log = logger.debug(s"Running external command: ${cmd.cmdString}")
|
||||
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
||||
val proc = log *> hasStdin.flatMap(flag => Sync[F].delay {
|
||||
val proc = log *> hasStdin.flatMap(flag =>
|
||||
Sync[F].delay {
|
||||
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
||||
.redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT)
|
||||
.redirectError(Redirect.PIPE)
|
||||
@ -88,12 +95,11 @@ object SystemCommand {
|
||||
|
||||
wd.map(_.toFile).foreach(pb.directory)
|
||||
pb.start()
|
||||
})
|
||||
}
|
||||
)
|
||||
Stream
|
||||
.bracket(proc)(p =>
|
||||
logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ =>
|
||||
p.destroy()
|
||||
}
|
||||
logger.debug(s"Closing process: `${cmd.cmdString}`").map(_ => p.destroy())
|
||||
)
|
||||
.flatMap(f)
|
||||
}
|
||||
|
@ -13,7 +13,9 @@ import docspell.files.{ImageSize, TikaMimetype}
|
||||
|
||||
trait Conversion[F[_]] {
|
||||
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
|
||||
in: Stream[F, Byte]
|
||||
): F[A]
|
||||
|
||||
}
|
||||
|
||||
@ -26,7 +28,9 @@ object Conversion {
|
||||
): Resource[F, Conversion[F]] =
|
||||
Resource.pure(new Conversion[F] {
|
||||
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
|
||||
in: Stream[F, Byte]
|
||||
): F[A] =
|
||||
TikaMimetype.resolve(dataType, in).flatMap {
|
||||
case MimeType.pdf =>
|
||||
handler.run(ConversionResult.successPdf(in))
|
||||
|
@ -3,9 +3,11 @@ package docspell.convert
|
||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||
import docspell.convert.flexmark.MarkdownConfig
|
||||
|
||||
case class ConvertConfig(chunkSize: Int,
|
||||
case class ConvertConfig(
|
||||
chunkSize: Int,
|
||||
maxImageSize: Int,
|
||||
markdown: MarkdownConfig,
|
||||
wkhtmlpdf: WkHtmlPdfConfig,
|
||||
tesseract: TesseractConfig,
|
||||
unoconv: UnoconvConfig)
|
||||
unoconv: UnoconvConfig
|
||||
)
|
||||
|
@ -20,7 +20,9 @@ private[extern] object ExternConv {
|
||||
logger: Logger[F],
|
||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
||||
Stream
|
||||
.resource(File.withTempDir[F](wd, s"docspell-$name"))
|
||||
.flatMap { dir =>
|
||||
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
||||
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||
val sysCfg =
|
||||
@ -40,12 +42,12 @@ private[extern] object ExternConv {
|
||||
SystemCommand
|
||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
||||
.evalMap(result =>
|
||||
logResult(name, result, logger).
|
||||
flatMap(_ => reader(out, result)).
|
||||
flatMap(handler.run)
|
||||
logResult(name, result, logger).flatMap(_ => reader(out, result)).flatMap(handler.run)
|
||||
)
|
||||
}
|
||||
}.compile.lastOrError
|
||||
}
|
||||
.compile
|
||||
.lastOrError
|
||||
|
||||
def readResult[F[_]: Sync: ContextShift](
|
||||
blocker: Blocker,
|
||||
@ -60,9 +62,11 @@ private[extern] object ExternConv {
|
||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||
|
||||
case false =>
|
||||
ConversionResult.failure[F](
|
||||
ConversionResult
|
||||
.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
).pure[F]
|
||||
)
|
||||
.pure[F]
|
||||
}
|
||||
|
||||
def readResultTesseract[F[_]: Sync: ContextShift](
|
||||
@ -75,7 +79,7 @@ private[extern] object ExternConv {
|
||||
File.existsNonEmpty[F](outPdf).flatMap {
|
||||
case true =>
|
||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||
File.exists(outTxt).flatMap(txtExists => {
|
||||
File.exists(outTxt).flatMap { txtExists =>
|
||||
val pdfData = File.readAll(out, blocker, chunkSize)
|
||||
if (result.rc == 0) {
|
||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
||||
@ -84,12 +88,14 @@ private[extern] object ExternConv {
|
||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||
successPdf(pdfData).pure[F]
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
case false =>
|
||||
ConversionResult.failure[F](
|
||||
ConversionResult
|
||||
.failure[F](
|
||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||
).pure[F]
|
||||
)
|
||||
.pure[F]
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,15 @@ object Tesseract {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("tesseract", cfg.command.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
ExternConv.toPDF[F, A](
|
||||
"tesseract",
|
||||
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||
cfg.workingDir,
|
||||
false,
|
||||
blocker,
|
||||
logger,
|
||||
reader
|
||||
)(in, handler)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -19,7 +19,10 @@ object Unoconv {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
||||
ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -14,12 +14,16 @@ object WkHtmlPdf {
|
||||
cfg: WkHtmlPdfConfig,
|
||||
chunkSize: Int,
|
||||
blocker: Blocker,
|
||||
logger: Logger[F],
|
||||
logger: Logger[F]
|
||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||
|
||||
ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(in, handler)
|
||||
ExternConv
|
||||
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
|
||||
in,
|
||||
handler
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -27,7 +27,6 @@ object Markdown {
|
||||
}.toEither
|
||||
}
|
||||
|
||||
|
||||
def toHtml(md: String, cfg: MarkdownConfig): String = {
|
||||
val p = createParser()
|
||||
val r = createRenderer()
|
||||
@ -36,10 +35,9 @@ object Markdown {
|
||||
}
|
||||
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.
|
||||
map(str => toHtml(str, cfg))
|
||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||
|
||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String = {
|
||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
|
||||
s"""<!DOCTYPE html>
|
||||
|<html>
|
||||
|<head>
|
||||
@ -53,13 +51,13 @@ object Markdown {
|
||||
|</body>
|
||||
|</html>
|
||||
|""".stripMargin
|
||||
}
|
||||
|
||||
private def createParser(): Parser = {
|
||||
val opts = new MutableDataSet()
|
||||
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
||||
util.Arrays.asList(TablesExtension.create(),
|
||||
StrikethroughExtension.create()));
|
||||
opts.set(
|
||||
Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
||||
util.Arrays.asList(TablesExtension.create(), StrikethroughExtension.create())
|
||||
);
|
||||
|
||||
Parser.builder(opts).build()
|
||||
}
|
||||
|
@ -55,5 +55,4 @@ trait FileChecks {
|
||||
def commandExists(cmd: String): Boolean =
|
||||
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
|
||||
|
||||
|
||||
}
|
||||
|
@ -103,5 +103,4 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ object Extraction {
|
||||
data: Stream[F, Byte],
|
||||
dataType: DataType,
|
||||
lang: Language
|
||||
): F[ExtractResult] = {
|
||||
): F[ExtractResult] =
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.pdf =>
|
||||
PdfExtract
|
||||
@ -50,16 +50,23 @@ object Extraction {
|
||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(_.trim)
|
||||
.attempt
|
||||
.map(ExtractResult.fromEither)
|
||||
|
||||
ImageSize.get(data).flatMap {
|
||||
case Some(dim) =>
|
||||
if (dim.product > cfg.ocr.maxImageSize) {
|
||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
|
||||
ExtractResult.failure(new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
|
||||
).pure[F]
|
||||
logger.info(
|
||||
s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||
) *>
|
||||
ExtractResult
|
||||
.failure(
|
||||
new Exception(
|
||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||
)
|
||||
)
|
||||
.pure[F]
|
||||
} else {
|
||||
doExtract
|
||||
}
|
||||
@ -69,7 +76,8 @@ object Extraction {
|
||||
}
|
||||
|
||||
case OdfType.container =>
|
||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
logger
|
||||
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
||||
@ -83,6 +91,5 @@ object Extraction {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -33,7 +33,8 @@ object PdfExtract {
|
||||
|
||||
//maybe better: inspect the pdf and decide whether ocr or not
|
||||
for {
|
||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
|
||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||
.get[F](in)
|
||||
res <- pdfboxRes.fold(
|
||||
ex =>
|
||||
logger.info(
|
||||
|
@ -10,8 +10,7 @@ case class OcrConfig(
|
||||
pageRange: OcrConfig.PageRange,
|
||||
unpaper: OcrConfig.Unpaper,
|
||||
tesseract: OcrConfig.Tesseract
|
||||
) {
|
||||
}
|
||||
) {}
|
||||
|
||||
object OcrConfig {
|
||||
|
||||
|
@ -17,8 +17,8 @@ object OdfExtract {
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
|
||||
|
||||
def get(is: InputStream) = Try {
|
||||
def get(is: InputStream) =
|
||||
Try {
|
||||
val handler = new BodyContentHandler()
|
||||
val pctx = new ParseContext()
|
||||
val meta = new Metadata()
|
||||
|
@ -14,9 +14,7 @@ import fs2.Stream
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map { bytes =>
|
||||
Using(PDDocument.load(bytes))(readText).toEither.flatten
|
||||
}
|
||||
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
@ -52,25 +52,25 @@ object PoiExtract {
|
||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new WordExtractor(is)
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getXls(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
Option(xt.getText).map(_.trim).getOrElse("")
|
||||
}.toEither
|
||||
|
||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
|
@ -14,7 +14,8 @@ object OdfExtractTest extends SimpleTestSuite {
|
||||
)
|
||||
|
||||
test("test extract from odt") {
|
||||
files.foreach { case (file, len) =>
|
||||
files.foreach {
|
||||
case (file, len) =>
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str1.length, len)
|
||||
|
@ -29,12 +29,11 @@ object ImageSize {
|
||||
/** Return the image size from its header without reading
|
||||
* the whole image into memory.
|
||||
*/
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = {
|
||||
data.take(768).compile.to(Array).map(ar => {
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] =
|
||||
data.take(768).compile.to(Array).map { ar =>
|
||||
val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
|
||||
if (iis == null) sys.error("no reader given for the array")
|
||||
else getDimension(iis)
|
||||
})
|
||||
}
|
||||
|
||||
private def getDimension(in: ImageInputStream): Option[Dimension] =
|
||||
|
@ -52,8 +52,8 @@ object TikaMimetype {
|
||||
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||
Sync[F].delay {
|
||||
val hint = MimeTypeHint.filename(file.getFileName.toString)
|
||||
Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in =>
|
||||
Using(new BufferedInputStream(Files.newInputStream(file), 64)) { in =>
|
||||
convert(tika.detect(in, makeMetadata(hint)))
|
||||
}).toEither
|
||||
}.toEither
|
||||
}.rethrow
|
||||
}
|
||||
|
@ -10,5 +10,4 @@ trait ExampleFilesSupport {
|
||||
case None => sys.error(s"Resource '$resource' not found")
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -8,15 +8,14 @@ import scala.concurrent.ExecutionContext
|
||||
object Playing extends IOApp {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
|
||||
|
||||
def run(args: List[String]): IO[ExitCode] = IO {
|
||||
//val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker)
|
||||
//val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker)
|
||||
val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker)
|
||||
|
||||
val x = for {
|
||||
odsm1 <- TikaMimetype.detect(rtf,
|
||||
MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last))
|
||||
odsm1 <- TikaMimetype
|
||||
.detect(rtf, MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last))
|
||||
odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none)
|
||||
} yield (odsm1, odsm2)
|
||||
println(x.unsafeRunSync())
|
||||
|
@ -68,7 +68,9 @@ object ConvertPdf {
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data)
|
||||
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
|
||||
data
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -119,7 +121,9 @@ object ConvertPdf {
|
||||
.compile
|
||||
.lastOrError
|
||||
.map(fm => Ident.unsafe(fm.id))
|
||||
.flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId))
|
||||
.flatMap(fmId =>
|
||||
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
|
||||
)
|
||||
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||
}
|
||||
}
|
||||
|
@ -95,10 +95,10 @@ object FindProposal {
|
||||
labels => self.find(labels).map(f)
|
||||
|
||||
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
||||
flatMap({ ml0 =>
|
||||
flatMap { ml0 =>
|
||||
if (ml0.hasResultsAll) Finder.unit[F](ml0)
|
||||
else f.map(ml1 => ml0.fillEmptyFrom(ml1))
|
||||
})
|
||||
}
|
||||
|
||||
def nextWhenEmpty(f: Finder[F], mt0: MetaProposalType, mts: MetaProposalType*)(
|
||||
implicit F: FlatMap[F],
|
||||
|
@ -19,9 +19,7 @@ object ItemHandler {
|
||||
.map(_ => ())
|
||||
|
||||
def itemStateTask[F[_]: Sync, A](state: ItemState)(data: ItemData): Task[F, A, ItemData] =
|
||||
Task { ctx =>
|
||||
ctx.store.transact(RItem.updateState(data.item.id, state)).map(_ => data)
|
||||
}
|
||||
Task(ctx => ctx.store.transact(RItem.updateState(data.item.id, state)).map(_ => data))
|
||||
|
||||
def isLastRetry[F[_]: Sync, A](ctx: Context[F, A]): F[Boolean] =
|
||||
for {
|
||||
|
@ -11,9 +11,7 @@ object TestTasks {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
def success[F[_]]: Task[F, ProcessItemArgs, Unit] =
|
||||
Task { ctx =>
|
||||
ctx.logger.info(s"Running task now: ${ctx.args}")
|
||||
}
|
||||
Task(ctx => ctx.logger.info(s"Running task now: ${ctx.args}"))
|
||||
|
||||
def failing[F[_]: Sync]: Task[F, ProcessItemArgs, Unit] =
|
||||
Task { ctx =>
|
||||
|
@ -76,16 +76,15 @@ object TextExtraction {
|
||||
.getOrElse(Mimetype.`application/octet-stream`)
|
||||
|
||||
findMime
|
||||
.flatMap(mt =>
|
||||
extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||
}
|
||||
|
||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, _],
|
||||
cfg: ExtractConfig,
|
||||
ra: RAttachment,
|
||||
lang: Language,
|
||||
)(fileIds: List[Ident]): F[Option[String]] = {
|
||||
lang: Language
|
||||
)(fileIds: List[Ident]): F[Option[String]] =
|
||||
fileIds match {
|
||||
case Nil =>
|
||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||
@ -99,15 +98,18 @@ object TextExtraction {
|
||||
txt.some.pure[F]
|
||||
|
||||
case ExtractResult.UnsupportedFormat(mt) =>
|
||||
ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file.").
|
||||
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||
ctx.logger
|
||||
.warn(
|
||||
s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file."
|
||||
)
|
||||
.flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||
|
||||
case ExtractResult.Failure(ex) =>
|
||||
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
||||
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||
ctx.logger
|
||||
.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file")
|
||||
.flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the fileIds to extract text from. First, the source file
|
||||
* is tried. If that fails, the converted file is tried.
|
||||
|
@ -128,6 +128,9 @@ Please see the `nix/module-server.nix` and `nix/module-joex.nix` files
|
||||
for the set of options. The nixos options are modelled after the
|
||||
default configuration file.
|
||||
|
||||
The modules files are only applicable to the newest version of
|
||||
Docspell. If you really need an older version, checkout the
|
||||
appropriate commit.
|
||||
|
||||
## NixOs Example
|
||||
|
||||
|
@ -204,7 +204,8 @@ trait Conversions {
|
||||
|
||||
val files = mp.parts
|
||||
.filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))
|
||||
.map(p => OUpload.File(p.filename, p.headers.get(`Content-Type`).map(fromContentType), p.body)
|
||||
.map(p =>
|
||||
OUpload.File(p.filename, p.headers.get(`Content-Type`).map(fromContentType), p.body)
|
||||
)
|
||||
for {
|
||||
metaData <- meta
|
||||
|
@ -55,10 +55,10 @@ object AttachmentRoutes {
|
||||
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
||||
matches = matchETag(fileData.map(_.meta), inm)
|
||||
resp <- fileData
|
||||
.map({ data =>
|
||||
.map { data =>
|
||||
if (matches) withResponseHeaders(NotModified())(data)
|
||||
else makeByteResp(data)
|
||||
})
|
||||
}
|
||||
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
||||
} yield resp
|
||||
|
||||
@ -76,10 +76,10 @@ object AttachmentRoutes {
|
||||
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
||||
matches = matchETag(fileData.map(_.meta), inm)
|
||||
resp <- fileData
|
||||
.map({ data =>
|
||||
.map { data =>
|
||||
if (matches) withResponseHeaders(NotModified())(data)
|
||||
else makeByteResp(data)
|
||||
})
|
||||
}
|
||||
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
||||
} yield resp
|
||||
|
||||
|
@ -14,10 +14,15 @@ object QAttachment {
|
||||
|
||||
def deleteById[F[_]: Sync](store: Store[F])(attachId: Ident, coll: Ident): F[Int] =
|
||||
for {
|
||||
raFile <- store.transact(RAttachment.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId))
|
||||
rsFile <- store.transact(RAttachmentSource.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId))
|
||||
raFile <- store
|
||||
.transact(RAttachment.findByIdAndCollective(attachId, coll))
|
||||
.map(_.map(_.fileId))
|
||||
rsFile <- store
|
||||
.transact(RAttachmentSource.findByIdAndCollective(attachId, coll))
|
||||
.map(_.map(_.fileId))
|
||||
n <- store.transact(RAttachment.delete(attachId))
|
||||
f <- Stream.emits(raFile.toSeq ++ rsFile.toSeq)
|
||||
f <- Stream
|
||||
.emits(raFile.toSeq ++ rsFile.toSeq)
|
||||
.map(_.id)
|
||||
.flatMap(store.bitpeace.delete)
|
||||
.map(flag => if (flag) 1 else 0)
|
||||
@ -29,10 +34,12 @@ object QAttachment {
|
||||
for {
|
||||
s <- store.transact(RAttachmentSource.findById(ra.id))
|
||||
n <- store.transact(RAttachment.delete(ra.id))
|
||||
f <- Stream.emits(ra.fileId.id +: s.map(_.fileId.id).toSeq).
|
||||
flatMap(store.bitpeace.delete).
|
||||
map(flag => if (flag) 1 else 0).
|
||||
compile.foldMonoid
|
||||
f <- Stream
|
||||
.emits(ra.fileId.id +: s.map(_.fileId.id).toSeq)
|
||||
.flatMap(store.bitpeace.delete)
|
||||
.map(flag => if (flag) 1 else 0)
|
||||
.compile
|
||||
.foldMonoid
|
||||
} yield n + f
|
||||
|
||||
def deleteItemAttachments[F[_]: Sync](store: Store[F])(itemId: Ident, coll: Ident): F[Int] =
|
||||
|
@ -27,7 +27,6 @@ object QCollective {
|
||||
and(IC.cid.is(coll), IC.incoming.is(Direction.outgoing))
|
||||
).query[Int].unique
|
||||
|
||||
|
||||
val fileSize = sql"""
|
||||
select sum(length) from (
|
||||
with attachs as
|
||||
@ -42,7 +41,6 @@ object QCollective {
|
||||
inner join filemeta m on m.id = a.file_id where a.id in (select aid from attachs)
|
||||
) as t""".query[Option[Long]].unique
|
||||
|
||||
|
||||
val q3 = fr"SELECT" ++ commas(
|
||||
TC.name.prefix("t").f,
|
||||
fr"count(" ++ RC.itemId.prefix("r").f ++ fr")"
|
||||
|
@ -39,7 +39,8 @@ object QItem {
|
||||
val EC = REquipment.Columns.all.map(_.prefix("e"))
|
||||
val ICC = List(RItem.Columns.id, RItem.Columns.name).map(_.prefix("ref"))
|
||||
|
||||
val cq = selectSimple(IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC, RItem.table ++ fr"i", Fragment.empty) ++
|
||||
val cq =
|
||||
selectSimple(IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC, RItem.table ++ fr"i", Fragment.empty) ++
|
||||
fr"LEFT JOIN" ++ ROrganization.table ++ fr"o ON" ++ RItem.Columns.corrOrg
|
||||
.prefix("i")
|
||||
.is(ROrganization.Columns.oid.prefix("o")) ++
|
||||
@ -235,7 +236,8 @@ object QItem {
|
||||
def findByFileIds(fileMetaIds: List[Ident]): ConnectionIO[Vector[RItem]] = {
|
||||
val IC = RItem.Columns
|
||||
val AC = RAttachment.Columns
|
||||
val q = fr"SELECT DISTINCT" ++ commas(IC.all.map(_.prefix("i").f)) ++ fr"FROM" ++ RItem.table ++ fr"i" ++
|
||||
val q =
|
||||
fr"SELECT DISTINCT" ++ commas(IC.all.map(_.prefix("i").f)) ++ fr"FROM" ++ RItem.table ++ fr"i" ++
|
||||
fr"INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ AC.itemId
|
||||
.prefix("a")
|
||||
.is(IC.id.prefix("i")) ++
|
||||
|
@ -21,11 +21,11 @@ object QJob {
|
||||
Stream
|
||||
.range(0, 10)
|
||||
.evalMap(n => takeNextJob1(store)(priority, worker, retryPause, n))
|
||||
.evalTap({ x =>
|
||||
.evalTap { x =>
|
||||
if (x.isLeft)
|
||||
logger.fdebug[F]("Cannot mark job, probably due to concurrent updates. Will retry.")
|
||||
else ().pure[F]
|
||||
})
|
||||
}
|
||||
.find(_.isRight)
|
||||
.flatMap({
|
||||
case Right(job) =>
|
||||
@ -97,7 +97,8 @@ object QJob {
|
||||
val sql2 = fr"SELECT min(" ++ jgroup.f ++ fr") as g FROM" ++ RJob.table ++ fr"a" ++
|
||||
fr"WHERE" ++ stateCond
|
||||
|
||||
val union = sql"SELECT g FROM ((" ++ sql1 ++ sql") UNION ALL (" ++ sql2 ++ sql")) as t0 WHERE g is not null"
|
||||
val union =
|
||||
sql"SELECT g FROM ((" ++ sql1 ++ sql") UNION ALL (" ++ sql2 ++ sql")) as t0 WHERE g is not null"
|
||||
|
||||
union
|
||||
.query[Ident]
|
||||
|
@ -34,11 +34,11 @@ object JobQueue {
|
||||
def insert(job: RJob): F[Unit] =
|
||||
store
|
||||
.transact(RJob.insert(job))
|
||||
.flatMap({ n =>
|
||||
.flatMap { n =>
|
||||
if (n != 1)
|
||||
Effect[F].raiseError(new Exception(s"Inserting job failed. Update count: $n"))
|
||||
else ().pure[F]
|
||||
})
|
||||
}
|
||||
|
||||
def insertAll(jobs: Seq[RJob]): F[Unit] =
|
||||
jobs.toList
|
||||
|
@ -104,7 +104,8 @@ object RAttachment {
|
||||
def findByItemWithMeta(id: Ident): ConnectionIO[Vector[(RAttachment, FileMeta)]] = {
|
||||
import bitpeace.sql._
|
||||
|
||||
val q = fr"SELECT a.*,m.* FROM" ++ table ++ fr"a, filemeta m WHERE a.filemetaid = m.id AND a.itemid = $id ORDER BY a.position ASC"
|
||||
val q =
|
||||
fr"SELECT a.*,m.* FROM" ++ table ++ fr"a, filemeta m WHERE a.filemetaid = m.id AND a.itemid = $id ORDER BY a.position ASC"
|
||||
q.query[(RAttachment, FileMeta)].to[Vector]
|
||||
}
|
||||
|
||||
|
@ -38,14 +38,16 @@ object RAttachmentSource {
|
||||
def insert(v: RAttachmentSource): ConnectionIO[Int] =
|
||||
insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run
|
||||
|
||||
|
||||
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
|
||||
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
|
||||
|
||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||
deleteFrom(table, id.is(attachId)).update.run
|
||||
|
||||
def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachmentSource]] = {
|
||||
def findByIdAndCollective(
|
||||
attachId: Ident,
|
||||
collective: Ident
|
||||
): ConnectionIO[Option[RAttachmentSource]] = {
|
||||
val bId = RAttachment.Columns.id.prefix("b")
|
||||
val aId = Columns.id.prefix("a")
|
||||
val bItem = RAttachment.Columns.itemId.prefix("b")
|
||||
@ -77,8 +79,9 @@ object RAttachmentSource {
|
||||
RAttachment.table ++ fr"b ON" ++ aId.is(bId)
|
||||
val where = bItem.is(id)
|
||||
|
||||
(selectSimple(cols, from, where) ++ orderBy(bPos.asc)).
|
||||
query[(RAttachmentSource, FileMeta)].to[Vector]
|
||||
(selectSimple(cols, from, where) ++ orderBy(bPos.asc))
|
||||
.query[(RAttachmentSource, FileMeta)]
|
||||
.to[Vector]
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user