mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Merge branch 'master' into update/poi-4.1.2
This commit is contained in:
commit
ea4c06870d
@ -23,4 +23,4 @@ before_script:
|
|||||||
- export TZ=Europe/Berlin
|
- export TZ=Europe/Berlin
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- sbt ++$TRAVIS_SCALA_VERSION ";project root ;make"
|
- sbt ++$TRAVIS_SCALA_VERSION ";project root ;make ;test"
|
||||||
|
@ -12,13 +12,13 @@ object Contact {
|
|||||||
def annotate(text: String): Vector[NerLabel] =
|
def annotate(text: String): Vector[NerLabel] =
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken[Nothing](text, " \t\r\n".toSet)
|
.splitToken[Nothing](text, " \t\r\n".toSet)
|
||||||
.map({ token =>
|
.map { token =>
|
||||||
if (isEmailAddress(token.value))
|
if (isEmailAddress(token.value))
|
||||||
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
|
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
|
||||||
else if (isWebsite(token.value))
|
else if (isWebsite(token.value))
|
||||||
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
|
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
|
||||||
else None
|
else None
|
||||||
})
|
}
|
||||||
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
|
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
|
||||||
.toVector
|
.toVector
|
||||||
|
|
||||||
|
@ -11,7 +11,14 @@ import docspell.store.queries.{QAttachment, QItem}
|
|||||||
import OItem.{AttachmentData, AttachmentSourceData, ItemData, ListItem, Query}
|
import OItem.{AttachmentData, AttachmentSourceData, ItemData, ListItem, Query}
|
||||||
import bitpeace.{FileMeta, RangeDef}
|
import bitpeace.{FileMeta, RangeDef}
|
||||||
import docspell.common.{Direction, Ident, ItemState, MetaProposalList, Timestamp}
|
import docspell.common.{Direction, Ident, ItemState, MetaProposalList, Timestamp}
|
||||||
import docspell.store.records.{RAttachment, RAttachmentMeta, RAttachmentSource, RItem, RSource, RTagItem}
|
import docspell.store.records.{
|
||||||
|
RAttachment,
|
||||||
|
RAttachmentMeta,
|
||||||
|
RAttachmentSource,
|
||||||
|
RItem,
|
||||||
|
RSource,
|
||||||
|
RTagItem
|
||||||
|
}
|
||||||
|
|
||||||
trait OItem[F[_]] {
|
trait OItem[F[_]] {
|
||||||
|
|
||||||
@ -80,8 +87,11 @@ object OItem {
|
|||||||
val fileId = ra.fileId
|
val fileId = ra.fileId
|
||||||
}
|
}
|
||||||
|
|
||||||
case class AttachmentSourceData[F[_]](rs: RAttachmentSource, meta: FileMeta, data: Stream[F, Byte])
|
case class AttachmentSourceData[F[_]](
|
||||||
extends BinaryData[F] {
|
rs: RAttachmentSource,
|
||||||
|
meta: FileMeta,
|
||||||
|
data: Stream[F, Byte]
|
||||||
|
) extends BinaryData[F] {
|
||||||
val name = rs.name
|
val name = rs.name
|
||||||
val fileId = rs.fileId
|
val fileId = rs.fileId
|
||||||
}
|
}
|
||||||
@ -131,7 +141,11 @@ object OItem {
|
|||||||
|
|
||||||
private def makeBinaryData[A](fileId: Ident)(f: FileMeta => A): F[Option[A]] =
|
private def makeBinaryData[A](fileId: Ident)(f: FileMeta => A): F[Option[A]] =
|
||||||
store.bitpeace
|
store.bitpeace
|
||||||
.get(fileId.id).unNoneTerminate.compile.last.map(
|
.get(fileId.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.compile
|
||||||
|
.last
|
||||||
|
.map(
|
||||||
_.map(m => f(m))
|
_.map(m => f(m))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -51,7 +51,8 @@ object OSignup {
|
|||||||
res <- if (ok) addUser(data).map(SignupResult.fromAddResult)
|
res <- if (ok) addUser(data).map(SignupResult.fromAddResult)
|
||||||
else SignupResult.invalidInvitationKey.pure[F]
|
else SignupResult.invalidInvitationKey.pure[F]
|
||||||
_ <- if (retryInvite(res))
|
_ <- if (retryInvite(res))
|
||||||
logger.fdebug(s"Adding account failed ($res). Allow retry with invite.") *> store
|
logger
|
||||||
|
.fdebug(s"Adding account failed ($res). Allow retry with invite.") *> store
|
||||||
.transact(
|
.transact(
|
||||||
RInvitation.insert(RInvitation(inv, now))
|
RInvitation.insert(RInvitation(inv, now))
|
||||||
)
|
)
|
||||||
|
@ -26,9 +26,7 @@ object AccountId {
|
|||||||
invalid
|
invalid
|
||||||
}
|
}
|
||||||
|
|
||||||
val separated = sepearatorChars.foldRight(invalid) { (c, v) =>
|
val separated = sepearatorChars.foldRight(invalid)((c, v) => v.orElse(parse0(c)))
|
||||||
v.orElse(parse0(c))
|
|
||||||
}
|
|
||||||
|
|
||||||
separated.orElse(Ident.fromString(str).map(id => AccountId(id, id)))
|
separated.orElse(Ident.fromString(str).map(id => AccountId(id, id)))
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
package docspell.common
|
package docspell.common
|
||||||
|
|
||||||
sealed trait DataType {
|
sealed trait DataType {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object DataType {
|
object DataType {
|
||||||
|
|
||||||
@ -10,7 +8,6 @@ object DataType {
|
|||||||
|
|
||||||
case class Hint(hint: MimeTypeHint) extends DataType
|
case class Hint(hint: MimeTypeHint) extends DataType
|
||||||
|
|
||||||
|
|
||||||
def apply(mt: MimeType): DataType =
|
def apply(mt: MimeType): DataType =
|
||||||
Exact(mt)
|
Exact(mt)
|
||||||
|
|
||||||
|
@ -65,11 +65,13 @@ object File {
|
|||||||
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
javaList.asScala.toList.sortBy(_.getFileName.toString)
|
||||||
}
|
}
|
||||||
|
|
||||||
def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] =
|
def readAll[F[_]: Sync: ContextShift](
|
||||||
|
file: Path,
|
||||||
|
blocker: Blocker,
|
||||||
|
chunkSize: Int
|
||||||
|
): Stream[F, Byte] =
|
||||||
fs2.io.file.readAll(file, blocker, chunkSize)
|
fs2.io.file.readAll(file, blocker, chunkSize)
|
||||||
|
|
||||||
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||||
readAll[F](file, blocker, 8192).
|
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||||
through(fs2.text.utf8Decode).
|
|
||||||
compile.foldMonoid
|
|
||||||
}
|
}
|
||||||
|
@ -66,9 +66,7 @@ case class LenientUri(
|
|||||||
)
|
)
|
||||||
|
|
||||||
def readText[F[_]: Sync: ContextShift](chunkSize: Int, blocker: Blocker): F[String] =
|
def readText[F[_]: Sync: ContextShift](chunkSize: Int, blocker: Blocker): F[String] =
|
||||||
readURL[F](chunkSize, blocker).
|
readURL[F](chunkSize, blocker).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||||
through(fs2.text.utf8Decode).
|
|
||||||
compile.foldMonoid
|
|
||||||
|
|
||||||
def host: Option[String] =
|
def host: Option[String] =
|
||||||
authority.map(a =>
|
authority.map(a =>
|
||||||
|
@ -17,7 +17,6 @@ trait Logger[F[_]] {
|
|||||||
|
|
||||||
object Logger {
|
object Logger {
|
||||||
|
|
||||||
|
|
||||||
def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] {
|
def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] {
|
||||||
def trace(msg: => String): F[Unit] =
|
def trace(msg: => String): F[Unit] =
|
||||||
log.ftrace(msg)
|
log.ftrace(msg)
|
||||||
|
@ -66,9 +66,7 @@ object MetaProposalList {
|
|||||||
case None => map.updated(mp.proposalType, mp)
|
case None => map.updated(mp.proposalType, mp)
|
||||||
}
|
}
|
||||||
|
|
||||||
val merged = ml.foldLeft(init) { (map, el) =>
|
val merged = ml.foldLeft(init)((map, el) => el.proposals.foldLeft(map)(updateMap))
|
||||||
el.proposals.foldLeft(map)(updateMap)
|
|
||||||
}
|
|
||||||
|
|
||||||
fromMap(merged)
|
fromMap(merged)
|
||||||
}
|
}
|
||||||
|
@ -23,7 +23,8 @@ object SystemCommand {
|
|||||||
repl.foldLeft(s) {
|
repl.foldLeft(s) {
|
||||||
case (res, (k, v)) =>
|
case (res, (k, v)) =>
|
||||||
res.replace(k, v)
|
res.replace(k, v)
|
||||||
})
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def toCmd: List[String] =
|
def toCmd: List[String] =
|
||||||
program :: args.toList
|
program :: args.toList
|
||||||
@ -75,12 +76,18 @@ object SystemCommand {
|
|||||||
else Stream.emit(r)
|
else Stream.emit(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])(
|
private def startProcess[F[_]: Sync, A](
|
||||||
|
cmd: Config,
|
||||||
|
wd: Option[Path],
|
||||||
|
logger: Logger[F],
|
||||||
|
stdin: Stream[F, Byte]
|
||||||
|
)(
|
||||||
f: Process => Stream[F, A]
|
f: Process => Stream[F, A]
|
||||||
): Stream[F, A] = {
|
): Stream[F, A] = {
|
||||||
val log = logger.debug(s"Running external command: ${cmd.cmdString}")
|
val log = logger.debug(s"Running external command: ${cmd.cmdString}")
|
||||||
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
val hasStdin = stdin.take(1).compile.last.map(_.isDefined)
|
||||||
val proc = log *> hasStdin.flatMap(flag => Sync[F].delay {
|
val proc = log *> hasStdin.flatMap(flag =>
|
||||||
|
Sync[F].delay {
|
||||||
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
val pb = new ProcessBuilder(cmd.toCmd.asJava)
|
||||||
.redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT)
|
.redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT)
|
||||||
.redirectError(Redirect.PIPE)
|
.redirectError(Redirect.PIPE)
|
||||||
@ -88,12 +95,11 @@ object SystemCommand {
|
|||||||
|
|
||||||
wd.map(_.toFile).foreach(pb.directory)
|
wd.map(_.toFile).foreach(pb.directory)
|
||||||
pb.start()
|
pb.start()
|
||||||
})
|
}
|
||||||
|
)
|
||||||
Stream
|
Stream
|
||||||
.bracket(proc)(p =>
|
.bracket(proc)(p =>
|
||||||
logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ =>
|
logger.debug(s"Closing process: `${cmd.cmdString}`").map(_ => p.destroy())
|
||||||
p.destroy()
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
.flatMap(f)
|
.flatMap(f)
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,9 @@ import docspell.files.{ImageSize, TikaMimetype}
|
|||||||
|
|
||||||
trait Conversion[F[_]] {
|
trait Conversion[F[_]] {
|
||||||
|
|
||||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A]
|
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
|
||||||
|
in: Stream[F, Byte]
|
||||||
|
): F[A]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -26,7 +28,9 @@ object Conversion {
|
|||||||
): Resource[F, Conversion[F]] =
|
): Resource[F, Conversion[F]] =
|
||||||
Resource.pure(new Conversion[F] {
|
Resource.pure(new Conversion[F] {
|
||||||
|
|
||||||
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] =
|
def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(
|
||||||
|
in: Stream[F, Byte]
|
||||||
|
): F[A] =
|
||||||
TikaMimetype.resolve(dataType, in).flatMap {
|
TikaMimetype.resolve(dataType, in).flatMap {
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
handler.run(ConversionResult.successPdf(in))
|
handler.run(ConversionResult.successPdf(in))
|
||||||
|
@ -3,9 +3,11 @@ package docspell.convert
|
|||||||
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig}
|
||||||
import docspell.convert.flexmark.MarkdownConfig
|
import docspell.convert.flexmark.MarkdownConfig
|
||||||
|
|
||||||
case class ConvertConfig(chunkSize: Int,
|
case class ConvertConfig(
|
||||||
|
chunkSize: Int,
|
||||||
maxImageSize: Int,
|
maxImageSize: Int,
|
||||||
markdown: MarkdownConfig,
|
markdown: MarkdownConfig,
|
||||||
wkhtmlpdf: WkHtmlPdfConfig,
|
wkhtmlpdf: WkHtmlPdfConfig,
|
||||||
tesseract: TesseractConfig,
|
tesseract: TesseractConfig,
|
||||||
unoconv: UnoconvConfig)
|
unoconv: UnoconvConfig
|
||||||
|
)
|
||||||
|
@ -20,7 +20,9 @@ private[extern] object ExternConv {
|
|||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
reader: (Path, SystemCommand.Result) => F[ConversionResult[F]]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] =
|
||||||
Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir =>
|
Stream
|
||||||
|
.resource(File.withTempDir[F](wd, s"docspell-$name"))
|
||||||
|
.flatMap { dir =>
|
||||||
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
val inFile = dir.resolve("infile").toAbsolutePath.normalize
|
||||||
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
val out = dir.resolve("out.pdf").toAbsolutePath.normalize
|
||||||
val sysCfg =
|
val sysCfg =
|
||||||
@ -40,12 +42,12 @@ private[extern] object ExternConv {
|
|||||||
SystemCommand
|
SystemCommand
|
||||||
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
.execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty)
|
||||||
.evalMap(result =>
|
.evalMap(result =>
|
||||||
logResult(name, result, logger).
|
logResult(name, result, logger).flatMap(_ => reader(out, result)).flatMap(handler.run)
|
||||||
flatMap(_ => reader(out, result)).
|
|
||||||
flatMap(handler.run)
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}.compile.lastOrError
|
}
|
||||||
|
.compile
|
||||||
|
.lastOrError
|
||||||
|
|
||||||
def readResult[F[_]: Sync: ContextShift](
|
def readResult[F[_]: Sync: ContextShift](
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
@ -60,9 +62,11 @@ private[extern] object ExternConv {
|
|||||||
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
successPdf(File.readAll(out, blocker, chunkSize)).pure[F]
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
ConversionResult.failure[F](
|
ConversionResult
|
||||||
|
.failure[F](
|
||||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||||
).pure[F]
|
)
|
||||||
|
.pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
def readResultTesseract[F[_]: Sync: ContextShift](
|
def readResultTesseract[F[_]: Sync: ContextShift](
|
||||||
@ -75,7 +79,7 @@ private[extern] object ExternConv {
|
|||||||
File.existsNonEmpty[F](outPdf).flatMap {
|
File.existsNonEmpty[F](outPdf).flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
val outTxt = out.resolveSibling(s"$outPrefix.txt")
|
||||||
File.exists(outTxt).flatMap(txtExists => {
|
File.exists(outTxt).flatMap { txtExists =>
|
||||||
val pdfData = File.readAll(out, blocker, chunkSize)
|
val pdfData = File.readAll(out, blocker, chunkSize)
|
||||||
if (result.rc == 0) {
|
if (result.rc == 0) {
|
||||||
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F]
|
||||||
@ -84,12 +88,14 @@ private[extern] object ExternConv {
|
|||||||
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *>
|
||||||
successPdf(pdfData).pure[F]
|
successPdf(pdfData).pure[F]
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
|
||||||
case false =>
|
case false =>
|
||||||
ConversionResult.failure[F](
|
ConversionResult
|
||||||
|
.failure[F](
|
||||||
new Exception(s"Command result=${result.rc}. No output file found.")
|
new Exception(s"Command result=${result.rc}. No output file found.")
|
||||||
).pure[F]
|
)
|
||||||
|
.pure[F]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,7 +21,15 @@ object Tesseract {
|
|||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger)
|
||||||
|
|
||||||
ExternConv.toPDF[F, A]("tesseract", cfg.command.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
ExternConv.toPDF[F, A](
|
||||||
|
"tesseract",
|
||||||
|
cfg.command.replace(Map("{{lang}}" -> lang.iso3)),
|
||||||
|
cfg.workingDir,
|
||||||
|
false,
|
||||||
|
blocker,
|
||||||
|
logger,
|
||||||
|
reader
|
||||||
|
)(in, handler)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,10 @@ object Unoconv {
|
|||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||||
|
|
||||||
ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(in, handler)
|
ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(
|
||||||
|
in,
|
||||||
|
handler
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -14,12 +14,16 @@ object WkHtmlPdf {
|
|||||||
cfg: WkHtmlPdfConfig,
|
cfg: WkHtmlPdfConfig,
|
||||||
chunkSize: Int,
|
chunkSize: Int,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
logger: Logger[F],
|
logger: Logger[F]
|
||||||
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
)(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = {
|
||||||
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] =
|
||||||
ExternConv.readResult[F](blocker, chunkSize, logger)
|
ExternConv.readResult[F](blocker, chunkSize, logger)
|
||||||
|
|
||||||
ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(in, handler)
|
ExternConv
|
||||||
|
.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(
|
||||||
|
in,
|
||||||
|
handler
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,6 @@ object Markdown {
|
|||||||
}.toEither
|
}.toEither
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def toHtml(md: String, cfg: MarkdownConfig): String = {
|
def toHtml(md: String, cfg: MarkdownConfig): String = {
|
||||||
val p = createParser()
|
val p = createParser()
|
||||||
val r = createRenderer()
|
val r = createRenderer()
|
||||||
@ -36,10 +35,9 @@ object Markdown {
|
|||||||
}
|
}
|
||||||
|
|
||||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
||||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.
|
data.through(fs2.text.utf8Decode).compile.foldMonoid.map(str => toHtml(str, cfg))
|
||||||
map(str => toHtml(str, cfg))
|
|
||||||
|
|
||||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String = {
|
private def wrapHtml(body: String, cfg: MarkdownConfig): String =
|
||||||
s"""<!DOCTYPE html>
|
s"""<!DOCTYPE html>
|
||||||
|<html>
|
|<html>
|
||||||
|<head>
|
|<head>
|
||||||
@ -53,13 +51,13 @@ object Markdown {
|
|||||||
|</body>
|
|</body>
|
||||||
|</html>
|
|</html>
|
||||||
|""".stripMargin
|
|""".stripMargin
|
||||||
}
|
|
||||||
|
|
||||||
private def createParser(): Parser = {
|
private def createParser(): Parser = {
|
||||||
val opts = new MutableDataSet()
|
val opts = new MutableDataSet()
|
||||||
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
opts.set(
|
||||||
util.Arrays.asList(TablesExtension.create(),
|
Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
||||||
StrikethroughExtension.create()));
|
util.Arrays.asList(TablesExtension.create(), StrikethroughExtension.create())
|
||||||
|
);
|
||||||
|
|
||||||
Parser.builder(opts).build()
|
Parser.builder(opts).build()
|
||||||
}
|
}
|
||||||
|
@ -55,5 +55,4 @@ trait FileChecks {
|
|||||||
def commandExists(cmd: String): Boolean =
|
def commandExists(cmd: String): Boolean =
|
||||||
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
|
Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -103,5 +103,4 @@ object ExternConvTest extends SimpleTestSuite with FileChecks {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ object Extraction {
|
|||||||
data: Stream[F, Byte],
|
data: Stream[F, Byte],
|
||||||
dataType: DataType,
|
dataType: DataType,
|
||||||
lang: Language
|
lang: Language
|
||||||
): F[ExtractResult] = {
|
): F[ExtractResult] =
|
||||||
TikaMimetype.resolve(dataType, data).flatMap {
|
TikaMimetype.resolve(dataType, data).flatMap {
|
||||||
case MimeType.pdf =>
|
case MimeType.pdf =>
|
||||||
PdfExtract
|
PdfExtract
|
||||||
@ -50,16 +50,23 @@ object Extraction {
|
|||||||
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
.extractOCR(data, blocker, logger, lang.iso3, cfg.ocr)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
|
.map(_.trim)
|
||||||
.attempt
|
.attempt
|
||||||
.map(ExtractResult.fromEither)
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
ImageSize.get(data).flatMap {
|
ImageSize.get(data).flatMap {
|
||||||
case Some(dim) =>
|
case Some(dim) =>
|
||||||
if (dim.product > cfg.ocr.maxImageSize) {
|
if (dim.product > cfg.ocr.maxImageSize) {
|
||||||
logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *>
|
logger.info(
|
||||||
ExtractResult.failure(new Exception(
|
s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||||
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).")
|
) *>
|
||||||
).pure[F]
|
ExtractResult
|
||||||
|
.failure(
|
||||||
|
new Exception(
|
||||||
|
s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize})."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.pure[F]
|
||||||
} else {
|
} else {
|
||||||
doExtract
|
doExtract
|
||||||
}
|
}
|
||||||
@ -69,7 +76,8 @@ object Extraction {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case OdfType.container =>
|
case OdfType.container =>
|
||||||
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
logger
|
||||||
|
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
||||||
@ -83,6 +91,5 @@ object Extraction {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,8 @@ object PdfExtract {
|
|||||||
|
|
||||||
//maybe better: inspect the pdf and decide whether ocr or not
|
//maybe better: inspect the pdf and decide whether ocr or not
|
||||||
for {
|
for {
|
||||||
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in)
|
pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract
|
||||||
|
.get[F](in)
|
||||||
res <- pdfboxRes.fold(
|
res <- pdfboxRes.fold(
|
||||||
ex =>
|
ex =>
|
||||||
logger.info(
|
logger.info(
|
||||||
|
@ -10,8 +10,7 @@ case class OcrConfig(
|
|||||||
pageRange: OcrConfig.PageRange,
|
pageRange: OcrConfig.PageRange,
|
||||||
unpaper: OcrConfig.Unpaper,
|
unpaper: OcrConfig.Unpaper,
|
||||||
tesseract: OcrConfig.Tesseract
|
tesseract: OcrConfig.Tesseract
|
||||||
) {
|
) {}
|
||||||
}
|
|
||||||
|
|
||||||
object OcrConfig {
|
object OcrConfig {
|
||||||
|
|
||||||
|
@ -17,8 +17,8 @@ object OdfExtract {
|
|||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||||
|
|
||||||
|
def get(is: InputStream) =
|
||||||
def get(is: InputStream) = Try {
|
Try {
|
||||||
val handler = new BodyContentHandler()
|
val handler = new BodyContentHandler()
|
||||||
val pctx = new ParseContext()
|
val pctx = new ParseContext()
|
||||||
val meta = new Metadata()
|
val meta = new Metadata()
|
||||||
|
@ -14,9 +14,7 @@ import fs2.Stream
|
|||||||
object PdfboxExtract {
|
object PdfboxExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||||
data.compile.to(Array).map { bytes =>
|
data.compile.to(Array).map(bytes => Using(PDDocument.load(bytes))(readText).toEither.flatten)
|
||||||
Using(PDDocument.load(bytes))(readText).toEither.flatten
|
|
||||||
}
|
|
||||||
|
|
||||||
def get(is: InputStream): Either[Throwable, String] =
|
def get(is: InputStream): Either[Throwable, String] =
|
||||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||||
|
@ -52,25 +52,25 @@ object PoiExtract {
|
|||||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
def getDocx(is: InputStream): Either[Throwable, String] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||||
xt.getText.trim
|
Option(xt.getText).map(_.trim).getOrElse("")
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
def getDoc(is: InputStream): Either[Throwable, String] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new WordExtractor(is)
|
val xt = new WordExtractor(is)
|
||||||
xt.getText.trim
|
Option(xt.getText).map(_.trim).getOrElse("")
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
def getXlsx(is: InputStream): Either[Throwable, String] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||||
xt.getText.trim
|
Option(xt.getText).map(_.trim).getOrElse("")
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getXls(is: InputStream): Either[Throwable, String] =
|
def getXls(is: InputStream): Either[Throwable, String] =
|
||||||
Try {
|
Try {
|
||||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||||
xt.getText.trim
|
Option(xt.getText).map(_.trim).getOrElse("")
|
||||||
}.toEither
|
}.toEither
|
||||||
|
|
||||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||||
|
@ -14,7 +14,8 @@ object OdfExtractTest extends SimpleTestSuite {
|
|||||||
)
|
)
|
||||||
|
|
||||||
test("test extract from odt") {
|
test("test extract from odt") {
|
||||||
files.foreach { case (file, len) =>
|
files.foreach {
|
||||||
|
case (file, len) =>
|
||||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||||
assertEquals(str1.length, len)
|
assertEquals(str1.length, len)
|
||||||
|
@ -29,12 +29,11 @@ object ImageSize {
|
|||||||
/** Return the image size from its header without reading
|
/** Return the image size from its header without reading
|
||||||
* the whole image into memory.
|
* the whole image into memory.
|
||||||
*/
|
*/
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = {
|
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] =
|
||||||
data.take(768).compile.to(Array).map(ar => {
|
data.take(768).compile.to(Array).map { ar =>
|
||||||
val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
|
val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
|
||||||
if (iis == null) sys.error("no reader given for the array")
|
if (iis == null) sys.error("no reader given for the array")
|
||||||
else getDimension(iis)
|
else getDimension(iis)
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private def getDimension(in: ImageInputStream): Option[Dimension] =
|
private def getDimension(in: ImageInputStream): Option[Dimension] =
|
||||||
|
@ -52,8 +52,8 @@ object TikaMimetype {
|
|||||||
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
def detect[F[_]: Sync](file: Path): F[MimeType] =
|
||||||
Sync[F].delay {
|
Sync[F].delay {
|
||||||
val hint = MimeTypeHint.filename(file.getFileName.toString)
|
val hint = MimeTypeHint.filename(file.getFileName.toString)
|
||||||
Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in =>
|
Using(new BufferedInputStream(Files.newInputStream(file), 64)) { in =>
|
||||||
convert(tika.detect(in, makeMetadata(hint)))
|
convert(tika.detect(in, makeMetadata(hint)))
|
||||||
}).toEither
|
}.toEither
|
||||||
}.rethrow
|
}.rethrow
|
||||||
}
|
}
|
||||||
|
@ -10,5 +10,4 @@ trait ExampleFilesSupport {
|
|||||||
case None => sys.error(s"Resource '$resource' not found")
|
case None => sys.error(s"Resource '$resource' not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -8,15 +8,14 @@ import scala.concurrent.ExecutionContext
|
|||||||
object Playing extends IOApp {
|
object Playing extends IOApp {
|
||||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||||
|
|
||||||
|
|
||||||
def run(args: List[String]): IO[ExitCode] = IO {
|
def run(args: List[String]): IO[ExitCode] = IO {
|
||||||
//val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker)
|
//val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker)
|
||||||
//val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker)
|
//val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker)
|
||||||
val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker)
|
val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker)
|
||||||
|
|
||||||
val x = for {
|
val x = for {
|
||||||
odsm1 <- TikaMimetype.detect(rtf,
|
odsm1 <- TikaMimetype
|
||||||
MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last))
|
.detect(rtf, MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last))
|
||||||
odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none)
|
odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none)
|
||||||
} yield (odsm1, odsm2)
|
} yield (odsm1, odsm2)
|
||||||
println(x.unsafeRunSync())
|
println(x.unsafeRunSync())
|
||||||
|
@ -68,7 +68,9 @@ object ConvertPdf {
|
|||||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
val handler = conversionHandler[F](ctx, cfg, ra, item)
|
||||||
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *>
|
||||||
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data)
|
conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(
|
||||||
|
data
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -119,7 +121,9 @@ object ConvertPdf {
|
|||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.map(fm => Ident.unsafe(fm.id))
|
.map(fm => Ident.unsafe(fm.id))
|
||||||
.flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId))
|
.flatMap(fmId =>
|
||||||
|
ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)
|
||||||
|
)
|
||||||
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
.map(fmId => ra.copy(fileId = fmId, name = newName))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -95,10 +95,10 @@ object FindProposal {
|
|||||||
labels => self.find(labels).map(f)
|
labels => self.find(labels).map(f)
|
||||||
|
|
||||||
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
|
||||||
flatMap({ ml0 =>
|
flatMap { ml0 =>
|
||||||
if (ml0.hasResultsAll) Finder.unit[F](ml0)
|
if (ml0.hasResultsAll) Finder.unit[F](ml0)
|
||||||
else f.map(ml1 => ml0.fillEmptyFrom(ml1))
|
else f.map(ml1 => ml0.fillEmptyFrom(ml1))
|
||||||
})
|
}
|
||||||
|
|
||||||
def nextWhenEmpty(f: Finder[F], mt0: MetaProposalType, mts: MetaProposalType*)(
|
def nextWhenEmpty(f: Finder[F], mt0: MetaProposalType, mts: MetaProposalType*)(
|
||||||
implicit F: FlatMap[F],
|
implicit F: FlatMap[F],
|
||||||
|
@ -19,9 +19,7 @@ object ItemHandler {
|
|||||||
.map(_ => ())
|
.map(_ => ())
|
||||||
|
|
||||||
def itemStateTask[F[_]: Sync, A](state: ItemState)(data: ItemData): Task[F, A, ItemData] =
|
def itemStateTask[F[_]: Sync, A](state: ItemState)(data: ItemData): Task[F, A, ItemData] =
|
||||||
Task { ctx =>
|
Task(ctx => ctx.store.transact(RItem.updateState(data.item.id, state)).map(_ => data))
|
||||||
ctx.store.transact(RItem.updateState(data.item.id, state)).map(_ => data)
|
|
||||||
}
|
|
||||||
|
|
||||||
def isLastRetry[F[_]: Sync, A](ctx: Context[F, A]): F[Boolean] =
|
def isLastRetry[F[_]: Sync, A](ctx: Context[F, A]): F[Boolean] =
|
||||||
for {
|
for {
|
||||||
|
@ -11,9 +11,7 @@ object TestTasks {
|
|||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
def success[F[_]]: Task[F, ProcessItemArgs, Unit] =
|
def success[F[_]]: Task[F, ProcessItemArgs, Unit] =
|
||||||
Task { ctx =>
|
Task(ctx => ctx.logger.info(s"Running task now: ${ctx.args}"))
|
||||||
ctx.logger.info(s"Running task now: ${ctx.args}")
|
|
||||||
}
|
|
||||||
|
|
||||||
def failing[F[_]: Sync]: Task[F, ProcessItemArgs, Unit] =
|
def failing[F[_]: Sync]: Task[F, ProcessItemArgs, Unit] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
|
@ -76,16 +76,15 @@ object TextExtraction {
|
|||||||
.getOrElse(Mimetype.`application/octet-stream`)
|
.getOrElse(Mimetype.`application/octet-stream`)
|
||||||
|
|
||||||
findMime
|
findMime
|
||||||
.flatMap(mt =>
|
.flatMap(mt => extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
||||||
extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private def extractTextFallback[F[_]: Sync: ContextShift](
|
private def extractTextFallback[F[_]: Sync: ContextShift](
|
||||||
ctx: Context[F, _],
|
ctx: Context[F, _],
|
||||||
cfg: ExtractConfig,
|
cfg: ExtractConfig,
|
||||||
ra: RAttachment,
|
ra: RAttachment,
|
||||||
lang: Language,
|
lang: Language
|
||||||
)(fileIds: List[Ident]): F[Option[String]] = {
|
)(fileIds: List[Ident]): F[Option[String]] =
|
||||||
fileIds match {
|
fileIds match {
|
||||||
case Nil =>
|
case Nil =>
|
||||||
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
ctx.logger.error(s"Cannot extract text").map(_ => None)
|
||||||
@ -99,15 +98,18 @@ object TextExtraction {
|
|||||||
txt.some.pure[F]
|
txt.some.pure[F]
|
||||||
|
|
||||||
case ExtractResult.UnsupportedFormat(mt) =>
|
case ExtractResult.UnsupportedFormat(mt) =>
|
||||||
ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file.").
|
ctx.logger
|
||||||
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
.warn(
|
||||||
|
s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file."
|
||||||
|
)
|
||||||
|
.flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||||
|
|
||||||
case ExtractResult.Failure(ex) =>
|
case ExtractResult.Failure(ex) =>
|
||||||
ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file").
|
ctx.logger
|
||||||
flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file")
|
||||||
|
.flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the fileIds to extract text from. First, the source file
|
/** Returns the fileIds to extract text from. First, the source file
|
||||||
* is tried. If that fails, the converted file is tried.
|
* is tried. If that fails, the converted file is tried.
|
||||||
|
@ -128,6 +128,9 @@ Please see the `nix/module-server.nix` and `nix/module-joex.nix` files
|
|||||||
for the set of options. The nixos options are modelled after the
|
for the set of options. The nixos options are modelled after the
|
||||||
default configuration file.
|
default configuration file.
|
||||||
|
|
||||||
|
The modules files are only applicable to the newest version of
|
||||||
|
Docspell. If you really need an older version, checkout the
|
||||||
|
appropriate commit.
|
||||||
|
|
||||||
## NixOs Example
|
## NixOs Example
|
||||||
|
|
||||||
|
@ -9,7 +9,8 @@ title: Features and Limitations
|
|||||||
- Multiple users per account
|
- Multiple users per account
|
||||||
- Handle multiple documents as one unit
|
- Handle multiple documents as one unit
|
||||||
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
- OCR using [tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||||
- Conversion to PDF: all files are converted into a PDF file
|
- Conversion to PDF: all files are converted into a PDF file, while
|
||||||
|
the original file is preserved
|
||||||
- Text is analysed to find and attach meta data automatically
|
- Text is analysed to find and attach meta data automatically
|
||||||
- Manage document processing (cancel jobs, set priorities)
|
- Manage document processing (cancel jobs, set priorities)
|
||||||
- Everything available via a documented [REST Api](api)
|
- Everything available via a documented [REST Api](api)
|
||||||
|
@ -204,7 +204,8 @@ trait Conversions {
|
|||||||
|
|
||||||
val files = mp.parts
|
val files = mp.parts
|
||||||
.filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))
|
.filter(p => p.name.forall(s => !s.equalsIgnoreCase("meta")))
|
||||||
.map(p => OUpload.File(p.filename, p.headers.get(`Content-Type`).map(fromContentType), p.body)
|
.map(p =>
|
||||||
|
OUpload.File(p.filename, p.headers.get(`Content-Type`).map(fromContentType), p.body)
|
||||||
)
|
)
|
||||||
for {
|
for {
|
||||||
metaData <- meta
|
metaData <- meta
|
||||||
|
@ -55,10 +55,10 @@ object AttachmentRoutes {
|
|||||||
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
||||||
matches = matchETag(fileData.map(_.meta), inm)
|
matches = matchETag(fileData.map(_.meta), inm)
|
||||||
resp <- fileData
|
resp <- fileData
|
||||||
.map({ data =>
|
.map { data =>
|
||||||
if (matches) withResponseHeaders(NotModified())(data)
|
if (matches) withResponseHeaders(NotModified())(data)
|
||||||
else makeByteResp(data)
|
else makeByteResp(data)
|
||||||
})
|
}
|
||||||
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
@ -76,10 +76,10 @@ object AttachmentRoutes {
|
|||||||
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
inm = req.headers.get(`If-None-Match`).flatMap(_.tags)
|
||||||
matches = matchETag(fileData.map(_.meta), inm)
|
matches = matchETag(fileData.map(_.meta), inm)
|
||||||
resp <- fileData
|
resp <- fileData
|
||||||
.map({ data =>
|
.map { data =>
|
||||||
if (matches) withResponseHeaders(NotModified())(data)
|
if (matches) withResponseHeaders(NotModified())(data)
|
||||||
else makeByteResp(data)
|
else makeByteResp(data)
|
||||||
})
|
}
|
||||||
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
.getOrElse(NotFound(BasicResult(false, "Not found")))
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
|
@ -14,10 +14,15 @@ object QAttachment {
|
|||||||
|
|
||||||
def deleteById[F[_]: Sync](store: Store[F])(attachId: Ident, coll: Ident): F[Int] =
|
def deleteById[F[_]: Sync](store: Store[F])(attachId: Ident, coll: Ident): F[Int] =
|
||||||
for {
|
for {
|
||||||
raFile <- store.transact(RAttachment.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId))
|
raFile <- store
|
||||||
rsFile <- store.transact(RAttachmentSource.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId))
|
.transact(RAttachment.findByIdAndCollective(attachId, coll))
|
||||||
|
.map(_.map(_.fileId))
|
||||||
|
rsFile <- store
|
||||||
|
.transact(RAttachmentSource.findByIdAndCollective(attachId, coll))
|
||||||
|
.map(_.map(_.fileId))
|
||||||
n <- store.transact(RAttachment.delete(attachId))
|
n <- store.transact(RAttachment.delete(attachId))
|
||||||
f <- Stream.emits(raFile.toSeq ++ rsFile.toSeq)
|
f <- Stream
|
||||||
|
.emits(raFile.toSeq ++ rsFile.toSeq)
|
||||||
.map(_.id)
|
.map(_.id)
|
||||||
.flatMap(store.bitpeace.delete)
|
.flatMap(store.bitpeace.delete)
|
||||||
.map(flag => if (flag) 1 else 0)
|
.map(flag => if (flag) 1 else 0)
|
||||||
@ -29,10 +34,12 @@ object QAttachment {
|
|||||||
for {
|
for {
|
||||||
s <- store.transact(RAttachmentSource.findById(ra.id))
|
s <- store.transact(RAttachmentSource.findById(ra.id))
|
||||||
n <- store.transact(RAttachment.delete(ra.id))
|
n <- store.transact(RAttachment.delete(ra.id))
|
||||||
f <- Stream.emits(ra.fileId.id +: s.map(_.fileId.id).toSeq).
|
f <- Stream
|
||||||
flatMap(store.bitpeace.delete).
|
.emits(ra.fileId.id +: s.map(_.fileId.id).toSeq)
|
||||||
map(flag => if (flag) 1 else 0).
|
.flatMap(store.bitpeace.delete)
|
||||||
compile.foldMonoid
|
.map(flag => if (flag) 1 else 0)
|
||||||
|
.compile
|
||||||
|
.foldMonoid
|
||||||
} yield n + f
|
} yield n + f
|
||||||
|
|
||||||
def deleteItemAttachments[F[_]: Sync](store: Store[F])(itemId: Ident, coll: Ident): F[Int] =
|
def deleteItemAttachments[F[_]: Sync](store: Store[F])(itemId: Ident, coll: Ident): F[Int] =
|
||||||
|
@ -27,7 +27,6 @@ object QCollective {
|
|||||||
and(IC.cid.is(coll), IC.incoming.is(Direction.outgoing))
|
and(IC.cid.is(coll), IC.incoming.is(Direction.outgoing))
|
||||||
).query[Int].unique
|
).query[Int].unique
|
||||||
|
|
||||||
|
|
||||||
val fileSize = sql"""
|
val fileSize = sql"""
|
||||||
select sum(length) from (
|
select sum(length) from (
|
||||||
with attachs as
|
with attachs as
|
||||||
@ -42,7 +41,6 @@ object QCollective {
|
|||||||
inner join filemeta m on m.id = a.file_id where a.id in (select aid from attachs)
|
inner join filemeta m on m.id = a.file_id where a.id in (select aid from attachs)
|
||||||
) as t""".query[Option[Long]].unique
|
) as t""".query[Option[Long]].unique
|
||||||
|
|
||||||
|
|
||||||
val q3 = fr"SELECT" ++ commas(
|
val q3 = fr"SELECT" ++ commas(
|
||||||
TC.name.prefix("t").f,
|
TC.name.prefix("t").f,
|
||||||
fr"count(" ++ RC.itemId.prefix("r").f ++ fr")"
|
fr"count(" ++ RC.itemId.prefix("r").f ++ fr")"
|
||||||
|
@ -39,7 +39,8 @@ object QItem {
|
|||||||
val EC = REquipment.Columns.all.map(_.prefix("e"))
|
val EC = REquipment.Columns.all.map(_.prefix("e"))
|
||||||
val ICC = List(RItem.Columns.id, RItem.Columns.name).map(_.prefix("ref"))
|
val ICC = List(RItem.Columns.id, RItem.Columns.name).map(_.prefix("ref"))
|
||||||
|
|
||||||
val cq = selectSimple(IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC, RItem.table ++ fr"i", Fragment.empty) ++
|
val cq =
|
||||||
|
selectSimple(IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC, RItem.table ++ fr"i", Fragment.empty) ++
|
||||||
fr"LEFT JOIN" ++ ROrganization.table ++ fr"o ON" ++ RItem.Columns.corrOrg
|
fr"LEFT JOIN" ++ ROrganization.table ++ fr"o ON" ++ RItem.Columns.corrOrg
|
||||||
.prefix("i")
|
.prefix("i")
|
||||||
.is(ROrganization.Columns.oid.prefix("o")) ++
|
.is(ROrganization.Columns.oid.prefix("o")) ++
|
||||||
@ -235,7 +236,8 @@ object QItem {
|
|||||||
def findByFileIds(fileMetaIds: List[Ident]): ConnectionIO[Vector[RItem]] = {
|
def findByFileIds(fileMetaIds: List[Ident]): ConnectionIO[Vector[RItem]] = {
|
||||||
val IC = RItem.Columns
|
val IC = RItem.Columns
|
||||||
val AC = RAttachment.Columns
|
val AC = RAttachment.Columns
|
||||||
val q = fr"SELECT DISTINCT" ++ commas(IC.all.map(_.prefix("i").f)) ++ fr"FROM" ++ RItem.table ++ fr"i" ++
|
val q =
|
||||||
|
fr"SELECT DISTINCT" ++ commas(IC.all.map(_.prefix("i").f)) ++ fr"FROM" ++ RItem.table ++ fr"i" ++
|
||||||
fr"INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ AC.itemId
|
fr"INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ AC.itemId
|
||||||
.prefix("a")
|
.prefix("a")
|
||||||
.is(IC.id.prefix("i")) ++
|
.is(IC.id.prefix("i")) ++
|
||||||
|
@ -21,11 +21,11 @@ object QJob {
|
|||||||
Stream
|
Stream
|
||||||
.range(0, 10)
|
.range(0, 10)
|
||||||
.evalMap(n => takeNextJob1(store)(priority, worker, retryPause, n))
|
.evalMap(n => takeNextJob1(store)(priority, worker, retryPause, n))
|
||||||
.evalTap({ x =>
|
.evalTap { x =>
|
||||||
if (x.isLeft)
|
if (x.isLeft)
|
||||||
logger.fdebug[F]("Cannot mark job, probably due to concurrent updates. Will retry.")
|
logger.fdebug[F]("Cannot mark job, probably due to concurrent updates. Will retry.")
|
||||||
else ().pure[F]
|
else ().pure[F]
|
||||||
})
|
}
|
||||||
.find(_.isRight)
|
.find(_.isRight)
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case Right(job) =>
|
case Right(job) =>
|
||||||
@ -97,7 +97,8 @@ object QJob {
|
|||||||
val sql2 = fr"SELECT min(" ++ jgroup.f ++ fr") as g FROM" ++ RJob.table ++ fr"a" ++
|
val sql2 = fr"SELECT min(" ++ jgroup.f ++ fr") as g FROM" ++ RJob.table ++ fr"a" ++
|
||||||
fr"WHERE" ++ stateCond
|
fr"WHERE" ++ stateCond
|
||||||
|
|
||||||
val union = sql"SELECT g FROM ((" ++ sql1 ++ sql") UNION ALL (" ++ sql2 ++ sql")) as t0 WHERE g is not null"
|
val union =
|
||||||
|
sql"SELECT g FROM ((" ++ sql1 ++ sql") UNION ALL (" ++ sql2 ++ sql")) as t0 WHERE g is not null"
|
||||||
|
|
||||||
union
|
union
|
||||||
.query[Ident]
|
.query[Ident]
|
||||||
|
@ -34,11 +34,11 @@ object JobQueue {
|
|||||||
def insert(job: RJob): F[Unit] =
|
def insert(job: RJob): F[Unit] =
|
||||||
store
|
store
|
||||||
.transact(RJob.insert(job))
|
.transact(RJob.insert(job))
|
||||||
.flatMap({ n =>
|
.flatMap { n =>
|
||||||
if (n != 1)
|
if (n != 1)
|
||||||
Effect[F].raiseError(new Exception(s"Inserting job failed. Update count: $n"))
|
Effect[F].raiseError(new Exception(s"Inserting job failed. Update count: $n"))
|
||||||
else ().pure[F]
|
else ().pure[F]
|
||||||
})
|
}
|
||||||
|
|
||||||
def insertAll(jobs: Seq[RJob]): F[Unit] =
|
def insertAll(jobs: Seq[RJob]): F[Unit] =
|
||||||
jobs.toList
|
jobs.toList
|
||||||
|
@ -104,7 +104,8 @@ object RAttachment {
|
|||||||
def findByItemWithMeta(id: Ident): ConnectionIO[Vector[(RAttachment, FileMeta)]] = {
|
def findByItemWithMeta(id: Ident): ConnectionIO[Vector[(RAttachment, FileMeta)]] = {
|
||||||
import bitpeace.sql._
|
import bitpeace.sql._
|
||||||
|
|
||||||
val q = fr"SELECT a.*,m.* FROM" ++ table ++ fr"a, filemeta m WHERE a.filemetaid = m.id AND a.itemid = $id ORDER BY a.position ASC"
|
val q =
|
||||||
|
fr"SELECT a.*,m.* FROM" ++ table ++ fr"a, filemeta m WHERE a.filemetaid = m.id AND a.itemid = $id ORDER BY a.position ASC"
|
||||||
q.query[(RAttachment, FileMeta)].to[Vector]
|
q.query[(RAttachment, FileMeta)].to[Vector]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,14 +38,16 @@ object RAttachmentSource {
|
|||||||
def insert(v: RAttachmentSource): ConnectionIO[Int] =
|
def insert(v: RAttachmentSource): ConnectionIO[Int] =
|
||||||
insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run
|
insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run
|
||||||
|
|
||||||
|
|
||||||
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
|
def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] =
|
||||||
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
|
selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option
|
||||||
|
|
||||||
def delete(attachId: Ident): ConnectionIO[Int] =
|
def delete(attachId: Ident): ConnectionIO[Int] =
|
||||||
deleteFrom(table, id.is(attachId)).update.run
|
deleteFrom(table, id.is(attachId)).update.run
|
||||||
|
|
||||||
def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachmentSource]] = {
|
def findByIdAndCollective(
|
||||||
|
attachId: Ident,
|
||||||
|
collective: Ident
|
||||||
|
): ConnectionIO[Option[RAttachmentSource]] = {
|
||||||
val bId = RAttachment.Columns.id.prefix("b")
|
val bId = RAttachment.Columns.id.prefix("b")
|
||||||
val aId = Columns.id.prefix("a")
|
val aId = Columns.id.prefix("a")
|
||||||
val bItem = RAttachment.Columns.itemId.prefix("b")
|
val bItem = RAttachment.Columns.itemId.prefix("b")
|
||||||
@ -77,8 +79,9 @@ object RAttachmentSource {
|
|||||||
RAttachment.table ++ fr"b ON" ++ aId.is(bId)
|
RAttachment.table ++ fr"b ON" ++ aId.is(bId)
|
||||||
val where = bItem.is(id)
|
val where = bItem.is(id)
|
||||||
|
|
||||||
(selectSimple(cols, from, where) ++ orderBy(bPos.asc)).
|
(selectSimple(cols, from, where) ++ orderBy(bPos.asc))
|
||||||
query[(RAttachmentSource, FileMeta)].to[Vector]
|
.query[(RAttachmentSource, FileMeta)]
|
||||||
|
.to[Vector]
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -407,6 +407,20 @@ update key flags next msg model =
|
|||||||
)
|
)
|
||||||
m4
|
m4
|
||||||
|
|
||||||
|
( m6, c6 ) =
|
||||||
|
update key
|
||||||
|
flags
|
||||||
|
next
|
||||||
|
(ConcEquipMsg
|
||||||
|
(Comp.Dropdown.SetSelection
|
||||||
|
(item.concEquipment
|
||||||
|
|> Maybe.map List.singleton
|
||||||
|
|> Maybe.withDefault []
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
m5
|
||||||
|
|
||||||
proposalCmd =
|
proposalCmd =
|
||||||
if item.state == "created" then
|
if item.state == "created" then
|
||||||
Api.getItemProposals flags item.id GetProposalResp
|
Api.getItemProposals flags item.id GetProposalResp
|
||||||
@ -414,7 +428,7 @@ update key flags next msg model =
|
|||||||
else
|
else
|
||||||
Cmd.none
|
Cmd.none
|
||||||
in
|
in
|
||||||
( { m5
|
( { m6
|
||||||
| item = item
|
| item = item
|
||||||
, nameModel = item.name
|
, nameModel = item.name
|
||||||
, notesModel = item.notes
|
, notesModel = item.notes
|
||||||
@ -428,6 +442,7 @@ update key flags next msg model =
|
|||||||
, c3
|
, c3
|
||||||
, c4
|
, c4
|
||||||
, c5
|
, c5
|
||||||
|
, c6
|
||||||
, getOptions flags
|
, getOptions flags
|
||||||
, proposalCmd
|
, proposalCmd
|
||||||
, Api.getSentMails flags item.id SentMailsResp
|
, Api.getSentMails flags item.id SentMailsResp
|
||||||
|
@ -21,8 +21,8 @@ object Dependencies {
|
|||||||
val LogbackVersion = "1.2.3"
|
val LogbackVersion = "1.2.3"
|
||||||
val MariaDbVersion = "2.5.4"
|
val MariaDbVersion = "2.5.4"
|
||||||
val MiniTestVersion = "2.7.0"
|
val MiniTestVersion = "2.7.0"
|
||||||
val PdfboxVersion = "2.0.18"
|
val PdfboxVersion = "2.0.19"
|
||||||
val PoiVersion = "4.1.2"
|
val PoiVersion = "4.1.1"
|
||||||
val PostgresVersion = "42.2.10"
|
val PostgresVersion = "42.2.10"
|
||||||
val PureConfigVersion = "0.12.2"
|
val PureConfigVersion = "0.12.2"
|
||||||
val Slf4jVersion = "1.7.30"
|
val Slf4jVersion = "1.7.30"
|
||||||
|
24
tools/ds.sh
24
tools/ds.sh
@ -15,6 +15,9 @@
|
|||||||
# url.2=...
|
# url.2=...
|
||||||
#
|
#
|
||||||
# Lines starting with a `#' are ignored.
|
# Lines starting with a `#' are ignored.
|
||||||
|
#
|
||||||
|
# The `-e|--exists' option allows to skip uploading and only check
|
||||||
|
# whether a given file exists in docspell.
|
||||||
|
|
||||||
# saner programming env: these switches turn some bugs into errors
|
# saner programming env: these switches turn some bugs into errors
|
||||||
set -o errexit -o pipefail -o noclobber -o nounset
|
set -o errexit -o pipefail -o noclobber -o nounset
|
||||||
@ -30,8 +33,8 @@ if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
OPTIONS=c:hsd
|
OPTIONS=c:hsde
|
||||||
LONGOPTS=config:,help,skip,delete
|
LONGOPTS=config:,help,skip,delete,exists
|
||||||
|
|
||||||
! PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS --name "$0" -- "$@")
|
! PARSED=$(getopt --options=$OPTIONS --longoptions=$LONGOPTS --name "$0" -- "$@")
|
||||||
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
|
if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
|
||||||
@ -43,7 +46,7 @@ fi
|
|||||||
# read getopt’s output this way to handle the quoting right:
|
# read getopt’s output this way to handle the quoting right:
|
||||||
eval set -- "$PARSED"
|
eval set -- "$PARSED"
|
||||||
|
|
||||||
delete=n help=n config="${XDG_CONFIG_HOME:-$HOME/.config}/docspell/ds.conf"
|
exists=n delete=n help=n config="${XDG_CONFIG_HOME:-$HOME/.config}/docspell/ds.conf"
|
||||||
while true; do
|
while true; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
-h|--help)
|
-h|--help)
|
||||||
@ -58,6 +61,10 @@ while true; do
|
|||||||
delete="y"
|
delete="y"
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
-e|--exists)
|
||||||
|
exists=y
|
||||||
|
shift
|
||||||
|
;;
|
||||||
--)
|
--)
|
||||||
shift
|
shift
|
||||||
break
|
break
|
||||||
@ -121,9 +128,10 @@ showUsage() {
|
|||||||
info " -c | --config Provide a config file. (value: $config)"
|
info " -c | --config Provide a config file. (value: $config)"
|
||||||
info " -d | --delete Delete the files when successfully uploaded (value: $delete)"
|
info " -d | --delete Delete the files when successfully uploaded (value: $delete)"
|
||||||
info " -h | --help Prints this help text. (value: $help)"
|
info " -h | --help Prints this help text. (value: $help)"
|
||||||
|
info " -e | --exists Checks for the existence of a file instead of uploading (value: $exists)"
|
||||||
info ""
|
info ""
|
||||||
info "Arguments:"
|
info "Arguments:"
|
||||||
info " One or more PDF files to upload."
|
info " One or more files to check for existence or upload."
|
||||||
info ""
|
info ""
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,6 +161,13 @@ done <<< $($GREP_CMD -v '^#.*' "$config")
|
|||||||
IFS=$'\n'
|
IFS=$'\n'
|
||||||
for file in $*; do
|
for file in $*; do
|
||||||
for url in "${urls[@]}"; do
|
for url in "${urls[@]}"; do
|
||||||
|
if [ "$exists" = "y" ]; then
|
||||||
|
if checkFile "$url" "$file"; then
|
||||||
|
info "$url $file: true"
|
||||||
|
else
|
||||||
|
info "$url $file: false"
|
||||||
|
fi
|
||||||
|
else
|
||||||
info "Uploading '$file' to '$url'"
|
info "Uploading '$file' to '$url'"
|
||||||
set +e
|
set +e
|
||||||
upload "$file" "$url"
|
upload "$file" "$url"
|
||||||
@ -161,5 +176,6 @@ for file in $*; do
|
|||||||
info "Deleting file: $file"
|
info "Deleting file: $file"
|
||||||
rm -f "$file"
|
rm -f "$file"
|
||||||
fi
|
fi
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
Loading…
x
Reference in New Issue
Block a user