Apply scalafmt to all files

This commit is contained in:
Eike Kettner 2020-02-09 01:54:11 +01:00
parent 6a9ec42a03
commit 5c37efeaba
32 changed files with 442 additions and 362 deletions

View File

@ -143,7 +143,9 @@ object OMail {
for {
_ <- OptionT.liftF(store.transact(RItem.existsById(m.item))).filter(identity)
ras <- OptionT.liftF(
store.transact(RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective))
store.transact(
RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective)
)
)
} yield {
val addAttach = m.attach.filter(ras).map { a =>

View File

@ -19,9 +19,9 @@ object AccountId {
case n if n > 0 && input.length > 2 =>
val coll = input.substring(0, n)
val user = input.substring(n + 1)
Ident.fromString(coll).
flatMap(collId => Ident.fromString(user).
map(userId => AccountId(collId, userId)))
Ident
.fromString(coll)
.flatMap(collId => Ident.fromString(user).map(userId => AccountId(collId, userId)))
case _ =>
invalid
}

View File

@ -12,5 +12,4 @@ object BaseJsonCodecs {
implicit val decodeInstantEpoch: Decoder[Instant] =
Decoder.decodeLong.map(Instant.ofEpochMilli)
}

View File

@ -21,7 +21,6 @@ object CollectiveState {
* action. */
case object Blocked extends CollectiveState
def fromString(s: String): Either[String, CollectiveState] =
s.toLowerCase match {
case "active" => Right(Active)
@ -41,8 +40,6 @@ object CollectiveState {
case ReadOnly => "readonly"
}
implicit val collectiveStateEncoder: Encoder[CollectiveState] =
Encoder.encodeString.contramap(CollectiveState.asString)

View File

@ -34,7 +34,6 @@ object ContactKind {
def asString(s: ContactKind): String =
s.asString.toLowerCase
implicit val contactKindEncoder: Encoder[ContactKind] =
Encoder.encodeString.contramap(_.asString)

View File

@ -10,33 +10,26 @@ sealed trait JobState { self: Product =>
object JobState {
/** Waiting for being executed. */
case object Waiting extends JobState {
}
case object Waiting extends JobState {}
/** A scheduler has picked up this job and will pass it to the next
* free slot. */
case object Scheduled extends JobState {
}
case object Scheduled extends JobState {}
/** Is currently executing */
case object Running extends JobState {
}
case object Running extends JobState {}
/** Finished with failure and is being retried. */
case object Stuck extends JobState {
}
case object Stuck extends JobState {}
/** Finished finally with a failure */
case object Failed extends JobState {
}
case object Failed extends JobState {}
/** Finished by cancellation. */
case object Cancelled extends JobState {
}
case object Cancelled extends JobState {}
/** Finished with success */
case object Success extends JobState {
}
case object Success extends JobState {}
val all: Set[JobState] = Set(Waiting, Scheduled, Running, Stuck, Failed, Cancelled, Success)
val queued: Set[JobState] = Set(Waiting, Scheduled, Stuck)
@ -60,7 +53,6 @@ object JobState {
def asString(state: JobState): String =
state.name
implicit val jobStateEncoder: Encoder[JobState] =
Encoder.encodeString.contramap(_.name)

View File

@ -51,8 +51,8 @@ case class LenientUri(
def open[F[_]: Sync]: Either[String, Resource[F, HttpURLConnection]] =
toJavaUrl.map { url =>
Resource
.make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))(
conn => Sync[F].delay(conn.disconnect())
.make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))(conn =>
Sync[F].delay(conn.disconnect())
)
}
@ -61,13 +61,12 @@ case class LenientUri(
.emit(Either.catchNonFatal(new URL(asString)))
.covary[F]
.rethrow
.flatMap(
url => fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true)
.flatMap(url =>
fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true)
)
def host: Option[String] =
authority.map(
a =>
authority.map(a =>
a.indexOf(':') match {
case -1 => a
case n => a.substring(0, n)

View File

@ -11,10 +11,8 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
def isEmpty: Boolean = proposals.isEmpty
def nonEmpty: Boolean = proposals.nonEmpty
def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean = {
(mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)).
reduce(_ && _)
}
def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean =
(mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)).reduce(_ && _)
def hasResultsAll: Boolean =
proposals.map(_.proposalType).toSet == MetaProposalType.all.toSet
@ -23,7 +21,7 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
proposals.foldLeft(Set.empty[MetaProposalType])(_ + _.proposalType)
def fillEmptyFrom(ml: MetaProposalList): MetaProposalList = {
val list = ml.proposals.foldLeft(proposals){ (mine, mp) =>
val list = ml.proposals.foldLeft(proposals) { (mine, mp) =>
if (hasResults(mp.proposalType)) mine
else mp :: mine
}
@ -48,18 +46,21 @@ object MetaProposalList {
fromSeq1(mt, refs.map(ref => Candidate(ref, Set(label))))
def fromSeq1(mt: MetaProposalType, refs: Seq[Candidate]): MetaProposalList =
NonEmptyList.fromList(refs.toList).
map(nl => MetaProposalList.of(MetaProposal(mt, nl))).
getOrElse(empty)
NonEmptyList
.fromList(refs.toList)
.map(nl => MetaProposalList.of(MetaProposal(mt, nl)))
.getOrElse(empty)
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = {
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList =
new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) }))
}
def flatten(ml: Seq[MetaProposalList]): MetaProposalList = {
val init: Map[MetaProposalType, MetaProposal] = Map.empty
def updateMap(map: Map[MetaProposalType, MetaProposal], mp: MetaProposal): Map[MetaProposalType, MetaProposal] =
def updateMap(
map: Map[MetaProposalType, MetaProposal],
mp: MetaProposal
): Map[MetaProposalType, MetaProposal] =
map.get(mp.proposalType) match {
case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList))
case None => map.updated(mp.proposalType, mp)

View File

@ -35,7 +35,6 @@ object NerTag {
def unsafe(str: String): NerTag =
fromString(str).fold(sys.error, identity)
implicit val jsonDecoder: Decoder[NerTag] =
Decoder.decodeString.emap(fromString)
implicit val jsonEncoder: Encoder[NerTag] =

View File

@ -24,12 +24,14 @@ object Implicits {
ConfigReader[String].emap(reason(Ident.fromString))
implicit val byteVectorReader: ConfigReader[ByteVector] =
ConfigReader[String].emap(reason(str => {
ConfigReader[String].emap(reason { str =>
if (str.startsWith("hex:")) ByteVector.fromHex(str.drop(4)).toRight("Invalid hex value.")
else if (str.startsWith("b64:")) ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.")
else if (str.startsWith("b64:"))
ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.")
else ByteVector.encodeUtf8(str).left.map(ex => s"Invalid utf8 string: ${ex.getMessage}")
}))
})
def reason[A: ClassTag](f: String => Either[String, A]): String => Either[FailureReason, A] =
in => f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str))
in =>
f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str))
}

View File

@ -2,9 +2,6 @@ package docspell.common
package object syntax {
object all extends EitherSyntax
with StreamSyntax
with StringSyntax
with LoggerSyntax
object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
}

View File

@ -16,7 +16,6 @@ object QueryParam {
implicit val queryStringDecoder: QueryParamDecoder[QueryString] =
QueryParamDecoder[String].map(s => QueryString(s.trim.toLowerCase))
// implicit val booleanDecoder: QueryParamDecoder[Boolean] =
// QueryParamDecoder.fromUnsafeCast(qp => Option(qp.value).exists(_.equalsIgnoreCase("true")))(
// "Boolean"

View File

@ -139,8 +139,7 @@ object ItemRoutes {
}
}
final implicit class OptionString(opt: Option[String]) {
implicit final class OptionString(opt: Option[String]) {
def notEmpty: Option[String] =
opt.map(_.trim).filter(_.nonEmpty)
}

View File

@ -194,8 +194,9 @@ object QItem {
IC.cid.prefix("i").is(q.collective),
IC.state.prefix("i").isOneOf(q.states),
IC.incoming.prefix("i").isOrDiscard(q.direction),
name.map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n))).
getOrElse(Fragment.empty),
name
.map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n)))
.getOrElse(Fragment.empty),
RPerson.Columns.pid.prefix("p0").isOrDiscard(q.corrPerson),
ROrganization.Columns.oid.prefix("o0").isOrDiscard(q.corrOrg),
RPerson.Columns.pid.prefix("p1").isOrDiscard(q.concPerson),

View File

@ -52,7 +52,15 @@ object RSentMail {
for {
user <- OptionT(RUser.findByAccount(accId))
sm <- OptionT.liftF(
RSentMail[ConnectionIO](user.uid, messageId, sender, connName, subject, recipients, body)
RSentMail[ConnectionIO](
user.uid,
messageId,
sender,
connName,
subject,
recipients,
body
)
)
si <- OptionT.liftF(RSentMailItem[ConnectionIO](itemId, sm.id, Some(sm.created)))
} yield (sm, si)

View File

@ -9,15 +9,17 @@ object Contact {
private[this] val protocols = Set("ftp", "http", "https")
def annotate(text: String): Vector[NerLabel] =
TextSplitter.splitToken[Nothing](text, " \t\r\n".toSet).
map({ token =>
if (isEmailAddress(token.value)) NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value)) NerLabel(token.value, NerTag.Website, token.begin, token.end).some
TextSplitter
.splitToken[Nothing](text, " \t\r\n".toSet)
.map({ token =>
if (isEmailAddress(token.value))
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value))
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
else None
}).
flatMap(_.map(Stream.emit).getOrElse(Stream.empty)).
toVector
})
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
.toVector
def isEmailAddress(str: String): Boolean = {
val atIdx = str.indexOf('@')
@ -30,10 +32,11 @@ object Contact {
}
def isWebsite(str: String): Boolean =
LenientUri.parse(str).
toOption.
map(uri => protocols.contains(uri.scheme.head)).
getOrElse(Domain.isDomain(str))
LenientUri
.parse(str)
.toOption
.map(uri => protocols.contains(uri.scheme.head))
.getOrElse(Domain.isDomain(str))
def isDocspellOpenUpload(str: String): Boolean = {
def isUploadPath(p: LenientUri.Path): Boolean =
@ -44,8 +47,9 @@ object Contact {
Ident.fromString(segs.last).isRight &&
segs.init.takeRight(3) == List("open", "upload", "item")
}
LenientUri.parse(str).
toOption.
exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
LenientUri
.parse(str)
.toOption
.exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
}
}

View File

@ -11,7 +11,7 @@ private[text] object Tld {
/**
* Some selected TLDs.
*/
private [this] val known = List(
private[this] val known = List(
".com",
".org",
".net",

View File

@ -10,16 +10,22 @@ import scala.util.Try
object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = {
TextSplitter.splitToken(text, " \t.,\n\r/".toSet).
sliding(3).
filter(_.length == 3).
map(q => SimpleDate.fromParts(q.toList, lang).
map(sd => NerDateLabel(sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)))).
collect({ case Some(d) => d })
}
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter
.splitToken(text, " \t.,\n\r/".toSet)
.sliding(3)
.filter(_.length == 3)
.map(q =>
SimpleDate
.fromParts(q.toList, lang)
.map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
)
)
)
.collect({ case Some(d) => d })
private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate =
@ -27,13 +33,13 @@ object DateFind {
}
private object SimpleDate {
val p0 = readYear >> readMonth >> readDay map {
val p0 = (readYear >> readMonth >> readDay).map {
case ((y, m), d) => SimpleDate(y, m, d)
}
val p1 = readDay >> readMonth >> readYear map {
val p1 = (readDay >> readMonth >> readYear).map {
case ((d, m), y) => SimpleDate(y, m, d)
}
val p2 = readMonth >> readDay >> readYear map {
val p2 = (readMonth >> readDay >> readYear).map {
case ((m, d), y) => SimpleDate(y, m, d)
}
@ -46,14 +52,14 @@ object DateFind {
p.read(parts).toOption
}
def readYear: Reader[Int] = {
Reader.readFirst(w => w.value.length match {
def readYear: Reader[Int] =
Reader.readFirst(w =>
w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None
})
}
)
def readMonth: Reader[Int] =
Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
@ -69,10 +75,12 @@ object DateFind {
Reader(read.andThen(_.map(f)))
def or(other: Reader[A]): Reader[A] =
Reader(words => read(words) match {
Reader(words =>
read(words) match {
case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s
})
}
)
}
object Reader {
@ -86,7 +94,6 @@ object DateFind {
})
}
sealed trait Result[+A] {
def toOption: Option[A]
def map[B](f: A => B): Result[B]

View File

@ -14,7 +14,7 @@ import java.net.URL
import scala.util.Using
object StanfordNerClassifier {
private [this] val logger = getLogger
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
@ -24,13 +24,18 @@ object StanfordNerClassifier {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
}
nerClassifier.classify(text).asScala.flatMap(a => a.asScala).
collect(Function.unlift(label => {
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag.fromString(Option(tag).getOrElse("")).toOption.
map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})).
toVector
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
@ -48,7 +53,9 @@ object StanfordNerClassifier {
check(lang match {
case Language.German =>
getClass.getResource("/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz")
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English =>
getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
})

View File

@ -5,11 +5,11 @@ import java.nio.file.{Path, Paths}
import docspell.common._
case class Config(
allowedContentTypes: Set[MimeType]
, ghostscript: Config.Ghostscript
, pageRange: Config.PageRange
, unpaper: Config.Unpaper
, tesseract: Config.Tesseract
allowedContentTypes: Set[MimeType],
ghostscript: Config.Ghostscript,
pageRange: Config.PageRange,
unpaper: Config.Unpaper,
tesseract: Config.Tesseract
) {
def isAllowed(mt: MimeType): Boolean =
@ -22,7 +22,7 @@ object Config {
case class Command(program: String, args: Seq[String], timeout: Duration) {
def mapArgs(f: String => String): Command =
Command(program, args map f, timeout)
Command(program, args.map(f), timeout)
def toCmd: List[String] =
program :: args.toList
@ -44,23 +44,23 @@ object Config {
),
pageRange = PageRange(10),
ghostscript = Ghostscript(
Command("gs", Seq("-dNOPAUSE"
, "-dBATCH"
, "-dSAFER"
, "-sDEVICE=tiffscaled8"
, "-sOutputFile={{outfile}}"
, "{{infile}}"),
Duration.seconds(30)),
Paths.get(System.getProperty("java.io.tmpdir")).
resolve("docspell-extraction")),
unpaper = Unpaper(Command("unpaper"
, Seq("{{infile}}", "{{outfile}}")
, Duration.seconds(30))),
Command(
"gs",
Seq(
"-dNOPAUSE",
"-dBATCH",
"-dSAFER",
"-sDEVICE=tiffscaled8",
"-sOutputFile={{outfile}}",
"{{infile}}"
),
Duration.seconds(30)
),
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
),
unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
tesseract = Tesseract(
Command("tesseract", Seq("{{file}}"
, "stdout"
, "-l"
, "{{lang}}"),
Duration.minutes(1)))
Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
)
)
}

View File

@ -11,71 +11,106 @@ object Ocr {
/** Extract the text of all pages in the given pdf file.
*/
def extractPdf[F[_]: Sync: ContextShift](pdf: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractPdf[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscript(pdf, config, wd, blocker).
flatMap({ tmpImg =>
runGhostscript(pdf, config, wd, blocker)
.flatMap({ tmpImg =>
runTesseractFile(tmpImg, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
})
.fold1(_ + "\n\n\n" + _)
}
/** Extract the text from the given image file
*/
def extractImage[F[_]: Sync: ContextShift](img: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractImage[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractStdin(img, blocker, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](pdf: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractPdFFile[F[_]: Sync: ContextShift](
pdf: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker).
flatMap({ tif =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
.flatMap({ tif =>
runTesseractFile(tif, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
})
.fold1(_ + "\n\n\n" + _)
}
def extractImageFile[F[_]: Sync: ContextShift](img: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractImageFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractFile(img, blocker, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte]
, cfg: Config
, wd: Path
, blocker: Blocker): Stream[F, Path] = {
pdf: Stream[F, Byte],
cfg: Config,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val xargs =
if (cfg.pageRange.begin > 0) s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
if (cfg.pageRange.begin > 0)
s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command.copy(args = xargs).mapArgs(replace(Map(
val cmd = cfg.ghostscript.command
.copy(args = xargs)
.mapArgs(
replace(
Map(
"{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf).
evalMap({ _ =>
)
)
)
SystemCommand
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
})
.flatMap(fs => Stream.emits(fs))
}
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path
, ghostscript: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(replace(Map(
pdf: Path,
ghostscript: Config.Command,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(
replace(
Map(
"{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
evalMap({ _ =>
)
)
)
SystemCommand
.execSuccess[F](cmd, blocker, wd = Some(wd))
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
})
.flatMap(fs => Stream.emits(fs))
}
private def pathEndsWith(ext: String): Path => Boolean =
@ -84,58 +119,65 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](img: Path
, unpaper: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-"+ img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(replace(Map(
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path,
unpaper: Config.Command,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(
replace(
Map(
"{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
map(_ => targetFile).
handleErrorWith(th => {
logger.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
)
)
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
th =>
logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img)
})
}
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).
flatMap(uimg => {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> uimg.getFileName.toString
, "{{lang}}" -> fixLanguage(lang))))
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
val cmd = config.tesseract.command.mapArgs(
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
})
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte]
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> "stdin"
, "{{lang}}" -> fixLanguage(lang))))
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] = {
val cmd = config.tesseract.command
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
}
private def replace(repl: Map[String, String]): String => String =
s => repl.foldLeft(s) { case (res, (k, v)) =>
s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v)
}

View File

@ -16,17 +16,22 @@ object SystemCommand {
final case class Result(rc: Int, stdout: String, stderr: String)
def exec[F[_]: Sync: ContextShift]( cmd: Config.Command
, blocker: Blocker
, wd: Option[Path] = None
, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
startProcess(cmd, wd){ proc =>
def exec[F[_]: Sync: ContextShift](
cmd: Config.Command,
blocker: Blocker,
wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
startProcess(cmd, wd) { proc =>
Stream.eval {
for {
_ <- writeToProcess(stdin, proc, blocker)
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
else logger.fwarn(s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!")
else
logger.fwarn(
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
)
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("")
err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("")
@ -34,39 +39,64 @@ object SystemCommand {
}
}
def execSuccess[F[_]: Sync: ContextShift](cmd: Config.Command, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
def execSuccess[F[_]: Sync: ContextShift](
cmd: Config.Command,
blocker: Blocker,
wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
exec(cmd, blocker, wd, stdin).flatMap { r =>
if (r.rc != 0) Stream.raiseError[F](new Exception(s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"))
if (r.rc != 0)
Stream.raiseError[F](
new Exception(
s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"
)
)
else Stream.emit(r)
}
private def startProcess[F[_]: Sync,A](cmd: Config.Command, wd: Option[Path])(f: Process => Stream[F,A]): Stream[F, A] = {
private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])(
f: Process => Stream[F, A]
): Stream[F, A] = {
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
val proc = log *> Sync[F].delay {
val pb = new ProcessBuilder(cmd.toCmd.asJava)
wd.map(_.toFile).foreach(pb.directory)
pb.start()
}
Stream.bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
Stream
.bracket(proc)(p =>
logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
p.destroy()
}).flatMap(f)
}
)
.flatMap(f)
}
private def inputStreamToString[F[_]: Sync: ContextShift](in: InputStream, blocker: Blocker): F[String] =
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false).
through(text.utf8Decode).
chunks.
map(_.toVector.mkString).
fold1(_ + _).
compile.last.
map(_.getOrElse(""))
private def inputStreamToString[F[_]: Sync: ContextShift](
in: InputStream,
blocker: Blocker
): F[String] =
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false)
.through(text.utf8Decode)
.chunks
.map(_.toVector.mkString)
.fold1(_ + _)
.compile
.last
.map(_.getOrElse(""))
private def writeToProcess[F[_]: Sync: ContextShift](data: Stream[F, Byte], proc: Process, blocker: Blocker): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).
compile.drain
private def writeToProcess[F[_]: Sync: ContextShift](
data: Stream[F, Byte],
proc: Process,
blocker: Blocker
): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] =
Sync[F].delay(proc.destroyForcibly()).attempt *> {
Sync[F].raiseError(new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})"))
Sync[F].raiseError(
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")
)
}
}

View File

@ -12,18 +12,17 @@ object TikaMimetype {
private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType =
Option(mt).map(_.toString).
map(MimeType.parse).
flatMap(_.toOption).
map(normalize).
getOrElse(MimeType.octetStream)
Option(mt)
.map(_.toString)
.map(MimeType.parse)
.flatMap(_.toOption)
.map(normalize)
.getOrElse(MimeType.octetStream)
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
hint.filename.
foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.advertised.
foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}
@ -33,13 +32,10 @@ object TikaMimetype {
case _ => in
}
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
}
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
data.take(1024).
compile.toVector.
map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
}