Apply scalafmt to all files

This commit is contained in:
Eike Kettner 2020-02-09 01:54:11 +01:00
parent 6a9ec42a03
commit 5c37efeaba
32 changed files with 442 additions and 362 deletions

View File

@ -143,7 +143,9 @@ object OMail {
for { for {
_ <- OptionT.liftF(store.transact(RItem.existsById(m.item))).filter(identity) _ <- OptionT.liftF(store.transact(RItem.existsById(m.item))).filter(identity)
ras <- OptionT.liftF( ras <- OptionT.liftF(
store.transact(RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective)) store.transact(
RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective)
)
) )
} yield { } yield {
val addAttach = m.attach.filter(ras).map { a => val addAttach = m.attach.filter(ras).map { a =>

View File

@ -19,9 +19,9 @@ object AccountId {
case n if n > 0 && input.length > 2 => case n if n > 0 && input.length > 2 =>
val coll = input.substring(0, n) val coll = input.substring(0, n)
val user = input.substring(n + 1) val user = input.substring(n + 1)
Ident.fromString(coll). Ident
flatMap(collId => Ident.fromString(user). .fromString(coll)
map(userId => AccountId(collId, userId))) .flatMap(collId => Ident.fromString(user).map(userId => AccountId(collId, userId)))
case _ => case _ =>
invalid invalid
} }

View File

@ -12,5 +12,4 @@ object BaseJsonCodecs {
implicit val decodeInstantEpoch: Decoder[Instant] = implicit val decodeInstantEpoch: Decoder[Instant] =
Decoder.decodeLong.map(Instant.ofEpochMilli) Decoder.decodeLong.map(Instant.ofEpochMilli)
} }

View File

@ -21,7 +21,6 @@ object CollectiveState {
* action. */ * action. */
case object Blocked extends CollectiveState case object Blocked extends CollectiveState
def fromString(s: String): Either[String, CollectiveState] = def fromString(s: String): Either[String, CollectiveState] =
s.toLowerCase match { s.toLowerCase match {
case "active" => Right(Active) case "active" => Right(Active)
@ -41,8 +40,6 @@ object CollectiveState {
case ReadOnly => "readonly" case ReadOnly => "readonly"
} }
implicit val collectiveStateEncoder: Encoder[CollectiveState] = implicit val collectiveStateEncoder: Encoder[CollectiveState] =
Encoder.encodeString.contramap(CollectiveState.asString) Encoder.encodeString.contramap(CollectiveState.asString)

View File

@ -34,7 +34,6 @@ object ContactKind {
def asString(s: ContactKind): String = def asString(s: ContactKind): String =
s.asString.toLowerCase s.asString.toLowerCase
implicit val contactKindEncoder: Encoder[ContactKind] = implicit val contactKindEncoder: Encoder[ContactKind] =
Encoder.encodeString.contramap(_.asString) Encoder.encodeString.contramap(_.asString)

View File

@ -10,33 +10,26 @@ sealed trait JobState { self: Product =>
object JobState { object JobState {
/** Waiting for being executed. */ /** Waiting for being executed. */
case object Waiting extends JobState { case object Waiting extends JobState {}
}
/** A scheduler has picked up this job and will pass it to the next /** A scheduler has picked up this job and will pass it to the next
* free slot. */ * free slot. */
case object Scheduled extends JobState { case object Scheduled extends JobState {}
}
/** Is currently executing */ /** Is currently executing */
case object Running extends JobState { case object Running extends JobState {}
}
/** Finished with failure and is being retried. */ /** Finished with failure and is being retried. */
case object Stuck extends JobState { case object Stuck extends JobState {}
}
/** Finished finally with a failure */ /** Finished finally with a failure */
case object Failed extends JobState { case object Failed extends JobState {}
}
/** Finished by cancellation. */ /** Finished by cancellation. */
case object Cancelled extends JobState { case object Cancelled extends JobState {}
}
/** Finished with success */ /** Finished with success */
case object Success extends JobState { case object Success extends JobState {}
}
val all: Set[JobState] = Set(Waiting, Scheduled, Running, Stuck, Failed, Cancelled, Success) val all: Set[JobState] = Set(Waiting, Scheduled, Running, Stuck, Failed, Cancelled, Success)
val queued: Set[JobState] = Set(Waiting, Scheduled, Stuck) val queued: Set[JobState] = Set(Waiting, Scheduled, Stuck)
@ -60,7 +53,6 @@ object JobState {
def asString(state: JobState): String = def asString(state: JobState): String =
state.name state.name
implicit val jobStateEncoder: Encoder[JobState] = implicit val jobStateEncoder: Encoder[JobState] =
Encoder.encodeString.contramap(_.name) Encoder.encodeString.contramap(_.name)

View File

@ -51,8 +51,8 @@ case class LenientUri(
def open[F[_]: Sync]: Either[String, Resource[F, HttpURLConnection]] = def open[F[_]: Sync]: Either[String, Resource[F, HttpURLConnection]] =
toJavaUrl.map { url => toJavaUrl.map { url =>
Resource Resource
.make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))( .make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))(conn =>
conn => Sync[F].delay(conn.disconnect()) Sync[F].delay(conn.disconnect())
) )
} }
@ -61,13 +61,12 @@ case class LenientUri(
.emit(Either.catchNonFatal(new URL(asString))) .emit(Either.catchNonFatal(new URL(asString)))
.covary[F] .covary[F]
.rethrow .rethrow
.flatMap( .flatMap(url =>
url => fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true) fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true)
) )
def host: Option[String] = def host: Option[String] =
authority.map( authority.map(a =>
a =>
a.indexOf(':') match { a.indexOf(':') match {
case -1 => a case -1 => a
case n => a.substring(0, n) case n => a.substring(0, n)

View File

@ -11,10 +11,8 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
def isEmpty: Boolean = proposals.isEmpty def isEmpty: Boolean = proposals.isEmpty
def nonEmpty: Boolean = proposals.nonEmpty def nonEmpty: Boolean = proposals.nonEmpty
def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean = { def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean =
(mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)). (mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)).reduce(_ && _)
reduce(_ && _)
}
def hasResultsAll: Boolean = def hasResultsAll: Boolean =
proposals.map(_.proposalType).toSet == MetaProposalType.all.toSet proposals.map(_.proposalType).toSet == MetaProposalType.all.toSet
@ -23,7 +21,7 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
proposals.foldLeft(Set.empty[MetaProposalType])(_ + _.proposalType) proposals.foldLeft(Set.empty[MetaProposalType])(_ + _.proposalType)
def fillEmptyFrom(ml: MetaProposalList): MetaProposalList = { def fillEmptyFrom(ml: MetaProposalList): MetaProposalList = {
val list = ml.proposals.foldLeft(proposals){ (mine, mp) => val list = ml.proposals.foldLeft(proposals) { (mine, mp) =>
if (hasResults(mp.proposalType)) mine if (hasResults(mp.proposalType)) mine
else mp :: mine else mp :: mine
} }
@ -48,18 +46,21 @@ object MetaProposalList {
fromSeq1(mt, refs.map(ref => Candidate(ref, Set(label)))) fromSeq1(mt, refs.map(ref => Candidate(ref, Set(label))))
def fromSeq1(mt: MetaProposalType, refs: Seq[Candidate]): MetaProposalList = def fromSeq1(mt: MetaProposalType, refs: Seq[Candidate]): MetaProposalList =
NonEmptyList.fromList(refs.toList). NonEmptyList
map(nl => MetaProposalList.of(MetaProposal(mt, nl))). .fromList(refs.toList)
getOrElse(empty) .map(nl => MetaProposalList.of(MetaProposal(mt, nl)))
.getOrElse(empty)
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = { def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList =
new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) })) new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) }))
}
def flatten(ml: Seq[MetaProposalList]): MetaProposalList = { def flatten(ml: Seq[MetaProposalList]): MetaProposalList = {
val init: Map[MetaProposalType, MetaProposal] = Map.empty val init: Map[MetaProposalType, MetaProposal] = Map.empty
def updateMap(map: Map[MetaProposalType, MetaProposal], mp: MetaProposal): Map[MetaProposalType, MetaProposal] = def updateMap(
map: Map[MetaProposalType, MetaProposal],
mp: MetaProposal
): Map[MetaProposalType, MetaProposal] =
map.get(mp.proposalType) match { map.get(mp.proposalType) match {
case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList)) case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList))
case None => map.updated(mp.proposalType, mp) case None => map.updated(mp.proposalType, mp)

View File

@ -35,7 +35,6 @@ object NerTag {
def unsafe(str: String): NerTag = def unsafe(str: String): NerTag =
fromString(str).fold(sys.error, identity) fromString(str).fold(sys.error, identity)
implicit val jsonDecoder: Decoder[NerTag] = implicit val jsonDecoder: Decoder[NerTag] =
Decoder.decodeString.emap(fromString) Decoder.decodeString.emap(fromString)
implicit val jsonEncoder: Encoder[NerTag] = implicit val jsonEncoder: Encoder[NerTag] =

View File

@ -24,12 +24,14 @@ object Implicits {
ConfigReader[String].emap(reason(Ident.fromString)) ConfigReader[String].emap(reason(Ident.fromString))
implicit val byteVectorReader: ConfigReader[ByteVector] = implicit val byteVectorReader: ConfigReader[ByteVector] =
ConfigReader[String].emap(reason(str => { ConfigReader[String].emap(reason { str =>
if (str.startsWith("hex:")) ByteVector.fromHex(str.drop(4)).toRight("Invalid hex value.") if (str.startsWith("hex:")) ByteVector.fromHex(str.drop(4)).toRight("Invalid hex value.")
else if (str.startsWith("b64:")) ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.") else if (str.startsWith("b64:"))
ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.")
else ByteVector.encodeUtf8(str).left.map(ex => s"Invalid utf8 string: ${ex.getMessage}") else ByteVector.encodeUtf8(str).left.map(ex => s"Invalid utf8 string: ${ex.getMessage}")
})) })
def reason[A: ClassTag](f: String => Either[String, A]): String => Either[FailureReason, A] = def reason[A: ClassTag](f: String => Either[String, A]): String => Either[FailureReason, A] =
in => f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str)) in =>
f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str))
} }

View File

@ -2,9 +2,6 @@ package docspell.common
package object syntax { package object syntax {
object all extends EitherSyntax object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
with StreamSyntax
with StringSyntax
with LoggerSyntax
} }

View File

@ -16,7 +16,6 @@ object QueryParam {
implicit val queryStringDecoder: QueryParamDecoder[QueryString] = implicit val queryStringDecoder: QueryParamDecoder[QueryString] =
QueryParamDecoder[String].map(s => QueryString(s.trim.toLowerCase)) QueryParamDecoder[String].map(s => QueryString(s.trim.toLowerCase))
// implicit val booleanDecoder: QueryParamDecoder[Boolean] = // implicit val booleanDecoder: QueryParamDecoder[Boolean] =
// QueryParamDecoder.fromUnsafeCast(qp => Option(qp.value).exists(_.equalsIgnoreCase("true")))( // QueryParamDecoder.fromUnsafeCast(qp => Option(qp.value).exists(_.equalsIgnoreCase("true")))(
// "Boolean" // "Boolean"

View File

@ -139,8 +139,7 @@ object ItemRoutes {
} }
} }
implicit final class OptionString(opt: Option[String]) {
final implicit class OptionString(opt: Option[String]) {
def notEmpty: Option[String] = def notEmpty: Option[String] =
opt.map(_.trim).filter(_.nonEmpty) opt.map(_.trim).filter(_.nonEmpty)
} }

View File

@ -194,8 +194,9 @@ object QItem {
IC.cid.prefix("i").is(q.collective), IC.cid.prefix("i").is(q.collective),
IC.state.prefix("i").isOneOf(q.states), IC.state.prefix("i").isOneOf(q.states),
IC.incoming.prefix("i").isOrDiscard(q.direction), IC.incoming.prefix("i").isOrDiscard(q.direction),
name.map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n))). name
getOrElse(Fragment.empty), .map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n)))
.getOrElse(Fragment.empty),
RPerson.Columns.pid.prefix("p0").isOrDiscard(q.corrPerson), RPerson.Columns.pid.prefix("p0").isOrDiscard(q.corrPerson),
ROrganization.Columns.oid.prefix("o0").isOrDiscard(q.corrOrg), ROrganization.Columns.oid.prefix("o0").isOrDiscard(q.corrOrg),
RPerson.Columns.pid.prefix("p1").isOrDiscard(q.concPerson), RPerson.Columns.pid.prefix("p1").isOrDiscard(q.concPerson),

View File

@ -52,7 +52,15 @@ object RSentMail {
for { for {
user <- OptionT(RUser.findByAccount(accId)) user <- OptionT(RUser.findByAccount(accId))
sm <- OptionT.liftF( sm <- OptionT.liftF(
RSentMail[ConnectionIO](user.uid, messageId, sender, connName, subject, recipients, body) RSentMail[ConnectionIO](
user.uid,
messageId,
sender,
connName,
subject,
recipients,
body
)
) )
si <- OptionT.liftF(RSentMailItem[ConnectionIO](itemId, sm.id, Some(sm.created))) si <- OptionT.liftF(RSentMailItem[ConnectionIO](itemId, sm.id, Some(sm.created)))
} yield (sm, si) } yield (sm, si)

View File

@ -9,15 +9,17 @@ object Contact {
private[this] val protocols = Set("ftp", "http", "https") private[this] val protocols = Set("ftp", "http", "https")
def annotate(text: String): Vector[NerLabel] = def annotate(text: String): Vector[NerLabel] =
TextSplitter.splitToken[Nothing](text, " \t\r\n".toSet). TextSplitter
map({ token => .splitToken[Nothing](text, " \t\r\n".toSet)
if (isEmailAddress(token.value)) NerLabel(token.value, NerTag.Email, token.begin, token.end).some .map({ token =>
else if (isWebsite(token.value)) NerLabel(token.value, NerTag.Website, token.begin, token.end).some if (isEmailAddress(token.value))
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value))
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
else None else None
}). })
flatMap(_.map(Stream.emit).getOrElse(Stream.empty)). .flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
toVector .toVector
def isEmailAddress(str: String): Boolean = { def isEmailAddress(str: String): Boolean = {
val atIdx = str.indexOf('@') val atIdx = str.indexOf('@')
@ -30,10 +32,11 @@ object Contact {
} }
def isWebsite(str: String): Boolean = def isWebsite(str: String): Boolean =
LenientUri.parse(str). LenientUri
toOption. .parse(str)
map(uri => protocols.contains(uri.scheme.head)). .toOption
getOrElse(Domain.isDomain(str)) .map(uri => protocols.contains(uri.scheme.head))
.getOrElse(Domain.isDomain(str))
def isDocspellOpenUpload(str: String): Boolean = { def isDocspellOpenUpload(str: String): Boolean = {
def isUploadPath(p: LenientUri.Path): Boolean = def isUploadPath(p: LenientUri.Path): Boolean =
@ -44,8 +47,9 @@ object Contact {
Ident.fromString(segs.last).isRight && Ident.fromString(segs.last).isRight &&
segs.init.takeRight(3) == List("open", "upload", "item") segs.init.takeRight(3) == List("open", "upload", "item")
} }
LenientUri.parse(str). LenientUri
toOption. .parse(str)
exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path)) .toOption
.exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
} }
} }

View File

@ -11,7 +11,7 @@ private[text] object Tld {
/** /**
* Some selected TLDs. * Some selected TLDs.
*/ */
private [this] val known = List( private[this] val known = List(
".com", ".com",
".org", ".org",
".net", ".net",

View File

@ -10,16 +10,22 @@ import scala.util.Try
object DateFind { object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter.splitToken(text, " \t.,\n\r/".toSet). TextSplitter
sliding(3). .splitToken(text, " \t.,\n\r/".toSet)
filter(_.length == 3). .sliding(3)
map(q => SimpleDate.fromParts(q.toList, lang). .filter(_.length == 3)
map(sd => NerDateLabel(sd.toLocalDate, .map(q =>
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)))). SimpleDate
collect({ case Some(d) => d }) .fromParts(q.toList, lang)
} .map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
)
)
)
.collect({ case Some(d) => d })
private case class SimpleDate(year: Int, month: Int, day: Int) { private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate = def toLocalDate: LocalDate =
@ -27,13 +33,13 @@ object DateFind {
} }
private object SimpleDate { private object SimpleDate {
val p0 = readYear >> readMonth >> readDay map { val p0 = (readYear >> readMonth >> readDay).map {
case ((y, m), d) => SimpleDate(y, m, d) case ((y, m), d) => SimpleDate(y, m, d)
} }
val p1 = readDay >> readMonth >> readYear map { val p1 = (readDay >> readMonth >> readYear).map {
case ((d, m), y) => SimpleDate(y, m, d) case ((d, m), y) => SimpleDate(y, m, d)
} }
val p2 = readMonth >> readDay >> readYear map { val p2 = (readMonth >> readDay >> readYear).map {
case ((m, d), y) => SimpleDate(y, m, d) case ((m, d), y) => SimpleDate(y, m, d)
} }
@ -46,14 +52,14 @@ object DateFind {
p.read(parts).toOption p.read(parts).toOption
} }
def readYear: Reader[Int] =
def readYear: Reader[Int] = { Reader.readFirst(w =>
Reader.readFirst(w => w.value.length match { w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None case _ => None
})
} }
)
def readMonth: Reader[Int] = def readMonth: Reader[Int] =
Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1)) Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
@ -69,10 +75,12 @@ object DateFind {
Reader(read.andThen(_.map(f))) Reader(read.andThen(_.map(f)))
def or(other: Reader[A]): Reader[A] = def or(other: Reader[A]): Reader[A] =
Reader(words => read(words) match { Reader(words =>
read(words) match {
case Result.Failure => other.read(words) case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s case s @ Result.Success(_, _) => s
}) }
)
} }
object Reader { object Reader {
@ -86,7 +94,6 @@ object DateFind {
}) })
} }
sealed trait Result[+A] { sealed trait Result[+A] {
def toOption: Option[A] def toOption: Option[A]
def map[B](f: A => B): Result[B] def map[B](f: A => B): Result[B]

View File

@ -14,7 +14,7 @@ import java.net.URL
import scala.util.Using import scala.util.Using
object StanfordNerClassifier { object StanfordNerClassifier {
private [this] val logger = getLogger private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German) lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English) lazy val englishNerClassifier = makeClassifier(Language.English)
@ -24,13 +24,18 @@ object StanfordNerClassifier {
case Language.English => englishNerClassifier case Language.English => englishNerClassifier
case Language.German => germanNerClassifier case Language.German => germanNerClassifier
} }
nerClassifier.classify(text).asScala.flatMap(a => a.asScala). nerClassifier
collect(Function.unlift(label => { .classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag.fromString(Option(tag).getOrElse("")).toOption. NerTag
map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) .fromString(Option(tag).getOrElse(""))
})). .toOption
toVector .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
} }
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
@ -48,7 +53,9 @@ object StanfordNerClassifier {
check(lang match { check(lang match {
case Language.German => case Language.German =>
getClass.getResource("/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz") getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English => case Language.English =>
getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz") getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
}) })

View File

@ -5,11 +5,11 @@ import java.nio.file.{Path, Paths}
import docspell.common._ import docspell.common._
case class Config( case class Config(
allowedContentTypes: Set[MimeType] allowedContentTypes: Set[MimeType],
, ghostscript: Config.Ghostscript ghostscript: Config.Ghostscript,
, pageRange: Config.PageRange pageRange: Config.PageRange,
, unpaper: Config.Unpaper unpaper: Config.Unpaper,
, tesseract: Config.Tesseract tesseract: Config.Tesseract
) { ) {
def isAllowed(mt: MimeType): Boolean = def isAllowed(mt: MimeType): Boolean =
@ -22,7 +22,7 @@ object Config {
case class Command(program: String, args: Seq[String], timeout: Duration) { case class Command(program: String, args: Seq[String], timeout: Duration) {
def mapArgs(f: String => String): Command = def mapArgs(f: String => String): Command =
Command(program, args map f, timeout) Command(program, args.map(f), timeout)
def toCmd: List[String] = def toCmd: List[String] =
program :: args.toList program :: args.toList
@ -44,23 +44,23 @@ object Config {
), ),
pageRange = PageRange(10), pageRange = PageRange(10),
ghostscript = Ghostscript( ghostscript = Ghostscript(
Command("gs", Seq("-dNOPAUSE" Command(
, "-dBATCH" "gs",
, "-dSAFER" Seq(
, "-sDEVICE=tiffscaled8" "-dNOPAUSE",
, "-sOutputFile={{outfile}}" "-dBATCH",
, "{{infile}}"), "-dSAFER",
Duration.seconds(30)), "-sDEVICE=tiffscaled8",
Paths.get(System.getProperty("java.io.tmpdir")). "-sOutputFile={{outfile}}",
resolve("docspell-extraction")), "{{infile}}"
unpaper = Unpaper(Command("unpaper" ),
, Seq("{{infile}}", "{{outfile}}") Duration.seconds(30)
, Duration.seconds(30))), ),
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
),
unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
tesseract = Tesseract( tesseract = Tesseract(
Command("tesseract", Seq("{{file}}" Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
, "stdout" )
, "-l"
, "{{lang}}"),
Duration.minutes(1)))
) )
} }

View File

@ -11,71 +11,106 @@ object Ocr {
/** Extract the text of all pages in the given pdf file. /** Extract the text of all pages in the given pdf file.
*/ */
def extractPdf[F[_]: Sync: ContextShift](pdf: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] = def extractPdf[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscript(pdf, config, wd, blocker). runGhostscript(pdf, config, wd, blocker)
flatMap({ tmpImg => .flatMap({ tmpImg =>
runTesseractFile(tmpImg, blocker, lang, config) runTesseractFile(tmpImg, blocker, lang, config)
}). })
fold1(_ + "\n\n\n" + _) .fold1(_ + "\n\n\n" + _)
} }
/** Extract the text from the given image file /** Extract the text from the given image file
*/ */
def extractImage[F[_]: Sync: ContextShift](img: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] = def extractImage[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractStdin(img, blocker, lang, config) runTesseractStdin(img, blocker, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](
def extractPdFFile[F[_]: Sync: ContextShift](pdf: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] = pdf: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker). runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
flatMap({ tif => .flatMap({ tif =>
runTesseractFile(tif, blocker, lang, config) runTesseractFile(tif, blocker, lang, config)
}). })
fold1(_ + "\n\n\n" + _) .fold1(_ + "\n\n\n" + _)
} }
def extractImageFile[F[_]: Sync: ContextShift](img: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] = def extractImageFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractFile(img, blocker, lang, config) runTesseractFile(img, blocker, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The /** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned. * files are stored to a temporary location on disk and returned.
*/ */
private[text] def runGhostscript[F[_]: Sync: ContextShift]( private[text] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte] pdf: Stream[F, Byte],
, cfg: Config cfg: Config,
, wd: Path wd: Path,
, blocker: Blocker): Stream[F, Path] = { blocker: Blocker
): Stream[F, Path] = {
val xargs = val xargs =
if (cfg.pageRange.begin > 0) s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args if (cfg.pageRange.begin > 0)
s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
else cfg.ghostscript.command.args else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command.copy(args = xargs).mapArgs(replace(Map( val cmd = cfg.ghostscript.command
.copy(args = xargs)
.mapArgs(
replace(
Map(
"{{infile}}" -> "-", "{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif" "{{outfile}}" -> "%d.tif"
))) )
SystemCommand.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf). )
evalMap({ _ => )
SystemCommand
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd) File.listFiles(pathEndsWith(".tif"), wd)
}). })
flatMap(fs => Stream.emits(fs)) .flatMap(fs => Stream.emits(fs))
} }
/** Run ghostscript to extract all pdf pages into tiff files. The /** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned. * files are stored to a temporary location on disk and returned.
*/ */
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path pdf: Path,
, ghostscript: Config.Command ghostscript: Config.Command,
, wd: Path, blocker: Blocker): Stream[F, Path] = { wd: Path,
val cmd = ghostscript.mapArgs(replace(Map( blocker: Blocker
): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(
replace(
Map(
"{{infile}}" -> pdf.toAbsolutePath.toString, "{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif" "{{outfile}}" -> "%d.tif"
))) )
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)). )
evalMap({ _ => )
SystemCommand
.execSuccess[F](cmd, blocker, wd = Some(wd))
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd) File.listFiles(pathEndsWith(".tif"), wd)
}). })
flatMap(fs => Stream.emits(fs)) .flatMap(fs => Stream.emits(fs))
} }
private def pathEndsWith(ext: String): Path => Boolean = private def pathEndsWith(ext: String): Path => Boolean =
@ -84,58 +119,65 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The /** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned. * files are stored to a temporary location on disk and returned.
*/ */
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](img: Path private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
, unpaper: Config.Command img: Path,
, wd: Path, blocker: Blocker): Stream[F, Path] = { unpaper: Config.Command,
val targetFile = img.resolveSibling("u-"+ img.getFileName.toString).toAbsolutePath wd: Path,
val cmd = unpaper.mapArgs(replace(Map( blocker: Blocker
): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(
replace(
Map(
"{{infile}}" -> img.toAbsolutePath.toString, "{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString "{{outfile}}" -> targetFile.toString
))) )
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)). )
map(_ => targetFile). )
handleErrorWith(th => { SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
logger.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") th =>
logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img) Stream.emit(img)
}) }
} }
/** Run tesseract on the given image file and return the extracted /** Run tesseract on the given image file and return the extracted
* text. * text.
*/ */
private[text] def runTesseractFile[F[_]: Sync: ContextShift]( private[text] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path img: Path,
, blocker: Blocker blocker: Blocker,
, lang: String lang: String,
, config: Config): Stream[F, String] = { config: Config
): Stream[F, String] =
// tesseract cannot cope with absolute filenames // tesseract cannot cope with absolute filenames
// so use the parent as working dir // so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker). runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
flatMap(uimg => { val cmd = config.tesseract.command.mapArgs(
val cmd = config.tesseract.command.mapArgs(replace(Map( replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
"{{file}}" -> uimg.getFileName.toString )
, "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
})
} }
/** Run tesseract on the given image file and return the extracted /** Run tesseract on the given image file and return the extracted
* text. * text.
*/ */
private[text] def runTesseractStdin[F[_]: Sync: ContextShift]( private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte] img: Stream[F, Byte],
, blocker: Blocker blocker: Blocker,
, lang: String lang: String,
, config: Config): Stream[F, String] = { config: Config
val cmd = config.tesseract.command.mapArgs(replace(Map( ): Stream[F, String] = {
"{{file}}" -> "stdin" val cmd = config.tesseract.command
, "{{lang}}" -> fixLanguage(lang)))) .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout) SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
} }
private def replace(repl: Map[String, String]): String => String = private def replace(repl: Map[String, String]): String => String =
s => repl.foldLeft(s) { case (res, (k, v)) => s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v) res.replace(k, v)
} }

View File

@ -16,17 +16,22 @@ object SystemCommand {
final case class Result(rc: Int, stdout: String, stderr: String) final case class Result(rc: Int, stdout: String, stderr: String)
def exec[F[_]: Sync: ContextShift]( cmd: Config.Command def exec[F[_]: Sync: ContextShift](
, blocker: Blocker cmd: Config.Command,
, wd: Option[Path] = None blocker: Blocker,
, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] = wd: Option[Path] = None,
startProcess(cmd, wd){ proc => stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
startProcess(cmd, wd) { proc =>
Stream.eval { Stream.eval {
for { for {
_ <- writeToProcess(stdin, proc, blocker) _ <- writeToProcess(stdin, proc, blocker)
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
else logger.fwarn(s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!") else
logger.fwarn(
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
)
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("") out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("")
err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("") err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("")
@ -34,39 +39,64 @@ object SystemCommand {
} }
} }
def execSuccess[F[_]: Sync: ContextShift](cmd: Config.Command, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] = def execSuccess[F[_]: Sync: ContextShift](
cmd: Config.Command,
blocker: Blocker,
wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
exec(cmd, blocker, wd, stdin).flatMap { r => exec(cmd, blocker, wd, stdin).flatMap { r =>
if (r.rc != 0) Stream.raiseError[F](new Exception(s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}")) if (r.rc != 0)
Stream.raiseError[F](
new Exception(
s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"
)
)
else Stream.emit(r) else Stream.emit(r)
} }
private def startProcess[F[_]: Sync,A](cmd: Config.Command, wd: Option[Path])(f: Process => Stream[F,A]): Stream[F, A] = { private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])(
f: Process => Stream[F, A]
): Stream[F, A] = {
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
val proc = log *> Sync[F].delay { val proc = log *> Sync[F].delay {
val pb = new ProcessBuilder(cmd.toCmd.asJava) val pb = new ProcessBuilder(cmd.toCmd.asJava)
wd.map(_.toFile).foreach(pb.directory) wd.map(_.toFile).foreach(pb.directory)
pb.start() pb.start()
} }
Stream.bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => Stream
.bracket(proc)(p =>
logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
p.destroy() p.destroy()
}).flatMap(f) }
)
.flatMap(f)
} }
private def inputStreamToString[F[_]: Sync: ContextShift](in: InputStream, blocker: Blocker): F[String] = private def inputStreamToString[F[_]: Sync: ContextShift](
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false). in: InputStream,
through(text.utf8Decode). blocker: Blocker
chunks. ): F[String] =
map(_.toVector.mkString). io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false)
fold1(_ + _). .through(text.utf8Decode)
compile.last. .chunks
map(_.getOrElse("")) .map(_.toVector.mkString)
.fold1(_ + _)
.compile
.last
.map(_.getOrElse(""))
private def writeToProcess[F[_]: Sync: ContextShift](data: Stream[F, Byte], proc: Process, blocker: Blocker): F[Unit] = private def writeToProcess[F[_]: Sync: ContextShift](
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)). data: Stream[F, Byte],
compile.drain proc: Process,
blocker: Blocker
): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] = private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] =
Sync[F].delay(proc.destroyForcibly()).attempt *> { Sync[F].delay(proc.destroyForcibly()).attempt *> {
Sync[F].raiseError(new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")) Sync[F].raiseError(
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")
)
} }
} }

View File

@ -12,18 +12,17 @@ object TikaMimetype {
private val tika = new TikaConfig().getDetector private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType = private def convert(mt: MediaType): MimeType =
Option(mt).map(_.toString). Option(mt)
map(MimeType.parse). .map(_.toString)
flatMap(_.toOption). .map(MimeType.parse)
map(normalize). .flatMap(_.toOption)
getOrElse(MimeType.octetStream) .map(normalize)
.getOrElse(MimeType.octetStream)
private def makeMetadata(hint: MimeTypeHint): Metadata = { private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata val md = new Metadata
hint.filename. hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _)) hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
hint.advertised.
foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md md
} }
@ -33,13 +32,10 @@ object TikaMimetype {
case _ => in case _ => in
} }
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = { private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
}
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] = def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
data.take(1024). data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
compile.toVector.
map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
} }