From 5c37efeaba43ad565dc92e364b2187027d09d8db Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sun, 9 Feb 2020 01:54:11 +0100 Subject: [PATCH] Apply scalafmt to all files --- .../scala/docspell/backend/BackendApp.scala | 4 +- .../scala/docspell/backend/ops/OItem.scala | 2 +- .../scala/docspell/backend/ops/OMail.scala | 30 +-- .../scala/docspell/common/AccountId.scala | 6 +- .../docspell/common/BaseJsonCodecs.scala | 1 - .../docspell/common/CollectiveState.scala | 19 +- .../scala/docspell/common/ContactKind.scala | 23 +-- .../main/scala/docspell/common/Duration.scala | 2 +- .../main/scala/docspell/common/JobState.scala | 38 ++-- .../scala/docspell/common/LenientUri.scala | 19 +- .../docspell/common/MetaProposalList.scala | 27 +-- .../docspell/common/MetaProposalType.scala | 18 +- .../main/scala/docspell/common/NerTag.scala | 27 ++- .../common/pureconfig/Implicits.scala | 10 +- .../docspell/common/syntax/package.scala | 5 +- .../restserver/http4s/QueryParam.scala | 1 - .../restserver/routes/ItemRoutes.scala | 3 +- .../restserver/routes/MailSendRoutes.scala | 12 +- .../routes/MailSettingsRoutes.scala | 40 ++-- .../restserver/routes/SentMailRoutes.scala | 4 +- .../restserver/webapp/TemplateRoutes.scala | 2 +- .../scala/docspell/store/queries/QItem.scala | 5 +- .../docspell/store/records/RFileMeta.scala | 10 +- .../docspell/store/records/RSentMail.scala | 12 +- .../scala/docspell/text/contact/Contact.scala | 38 ++-- .../scala/docspell/text/contact/Tld.scala | 2 +- .../scala/docspell/text/date/DateFind.scala | 67 +++--- .../text/nlp/StanfordNerClassifier.scala | 27 ++- .../main/scala/docspell/text/ocr/Config.scala | 46 ++--- .../main/scala/docspell/text/ocr/Ocr.scala | 190 +++++++++++------- .../docspell/text/ocr/SystemCommand.scala | 90 ++++++--- .../docspell/text/ocr/TikaMimetype.scala | 24 +-- 32 files changed, 442 insertions(+), 362 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala index 59982761..18e7ef37 100644 --- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala +++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala @@ -32,8 +32,8 @@ object BackendApp { def create[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, store: Store[F], - httpClientEc: ExecutionContext, - blocker: Blocker + httpClientEc: ExecutionContext, + blocker: Blocker ): Resource[F, BackendApp[F]] = for { queue <- JobQueue(store) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index bcd9c8bc..aec36e4f 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -176,7 +176,7 @@ object OItem { def findByFileSource(checksum: String, sourceId: Ident): F[Vector[RItem]] = store.transact((for { - coll <- OptionT(RSource.findCollective(sourceId)) + coll <- OptionT(RSource.findCollective(sourceId)) items <- OptionT.liftF(QItem.findByChecksum(checksum, coll)) } yield items).getOrElse(Vector.empty)) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala b/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala index ee7b775f..432ce947 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OMail.scala @@ -113,10 +113,10 @@ object OMail { def createSettings(accId: AccountId, s: SmtpSettings): F[AddResult] = (for { - ru <- OptionT(store.transact(s.toRecord(accId).value)) + ru <- OptionT(store.transact(s.toRecord(accId).value)) ins = RUserEmail.insert(ru) exists = RUserEmail.exists(ru.uid, ru.name) - res <- OptionT.liftF(store.add(ins, exists)) + res <- OptionT.liftF(store.add(ins, exists)) } yield res).getOrElse(AddResult.Failure(new Exception("User not found"))) def updateSettings(accId: AccountId, name: Ident, data: SmtpSettings): F[Int] = { @@ -143,8 +143,10 @@ object OMail { for { _ <- OptionT.liftF(store.transact(RItem.existsById(m.item))).filter(identity) ras <- OptionT.liftF( - store.transact(RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective)) - ) + store.transact( + RAttachment.findByItemAndCollectiveWithMeta(m.item, accId.collective) + ) + ) } yield { val addAttach = m.attach.filter(ras).map { a => Attach[F](Stream.emit(a._2).through(store.bitpeace.fetchData2(RangeDef.all))) @@ -169,15 +171,15 @@ object OMail { def storeMail(msgId: String, cfg: RUserEmail): F[Either[SendResult, Ident]] = { val save = for { data <- RSentMail.forItem( - m.item, - accId, - msgId, - cfg.mailFrom, - name, - m.subject, - m.recipients, - m.body - ) + m.item, + accId, + msgId, + cfg.mailFrom, + name, + m.subject, + m.recipients, + m.body + ) _ <- OptionT.liftF(RSentMail.insert(data._1)) _ <- OptionT.liftF(RSentMailItem.insert(data._2)) } yield data._1.id @@ -195,7 +197,7 @@ object OMail { mail <- createMail(mailCfg) mid <- OptionT.liftF(sendMail(mailCfg.toMailConfig, mail)) res <- mid.traverse(id => OptionT.liftF(storeMail(id, mailCfg))) - conv = res.fold(identity, _.fold(identity, id => SendResult.Success(id))) + conv = res.fold(identity, _.fold(identity, id => SendResult.Success(id))) } yield conv).getOrElse(SendResult.NotFound) } diff --git a/modules/common/src/main/scala/docspell/common/AccountId.scala b/modules/common/src/main/scala/docspell/common/AccountId.scala index 1618bf5e..d8aa2db7 100644 --- a/modules/common/src/main/scala/docspell/common/AccountId.scala +++ b/modules/common/src/main/scala/docspell/common/AccountId.scala @@ -19,9 +19,9 @@ object AccountId { case n if n > 0 && input.length > 2 => val coll = input.substring(0, n) val user = input.substring(n + 1) - Ident.fromString(coll). - flatMap(collId => Ident.fromString(user). - map(userId => AccountId(collId, userId))) + Ident + .fromString(coll) + .flatMap(collId => Ident.fromString(user).map(userId => AccountId(collId, userId))) case _ => invalid } diff --git a/modules/common/src/main/scala/docspell/common/BaseJsonCodecs.scala b/modules/common/src/main/scala/docspell/common/BaseJsonCodecs.scala index 4967e661..65a41e16 100644 --- a/modules/common/src/main/scala/docspell/common/BaseJsonCodecs.scala +++ b/modules/common/src/main/scala/docspell/common/BaseJsonCodecs.scala @@ -12,5 +12,4 @@ object BaseJsonCodecs { implicit val decodeInstantEpoch: Decoder[Instant] = Decoder.decodeLong.map(Instant.ofEpochMilli) - } diff --git a/modules/common/src/main/scala/docspell/common/CollectiveState.scala b/modules/common/src/main/scala/docspell/common/CollectiveState.scala index 00e7fed1..35dfa40e 100644 --- a/modules/common/src/main/scala/docspell/common/CollectiveState.scala +++ b/modules/common/src/main/scala/docspell/common/CollectiveState.scala @@ -21,32 +21,29 @@ object CollectiveState { * action. */ case object Blocked extends CollectiveState - def fromString(s: String): Either[String, CollectiveState] = s.toLowerCase match { - case "active" => Right(Active) + case "active" => Right(Active) case "readonly" => Right(ReadOnly) - case "closed" => Right(Closed) - case "blocked" => Right(Blocked) - case _ => Left(s"Unknown state: $s") + case "closed" => Right(Closed) + case "blocked" => Right(Blocked) + case _ => Left(s"Unknown state: $s") } def unsafe(str: String): CollectiveState = fromString(str).fold(sys.error, identity) def asString(state: CollectiveState): String = state match { - case Active => "active" - case Blocked => "blocked" - case Closed => "closed" + case Active => "active" + case Blocked => "blocked" + case Closed => "closed" case ReadOnly => "readonly" } - - implicit val collectiveStateEncoder: Encoder[CollectiveState] = Encoder.encodeString.contramap(CollectiveState.asString) implicit val collectiveStateDecoder: Decoder[CollectiveState] = Decoder.decodeString.emap(CollectiveState.fromString) -} \ No newline at end of file +} diff --git a/modules/common/src/main/scala/docspell/common/ContactKind.scala b/modules/common/src/main/scala/docspell/common/ContactKind.scala index 54ed6958..e8a97ce7 100644 --- a/modules/common/src/main/scala/docspell/common/ContactKind.scala +++ b/modules/common/src/main/scala/docspell/common/ContactKind.scala @@ -10,22 +10,22 @@ sealed trait ContactKind { self: Product => object ContactKind { val all = List() - case object Phone extends ContactKind - case object Mobile extends ContactKind - case object Fax extends ContactKind - case object Email extends ContactKind + case object Phone extends ContactKind + case object Mobile extends ContactKind + case object Fax extends ContactKind + case object Email extends ContactKind case object Docspell extends ContactKind - case object Website extends ContactKind + case object Website extends ContactKind def fromString(s: String): Either[String, ContactKind] = s.toLowerCase match { - case "phone" => Right(Phone) - case "mobile" => Right(Mobile) - case "fax" => Right(Fax) - case "email" => Right(Email) + case "phone" => Right(Phone) + case "mobile" => Right(Mobile) + case "fax" => Right(Fax) + case "email" => Right(Email) case "docspell" => Right(Docspell) - case "website" => Right(Website) - case _ => Left(s"Not a state value: $s") + case "website" => Right(Website) + case _ => Left(s"Not a state value: $s") } def unsafe(str: String): ContactKind = @@ -34,7 +34,6 @@ object ContactKind { def asString(s: ContactKind): String = s.asString.toLowerCase - implicit val contactKindEncoder: Encoder[ContactKind] = Encoder.encodeString.contramap(_.asString) diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index 16b0c3e8..2e4efb85 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -49,6 +49,6 @@ object Duration { def stopTime[F[_]: Sync]: F[F[Duration]] = for { now <- Timestamp.current[F] - end = Timestamp.current[F] + end = Timestamp.current[F] } yield end.map(e => Duration.millis(e.toMillis - now.toMillis)) } diff --git a/modules/common/src/main/scala/docspell/common/JobState.scala b/modules/common/src/main/scala/docspell/common/JobState.scala index e274546a..395134bc 100644 --- a/modules/common/src/main/scala/docspell/common/JobState.scala +++ b/modules/common/src/main/scala/docspell/common/JobState.scala @@ -10,48 +10,41 @@ sealed trait JobState { self: Product => object JobState { /** Waiting for being executed. */ - case object Waiting extends JobState { - } + case object Waiting extends JobState {} /** A scheduler has picked up this job and will pass it to the next * free slot. */ - case object Scheduled extends JobState { - } + case object Scheduled extends JobState {} /** Is currently executing */ - case object Running extends JobState { - } + case object Running extends JobState {} /** Finished with failure and is being retried. */ - case object Stuck extends JobState { - } + case object Stuck extends JobState {} /** Finished finally with a failure */ - case object Failed extends JobState { - } + case object Failed extends JobState {} /** Finished by cancellation. */ - case object Cancelled extends JobState { - } + case object Cancelled extends JobState {} /** Finished with success */ - case object Success extends JobState { - } + case object Success extends JobState {} - val all: Set[JobState] = Set(Waiting, Scheduled, Running, Stuck, Failed, Cancelled, Success) + val all: Set[JobState] = Set(Waiting, Scheduled, Running, Stuck, Failed, Cancelled, Success) val queued: Set[JobState] = Set(Waiting, Scheduled, Stuck) - val done: Set[JobState] = Set(Failed, Cancelled, Success) + val done: Set[JobState] = Set(Failed, Cancelled, Success) def parse(str: String): Either[String, JobState] = str.toLowerCase match { - case "waiting" => Right(Waiting) + case "waiting" => Right(Waiting) case "scheduled" => Right(Scheduled) - case "running" => Right(Running) - case "stuck" => Right(Stuck) - case "failed" => Right(Failed) + case "running" => Right(Running) + case "stuck" => Right(Stuck) + case "failed" => Right(Failed) case "cancelled" => Right(Cancelled) - case "success" => Right(Success) - case _ => Left(s"Not a job state: $str") + case "success" => Right(Success) + case _ => Left(s"Not a job state: $str") } def unsafe(str: String): JobState = @@ -60,7 +53,6 @@ object JobState { def asString(state: JobState): String = state.name - implicit val jobStateEncoder: Encoder[JobState] = Encoder.encodeString.contramap(_.name) diff --git a/modules/common/src/main/scala/docspell/common/LenientUri.scala b/modules/common/src/main/scala/docspell/common/LenientUri.scala index 7a249f58..03632818 100644 --- a/modules/common/src/main/scala/docspell/common/LenientUri.scala +++ b/modules/common/src/main/scala/docspell/common/LenientUri.scala @@ -51,8 +51,8 @@ case class LenientUri( def open[F[_]: Sync]: Either[String, Resource[F, HttpURLConnection]] = toJavaUrl.map { url => Resource - .make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))( - conn => Sync[F].delay(conn.disconnect()) + .make(Sync[F].delay(url.openConnection().asInstanceOf[HttpURLConnection]))(conn => + Sync[F].delay(conn.disconnect()) ) } @@ -61,17 +61,16 @@ case class LenientUri( .emit(Either.catchNonFatal(new URL(asString))) .covary[F] .rethrow - .flatMap( - url => fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true) + .flatMap(url => + fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true) ) def host: Option[String] = - authority.map( - a => - a.indexOf(':') match { - case -1 => a - case n => a.substring(0, n) - } + authority.map(a => + a.indexOf(':') match { + case -1 => a + case n => a.substring(0, n) + } ) def asString: String = { diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index c2343aa0..fb4598ac 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -8,13 +8,11 @@ import io.circe.generic.semiauto._ case class MetaProposalList private (proposals: List[MetaProposal]) { - def isEmpty: Boolean = proposals.isEmpty + def isEmpty: Boolean = proposals.isEmpty def nonEmpty: Boolean = proposals.nonEmpty - def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean = { - (mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)). - reduce(_ && _) - } + def hasResults(mt: MetaProposalType, mts: MetaProposalType*): Boolean = + (mts :+ mt).map(mtp => proposals.exists(_.proposalType == mtp)).reduce(_ && _) def hasResultsAll: Boolean = proposals.map(_.proposalType).toSet == MetaProposalType.all.toSet @@ -23,7 +21,7 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { proposals.foldLeft(Set.empty[MetaProposalType])(_ + _.proposalType) def fillEmptyFrom(ml: MetaProposalList): MetaProposalList = { - val list = ml.proposals.foldLeft(proposals){ (mine, mp) => + val list = ml.proposals.foldLeft(proposals) { (mine, mp) => if (hasResults(mp.proposalType)) mine else mp :: mine } @@ -48,21 +46,24 @@ object MetaProposalList { fromSeq1(mt, refs.map(ref => Candidate(ref, Set(label)))) def fromSeq1(mt: MetaProposalType, refs: Seq[Candidate]): MetaProposalList = - NonEmptyList.fromList(refs.toList). - map(nl => MetaProposalList.of(MetaProposal(mt, nl))). - getOrElse(empty) + NonEmptyList + .fromList(refs.toList) + .map(nl => MetaProposalList.of(MetaProposal(mt, nl))) + .getOrElse(empty) - def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = { + def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) })) - } def flatten(ml: Seq[MetaProposalList]): MetaProposalList = { val init: Map[MetaProposalType, MetaProposal] = Map.empty - def updateMap(map: Map[MetaProposalType, MetaProposal], mp: MetaProposal): Map[MetaProposalType, MetaProposal] = + def updateMap( + map: Map[MetaProposalType, MetaProposal], + mp: MetaProposal + ): Map[MetaProposalType, MetaProposal] = map.get(mp.proposalType) match { case Some(mp0) => map.updated(mp.proposalType, mp0.addIdRef(mp.values.toList)) - case None => map.updated(mp.proposalType, mp) + case None => map.updated(mp.proposalType, mp) } val merged = ml.foldLeft(init) { (map, el) => diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalType.scala b/modules/common/src/main/scala/docspell/common/MetaProposalType.scala index 93dfb168..89504896 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalType.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalType.scala @@ -10,25 +10,25 @@ sealed trait MetaProposalType { self: Product => object MetaProposalType { - case object CorrOrg extends MetaProposalType + case object CorrOrg extends MetaProposalType case object CorrPerson extends MetaProposalType case object ConcPerson extends MetaProposalType - case object ConcEquip extends MetaProposalType - case object DocDate extends MetaProposalType - case object DueDate extends MetaProposalType + case object ConcEquip extends MetaProposalType + case object DocDate extends MetaProposalType + case object DueDate extends MetaProposalType val all: List[MetaProposalType] = List(CorrOrg, CorrPerson, ConcPerson, ConcEquip) def fromString(str: String): Either[String, MetaProposalType] = str.toLowerCase match { - case "corrorg" => Right(CorrOrg) + case "corrorg" => Right(CorrOrg) case "corrperson" => Right(CorrPerson) case "concperson" => Right(ConcPerson) - case "concequip" => Right(ConcEquip) - case "docdate" => Right(DocDate) - case "duedate" => Right(DueDate) - case _ => Left(s"Invalid item-proposal-type: $str") + case "concequip" => Right(ConcEquip) + case "docdate" => Right(DocDate) + case "duedate" => Right(DueDate) + case _ => Left(s"Invalid item-proposal-type: $str") } def unsafe(str: String): MetaProposalType = diff --git a/modules/common/src/main/scala/docspell/common/NerTag.scala b/modules/common/src/main/scala/docspell/common/NerTag.scala index 39413ccc..e2ad2ca0 100644 --- a/modules/common/src/main/scala/docspell/common/NerTag.scala +++ b/modules/common/src/main/scala/docspell/common/NerTag.scala @@ -11,31 +11,30 @@ sealed trait NerTag { self: Product => object NerTag { case object Organization extends NerTag - case object Person extends NerTag - case object Location extends NerTag - case object Misc extends NerTag - case object Email extends NerTag - case object Website extends NerTag - case object Date extends NerTag + case object Person extends NerTag + case object Location extends NerTag + case object Misc extends NerTag + case object Email extends NerTag + case object Website extends NerTag + case object Date extends NerTag val all: List[NerTag] = List(Organization, Person, Location) def fromString(str: String): Either[String, NerTag] = str.toLowerCase match { case "organization" => Right(Organization) - case "person" => Right(Person) - case "location" => Right(Location) - case "misc" => Right(Misc) - case "email" => Right(Email) - case "website" => Right(Website) - case "date" => Right(Date) - case _ => Left(s"Invalid ner tag: $str") + case "person" => Right(Person) + case "location" => Right(Location) + case "misc" => Right(Misc) + case "email" => Right(Email) + case "website" => Right(Website) + case "date" => Right(Date) + case _ => Left(s"Invalid ner tag: $str") } def unsafe(str: String): NerTag = fromString(str).fold(sys.error, identity) - implicit val jsonDecoder: Decoder[NerTag] = Decoder.decodeString.emap(fromString) implicit val jsonEncoder: Encoder[NerTag] = diff --git a/modules/common/src/main/scala/docspell/common/pureconfig/Implicits.scala b/modules/common/src/main/scala/docspell/common/pureconfig/Implicits.scala index b2c1452d..7fc880ab 100644 --- a/modules/common/src/main/scala/docspell/common/pureconfig/Implicits.scala +++ b/modules/common/src/main/scala/docspell/common/pureconfig/Implicits.scala @@ -24,12 +24,14 @@ object Implicits { ConfigReader[String].emap(reason(Ident.fromString)) implicit val byteVectorReader: ConfigReader[ByteVector] = - ConfigReader[String].emap(reason(str => { + ConfigReader[String].emap(reason { str => if (str.startsWith("hex:")) ByteVector.fromHex(str.drop(4)).toRight("Invalid hex value.") - else if (str.startsWith("b64:")) ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.") + else if (str.startsWith("b64:")) + ByteVector.fromBase64(str.drop(4)).toRight("Invalid Base64 string.") else ByteVector.encodeUtf8(str).left.map(ex => s"Invalid utf8 string: ${ex.getMessage}") - })) + }) def reason[A: ClassTag](f: String => Either[String, A]): String => Either[FailureReason, A] = - in => f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str)) + in => + f(in).left.map(str => CannotConvert(in, implicitly[ClassTag[A]].runtimeClass.toString, str)) } diff --git a/modules/common/src/main/scala/docspell/common/syntax/package.scala b/modules/common/src/main/scala/docspell/common/syntax/package.scala index af61799d..77e17039 100644 --- a/modules/common/src/main/scala/docspell/common/syntax/package.scala +++ b/modules/common/src/main/scala/docspell/common/syntax/package.scala @@ -2,9 +2,6 @@ package docspell.common package object syntax { - object all extends EitherSyntax - with StreamSyntax - with StringSyntax - with LoggerSyntax + object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax } diff --git a/modules/restserver/src/main/scala/docspell/restserver/http4s/QueryParam.scala b/modules/restserver/src/main/scala/docspell/restserver/http4s/QueryParam.scala index 4e8198ea..33d506bf 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/http4s/QueryParam.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/http4s/QueryParam.scala @@ -16,7 +16,6 @@ object QueryParam { implicit val queryStringDecoder: QueryParamDecoder[QueryString] = QueryParamDecoder[String].map(s => QueryString(s.trim.toLowerCase)) - // implicit val booleanDecoder: QueryParamDecoder[Boolean] = // QueryParamDecoder.fromUnsafeCast(qp => Option(qp.value).exists(_.equalsIgnoreCase("true")))( // "Boolean" diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala index 7aee0ea2..3b41aa0a 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/ItemRoutes.scala @@ -139,8 +139,7 @@ object ItemRoutes { } } - - final implicit class OptionString(opt: Option[String]) { + implicit final class OptionString(opt: Option[String]) { def notEmpty: Option[String] = opt.map(_.trim).filter(_.nonEmpty) } diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/MailSendRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/MailSendRoutes.scala index 3d7a08e3..bc0d2a94 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/MailSendRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/MailSendRoutes.scala @@ -24,13 +24,13 @@ object MailSendRoutes { HttpRoutes.of { case req @ POST -> Root / Ident(name) / Ident(id) => for { - in <- req.as[SimpleMail] + in <- req.as[SimpleMail] mail = convertIn(id, in) - res <- mail.traverse(m => backend.mail.sendMail(user.account, name, m)) + res <- mail.traverse(m => backend.mail.sendMail(user.account, name, m)) resp <- res.fold( - err => Ok(BasicResult(false, s"Invalid mail data: $err")), - res => Ok(convertOut(res)) - ) + err => Ok(BasicResult(false, s"Invalid mail data: $err")), + res => Ok(convertOut(res)) + ) } yield resp } } @@ -39,7 +39,7 @@ object MailSendRoutes { for { rec <- s.recipients.traverse(EmilUtil.readMailAddress) fileIds <- s.attachmentIds.traverse(Ident.fromString) - sel = if (s.addAllAttachments) AttachSelection.All else AttachSelection.Selected(fileIds) + sel = if (s.addAllAttachments) AttachSelection.All else AttachSelection.Selected(fileIds) } yield ItemMail(item, s.subject, rec, s.body, sel) def convertOut(res: SendResult): BasicResult = diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/MailSettingsRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/MailSettingsRoutes.scala index 3586e9f0..29ba7822 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/MailSettingsRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/MailSettingsRoutes.scala @@ -29,7 +29,7 @@ object MailSettingsRoutes { case GET -> Root :? QueryParam.QueryOpt(q) => for { list <- backend.mail.getSettings(user.account, q.map(_.q)) - res = list.map(convert) + res = list.map(convert) resp <- Ok(EmailSettingsList(res.toList)) } yield resp @@ -45,13 +45,13 @@ object MailSettingsRoutes { ru = makeSettings(in) up <- OptionT.liftF(ru.traverse(r => backend.mail.createSettings(user.account, r))) resp <- OptionT.liftF( - Ok( - up.fold( - err => BasicResult(false, err), - ar => Conversions.basicResult(ar, "Mail settings stored.") - ) - ) - ) + Ok( + up.fold( + err => BasicResult(false, err), + ar => Conversions.basicResult(ar, "Mail settings stored.") + ) + ) + ) } yield resp).getOrElseF(NotFound()) case req @ PUT -> Root / Ident(name) => @@ -60,24 +60,24 @@ object MailSettingsRoutes { ru = makeSettings(in) up <- OptionT.liftF(ru.traverse(r => backend.mail.updateSettings(user.account, name, r))) resp <- OptionT.liftF( - Ok( - up.fold( - err => BasicResult(false, err), - n => - if (n > 0) BasicResult(true, "Mail settings stored.") - else BasicResult(false, "Mail settings could not be saved") - ) - ) - ) + Ok( + up.fold( + err => BasicResult(false, err), + n => + if (n > 0) BasicResult(true, "Mail settings stored.") + else BasicResult(false, "Mail settings could not be saved") + ) + ) + ) } yield resp).getOrElseF(NotFound()) case DELETE -> Root / Ident(name) => for { n <- backend.mail.deleteSettings(user.account, name) resp <- Ok( - if (n > 0) BasicResult(true, "Mail settings removed") - else BasicResult(false, "Mail settings could not be removed") - ) + if (n > 0) BasicResult(true, "Mail settings removed") + else BasicResult(false, "Mail settings could not be removed") + ) } yield resp } diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/SentMailRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/SentMailRoutes.scala index 01f22c45..593b3895 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/SentMailRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/SentMailRoutes.scala @@ -23,7 +23,7 @@ object SentMailRoutes { HttpRoutes.of { case GET -> Root / "item" / Ident(id) => for { - all <- backend.mail.getSentMailsForItem(user.account, id) + all <- backend.mail.getSentMailsForItem(user.account, id) resp <- Ok(SentMails(all.map(convert).toList)) } yield resp @@ -35,7 +35,7 @@ object SentMailRoutes { case DELETE -> Root / "mail" / Ident(mailId) => for { - n <- backend.mail.deleteSentMail(user.account, mailId) + n <- backend.mail.deleteSentMail(user.account, mailId) resp <- Ok(BasicResult(n > 0, s"Mails deleted: $n")) } yield resp } diff --git a/modules/restserver/src/main/scala/docspell/restserver/webapp/TemplateRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/webapp/TemplateRoutes.scala index e731496f..2e7df381 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/webapp/TemplateRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/webapp/TemplateRoutes.scala @@ -37,7 +37,7 @@ object TemplateRoutes { new InnerRoutes[F] { def doc = HttpRoutes.of[F] { - case GET -> Root => + case GET -> Root => for { templ <- docTemplate resp <- Ok(DocData().render(templ), `Content-Type`(`text/html`)) diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 5d0b7727..1927c01d 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -194,8 +194,9 @@ object QItem { IC.cid.prefix("i").is(q.collective), IC.state.prefix("i").isOneOf(q.states), IC.incoming.prefix("i").isOrDiscard(q.direction), - name.map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n))). - getOrElse(Fragment.empty), + name + .map(n => or(IC.name.prefix("i").lowerLike(n), IC.notes.prefix("i").lowerLike(n))) + .getOrElse(Fragment.empty), RPerson.Columns.pid.prefix("p0").isOrDiscard(q.corrPerson), ROrganization.Columns.oid.prefix("o0").isOrDiscard(q.corrOrg), RPerson.Columns.pid.prefix("p1").isOrDiscard(q.concPerson), diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index e6f206e5..daa81029 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -8,12 +8,12 @@ object RFileMeta { val table = fr"filemeta" object Columns { - val id = Column("id") + val id = Column("id") val timestamp = Column("timestamp") - val mimetype = Column("mimetype") - val length = Column("length") - val checksum = Column("checksum") - val chunks = Column("chunks") + val mimetype = Column("mimetype") + val length = Column("length") + val checksum = Column("checksum") + val chunks = Column("chunks") val chunksize = Column("chunksize") val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize) diff --git a/modules/store/src/main/scala/docspell/store/records/RSentMail.scala b/modules/store/src/main/scala/docspell/store/records/RSentMail.scala index a0679b20..cb50e18d 100644 --- a/modules/store/src/main/scala/docspell/store/records/RSentMail.scala +++ b/modules/store/src/main/scala/docspell/store/records/RSentMail.scala @@ -52,8 +52,16 @@ object RSentMail { for { user <- OptionT(RUser.findByAccount(accId)) sm <- OptionT.liftF( - RSentMail[ConnectionIO](user.uid, messageId, sender, connName, subject, recipients, body) - ) + RSentMail[ConnectionIO]( + user.uid, + messageId, + sender, + connName, + subject, + recipients, + body + ) + ) si <- OptionT.liftF(RSentMailItem[ConnectionIO](itemId, sm.id, Some(sm.created))) } yield (sm, si) diff --git a/modules/text/src/main/scala/docspell/text/contact/Contact.scala b/modules/text/src/main/scala/docspell/text/contact/Contact.scala index 8ad7829d..f1e5b480 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Contact.scala +++ b/modules/text/src/main/scala/docspell/text/contact/Contact.scala @@ -9,43 +9,47 @@ object Contact { private[this] val protocols = Set("ftp", "http", "https") def annotate(text: String): Vector[NerLabel] = - TextSplitter.splitToken[Nothing](text, " \t\r\n".toSet). - map({ token => - if (isEmailAddress(token.value)) NerLabel(token.value, NerTag.Email, token.begin, token.end).some - else if (isWebsite(token.value)) NerLabel(token.value, NerTag.Website, token.begin, token.end).some + TextSplitter + .splitToken[Nothing](text, " \t\r\n".toSet) + .map({ token => + if (isEmailAddress(token.value)) + NerLabel(token.value, NerTag.Email, token.begin, token.end).some + else if (isWebsite(token.value)) + NerLabel(token.value, NerTag.Website, token.begin, token.end).some else None - }). - flatMap(_.map(Stream.emit).getOrElse(Stream.empty)). - toVector - + }) + .flatMap(_.map(Stream.emit).getOrElse(Stream.empty)) + .toVector def isEmailAddress(str: String): Boolean = { val atIdx = str.indexOf('@') if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false else { val name = str.substring(0, atIdx) - val dom = str.substring(atIdx + 1) + val dom = str.substring(atIdx + 1) Domain.isDomain(dom) && name.forall(c => !c.isWhitespace) } } def isWebsite(str: String): Boolean = - LenientUri.parse(str). - toOption. - map(uri => protocols.contains(uri.scheme.head)). - getOrElse(Domain.isDomain(str)) + LenientUri + .parse(str) + .toOption + .map(uri => protocols.contains(uri.scheme.head)) + .getOrElse(Domain.isDomain(str)) def isDocspellOpenUpload(str: String): Boolean = { def isUploadPath(p: LenientUri.Path): Boolean = p match { - case LenientUri.RootPath => false + case LenientUri.RootPath => false case LenientUri.EmptyPath => false case LenientUri.NonEmptyPath(segs) => Ident.fromString(segs.last).isRight && segs.init.takeRight(3) == List("open", "upload", "item") } - LenientUri.parse(str). - toOption. - exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path)) + LenientUri + .parse(str) + .toOption + .exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path)) } } diff --git a/modules/text/src/main/scala/docspell/text/contact/Tld.scala b/modules/text/src/main/scala/docspell/text/contact/Tld.scala index f8caa6b3..af7cae07 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Tld.scala +++ b/modules/text/src/main/scala/docspell/text/contact/Tld.scala @@ -11,7 +11,7 @@ private[text] object Tld { /** * Some selected TLDs. */ - private [this] val known = List( + private[this] val known = List( ".com", ".org", ".net", diff --git a/modules/text/src/main/scala/docspell/text/date/DateFind.scala b/modules/text/src/main/scala/docspell/text/date/DateFind.scala index ff011e7f..79f956ec 100644 --- a/modules/text/src/main/scala/docspell/text/date/DateFind.scala +++ b/modules/text/src/main/scala/docspell/text/date/DateFind.scala @@ -10,16 +10,22 @@ import scala.util.Try object DateFind { - def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = { - TextSplitter.splitToken(text, " \t.,\n\r/".toSet). - sliding(3). - filter(_.length == 3). - map(q => SimpleDate.fromParts(q.toList, lang). - map(sd => NerDateLabel(sd.toLocalDate, - NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)))). - collect({ case Some(d) => d }) - } - + def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = + TextSplitter + .splitToken(text, " \t.,\n\r/".toSet) + .sliding(3) + .filter(_.length == 3) + .map(q => + SimpleDate + .fromParts(q.toList, lang) + .map(sd => + NerDateLabel( + sd.toLocalDate, + NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end) + ) + ) + ) + .collect({ case Some(d) => d }) private case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = @@ -27,13 +33,13 @@ object DateFind { } private object SimpleDate { - val p0 = readYear >> readMonth >> readDay map { + val p0 = (readYear >> readMonth >> readDay).map { case ((y, m), d) => SimpleDate(y, m, d) } - val p1 = readDay >> readMonth >> readYear map { + val p1 = (readDay >> readMonth >> readYear).map { case ((d, m), y) => SimpleDate(y, m, d) } - val p2 = readMonth >> readDay >> readYear map { + val p2 = (readMonth >> readDay >> readYear).map { case ((m, d), y) => SimpleDate(y, m, d) } @@ -46,14 +52,14 @@ object DateFind { p.read(parts).toOption } - - def readYear: Reader[Int] = { - Reader.readFirst(w => w.value.length match { - case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption - case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption - case _ => None - }) - } + def readYear: Reader[Int] = + Reader.readFirst(w => + w.value.length match { + case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption + case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption + case _ => None + } + ) def readMonth: Reader[Int] = Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1)) @@ -69,10 +75,12 @@ object DateFind { Reader(read.andThen(_.map(f))) def or(other: Reader[A]): Reader[A] = - Reader(words => read(words) match { - case Result.Failure => other.read(words) - case s @ Result.Success(_, _) => s - }) + Reader(words => + read(words) match { + case Result.Failure => other.read(words) + case s @ Result.Success(_, _) => s + } + ) } object Reader { @@ -81,12 +89,11 @@ object DateFind { def readFirst[A](f: Word => Option[A]): Reader[A] = Reader({ - case Nil => Result.Failure + case Nil => Result.Failure case a :: as => f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure) }) } - sealed trait Result[+A] { def toOption: Option[A] def map[B](f: A => B): Result[B] @@ -95,14 +102,14 @@ object DateFind { object Result { final case class Success[A](value: A, rest: List[Word]) extends Result[A] { - val toOption = Some(value) + val toOption = Some(value) def map[B](f: A => B): Result[B] = Success(f(value), rest) def next[B](r: Reader[B]): Result[(A, B)] = r.read(rest).map(b => (value, b)) } final case object Failure extends Result[Nothing] { - val toOption = None - def map[B](f: Nothing => B): Result[B] = this + val toOption = None + def map[B](f: Nothing => B): Result[B] = this def next[B](r: Reader[B]): Result[(Nothing, B)] = this } } diff --git a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala b/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala index 5d825541..084d9dc4 100644 --- a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala +++ b/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala @@ -14,23 +14,28 @@ import java.net.URL import scala.util.Using object StanfordNerClassifier { - private [this] val logger = getLogger + private[this] val logger = getLogger - lazy val germanNerClassifier = makeClassifier(Language.German) + lazy val germanNerClassifier = makeClassifier(Language.German) lazy val englishNerClassifier = makeClassifier(Language.English) def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = { val nerClassifier = lang match { case Language.English => englishNerClassifier - case Language.German => germanNerClassifier + case Language.German => germanNerClassifier } - nerClassifier.classify(text).asScala.flatMap(a => a.asScala). - collect(Function.unlift(label => { + nerClassifier + .classify(text) + .asScala + .flatMap(a => a.asScala) + .collect(Function.unlift { label => val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation]) - NerTag.fromString(Option(tag).getOrElse("")).toOption. - map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) - })). - toVector + NerTag + .fromString(Option(tag).getOrElse("")) + .toOption + .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition())) + }) + .toVector } private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = { @@ -48,7 +53,9 @@ object StanfordNerClassifier { check(lang match { case Language.German => - getClass.getResource("/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz") + getClass.getResource( + "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz" + ) case Language.English => getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz") }) diff --git a/modules/text/src/main/scala/docspell/text/ocr/Config.scala b/modules/text/src/main/scala/docspell/text/ocr/Config.scala index 42fe7706..f2f8e5d1 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Config.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/Config.scala @@ -5,11 +5,11 @@ import java.nio.file.{Path, Paths} import docspell.common._ case class Config( - allowedContentTypes: Set[MimeType] - , ghostscript: Config.Ghostscript - , pageRange: Config.PageRange - , unpaper: Config.Unpaper - , tesseract: Config.Tesseract + allowedContentTypes: Set[MimeType], + ghostscript: Config.Ghostscript, + pageRange: Config.PageRange, + unpaper: Config.Unpaper, + tesseract: Config.Tesseract ) { def isAllowed(mt: MimeType): Boolean = @@ -22,7 +22,7 @@ object Config { case class Command(program: String, args: Seq[String], timeout: Duration) { def mapArgs(f: String => String): Command = - Command(program, args map f, timeout) + Command(program, args.map(f), timeout) def toCmd: List[String] = program :: args.toList @@ -44,23 +44,23 @@ object Config { ), pageRange = PageRange(10), ghostscript = Ghostscript( - Command("gs", Seq("-dNOPAUSE" - , "-dBATCH" - , "-dSAFER" - , "-sDEVICE=tiffscaled8" - , "-sOutputFile={{outfile}}" - , "{{infile}}"), - Duration.seconds(30)), - Paths.get(System.getProperty("java.io.tmpdir")). - resolve("docspell-extraction")), - unpaper = Unpaper(Command("unpaper" - , Seq("{{infile}}", "{{outfile}}") - , Duration.seconds(30))), + Command( + "gs", + Seq( + "-dNOPAUSE", + "-dBATCH", + "-dSAFER", + "-sDEVICE=tiffscaled8", + "-sOutputFile={{outfile}}", + "{{infile}}" + ), + Duration.seconds(30) + ), + Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") + ), + unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), tesseract = Tesseract( - Command("tesseract", Seq("{{file}}" - , "stdout" - , "-l" - , "{{lang}}"), - Duration.minutes(1))) + Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) + ) ) } diff --git a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala b/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala index 1cc402c3..99f558d3 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala @@ -11,71 +11,106 @@ object Ocr { /** Extract the text of all pages in the given pdf file. */ - def extractPdf[F[_]: Sync: ContextShift](pdf: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] = + def extractPdf[F[_]: Sync: ContextShift]( + pdf: Stream[F, Byte], + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => - runGhostscript(pdf, config, wd, blocker). - flatMap({ tmpImg => + runGhostscript(pdf, config, wd, blocker) + .flatMap({ tmpImg => runTesseractFile(tmpImg, blocker, lang, config) - }). - fold1(_ + "\n\n\n" + _) + }) + .fold1(_ + "\n\n\n" + _) } /** Extract the text from the given image file */ - def extractImage[F[_]: Sync: ContextShift](img: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] = + def extractImage[F[_]: Sync: ContextShift]( + img: Stream[F, Byte], + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = runTesseractStdin(img, blocker, lang, config) - - def extractPdFFile[F[_]: Sync: ContextShift](pdf: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] = + def extractPdFFile[F[_]: Sync: ContextShift]( + pdf: Path, + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => - runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker). - flatMap({ tif => + runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) + .flatMap({ tif => runTesseractFile(tif, blocker, lang, config) - }). - fold1(_ + "\n\n\n" + _) + }) + .fold1(_ + "\n\n\n" + _) } - def extractImageFile[F[_]: Sync: ContextShift](img: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] = + def extractImageFile[F[_]: Sync: ContextShift]( + img: Path, + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = runTesseractFile(img, blocker, lang, config) /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ private[text] def runGhostscript[F[_]: Sync: ContextShift]( - pdf: Stream[F, Byte] - , cfg: Config - , wd: Path - , blocker: Blocker): Stream[F, Path] = { + pdf: Stream[F, Byte], + cfg: Config, + wd: Path, + blocker: Blocker + ): Stream[F, Path] = { val xargs = - if (cfg.pageRange.begin > 0) s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args + if (cfg.pageRange.begin > 0) + s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args else cfg.ghostscript.command.args - val cmd = cfg.ghostscript.command.copy(args = xargs).mapArgs(replace(Map( - "{{infile}}" -> "-", - "{{outfile}}" -> "%d.tif" - ))) - SystemCommand.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf). - evalMap({ _ => + val cmd = cfg.ghostscript.command + .copy(args = xargs) + .mapArgs( + replace( + Map( + "{{infile}}" -> "-", + "{{outfile}}" -> "%d.tif" + ) + ) + ) + SystemCommand + .execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf) + .evalMap({ _ => File.listFiles(pathEndsWith(".tif"), wd) - }). - flatMap(fs => Stream.emits(fs)) + }) + .flatMap(fs => Stream.emits(fs)) } /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( - pdf: Path - , ghostscript: Config.Command - , wd: Path, blocker: Blocker): Stream[F, Path] = { - val cmd = ghostscript.mapArgs(replace(Map( - "{{infile}}" -> pdf.toAbsolutePath.toString, - "{{outfile}}" -> "%d.tif" - ))) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)). - evalMap({ _ => + pdf: Path, + ghostscript: Config.Command, + wd: Path, + blocker: Blocker + ): Stream[F, Path] = { + val cmd = ghostscript.mapArgs( + replace( + Map( + "{{infile}}" -> pdf.toAbsolutePath.toString, + "{{outfile}}" -> "%d.tif" + ) + ) + ) + SystemCommand + .execSuccess[F](cmd, blocker, wd = Some(wd)) + .evalMap({ _ => File.listFiles(pathEndsWith(".tif"), wd) - }). - flatMap(fs => Stream.emits(fs)) + }) + .flatMap(fs => Stream.emits(fs)) } private def pathEndsWith(ext: String): Path => Boolean = @@ -84,65 +119,72 @@ object Ocr { /** Run unpaper to optimize the image for ocr. The * files are stored to a temporary location on disk and returned. */ - private[text] def runUnpaperFile[F[_]: Sync: ContextShift](img: Path - , unpaper: Config.Command - , wd: Path, blocker: Blocker): Stream[F, Path] = { - val targetFile = img.resolveSibling("u-"+ img.getFileName.toString).toAbsolutePath - val cmd = unpaper.mapArgs(replace(Map( - "{{infile}}" -> img.toAbsolutePath.toString, - "{{outfile}}" -> targetFile.toString - ))) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)). - map(_ => targetFile). - handleErrorWith(th => { - logger.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") + private[text] def runUnpaperFile[F[_]: Sync: ContextShift]( + img: Path, + unpaper: Config.Command, + wd: Path, + blocker: Blocker + ): Stream[F, Path] = { + val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath + val cmd = unpaper.mapArgs( + replace( + Map( + "{{infile}}" -> img.toAbsolutePath.toString, + "{{outfile}}" -> targetFile.toString + ) + ) + ) + SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith { + th => + logger + .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") Stream.emit(img) - }) + } } /** Run tesseract on the given image file and return the extracted * text. */ private[text] def runTesseractFile[F[_]: Sync: ContextShift]( - img: Path - , blocker: Blocker - , lang: String - , config: Config): Stream[F, String] = { + img: Path, + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = // tesseract cannot cope with absolute filenames // so use the parent as working dir - runUnpaperFile(img, config.unpaper.command, img.getParent, blocker). - flatMap(uimg => { - val cmd = config.tesseract.command.mapArgs(replace(Map( - "{{file}}" -> uimg.getFileName.toString - , "{{lang}}" -> fixLanguage(lang)))) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) - }) - } - + runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg => + val cmd = config.tesseract.command.mapArgs( + replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) + ) + SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) + } /** Run tesseract on the given image file and return the extracted * text. */ private[text] def runTesseractStdin[F[_]: Sync: ContextShift]( - img: Stream[F, Byte] - , blocker: Blocker - , lang: String - , config: Config): Stream[F, String] = { - val cmd = config.tesseract.command.mapArgs(replace(Map( - "{{file}}" -> "stdin" - , "{{lang}}" -> fixLanguage(lang)))) + img: Stream[F, Byte], + blocker: Blocker, + lang: String, + config: Config + ): Stream[F, String] = { + val cmd = config.tesseract.command + .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout) } private def replace(repl: Map[String, String]): String => String = - s => repl.foldLeft(s) { case (res, (k, v)) => - res.replace(k, v) - } + s => + repl.foldLeft(s) { + case (res, (k, v)) => + res.replace(k, v) + } private def fixLanguage(lang: String): String = lang match { case "de" => "deu" case "en" => "eng" - case l => l + case l => l } } diff --git a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala b/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala index 630941e8..f433c967 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala @@ -16,57 +16,87 @@ object SystemCommand { final case class Result(rc: Int, stdout: String, stderr: String) - def exec[F[_]: Sync: ContextShift]( cmd: Config.Command - , blocker: Blocker - , wd: Option[Path] = None - , stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] = - startProcess(cmd, wd){ proc => + def exec[F[_]: Sync: ContextShift]( + cmd: Config.Command, + blocker: Blocker, + wd: Option[Path] = None, + stdin: Stream[F, Byte] = Stream.empty + ): Stream[F, Result] = + startProcess(cmd, wd) { proc => Stream.eval { for { - _ <- writeToProcess(stdin, proc, blocker) - term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) - _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") - else logger.fwarn(s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!") - _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) - out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("") - err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("") + _ <- writeToProcess(stdin, proc, blocker) + term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) + _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") + else + logger.fwarn( + s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!" + ) + _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) + out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("") + err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("") } yield Result(proc.exitValue, out, err) } } - def execSuccess[F[_]: Sync: ContextShift](cmd: Config.Command, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] = + def execSuccess[F[_]: Sync: ContextShift]( + cmd: Config.Command, + blocker: Blocker, + wd: Option[Path] = None, + stdin: Stream[F, Byte] = Stream.empty + ): Stream[F, Result] = exec(cmd, blocker, wd, stdin).flatMap { r => - if (r.rc != 0) Stream.raiseError[F](new Exception(s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}")) + if (r.rc != 0) + Stream.raiseError[F]( + new Exception( + s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}" + ) + ) else Stream.emit(r) } - private def startProcess[F[_]: Sync,A](cmd: Config.Command, wd: Option[Path])(f: Process => Stream[F,A]): Stream[F, A] = { + private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])( + f: Process => Stream[F, A] + ): Stream[F, A] = { val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") val proc = log *> Sync[F].delay { val pb = new ProcessBuilder(cmd.toCmd.asJava) wd.map(_.toFile).foreach(pb.directory) pb.start() } - Stream.bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => - p.destroy() - }).flatMap(f) + Stream + .bracket(proc)(p => + logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => + p.destroy() + } + ) + .flatMap(f) } - private def inputStreamToString[F[_]: Sync: ContextShift](in: InputStream, blocker: Blocker): F[String] = - io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false). - through(text.utf8Decode). - chunks. - map(_.toVector.mkString). - fold1(_ + _). - compile.last. - map(_.getOrElse("")) + private def inputStreamToString[F[_]: Sync: ContextShift]( + in: InputStream, + blocker: Blocker + ): F[String] = + io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false) + .through(text.utf8Decode) + .chunks + .map(_.toVector.mkString) + .fold1(_ + _) + .compile + .last + .map(_.getOrElse("")) - private def writeToProcess[F[_]: Sync: ContextShift](data: Stream[F, Byte], proc: Process, blocker: Blocker): F[Unit] = - data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)). - compile.drain + private def writeToProcess[F[_]: Sync: ContextShift]( + data: Stream[F, Byte], + proc: Process, + blocker: Blocker + ): F[Unit] = + data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] = Sync[F].delay(proc.destroyForcibly()).attempt *> { - Sync[F].raiseError(new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")) + Sync[F].raiseError( + new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})") + ) } } diff --git a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala b/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala index faa987ed..5c90c728 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala @@ -12,18 +12,17 @@ object TikaMimetype { private val tika = new TikaConfig().getDetector private def convert(mt: MediaType): MimeType = - Option(mt).map(_.toString). - map(MimeType.parse). - flatMap(_.toOption). - map(normalize). - getOrElse(MimeType.octetStream) + Option(mt) + .map(_.toString) + .map(MimeType.parse) + .flatMap(_.toOption) + .map(normalize) + .getOrElse(MimeType.octetStream) private def makeMetadata(hint: MimeTypeHint): Metadata = { val md = new Metadata - hint.filename. - foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _)) - hint.advertised. - foreach(md.set(HttpHeaders.CONTENT_TYPE, _)) + hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _)) + hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _)) md } @@ -33,13 +32,10 @@ object TikaMimetype { case _ => in } - private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = { + private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) - } def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] = - data.take(1024). - compile.toVector. - map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none)) + data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none)) }