Apply scalafmt to all files

This commit is contained in:
Eike Kettner
2020-02-09 01:54:11 +01:00
parent 6a9ec42a03
commit 5c37efeaba
32 changed files with 442 additions and 362 deletions

View File

@ -9,43 +9,47 @@ object Contact {
private[this] val protocols = Set("ftp", "http", "https")
def annotate(text: String): Vector[NerLabel] =
TextSplitter.splitToken[Nothing](text, " \t\r\n".toSet).
map({ token =>
if (isEmailAddress(token.value)) NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value)) NerLabel(token.value, NerTag.Website, token.begin, token.end).some
TextSplitter
.splitToken[Nothing](text, " \t\r\n".toSet)
.map({ token =>
if (isEmailAddress(token.value))
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value))
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
else None
}).
flatMap(_.map(Stream.emit).getOrElse(Stream.empty)).
toVector
})
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
.toVector
def isEmailAddress(str: String): Boolean = {
val atIdx = str.indexOf('@')
if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
else {
val name = str.substring(0, atIdx)
val dom = str.substring(atIdx + 1)
val dom = str.substring(atIdx + 1)
Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
}
}
def isWebsite(str: String): Boolean =
LenientUri.parse(str).
toOption.
map(uri => protocols.contains(uri.scheme.head)).
getOrElse(Domain.isDomain(str))
LenientUri
.parse(str)
.toOption
.map(uri => protocols.contains(uri.scheme.head))
.getOrElse(Domain.isDomain(str))
def isDocspellOpenUpload(str: String): Boolean = {
def isUploadPath(p: LenientUri.Path): Boolean =
p match {
case LenientUri.RootPath => false
case LenientUri.RootPath => false
case LenientUri.EmptyPath => false
case LenientUri.NonEmptyPath(segs) =>
Ident.fromString(segs.last).isRight &&
segs.init.takeRight(3) == List("open", "upload", "item")
}
LenientUri.parse(str).
toOption.
exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
LenientUri
.parse(str)
.toOption
.exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
}
}

View File

@ -11,7 +11,7 @@ private[text] object Tld {
/**
* Some selected TLDs.
*/
private [this] val known = List(
private[this] val known = List(
".com",
".org",
".net",

View File

@ -10,16 +10,22 @@ import scala.util.Try
object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = {
TextSplitter.splitToken(text, " \t.,\n\r/".toSet).
sliding(3).
filter(_.length == 3).
map(q => SimpleDate.fromParts(q.toList, lang).
map(sd => NerDateLabel(sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)))).
collect({ case Some(d) => d })
}
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter
.splitToken(text, " \t.,\n\r/".toSet)
.sliding(3)
.filter(_.length == 3)
.map(q =>
SimpleDate
.fromParts(q.toList, lang)
.map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
)
)
)
.collect({ case Some(d) => d })
private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate =
@ -27,13 +33,13 @@ object DateFind {
}
private object SimpleDate {
val p0 = readYear >> readMonth >> readDay map {
val p0 = (readYear >> readMonth >> readDay).map {
case ((y, m), d) => SimpleDate(y, m, d)
}
val p1 = readDay >> readMonth >> readYear map {
val p1 = (readDay >> readMonth >> readYear).map {
case ((d, m), y) => SimpleDate(y, m, d)
}
val p2 = readMonth >> readDay >> readYear map {
val p2 = (readMonth >> readDay >> readYear).map {
case ((m, d), y) => SimpleDate(y, m, d)
}
@ -46,14 +52,14 @@ object DateFind {
p.read(parts).toOption
}
def readYear: Reader[Int] = {
Reader.readFirst(w => w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None
})
}
def readYear: Reader[Int] =
Reader.readFirst(w =>
w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None
}
)
def readMonth: Reader[Int] =
Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
@ -69,10 +75,12 @@ object DateFind {
Reader(read.andThen(_.map(f)))
def or(other: Reader[A]): Reader[A] =
Reader(words => read(words) match {
case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s
})
Reader(words =>
read(words) match {
case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s
}
)
}
object Reader {
@ -81,12 +89,11 @@ object DateFind {
def readFirst[A](f: Word => Option[A]): Reader[A] =
Reader({
case Nil => Result.Failure
case Nil => Result.Failure
case a :: as => f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
})
}
sealed trait Result[+A] {
def toOption: Option[A]
def map[B](f: A => B): Result[B]
@ -95,14 +102,14 @@ object DateFind {
object Result {
final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
val toOption = Some(value)
val toOption = Some(value)
def map[B](f: A => B): Result[B] = Success(f(value), rest)
def next[B](r: Reader[B]): Result[(A, B)] =
r.read(rest).map(b => (value, b))
}
final case object Failure extends Result[Nothing] {
val toOption = None
def map[B](f: Nothing => B): Result[B] = this
val toOption = None
def map[B](f: Nothing => B): Result[B] = this
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
}
}

View File

@ -14,23 +14,28 @@ import java.net.URL
import scala.util.Using
object StanfordNerClassifier {
private [this] val logger = getLogger
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
case Language.German => germanNerClassifier
}
nerClassifier.classify(text).asScala.flatMap(a => a.asScala).
collect(Function.unlift(label => {
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag.fromString(Option(tag).getOrElse("")).toOption.
map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})).
toVector
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
@ -48,7 +53,9 @@ object StanfordNerClassifier {
check(lang match {
case Language.German =>
getClass.getResource("/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz")
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English =>
getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
})

View File

@ -5,11 +5,11 @@ import java.nio.file.{Path, Paths}
import docspell.common._
case class Config(
allowedContentTypes: Set[MimeType]
, ghostscript: Config.Ghostscript
, pageRange: Config.PageRange
, unpaper: Config.Unpaper
, tesseract: Config.Tesseract
allowedContentTypes: Set[MimeType],
ghostscript: Config.Ghostscript,
pageRange: Config.PageRange,
unpaper: Config.Unpaper,
tesseract: Config.Tesseract
) {
def isAllowed(mt: MimeType): Boolean =
@ -22,7 +22,7 @@ object Config {
case class Command(program: String, args: Seq[String], timeout: Duration) {
def mapArgs(f: String => String): Command =
Command(program, args map f, timeout)
Command(program, args.map(f), timeout)
def toCmd: List[String] =
program :: args.toList
@ -44,23 +44,23 @@ object Config {
),
pageRange = PageRange(10),
ghostscript = Ghostscript(
Command("gs", Seq("-dNOPAUSE"
, "-dBATCH"
, "-dSAFER"
, "-sDEVICE=tiffscaled8"
, "-sOutputFile={{outfile}}"
, "{{infile}}"),
Duration.seconds(30)),
Paths.get(System.getProperty("java.io.tmpdir")).
resolve("docspell-extraction")),
unpaper = Unpaper(Command("unpaper"
, Seq("{{infile}}", "{{outfile}}")
, Duration.seconds(30))),
Command(
"gs",
Seq(
"-dNOPAUSE",
"-dBATCH",
"-dSAFER",
"-sDEVICE=tiffscaled8",
"-sOutputFile={{outfile}}",
"{{infile}}"
),
Duration.seconds(30)
),
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
),
unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
tesseract = Tesseract(
Command("tesseract", Seq("{{file}}"
, "stdout"
, "-l"
, "{{lang}}"),
Duration.minutes(1)))
Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
)
)
}

View File

@ -11,71 +11,106 @@ object Ocr {
/** Extract the text of all pages in the given pdf file.
*/
def extractPdf[F[_]: Sync: ContextShift](pdf: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractPdf[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscript(pdf, config, wd, blocker).
flatMap({ tmpImg =>
runGhostscript(pdf, config, wd, blocker)
.flatMap({ tmpImg =>
runTesseractFile(tmpImg, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
})
.fold1(_ + "\n\n\n" + _)
}
/** Extract the text from the given image file
*/
def extractImage[F[_]: Sync: ContextShift](img: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractImage[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractStdin(img, blocker, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](pdf: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractPdFFile[F[_]: Sync: ContextShift](
pdf: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker).
flatMap({ tif =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
.flatMap({ tif =>
runTesseractFile(tif, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
})
.fold1(_ + "\n\n\n" + _)
}
def extractImageFile[F[_]: Sync: ContextShift](img: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
def extractImageFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
runTesseractFile(img, blocker, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte]
, cfg: Config
, wd: Path
, blocker: Blocker): Stream[F, Path] = {
pdf: Stream[F, Byte],
cfg: Config,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val xargs =
if (cfg.pageRange.begin > 0) s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
if (cfg.pageRange.begin > 0)
s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command.copy(args = xargs).mapArgs(replace(Map(
"{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf).
evalMap({ _ =>
val cmd = cfg.ghostscript.command
.copy(args = xargs)
.mapArgs(
replace(
Map(
"{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif"
)
)
)
SystemCommand
.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf)
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
})
.flatMap(fs => Stream.emits(fs))
}
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path
, ghostscript: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(replace(Map(
"{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
evalMap({ _ =>
pdf: Path,
ghostscript: Config.Command,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(
replace(
Map(
"{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif"
)
)
)
SystemCommand
.execSuccess[F](cmd, blocker, wd = Some(wd))
.evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
})
.flatMap(fs => Stream.emits(fs))
}
private def pathEndsWith(ext: String): Path => Boolean =
@ -84,65 +119,72 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](img: Path
, unpaper: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-"+ img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(replace(Map(
"{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
map(_ => targetFile).
handleErrorWith(th => {
logger.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path,
unpaper: Config.Command,
wd: Path,
blocker: Blocker
): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(
replace(
Map(
"{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString
)
)
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith {
th =>
logger
.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img)
})
}
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
img: Path,
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] =
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).
flatMap(uimg => {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> uimg.getFileName.toString
, "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
})
}
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg =>
val cmd = config.tesseract.command.mapArgs(
replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang)))
)
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte]
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> "stdin"
, "{{lang}}" -> fixLanguage(lang))))
img: Stream[F, Byte],
blocker: Blocker,
lang: String,
config: Config
): Stream[F, String] = {
val cmd = config.tesseract.command
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
}
private def replace(repl: Map[String, String]): String => String =
s => repl.foldLeft(s) { case (res, (k, v)) =>
res.replace(k, v)
}
s =>
repl.foldLeft(s) {
case (res, (k, v)) =>
res.replace(k, v)
}
private def fixLanguage(lang: String): String =
lang match {
case "de" => "deu"
case "en" => "eng"
case l => l
case l => l
}
}

View File

@ -16,57 +16,87 @@ object SystemCommand {
final case class Result(rc: Int, stdout: String, stderr: String)
def exec[F[_]: Sync: ContextShift]( cmd: Config.Command
, blocker: Blocker
, wd: Option[Path] = None
, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
startProcess(cmd, wd){ proc =>
def exec[F[_]: Sync: ContextShift](
cmd: Config.Command,
blocker: Blocker,
wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
startProcess(cmd, wd) { proc =>
Stream.eval {
for {
_ <- writeToProcess(stdin, proc, blocker)
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
else logger.fwarn(s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!")
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("")
err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("")
_ <- writeToProcess(stdin, proc, blocker)
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
else
logger.fwarn(
s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!"
)
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("")
err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("")
} yield Result(proc.exitValue, out, err)
}
}
def execSuccess[F[_]: Sync: ContextShift](cmd: Config.Command, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
def execSuccess[F[_]: Sync: ContextShift](
cmd: Config.Command,
blocker: Blocker,
wd: Option[Path] = None,
stdin: Stream[F, Byte] = Stream.empty
): Stream[F, Result] =
exec(cmd, blocker, wd, stdin).flatMap { r =>
if (r.rc != 0) Stream.raiseError[F](new Exception(s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"))
if (r.rc != 0)
Stream.raiseError[F](
new Exception(
s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"
)
)
else Stream.emit(r)
}
private def startProcess[F[_]: Sync,A](cmd: Config.Command, wd: Option[Path])(f: Process => Stream[F,A]): Stream[F, A] = {
private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])(
f: Process => Stream[F, A]
): Stream[F, A] = {
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
val proc = log *> Sync[F].delay {
val pb = new ProcessBuilder(cmd.toCmd.asJava)
wd.map(_.toFile).foreach(pb.directory)
pb.start()
}
Stream.bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
p.destroy()
}).flatMap(f)
Stream
.bracket(proc)(p =>
logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
p.destroy()
}
)
.flatMap(f)
}
private def inputStreamToString[F[_]: Sync: ContextShift](in: InputStream, blocker: Blocker): F[String] =
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false).
through(text.utf8Decode).
chunks.
map(_.toVector.mkString).
fold1(_ + _).
compile.last.
map(_.getOrElse(""))
private def inputStreamToString[F[_]: Sync: ContextShift](
in: InputStream,
blocker: Blocker
): F[String] =
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false)
.through(text.utf8Decode)
.chunks
.map(_.toVector.mkString)
.fold1(_ + _)
.compile
.last
.map(_.getOrElse(""))
private def writeToProcess[F[_]: Sync: ContextShift](data: Stream[F, Byte], proc: Process, blocker: Blocker): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).
compile.drain
private def writeToProcess[F[_]: Sync: ContextShift](
data: Stream[F, Byte],
proc: Process,
blocker: Blocker
): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] =
Sync[F].delay(proc.destroyForcibly()).attempt *> {
Sync[F].raiseError(new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})"))
Sync[F].raiseError(
new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})")
)
}
}

View File

@ -12,18 +12,17 @@ object TikaMimetype {
private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType =
Option(mt).map(_.toString).
map(MimeType.parse).
flatMap(_.toOption).
map(normalize).
getOrElse(MimeType.octetStream)
Option(mt)
.map(_.toString)
.map(MimeType.parse)
.flatMap(_.toOption)
.map(normalize)
.getOrElse(MimeType.octetStream)
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
hint.filename.
foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.advertised.
foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}
@ -33,13 +32,10 @@ object TikaMimetype {
case _ => in
}
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
}
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
data.take(1024).
compile.toVector.
map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
}