mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Early draft for text extraction
This commit is contained in:
parent
1a5546fe99
commit
3d615181e0
12
modules/common/src/main/scala/docspell/common/Logger.scala
Normal file
12
modules/common/src/main/scala/docspell/common/Logger.scala
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
trait Logger[F[_]] {
|
||||||
|
|
||||||
|
def trace(msg: => String): F[Unit]
|
||||||
|
def debug(msg: => String): F[Unit]
|
||||||
|
def info(msg: => String): F[Unit]
|
||||||
|
def warn(msg: => String): F[Unit]
|
||||||
|
def error(ex: Throwable)(msg: => String): F[Unit]
|
||||||
|
def error(msg: => String): F[Unit]
|
||||||
|
|
||||||
|
}
|
@ -48,6 +48,7 @@ object MimeType {
|
|||||||
|
|
||||||
val octetStream = application("octet-stream")
|
val octetStream = application("octet-stream")
|
||||||
val pdf = application("pdf")
|
val pdf = application("pdf")
|
||||||
|
val zip = application("zip")
|
||||||
val png = image("png")
|
val png = image("png")
|
||||||
val jpeg = image("jpeg")
|
val jpeg = image("jpeg")
|
||||||
val tiff = image("tiff")
|
val tiff = image("tiff")
|
||||||
|
@ -4,4 +4,13 @@ case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
|
|||||||
|
|
||||||
object MimeTypeHint {
|
object MimeTypeHint {
|
||||||
val none = MimeTypeHint(None, None)
|
val none = MimeTypeHint(None, None)
|
||||||
|
|
||||||
|
def filename(name: String): MimeTypeHint =
|
||||||
|
MimeTypeHint(Some(name), None)
|
||||||
|
|
||||||
|
def advertised(mimeType: MimeType): MimeTypeHint =
|
||||||
|
advertised(mimeType.asString)
|
||||||
|
|
||||||
|
def advertised(mimeType: String): MimeTypeHint =
|
||||||
|
MimeTypeHint(None, Some(mimeType))
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
package docspell.extract
|
||||||
|
|
||||||
|
import docspell.common.{MimeType, MimeTypeHint}
|
||||||
|
|
||||||
|
sealed trait DataType {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object DataType {
|
||||||
|
|
||||||
|
case class Exact(mime: MimeType) extends DataType
|
||||||
|
|
||||||
|
case class Hint(hint: MimeTypeHint) extends DataType
|
||||||
|
|
||||||
|
|
||||||
|
def apply(mt: MimeType): DataType =
|
||||||
|
Exact(mt)
|
||||||
|
|
||||||
|
def filename(name: String): DataType =
|
||||||
|
Hint(MimeTypeHint.filename(name))
|
||||||
|
}
|
@ -0,0 +1,5 @@
|
|||||||
|
package docspell.extract
|
||||||
|
|
||||||
|
import docspell.extract.ocr.OcrConfig
|
||||||
|
|
||||||
|
case class ExtractConfig(ocr: OcrConfig)
|
@ -15,15 +15,25 @@ object ExtractResult {
|
|||||||
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
}
|
}
|
||||||
|
def unsupportedFormat(mt: MimeType): ExtractResult =
|
||||||
|
UnsupportedFormat(mt)
|
||||||
|
|
||||||
case class Failure(ex: Throwable) extends ExtractResult {
|
case class Failure(ex: Throwable) extends ExtractResult {
|
||||||
val textOption = None
|
val textOption = None
|
||||||
}
|
}
|
||||||
|
def failure(ex: Throwable): ExtractResult =
|
||||||
|
Failure(ex)
|
||||||
|
|
||||||
case class Success(text: String) extends ExtractResult {
|
case class Success(text: String) extends ExtractResult {
|
||||||
val textOption = Some(text)
|
val textOption = Some(text)
|
||||||
}
|
}
|
||||||
|
def success(text: String): ExtractResult =
|
||||||
|
Success(text)
|
||||||
|
|
||||||
def fromTry(r: Try[String]): ExtractResult =
|
def fromTry(r: Try[String]): ExtractResult =
|
||||||
r.fold(Failure.apply, Success.apply)
|
r.fold(Failure.apply, Success.apply)
|
||||||
|
|
||||||
|
def fromEither(e: Either[Throwable, String]): ExtractResult =
|
||||||
|
e.fold(failure, success)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,70 @@
|
|||||||
|
package docspell.extract
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.extract.ocr.{OcrType, TextExtract}
|
||||||
|
import docspell.extract.odf.{OdfExtract, OdfType}
|
||||||
|
import docspell.extract.poi.{PoiExtract, PoiType}
|
||||||
|
import docspell.extract.rtf.RtfExtract
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.files.TikaMimetype
|
||||||
|
|
||||||
|
trait Extraction[F[_]] {
|
||||||
|
|
||||||
|
def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object Extraction {
|
||||||
|
|
||||||
|
def create[F[_]: Sync: ContextShift](
|
||||||
|
blocker: Blocker,
|
||||||
|
logger: Logger[F],
|
||||||
|
cfg: ExtractConfig
|
||||||
|
): Extraction[F] =
|
||||||
|
new Extraction[F] {
|
||||||
|
def extractText(
|
||||||
|
data: Stream[F, Byte],
|
||||||
|
dataType: DataType,
|
||||||
|
lang: Language
|
||||||
|
): F[ExtractResult] = {
|
||||||
|
val mime = dataType match {
|
||||||
|
case DataType.Exact(mt) => mt.pure[F]
|
||||||
|
case DataType.Hint(hint) => TikaMimetype.detect(data, hint)
|
||||||
|
}
|
||||||
|
mime.flatMap {
|
||||||
|
case MimeType.pdf =>
|
||||||
|
PdfExtract
|
||||||
|
.get(data, blocker, lang, 5, cfg.ocr, logger)
|
||||||
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case PoiType(mt) =>
|
||||||
|
PoiExtract.get(data, mt).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case RtfExtract.rtfType =>
|
||||||
|
RtfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case OdfType(_) =>
|
||||||
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case OcrType(_) =>
|
||||||
|
TextExtract
|
||||||
|
.extractOCR(data, blocker, lang.iso3, cfg.ocr)
|
||||||
|
.compile
|
||||||
|
.lastOrError
|
||||||
|
.attempt
|
||||||
|
.map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case OdfType.container =>
|
||||||
|
logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||||
|
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||||
|
|
||||||
|
case mt =>
|
||||||
|
ExtractResult.unsupportedFormat(mt).pure[F]
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,51 @@
|
|||||||
|
package docspell.extract
|
||||||
|
|
||||||
|
import cats.implicits._
|
||||||
|
import cats.effect._
|
||||||
|
import fs2.Stream
|
||||||
|
import docspell.common.{Language, Logger}
|
||||||
|
import docspell.extract.ocr.{OcrConfig, TextExtract}
|
||||||
|
import docspell.extract.pdfbox.PdfboxExtract
|
||||||
|
|
||||||
|
object PdfExtract {
|
||||||
|
|
||||||
|
def get[F[_]: Sync: ContextShift](
|
||||||
|
in: Stream[F, Byte],
|
||||||
|
blocker: Blocker,
|
||||||
|
lang: Language,
|
||||||
|
stripMinLen: Int,
|
||||||
|
ocrCfg: OcrConfig,
|
||||||
|
logger: Logger[F]
|
||||||
|
): F[Either[Throwable, String]] = {
|
||||||
|
|
||||||
|
val runOcr =
|
||||||
|
TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError
|
||||||
|
|
||||||
|
def chooseResult(ocrStr: String, strippedStr: String) =
|
||||||
|
if (ocrStr.length > strippedStr.length)
|
||||||
|
logger.info(
|
||||||
|
s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})"
|
||||||
|
) *> ocrStr.pure[F]
|
||||||
|
else
|
||||||
|
logger.info(
|
||||||
|
s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})"
|
||||||
|
) *> strippedStr.pure[F]
|
||||||
|
|
||||||
|
//maybe better: inspect the pdf and decide whether ocr or not
|
||||||
|
for {
|
||||||
|
pdfboxRes <- PdfboxExtract.get[F](in)
|
||||||
|
res <- pdfboxRes.fold(
|
||||||
|
ex =>
|
||||||
|
logger.info(
|
||||||
|
s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. "
|
||||||
|
) *> runOcr.attempt,
|
||||||
|
str =>
|
||||||
|
if (str.length >= stripMinLen) str.pure[F].attempt
|
||||||
|
else
|
||||||
|
logger
|
||||||
|
.info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *>
|
||||||
|
runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt
|
||||||
|
)
|
||||||
|
} yield res
|
||||||
|
}
|
||||||
|
}
|
@ -16,7 +16,7 @@ object Ocr {
|
|||||||
pdf: Stream[F, Byte],
|
pdf: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||||
runGhostscript(pdf, config, wd, blocker)
|
runGhostscript(pdf, config, wd, blocker)
|
||||||
@ -32,7 +32,7 @@ object Ocr {
|
|||||||
img: Stream[F, Byte],
|
img: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
runTesseractStdin(img, blocker, lang, config)
|
runTesseractStdin(img, blocker, lang, config)
|
||||||
|
|
||||||
@ -40,7 +40,7 @@ object Ocr {
|
|||||||
pdf: Path,
|
pdf: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
|
||||||
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker)
|
||||||
@ -54,7 +54,7 @@ object Ocr {
|
|||||||
img: Path,
|
img: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
runTesseractFile(img, blocker, lang, config)
|
runTesseractFile(img, blocker, lang, config)
|
||||||
|
|
||||||
@ -63,7 +63,7 @@ object Ocr {
|
|||||||
*/
|
*/
|
||||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||||
pdf: Stream[F, Byte],
|
pdf: Stream[F, Byte],
|
||||||
cfg: Config,
|
cfg: OcrConfig,
|
||||||
wd: Path,
|
wd: Path,
|
||||||
blocker: Blocker
|
blocker: Blocker
|
||||||
): Stream[F, Path] = {
|
): Stream[F, Path] = {
|
||||||
@ -150,7 +150,7 @@ object Ocr {
|
|||||||
img: Path,
|
img: Path,
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
// tesseract cannot cope with absolute filenames
|
// tesseract cannot cope with absolute filenames
|
||||||
// so use the parent as working dir
|
// so use the parent as working dir
|
||||||
@ -168,7 +168,7 @@ object Ocr {
|
|||||||
img: Stream[F, Byte],
|
img: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] = {
|
): Stream[F, String] = {
|
||||||
val cmd = config.tesseract.command
|
val cmd = config.tesseract.command
|
||||||
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
.mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))))
|
||||||
|
@ -4,26 +4,29 @@ import java.nio.file.{Path, Paths}
|
|||||||
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
case class Config(
|
case class OcrConfig(
|
||||||
allowedContentTypes: Set[MimeType],
|
allowedContentTypes: Set[MimeType],
|
||||||
ghostscript: Config.Ghostscript,
|
ghostscript: OcrConfig.Ghostscript,
|
||||||
pageRange: Config.PageRange,
|
pageRange: OcrConfig.PageRange,
|
||||||
unpaper: Config.Unpaper,
|
unpaper: OcrConfig.Unpaper,
|
||||||
tesseract: Config.Tesseract
|
tesseract: OcrConfig.Tesseract
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def isAllowed(mt: MimeType): Boolean =
|
def isAllowed(mt: MimeType): Boolean =
|
||||||
allowedContentTypes contains mt
|
allowedContentTypes contains mt
|
||||||
}
|
}
|
||||||
|
|
||||||
object Config {
|
object OcrConfig {
|
||||||
|
|
||||||
case class PageRange(begin: Int)
|
case class PageRange(begin: Int)
|
||||||
|
|
||||||
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
case class Ghostscript(command: SystemCommand.Config, workingDir: Path)
|
||||||
|
|
||||||
case class Tesseract(command: SystemCommand.Config)
|
case class Tesseract(command: SystemCommand.Config)
|
||||||
|
|
||||||
case class Unpaper(command: SystemCommand.Config)
|
case class Unpaper(command: SystemCommand.Config)
|
||||||
|
|
||||||
val default = Config(
|
val default = OcrConfig(
|
||||||
allowedContentTypes = Set(
|
allowedContentTypes = Set(
|
||||||
MimeType.pdf,
|
MimeType.pdf,
|
||||||
MimeType.png,
|
MimeType.png,
|
||||||
@ -46,9 +49,12 @@ object Config {
|
|||||||
),
|
),
|
||||||
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction")
|
||||||
),
|
),
|
||||||
unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))),
|
unpaper = Unpaper(
|
||||||
|
SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))
|
||||||
|
),
|
||||||
tesseract = Tesseract(
|
tesseract = Tesseract(
|
||||||
SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
SystemCommand
|
||||||
|
.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
@ -0,0 +1,16 @@
|
|||||||
|
package docspell.extract.ocr
|
||||||
|
|
||||||
|
import docspell.common.MimeType
|
||||||
|
|
||||||
|
object OcrType {
|
||||||
|
|
||||||
|
val jpeg = MimeType.jpeg
|
||||||
|
val png = MimeType.png
|
||||||
|
val tiff = MimeType.tiff
|
||||||
|
val pdf = MimeType.pdf
|
||||||
|
|
||||||
|
val all = Set(jpeg, png, tiff, pdf)
|
||||||
|
|
||||||
|
def unapply(mt: MimeType): Option[MimeType] =
|
||||||
|
Some(mt).filter(all.contains)
|
||||||
|
}
|
@ -11,7 +11,7 @@ object TextExtract {
|
|||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
extractOCR(in, blocker, lang, config)
|
extractOCR(in, blocker, lang, config)
|
||||||
|
|
||||||
@ -19,7 +19,7 @@ object TextExtract {
|
|||||||
in: Stream[F, Byte],
|
in: Stream[F, Byte],
|
||||||
blocker: Blocker,
|
blocker: Blocker,
|
||||||
lang: String,
|
lang: String,
|
||||||
config: Config
|
config: OcrConfig
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
Stream
|
Stream
|
||||||
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
package docspell.extract.odf
|
||||||
|
|
||||||
|
import docspell.common.MimeType
|
||||||
|
|
||||||
|
object OdfType {
|
||||||
|
|
||||||
|
val odt = MimeType.application("application/vnd.oasis.opendocument.text")
|
||||||
|
val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet")
|
||||||
|
val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text")
|
||||||
|
val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet")
|
||||||
|
|
||||||
|
val container = MimeType.zip
|
||||||
|
|
||||||
|
val all = Set(odt, ods, odtAlias, odsAlias)
|
||||||
|
|
||||||
|
def unapply(mt: MimeType): Option[MimeType] =
|
||||||
|
Some(mt).filter(all.contains)
|
||||||
|
}
|
@ -21,22 +21,25 @@ import docspell.files.TikaMimetype
|
|||||||
object PoiExtract {
|
object PoiExtract {
|
||||||
|
|
||||||
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
|
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
|
||||||
TikaMimetype.detect(data, hint).flatMap {
|
TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt))
|
||||||
case PoiTypes.doc =>
|
|
||||||
|
def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] =
|
||||||
|
mime match {
|
||||||
|
case PoiType.doc =>
|
||||||
getDoc(data)
|
getDoc(data)
|
||||||
case PoiTypes.xls =>
|
case PoiType.xls =>
|
||||||
getXls(data)
|
getXls(data)
|
||||||
case PoiTypes.xlsx =>
|
case PoiType.xlsx =>
|
||||||
getXlsx(data)
|
getXlsx(data)
|
||||||
case PoiTypes.docx =>
|
case PoiType.docx =>
|
||||||
getDocx(data)
|
getDocx(data)
|
||||||
case PoiTypes.msoffice =>
|
case PoiType.msoffice =>
|
||||||
EitherT(getDoc[F](data))
|
EitherT(getDoc[F](data))
|
||||||
.recoverWith({
|
.recoverWith({
|
||||||
case _ => EitherT(getXls[F](data))
|
case _ => EitherT(getXls[F](data))
|
||||||
})
|
})
|
||||||
.value
|
.value
|
||||||
case PoiTypes.ooxml =>
|
case PoiType.ooxml =>
|
||||||
EitherT(getDocx[F](data))
|
EitherT(getDocx[F](data))
|
||||||
.recoverWith({
|
.recoverWith({
|
||||||
case _ => EitherT(getXlsx[F](data))
|
case _ => EitherT(getXlsx[F](data))
|
||||||
|
@ -2,7 +2,7 @@ package docspell.extract.poi
|
|||||||
|
|
||||||
import docspell.common.MimeType
|
import docspell.common.MimeType
|
||||||
|
|
||||||
object PoiTypes {
|
object PoiType {
|
||||||
|
|
||||||
val msoffice = MimeType.application("x-tika-msoffice")
|
val msoffice = MimeType.application("x-tika-msoffice")
|
||||||
val ooxml = MimeType.application("x-tika-ooxml")
|
val ooxml = MimeType.application("x-tika-ooxml")
|
||||||
@ -13,4 +13,7 @@ object PoiTypes {
|
|||||||
|
|
||||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||||
|
|
||||||
|
def unapply(arg: MimeType): Option[MimeType] =
|
||||||
|
Some(arg).filter(all.contains)
|
||||||
|
|
||||||
}
|
}
|
@ -4,6 +4,7 @@ import java.io.{ByteArrayInputStream, InputStream}
|
|||||||
|
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
|
import docspell.common.MimeType
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import javax.swing.text.rtf.RTFEditorKit
|
import javax.swing.text.rtf.RTFEditorKit
|
||||||
|
|
||||||
@ -11,6 +12,8 @@ import scala.util.Try
|
|||||||
|
|
||||||
object RtfExtract {
|
object RtfExtract {
|
||||||
|
|
||||||
|
val rtfType = MimeType.application("rtf")
|
||||||
|
|
||||||
def get(is: InputStream): Either[Throwable, String] =
|
def get(is: InputStream): Either[Throwable, String] =
|
||||||
Try {
|
Try {
|
||||||
val kit = new RTFEditorKit()
|
val kit = new RTFEditorKit()
|
||||||
|
@ -10,7 +10,7 @@ object TextExtractionSuite extends SimpleTestSuite {
|
|||||||
test("extract english pdf") {
|
test("extract english pdf") {
|
||||||
ignore()
|
ignore()
|
||||||
val text = TextExtract
|
val text = TextExtract
|
||||||
.extract[IO](letterSourceEN, blocker, "eng", Config.default)
|
.extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
@ -21,7 +21,7 @@ object TextExtractionSuite extends SimpleTestSuite {
|
|||||||
ignore()
|
ignore()
|
||||||
val expect = TestFiles.letterDEText
|
val expect = TestFiles.letterDEText
|
||||||
val extract = TextExtract
|
val extract = TextExtract
|
||||||
.extract[IO](letterSourceDE, blocker, "deu", Config.default)
|
.extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default)
|
||||||
.compile
|
.compile
|
||||||
.lastOrError
|
.lastOrError
|
||||||
.unsafeRunSync()
|
.unsafeRunSync()
|
||||||
|
25
modules/files/src/test/scala/docspell/files/Playing.scala
Normal file
25
modules/files/src/test/scala/docspell/files/Playing.scala
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package docspell.files
|
||||||
|
|
||||||
|
import cats.effect.{Blocker, ExitCode, IO, IOApp}
|
||||||
|
import docspell.common.MimeTypeHint
|
||||||
|
|
||||||
|
import scala.concurrent.ExecutionContext
|
||||||
|
|
||||||
|
object Playing extends IOApp {
|
||||||
|
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||||
|
|
||||||
|
|
||||||
|
def run(args: List[String]): IO[ExitCode] = IO {
|
||||||
|
//val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker)
|
||||||
|
//val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker)
|
||||||
|
val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker)
|
||||||
|
|
||||||
|
val x = for {
|
||||||
|
odsm1 <- TikaMimetype.detect(rtf,
|
||||||
|
MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last))
|
||||||
|
odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none)
|
||||||
|
} yield (odsm1, odsm2)
|
||||||
|
println(x.unsafeRunSync())
|
||||||
|
ExitCode.Success
|
||||||
|
}
|
||||||
|
}
|
@ -3,7 +3,7 @@ package docspell.joex
|
|||||||
import docspell.common.{Ident, LenientUri}
|
import docspell.common.{Ident, LenientUri}
|
||||||
import docspell.joex.scheduler.SchedulerConfig
|
import docspell.joex.scheduler.SchedulerConfig
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
import docspell.extract.ocr.{Config => OcrConfig}
|
import docspell.extract.ocr.{OcrConfig => OcrConfig}
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
|
|
||||||
case class Config(
|
case class Config(
|
||||||
|
@ -7,7 +7,7 @@ import docspell.common._
|
|||||||
import docspell.joex.scheduler.{Context, Task}
|
import docspell.joex.scheduler.{Context, Task}
|
||||||
import docspell.store.Store
|
import docspell.store.Store
|
||||||
import docspell.store.records.{RAttachment, RAttachmentMeta}
|
import docspell.store.records.{RAttachment, RAttachmentMeta}
|
||||||
import docspell.extract.ocr.{TextExtract, Config => OcrConfig}
|
import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig}
|
||||||
|
|
||||||
object TextExtraction {
|
object TextExtraction {
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ package docspell.joex.scheduler
|
|||||||
import cats.Functor
|
import cats.Functor
|
||||||
import cats.effect.{Blocker, Concurrent}
|
import cats.effect.{Blocker, Concurrent}
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import docspell.common.Ident
|
import docspell.common._
|
||||||
import docspell.store.Store
|
import docspell.store.Store
|
||||||
import docspell.store.records.RJob
|
import docspell.store.records.RJob
|
||||||
import docspell.common.syntax.all._
|
import docspell.common.syntax.all._
|
||||||
|
@ -5,17 +5,6 @@ import cats.effect.{Concurrent, Sync}
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import fs2.concurrent.Queue
|
import fs2.concurrent.Queue
|
||||||
|
|
||||||
trait Logger[F[_]] {
|
|
||||||
|
|
||||||
def trace(msg: => String): F[Unit]
|
|
||||||
def debug(msg: => String): F[Unit]
|
|
||||||
def info(msg: => String): F[Unit]
|
|
||||||
def warn(msg: => String): F[Unit]
|
|
||||||
def error(ex: Throwable)(msg: => String): F[Unit]
|
|
||||||
def error(msg: => String): F[Unit]
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
object Logger {
|
object Logger {
|
||||||
|
|
||||||
def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] =
|
def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] =
|
||||||
|
Loading…
x
Reference in New Issue
Block a user