diff --git a/modules/common/src/main/scala/docspell/common/Logger.scala b/modules/common/src/main/scala/docspell/common/Logger.scala new file mode 100644 index 00000000..cb155ca1 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/Logger.scala @@ -0,0 +1,12 @@ +package docspell.common + +trait Logger[F[_]] { + + def trace(msg: => String): F[Unit] + def debug(msg: => String): F[Unit] + def info(msg: => String): F[Unit] + def warn(msg: => String): F[Unit] + def error(ex: Throwable)(msg: => String): F[Unit] + def error(msg: => String): F[Unit] + +} diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index e6aa6079..e3018580 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -48,6 +48,7 @@ object MimeType { val octetStream = application("octet-stream") val pdf = application("pdf") + val zip = application("zip") val png = image("png") val jpeg = image("jpeg") val tiff = image("tiff") diff --git a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala index f802b803..7b98a92a 100644 --- a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala +++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala @@ -4,4 +4,13 @@ case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} object MimeTypeHint { val none = MimeTypeHint(None, None) + + def filename(name: String): MimeTypeHint = + MimeTypeHint(Some(name), None) + + def advertised(mimeType: MimeType): MimeTypeHint = + advertised(mimeType.asString) + + def advertised(mimeType: String): MimeTypeHint = + MimeTypeHint(None, Some(mimeType)) } diff --git a/modules/extract/src/main/scala/docspell/extract/DataType.scala b/modules/extract/src/main/scala/docspell/extract/DataType.scala new file mode 100644 index 00000000..7d4c28d6 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/DataType.scala @@ -0,0 +1,21 @@ +package docspell.extract + +import docspell.common.{MimeType, MimeTypeHint} + +sealed trait DataType { + +} + +object DataType { + + case class Exact(mime: MimeType) extends DataType + + case class Hint(hint: MimeTypeHint) extends DataType + + + def apply(mt: MimeType): DataType = + Exact(mt) + + def filename(name: String): DataType = + Hint(MimeTypeHint.filename(name)) +} diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala new file mode 100644 index 00000000..76b65537 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -0,0 +1,5 @@ +package docspell.extract + +import docspell.extract.ocr.OcrConfig + +case class ExtractConfig(ocr: OcrConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala index 6c05d56a..ee948c53 100644 --- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala +++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala @@ -15,15 +15,25 @@ object ExtractResult { case class UnsupportedFormat(mime: MimeType) extends ExtractResult { val textOption = None } + def unsupportedFormat(mt: MimeType): ExtractResult = + UnsupportedFormat(mt) + case class Failure(ex: Throwable) extends ExtractResult { val textOption = None } + def failure(ex: Throwable): ExtractResult = + Failure(ex) + case class Success(text: String) extends ExtractResult { val textOption = Some(text) } + def success(text: String): ExtractResult = + Success(text) def fromTry(r: Try[String]): ExtractResult = r.fold(Failure.apply, Success.apply) + def fromEither(e: Either[Throwable, String]): ExtractResult = + e.fold(failure, success) } diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala new file mode 100644 index 00000000..ebc44591 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -0,0 +1,70 @@ +package docspell.extract + +import cats.effect._ +import cats.implicits._ +import docspell.common._ +import docspell.extract.ocr.{OcrType, TextExtract} +import docspell.extract.odf.{OdfExtract, OdfType} +import docspell.extract.poi.{PoiExtract, PoiType} +import docspell.extract.rtf.RtfExtract +import fs2.Stream +import docspell.files.TikaMimetype + +trait Extraction[F[_]] { + + def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult] + +} + +object Extraction { + + def create[F[_]: Sync: ContextShift]( + blocker: Blocker, + logger: Logger[F], + cfg: ExtractConfig + ): Extraction[F] = + new Extraction[F] { + def extractText( + data: Stream[F, Byte], + dataType: DataType, + lang: Language + ): F[ExtractResult] = { + val mime = dataType match { + case DataType.Exact(mt) => mt.pure[F] + case DataType.Hint(hint) => TikaMimetype.detect(data, hint) + } + mime.flatMap { + case MimeType.pdf => + PdfExtract + .get(data, blocker, lang, 5, cfg.ocr, logger) + .map(ExtractResult.fromEither) + + case PoiType(mt) => + PoiExtract.get(data, mt).map(ExtractResult.fromEither) + + case RtfExtract.rtfType => + RtfExtract.get(data).map(ExtractResult.fromEither) + + case OdfType(_) => + OdfExtract.get(data).map(ExtractResult.fromEither) + + case OcrType(_) => + TextExtract + .extractOCR(data, blocker, lang.iso3, cfg.ocr) + .compile + .lastOrError + .attempt + .map(ExtractResult.fromEither) + + case OdfType.container => + logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> + OdfExtract.get(data).map(ExtractResult.fromEither) + + case mt => + ExtractResult.unsupportedFormat(mt).pure[F] + + } + } + } + +} diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala new file mode 100644 index 00000000..2489b391 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -0,0 +1,51 @@ +package docspell.extract + +import cats.implicits._ +import cats.effect._ +import fs2.Stream +import docspell.common.{Language, Logger} +import docspell.extract.ocr.{OcrConfig, TextExtract} +import docspell.extract.pdfbox.PdfboxExtract + +object PdfExtract { + + def get[F[_]: Sync: ContextShift]( + in: Stream[F, Byte], + blocker: Blocker, + lang: Language, + stripMinLen: Int, + ocrCfg: OcrConfig, + logger: Logger[F] + ): F[Either[Throwable, String]] = { + + val runOcr = + TextExtract.extractOCR(in, blocker, lang.iso3, ocrCfg).compile.lastOrError + + def chooseResult(ocrStr: String, strippedStr: String) = + if (ocrStr.length > strippedStr.length) + logger.info( + s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" + ) *> ocrStr.pure[F] + else + logger.info( + s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" + ) *> strippedStr.pure[F] + + //maybe better: inspect the pdf and decide whether ocr or not + for { + pdfboxRes <- PdfboxExtract.get[F](in) + res <- pdfboxRes.fold( + ex => + logger.info( + s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " + ) *> runOcr.attempt, + str => + if (str.length >= stripMinLen) str.pure[F].attempt + else + logger + .info(s"Stripping text from PDF is very small (${str.length}). Trying with OCR.") *> + runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt + ) + } yield res + } +} diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index 5cefcbc1..fbdece9c 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -16,7 +16,7 @@ object Ocr { pdf: Stream[F, Byte], blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => runGhostscript(pdf, config, wd, blocker) @@ -32,7 +32,7 @@ object Ocr { img: Stream[F, Byte], blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = runTesseractStdin(img, blocker, lang, config) @@ -40,7 +40,7 @@ object Ocr { pdf: Path, blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) @@ -54,7 +54,7 @@ object Ocr { img: Path, blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = runTesseractFile(img, blocker, lang, config) @@ -62,10 +62,10 @@ object Ocr { * files are stored to a temporary location on disk and returned. */ private[extract] def runGhostscript[F[_]: Sync: ContextShift]( - pdf: Stream[F, Byte], - cfg: Config, - wd: Path, - blocker: Blocker + pdf: Stream[F, Byte], + cfg: OcrConfig, + wd: Path, + blocker: Blocker ): Stream[F, Path] = { val xargs = if (cfg.pageRange.begin > 0) @@ -150,7 +150,7 @@ object Ocr { img: Path, blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = // tesseract cannot cope with absolute filenames // so use the parent as working dir @@ -168,7 +168,7 @@ object Ocr { img: Stream[F, Byte], blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = { val cmd = config.tesseract.command .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala similarity index 69% rename from modules/extract/src/main/scala/docspell/extract/ocr/Config.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala index 482c0e91..b08f46ba 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala @@ -4,26 +4,29 @@ import java.nio.file.{Path, Paths} import docspell.common._ -case class Config( +case class OcrConfig( allowedContentTypes: Set[MimeType], - ghostscript: Config.Ghostscript, - pageRange: Config.PageRange, - unpaper: Config.Unpaper, - tesseract: Config.Tesseract + ghostscript: OcrConfig.Ghostscript, + pageRange: OcrConfig.PageRange, + unpaper: OcrConfig.Unpaper, + tesseract: OcrConfig.Tesseract ) { def isAllowed(mt: MimeType): Boolean = allowedContentTypes contains mt } -object Config { +object OcrConfig { + case class PageRange(begin: Int) case class Ghostscript(command: SystemCommand.Config, workingDir: Path) + case class Tesseract(command: SystemCommand.Config) + case class Unpaper(command: SystemCommand.Config) - val default = Config( + val default = OcrConfig( allowedContentTypes = Set( MimeType.pdf, MimeType.png, @@ -46,9 +49,12 @@ object Config { ), Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") ), - unpaper = Unpaper(SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), + unpaper = Unpaper( + SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30)) + ), tesseract = Tesseract( - SystemCommand.Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) + SystemCommand + .Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) ) ) } diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala new file mode 100644 index 00000000..f2effac6 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala @@ -0,0 +1,16 @@ +package docspell.extract.ocr + +import docspell.common.MimeType + +object OcrType { + + val jpeg = MimeType.jpeg + val png = MimeType.png + val tiff = MimeType.tiff + val pdf = MimeType.pdf + + val all = Set(jpeg, png, tiff, pdf) + + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(all.contains) +} diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index 51a7ca73..35031207 100644 --- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -11,7 +11,7 @@ object TextExtract { in: Stream[F, Byte], blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = extractOCR(in, blocker, lang, config) @@ -19,7 +19,7 @@ object TextExtract { in: Stream[F, Byte], blocker: Blocker, lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = Stream .eval(TikaMimetype.detect(in, MimeTypeHint.none)) diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala new file mode 100644 index 00000000..5519ddeb --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala @@ -0,0 +1,18 @@ +package docspell.extract.odf + +import docspell.common.MimeType + +object OdfType { + + val odt = MimeType.application("application/vnd.oasis.opendocument.text") + val ods = MimeType.application("application/vnd.oasis.opendocument.spreadsheet") + val odtAlias = MimeType.application("application/x-vnd.oasis.opendocument.text") + val odsAlias = MimeType.application("application/x-vnd.oasis.opendocument.spreadsheet") + + val container = MimeType.zip + + val all = Set(odt, ods, odtAlias, odsAlias) + + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(all.contains) +} diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala index 68e1de18..48cd0638 100644 --- a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala @@ -21,22 +21,25 @@ import docspell.files.TikaMimetype object PoiExtract { def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] = - TikaMimetype.detect(data, hint).flatMap { - case PoiTypes.doc => + TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt)) + + def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] = + mime match { + case PoiType.doc => getDoc(data) - case PoiTypes.xls => + case PoiType.xls => getXls(data) - case PoiTypes.xlsx => + case PoiType.xlsx => getXlsx(data) - case PoiTypes.docx => + case PoiType.docx => getDocx(data) - case PoiTypes.msoffice => + case PoiType.msoffice => EitherT(getDoc[F](data)) .recoverWith({ case _ => EitherT(getXls[F](data)) }) .value - case PoiTypes.ooxml => + case PoiType.ooxml => EitherT(getDocx[F](data)) .recoverWith({ case _ => EitherT(getXlsx[F](data)) diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala similarity index 83% rename from modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala rename to modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala index f3795fc5..f77cccb5 100644 --- a/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala @@ -2,7 +2,7 @@ package docspell.extract.poi import docspell.common.MimeType -object PoiTypes { +object PoiType { val msoffice = MimeType.application("x-tika-msoffice") val ooxml = MimeType.application("x-tika-ooxml") @@ -13,4 +13,7 @@ object PoiTypes { val all = Set(msoffice, ooxml, docx, xlsx, xls, doc) + def unapply(arg: MimeType): Option[MimeType] = + Some(arg).filter(all.contains) + } diff --git a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala index e2b5757b..c4a37fec 100644 --- a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala @@ -4,6 +4,7 @@ import java.io.{ByteArrayInputStream, InputStream} import cats.implicits._ import cats.effect.Sync +import docspell.common.MimeType import fs2.Stream import javax.swing.text.rtf.RTFEditorKit @@ -11,6 +12,8 @@ import scala.util.Try object RtfExtract { + val rtfType = MimeType.application("rtf") + def get(is: InputStream): Either[Throwable, String] = Try { val kit = new RTFEditorKit() diff --git a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index 8033200a..e63982ca 100644 --- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -10,7 +10,7 @@ object TextExtractionSuite extends SimpleTestSuite { test("extract english pdf") { ignore() val text = TextExtract - .extract[IO](letterSourceEN, blocker, "eng", Config.default) + .extract[IO](letterSourceEN, blocker, "eng", OcrConfig.default) .compile .lastOrError .unsafeRunSync() @@ -21,7 +21,7 @@ object TextExtractionSuite extends SimpleTestSuite { ignore() val expect = TestFiles.letterDEText val extract = TextExtract - .extract[IO](letterSourceDE, blocker, "deu", Config.default) + .extract[IO](letterSourceDE, blocker, "deu", OcrConfig.default) .compile .lastOrError .unsafeRunSync() diff --git a/modules/files/src/test/scala/docspell/files/Playing.scala b/modules/files/src/test/scala/docspell/files/Playing.scala new file mode 100644 index 00000000..ae87bd2d --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/Playing.scala @@ -0,0 +1,25 @@ +package docspell.files + +import cats.effect.{Blocker, ExitCode, IO, IOApp} +import docspell.common.MimeTypeHint + +import scala.concurrent.ExecutionContext + +object Playing extends IOApp { + val blocker = Blocker.liftExecutionContext(ExecutionContext.global) + + + def run(args: List[String]): IO[ExitCode] = IO { + //val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker) + //val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker) + val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker) + + val x = for { + odsm1 <- TikaMimetype.detect(rtf, + MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last)) + odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none) + } yield (odsm1, odsm2) + println(x.unsafeRunSync()) + ExitCode.Success + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index a5a9bc47..7e4de76c 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -3,7 +3,7 @@ package docspell.joex import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig -import docspell.extract.ocr.{Config => OcrConfig} +import docspell.extract.ocr.{OcrConfig => OcrConfig} import docspell.convert.ConvertConfig case class Config( diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index fa81774e..94b67109 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -7,7 +7,7 @@ import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.Store import docspell.store.records.{RAttachment, RAttachmentMeta} -import docspell.extract.ocr.{TextExtract, Config => OcrConfig} +import docspell.extract.ocr.{TextExtract, OcrConfig => OcrConfig} object TextExtraction { diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala index 7ce0b04a..a769103a 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala @@ -3,7 +3,7 @@ package docspell.joex.scheduler import cats.Functor import cats.effect.{Blocker, Concurrent} import cats.implicits._ -import docspell.common.Ident +import docspell.common._ import docspell.store.Store import docspell.store.records.RJob import docspell.common.syntax.all._ diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala index 353c4182..3ac4d441 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala @@ -5,17 +5,6 @@ import cats.effect.{Concurrent, Sync} import docspell.common._ import fs2.concurrent.Queue -trait Logger[F[_]] { - - def trace(msg: => String): F[Unit] - def debug(msg: => String): F[Unit] - def info(msg: => String): F[Unit] - def warn(msg: => String): F[Unit] - def error(ex: Throwable)(msg: => String): F[Unit] - def error(msg: => String): F[Unit] - -} - object Logger { def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] =