Move mimetype detection to docspell-files

This commit is contained in:
Eike Kettner 2020-02-14 11:24:05 +01:00
parent 5c3d2b2e28
commit 1309c8b7fa
4 changed files with 19 additions and 7 deletions

View File

@ -1,4 +1,4 @@
package docspell.text.ocr package docspell.common
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}

View File

@ -1,8 +1,8 @@
package docspell.text.ocr package docspell.files
import cats.implicits._ import cats.implicits._
import cats.effect.Sync import cats.effect.Sync
import docspell.common.MimeType import docspell.common._
import fs2.Stream import fs2.Stream
import org.apache.tika.config.TikaConfig import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
@ -35,7 +35,7 @@ object TikaMimetype {
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] = def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none)) data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
} }

View File

@ -1,7 +1,8 @@
package docspell.text.ocr package docspell.text.ocr
import cats.effect.{Blocker, ContextShift, Sync} import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common.MimeType import docspell.common._
import docspell.files._
import fs2.Stream import fs2.Stream
object TextExtract { object TextExtract {
@ -21,7 +22,7 @@ object TextExtract {
config: Config config: Config
): Stream[F, String] = ): Stream[F, String] =
Stream Stream
.eval(TikaMimetype.detect(in)) .eval(TikaMimetype.detect(in, MimeTypeHint.none))
.flatMap({ .flatMap({
case mt if !config.isAllowed(mt) => case mt if !config.isAllowed(mt) =>
raiseError(s"File `$mt` not allowed") raiseError(s"File `$mt` not allowed")

View File

@ -1,6 +1,8 @@
package docspell.text.ocr package docspell.text.ocr
import cats.effect.IO import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.text.TestFiles import docspell.text.TestFiles
import minitest.SimpleTestSuite import minitest.SimpleTestSuite
@ -28,4 +30,13 @@ object TextExtractionSuite extends SimpleTestSuite {
assertEquals(extract.trim, expect.trim) assertEquals(extract.trim, expect.trim)
} }
test("find mimetypes") {
docspell.examplefiles.ExampleFiles.
all.foreach { url =>
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
map(mt => println(url.asString + ": " + mt.asString)).
unsafeRunSync
}
}
} }