mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-31 05:15:08 +00:00
Move mimetype detection to docspell-files
This commit is contained in:
parent
5c3d2b2e28
commit
1309c8b7fa
@ -1,4 +1,4 @@
|
|||||||
package docspell.text.ocr
|
package docspell.common
|
||||||
|
|
||||||
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
|
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
|
||||||
|
|
@ -1,8 +1,8 @@
|
|||||||
package docspell.text.ocr
|
package docspell.files
|
||||||
|
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import docspell.common.MimeType
|
import docspell.common._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
import org.apache.tika.config.TikaConfig
|
import org.apache.tika.config.TikaConfig
|
||||||
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
|
||||||
@ -35,7 +35,7 @@ object TikaMimetype {
|
|||||||
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
|
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
|
||||||
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
|
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
|
||||||
|
|
||||||
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
|
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
|
||||||
data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
|
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
|
||||||
|
|
||||||
}
|
}
|
@ -1,7 +1,8 @@
|
|||||||
package docspell.text.ocr
|
package docspell.text.ocr
|
||||||
|
|
||||||
import cats.effect.{Blocker, ContextShift, Sync}
|
import cats.effect.{Blocker, ContextShift, Sync}
|
||||||
import docspell.common.MimeType
|
import docspell.common._
|
||||||
|
import docspell.files._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
object TextExtract {
|
object TextExtract {
|
||||||
@ -21,7 +22,7 @@ object TextExtract {
|
|||||||
config: Config
|
config: Config
|
||||||
): Stream[F, String] =
|
): Stream[F, String] =
|
||||||
Stream
|
Stream
|
||||||
.eval(TikaMimetype.detect(in))
|
.eval(TikaMimetype.detect(in, MimeTypeHint.none))
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case mt if !config.isAllowed(mt) =>
|
case mt if !config.isAllowed(mt) =>
|
||||||
raiseError(s"File `$mt` not allowed")
|
raiseError(s"File `$mt` not allowed")
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package docspell.text.ocr
|
package docspell.text.ocr
|
||||||
|
|
||||||
import cats.effect.IO
|
import cats.effect.IO
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.files._
|
||||||
import docspell.text.TestFiles
|
import docspell.text.TestFiles
|
||||||
import minitest.SimpleTestSuite
|
import minitest.SimpleTestSuite
|
||||||
|
|
||||||
@ -28,4 +30,13 @@ object TextExtractionSuite extends SimpleTestSuite {
|
|||||||
|
|
||||||
assertEquals(extract.trim, expect.trim)
|
assertEquals(extract.trim, expect.trim)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find mimetypes") {
|
||||||
|
docspell.examplefiles.ExampleFiles.
|
||||||
|
all.foreach { url =>
|
||||||
|
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
|
||||||
|
map(mt => println(url.asString + ": " + mt.asString)).
|
||||||
|
unsafeRunSync
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user