Move mimetype detection to docspell-files

2025-11-04 12:30:12 +00:00 · 2020-02-14 11:24:05 +01:00
parent 5c3d2b2e28
commit 1309c8b7fa
4 changed files with 19 additions and 7 deletions
--- a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala
+++ b/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala
@@ -1,7 +0,0 @@
-package docspell.text.ocr
-
-case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
-
-object MimeTypeHint {
-  val none = MimeTypeHint(None, None)
-}
--- a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala
+++ b/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala
@@ -1,7 +1,8 @@
 package docspell.text.ocr

 import cats.effect.{Blocker, ContextShift, Sync}
-import docspell.common.MimeType
+import docspell.common._
+import docspell.files._
 import fs2.Stream

 object TextExtract {
@@ -21,7 +22,7 @@ object TextExtract {
      config: Config
  ): Stream[F, String] =
    Stream
-      .eval(TikaMimetype.detect(in))
+      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
      .flatMap({
        case mt if !config.isAllowed(mt) =>
          raiseError(s"File `$mt` not allowed")
--- a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala
+++ b/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala
@@ -1,41 +0,0 @@
-package docspell.text.ocr
-
-import cats.implicits._
-import cats.effect.Sync
-import docspell.common.MimeType
-import fs2.Stream
-import org.apache.tika.config.TikaConfig
-import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
-import org.apache.tika.mime.MediaType
-
-object TikaMimetype {
-  private val tika = new TikaConfig().getDetector
-
-  private def convert(mt: MediaType): MimeType =
-    Option(mt)
-      .map(_.toString)
-      .map(MimeType.parse)
-      .flatMap(_.toOption)
-      .map(normalize)
-      .getOrElse(MimeType.octetStream)
-
-  private def makeMetadata(hint: MimeTypeHint): Metadata = {
-    val md = new Metadata
-    hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
-    hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
-    md
-  }
-
-  private def normalize(in: MimeType): MimeType = in match {
-    case MimeType(_, sub) if sub contains "xhtml" =>
-      MimeType.html
-    case _ => in
-  }
-
-  private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
-    convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
-
-  def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
-    data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
-
-}
--- a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala
+++ b/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala
@@ -1,6 +1,8 @@
 package docspell.text.ocr

 import cats.effect.IO
+import docspell.common._
+import docspell.files._
 import docspell.text.TestFiles
 import minitest.SimpleTestSuite

@@ -28,4 +30,13 @@ object TextExtractionSuite extends SimpleTestSuite {

    assertEquals(extract.trim, expect.trim)
  }
+
+  test("find mimetypes") {
+    docspell.examplefiles.ExampleFiles.
+      all.foreach { url =>
+        TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
+          map(mt => println(url.asString + ": " + mt.asString)).
+          unsafeRunSync
+      }
+  }
 }