From 1309c8b7faf0fad6af104671f8bec2a7ca4e213a Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Fri, 14 Feb 2020 11:24:05 +0100
Subject: [PATCH] Move mimetype detection to docspell-files

---
 .../main/scala/docspell/common}/MimeTypeHint.scala    |  2 +-
 .../src/main/scala/docspell/files}/TikaMimetype.scala |  8 ++++----
 .../main/scala/docspell/text/ocr/TextExtract.scala    |  5 +++--
 .../scala/docspell/text/ocr/TextExtractionSuite.scala | 11 +++++++++++
 4 files changed, 19 insertions(+), 7 deletions(-)
 rename modules/{text/src/main/scala/docspell/text/ocr => common/src/main/scala/docspell/common}/MimeTypeHint.scala (84%)
 rename modules/{text/src/main/scala/docspell/text/ocr => files/src/main/scala/docspell/files}/TikaMimetype.scala (82%)

diff --git a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala
similarity index 84%
rename from modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala
rename to modules/common/src/main/scala/docspell/common/MimeTypeHint.scala
index 23c39f16..f802b803 100644
--- a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala
+++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala
@@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.common
 
 case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {}
 
diff --git a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
similarity index 82%
rename from modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala
rename to modules/files/src/main/scala/docspell/files/TikaMimetype.scala
index 5c90c728..3511859c 100644
--- a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala
+++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
@@ -1,8 +1,8 @@
-package docspell.text.ocr
+package docspell.files
 
 import cats.implicits._
 import cats.effect.Sync
-import docspell.common.MimeType
+import docspell.common._
 import fs2.Stream
 import org.apache.tika.config.TikaConfig
 import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
@@ -35,7 +35,7 @@ object TikaMimetype {
   private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType =
     convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
 
-  def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
-    data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
+  def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
+    data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
 
 }
diff --git a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala b/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala
index 884a1581..dc43e524 100644
--- a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala
+++ b/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala
@@ -1,7 +1,8 @@
 package docspell.text.ocr
 
 import cats.effect.{Blocker, ContextShift, Sync}
-import docspell.common.MimeType
+import docspell.common._
+import docspell.files._
 import fs2.Stream
 
 object TextExtract {
@@ -21,7 +22,7 @@ object TextExtract {
       config: Config
   ): Stream[F, String] =
     Stream
-      .eval(TikaMimetype.detect(in))
+      .eval(TikaMimetype.detect(in, MimeTypeHint.none))
       .flatMap({
         case mt if !config.isAllowed(mt) =>
           raiseError(s"File `$mt` not allowed")
diff --git a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala b/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala
index f9e94ec2..ec25e9c4 100644
--- a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala
+++ b/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala
@@ -1,6 +1,8 @@
 package docspell.text.ocr
 
 import cats.effect.IO
+import docspell.common._
+import docspell.files._
 import docspell.text.TestFiles
 import minitest.SimpleTestSuite
 
@@ -28,4 +30,13 @@ object TextExtractionSuite extends SimpleTestSuite {
 
     assertEquals(extract.trim, expect.trim)
   }
+
+  test("find mimetypes") {
+    docspell.examplefiles.ExampleFiles.
+      all.foreach { url =>
+        TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
+          map(mt => println(url.asString + ": " + mt.asString)).
+          unsafeRunSync
+      }
+  }
 }