From 1309c8b7faf0fad6af104671f8bec2a7ca4e213a Mon Sep 17 00:00:00 2001 From: Eike Kettner <eike.kettner@posteo.de> Date: Fri, 14 Feb 2020 11:24:05 +0100 Subject: [PATCH] Move mimetype detection to docspell-files --- .../main/scala/docspell/common}/MimeTypeHint.scala | 2 +- .../src/main/scala/docspell/files}/TikaMimetype.scala | 8 ++++---- .../main/scala/docspell/text/ocr/TextExtract.scala | 5 +++-- .../scala/docspell/text/ocr/TextExtractionSuite.scala | 11 +++++++++++ 4 files changed, 19 insertions(+), 7 deletions(-) rename modules/{text/src/main/scala/docspell/text/ocr => common/src/main/scala/docspell/common}/MimeTypeHint.scala (84%) rename modules/{text/src/main/scala/docspell/text/ocr => files/src/main/scala/docspell/files}/TikaMimetype.scala (82%) diff --git a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala similarity index 84% rename from modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala rename to modules/common/src/main/scala/docspell/common/MimeTypeHint.scala index 23c39f16..f802b803 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala +++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala @@ -1,4 +1,4 @@ -package docspell.text.ocr +package docspell.common case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} diff --git a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala similarity index 82% rename from modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala rename to modules/files/src/main/scala/docspell/files/TikaMimetype.scala index 5c90c728..3511859c 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -1,8 +1,8 @@ -package docspell.text.ocr +package docspell.files import cats.implicits._ import cats.effect.Sync -import docspell.common.MimeType +import docspell.common._ import fs2.Stream import org.apache.tika.config.TikaConfig import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} @@ -35,7 +35,7 @@ object TikaMimetype { private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) - def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] = - data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none)) + def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] = + data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint)) } diff --git a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala b/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala index 884a1581..dc43e524 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala +++ b/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala @@ -1,7 +1,8 @@ package docspell.text.ocr import cats.effect.{Blocker, ContextShift, Sync} -import docspell.common.MimeType +import docspell.common._ +import docspell.files._ import fs2.Stream object TextExtract { @@ -21,7 +22,7 @@ object TextExtract { config: Config ): Stream[F, String] = Stream - .eval(TikaMimetype.detect(in)) + .eval(TikaMimetype.detect(in, MimeTypeHint.none)) .flatMap({ case mt if !config.isAllowed(mt) => raiseError(s"File `$mt` not allowed") diff --git a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala b/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala index f9e94ec2..ec25e9c4 100644 --- a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala +++ b/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala @@ -1,6 +1,8 @@ package docspell.text.ocr import cats.effect.IO +import docspell.common._ +import docspell.files._ import docspell.text.TestFiles import minitest.SimpleTestSuite @@ -28,4 +30,13 @@ object TextExtractionSuite extends SimpleTestSuite { assertEquals(extract.trim, expect.trim) } + + test("find mimetypes") { + docspell.examplefiles.ExampleFiles. + all.foreach { url => + TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none). + map(mt => println(url.asString + ": " + mt.asString)). + unsafeRunSync + } + } }