Use existing mimetype detection when storing files

This commit is contained in:
eikek 2021-09-23 09:27:04 +02:00
parent 1761526e20
commit 071f4067bf
4 changed files with 28 additions and 26 deletions

View File

@ -381,7 +381,7 @@ val store = project
libraryDependencies ++= libraryDependencies ++=
Dependencies.testContainer.map(_ % Test) Dependencies.testContainer.map(_ % Test)
) )
.dependsOn(common, query.jvm, totp) .dependsOn(common, query.jvm, totp, files)
val extract = project val extract = project
.in(file("modules/extract")) .in(file("modules/extract"))

View File

@ -24,6 +24,7 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties} import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties}
import org.apache.tika.mime.MediaType import org.apache.tika.mime.MediaType
import org.apache.tika.parser.txt.Icu4jEncodingDetector import org.apache.tika.parser.txt.Icu4jEncodingDetector
import scodec.bits.ByteVector
object TikaMimetype { object TikaMimetype {
private val tika = new TikaConfig().getDetector private val tika = new TikaConfig().getDetector
@ -83,6 +84,9 @@ object TikaMimetype {
def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] = def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] =
data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint)) data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint))
def detect(data: ByteVector, hint: MimeTypeHint): MimeType =
fromBytes(data.toArray, hint)
def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] = def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] =
dt match { dt match {
case DataType.Exact(mt) => case DataType.Exact(mt) =>

View File

@ -13,12 +13,13 @@ import cats.effect._
import fs2.{Pipe, Stream} import fs2.{Pipe, Stream}
import docspell.common._ import docspell.common._
import docspell.files.TikaMimetype
import docspell.store.records.RFileMeta import docspell.store.records.RFileMeta
import binny._ import binny._
import binny.jdbc.{GenericJdbcStore, JdbcStoreConfig} import binny.jdbc.{GenericJdbcStore, JdbcStoreConfig}
import binny.tika.TikaContentTypeDetect
import doobie._ import doobie._
import scodec.bits.ByteVector
trait FileStore[F[_]] { trait FileStore[F[_]] {
@ -42,8 +43,9 @@ object FileStore {
chunkSize: Int chunkSize: Int
): FileStore[F] = { ): FileStore[F] = {
val attrStore = new AttributeStore[F](xa) val attrStore = new AttributeStore[F](xa)
val cfg = JdbcStoreConfig("filechunk", chunkSize, TikaContentTypeDetect.default) val cfg = JdbcStoreConfig("filechunk", chunkSize, TikaContentTypeDetect)
val binStore = GenericJdbcStore[F](ds, Log4sLogger[F](logger), cfg, attrStore) val log = Logger.log4s[F](logger)
val binStore = GenericJdbcStore[F](ds, LoggerAdapter(log), cfg, attrStore)
new Impl[F](binStore, attrStore) new Impl[F](binStore, attrStore)
} }
@ -66,27 +68,24 @@ object FileStore {
.andThen(_.map(bid => Ident.unsafe(bid.id))) .andThen(_.map(bid => Ident.unsafe(bid.id)))
} }
private object Log4sLogger { private object LoggerAdapter {
def apply[F[_]](log: Logger[F]): binny.util.Logger[F] =
def apply[F[_]: Sync](log: org.log4s.Logger): binny.util.Logger[F] =
new binny.util.Logger[F] { new binny.util.Logger[F] {
override def trace(msg: => String): F[Unit] = override def trace(msg: => String): F[Unit] = log.trace(msg)
Sync[F].delay(log.trace(msg)) override def debug(msg: => String): F[Unit] = log.debug(msg)
override def info(msg: => String): F[Unit] = log.info(msg)
override def debug(msg: => String): F[Unit] = override def warn(msg: => String): F[Unit] = log.warn(msg)
Sync[F].delay(log.debug(msg)) override def error(msg: => String): F[Unit] = log.error(msg)
override def error(ex: Throwable)(msg: => String): F[Unit] = log.error(ex)(msg)
override def info(msg: => String): F[Unit] =
Sync[F].delay(log.info(msg))
override def warn(msg: => String): F[Unit] =
Sync[F].delay(log.warn(msg))
override def error(msg: => String): F[Unit] =
Sync[F].delay(log.error(msg))
override def error(ex: Throwable)(msg: => String): F[Unit] =
Sync[F].delay(log.error(ex)(msg))
} }
} }
private object TikaContentTypeDetect extends ContentTypeDetect {
override def detect(data: ByteVector, hint: Hint): SimpleContentType =
SimpleContentType(
TikaMimetype
.detect(data, MimeTypeHint(hint.filename, hint.advertisedType))
.asString
)
}
} }

View File

@ -275,8 +275,7 @@ object Dependencies {
val binny = Seq( val binny = Seq(
"com.github.eikek" %% "binny-core" % BinnyVersion, "com.github.eikek" %% "binny-core" % BinnyVersion,
"com.github.eikek" %% "binny-jdbc" % BinnyVersion, "com.github.eikek" %% "binny-jdbc" % BinnyVersion
"com.github.eikek" %% "binny-tika-detect" % BinnyVersion
) )
// https://github.com/flyway/flyway // https://github.com/flyway/flyway