mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Improve handling encodings
Html and text files are not fixed to be UTF-8. The encoding is now detected, which may not work for all files. Default/fallback will be utf-8. There is still a problem with mails that contain html parts not in utf8 encoding. The mail text is always returned as a string and the original encoding is lost. Then the html is stored using utf-8 bytes, but wkhtmltopdf reads it using latin1. It seems that the `--encoding` setting doesn't override encoding provided by the document.
This commit is contained in:
@ -31,7 +31,7 @@ object Extraction {
|
||||
lang: Language
|
||||
): F[ExtractResult] =
|
||||
TikaMimetype.resolve(dataType, data).flatMap {
|
||||
case MimeType.pdf =>
|
||||
case MimeType.PdfMatch(_) =>
|
||||
PdfExtract
|
||||
.get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger)
|
||||
.map(ExtractResult.fromEither)
|
||||
@ -75,14 +75,15 @@ object Extraction {
|
||||
doExtract
|
||||
}
|
||||
|
||||
case OdfType.container =>
|
||||
case OdfType.ContainerMatch(_) =>
|
||||
logger
|
||||
.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *>
|
||||
OdfExtract.get(data).map(ExtractResult.fromEither)
|
||||
|
||||
case mt @ MimeType("text", sub) if !sub.contains("html") =>
|
||||
case mt @ MimeType("text", sub, _) if !sub.contains("html") =>
|
||||
val cs = mt.charsetOrUtf8
|
||||
logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *>
|
||||
data.through(fs2.text.utf8Decode).compile.last.map { txt =>
|
||||
data.through(Binary.decode(cs)).foldMonoid.compile.last.map { txt =>
|
||||
ExtractResult.success(txt.getOrElse("").trim)
|
||||
}
|
||||
|
||||
|
@ -12,5 +12,5 @@ object OcrType {
|
||||
val all = Set(jpeg, png, tiff, pdf)
|
||||
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(all.contains)
|
||||
Some(mt).map(_.baseType).filter(all.contains)
|
||||
}
|
||||
|
@ -14,5 +14,10 @@ object OdfType {
|
||||
val all = Set(odt, ods, odtAlias, odsAlias)
|
||||
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(all.contains)
|
||||
Some(mt).map(_.baseType).filter(all.contains)
|
||||
|
||||
object ContainerMatch {
|
||||
def unapply(mt: MimeType): Option[MimeType] =
|
||||
Some(mt).filter(_.matches(container))
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,6 @@ object PoiType {
|
||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||
|
||||
def unapply(arg: MimeType): Option[MimeType] =
|
||||
Some(arg).filter(all.contains)
|
||||
Some(arg).map(_.baseType).filter(all.contains)
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user