Adding extraction primitives

This commit is contained in:
Eike Kettner 2020-02-16 21:37:26 +01:00
parent 851ee7ef0f
commit 8143a4edcc
46 changed files with 2731 additions and 83 deletions

View File

@ -205,7 +205,9 @@ val extract = project.in(file("modules/extract")).
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.pdfbox ++
Dependencies.poi
Dependencies.poi ++
Dependencies.commonsIO ++
Dependencies.julOverSlf4j
).dependsOn(common, files % "compile->compile;test->test")
val convert = project.in(file("modules/convert")).

View File

@ -1,21 +0,0 @@
package docspell.analysis
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -1,6 +1,6 @@
package docspell.analysis.date
import docspell.analysis.TestFiles
import docspell.files.TestFiles
import minitest.SimpleTestSuite
import docspell.common.Language

View File

@ -1,7 +1,7 @@
package docspell.analysis.nlp
import minitest.SimpleTestSuite
import docspell.analysis.TestFiles
import docspell.files.TestFiles
import docspell.common._
object TextAnalyserSuite extends SimpleTestSuite {
@ -12,25 +12,23 @@ object TextAnalyserSuite extends SimpleTestSuite {
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 69, 74),
NerLabel("Jeter", NerTag.Person, 75, 80),
NerLabel("Treesville", NerTag.Location, 96, 106),
NerLabel("M.", NerTag.Person, 142, 144),
NerLabel("Leat", NerTag.Person, 145, 149),
NerLabel("Syrup", NerTag.Organization, 160, 165),
NerLabel("Production", NerTag.Organization, 166, 176),
NerLabel("Old", NerTag.Organization, 177, 180),
NerLabel("Sticky", NerTag.Organization, 181, 187),
NerLabel("Pancake", NerTag.Organization, 188, 195),
NerLabel("Company", NerTag.Organization, 196, 203),
NerLabel("Maple", NerTag.Location, 208, 213),
NerLabel("Lane", NerTag.Location, 214, 218),
NerLabel("Forest", NerTag.Location, 220, 226),
NerLabel("Hemptown", NerTag.Location, 241, 249),
NerLabel("Little", NerTag.Organization, 349, 355),
NerLabel("League", NerTag.Organization, 356, 362),
NerLabel("Derek", NerTag.Person, 1119, 1124),
NerLabel("Jeter", NerTag.Person, 1125, 1130)
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 159, 164),
NerLabel("Production", NerTag.Organization, 165, 175),
NerLabel("Old", NerTag.Organization, 176, 179),
NerLabel("Sticky", NerTag.Organization, 180, 186),
NerLabel("Pancake", NerTag.Organization, 187, 194),
NerLabel("Company", NerTag.Organization, 195, 202),
NerLabel("Maple", NerTag.Location, 207, 212),
NerLabel("Lane", NerTag.Location, 213, 217),
NerLabel("Forest", NerTag.Location, 219, 225),
NerLabel("Hemptown", NerTag.Location, 239, 247),
NerLabel("Little", NerTag.Organization, 347, 353),
NerLabel("League", NerTag.Organization, 354, 360),
NerLabel("Derek", NerTag.Person, 1117, 1122),
NerLabel("Jeter", NerTag.Person, 1123, 1128)
)
assertEquals(labels, expect)
}

View File

@ -0,0 +1,69 @@
package docspell.convert.flexmark
import java.io.{InputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.util
import cats.effect.Sync
import cats.implicits._
import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension
import com.vladsch.flexmark.ext.tables.TablesExtension
import com.vladsch.flexmark.html.HtmlRenderer
import com.vladsch.flexmark.parser.Parser
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
import fs2.Stream
import scala.util.Try
object Markdown {
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
val p = createParser()
val r = createRenderer()
Try {
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
val doc = p.parseReader(reader)
wrapHtml(r.render(doc), cfg)
}.toEither
}
def toHtml(md: String, cfg: MarkdownConfig): String = {
val p = createParser()
val r = createRenderer()
val doc = p.parse(md)
wrapHtml(r.render(doc), cfg)
}
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
data.through(fs2.text.utf8Decode).compile.foldMonoid.
map(str => toHtml(str, cfg))
private def wrapHtml(body: String, cfg: MarkdownConfig): String = {
s"""<html>
|<head>
|<style>
|${cfg.internalCss}
|</style>
|</head>
|<body>
|$body
|</body>
|</html>
|""".stripMargin
}
private def createParser(): Parser = {
val opts = new MutableDataSet()
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
util.Arrays.asList(TablesExtension.create(),
StrikethroughExtension.create()));
Parser.builder(opts).build()
}
private def createRenderer(): HtmlRenderer = {
val opts = new MutableDataSet()
HtmlRenderer.builder(opts).build()
}
}

View File

@ -0,0 +1,3 @@
package docspell.convert.flexmark
case class MarkdownConfig(internalCss: String)

11
modules/extract/NOTICE Normal file
View File

@ -0,0 +1,11 @@
The Java source files in docspell-extract are unmodified copies of
those found in the Apache Tika parser project. It follows the
NOTICE.txt file from Apache Tika parsers:
Apache Tika parsers
Copyright 2007-2019 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

View File

@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
/**
* Content handler decorator that:<ul>
* <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
* <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
* </ul>
*/
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
private static final String OLD_NS =
"http://openoffice.org/2000/";
private static final String NEW_NS =
"urn:oasis:names:tc:opendocument:xmlns:";
private static final String DTD_PUBLIC_ID =
"-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
public NSNormalizerContentHandler(ContentHandler handler) {
super(handler);
}
private String mapOldNS(String ns) {
if (ns != null && ns.startsWith(OLD_NS)) {
return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
} else {
return ns;
}
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes atts) throws SAXException {
AttributesImpl natts = new AttributesImpl();
for (int i = 0; i < atts.getLength(); i++) {
natts.addAttribute(
mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i), atts.getType(i), atts.getValue(i));
}
super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
super.endElement(mapOldNS(namespaceURI), localName, qName);
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
super.startPrefixMapping(prefix, mapOldNS(uri));
}
/**
* do not load any DTDs (may be requested by parser). Fake the DTD by
* returning a empty string as InputSource
*/
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws IOException, SAXException {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|| DTD_PUBLIC_ID.equals(publicId)) {
return new InputSource(new StringReader(""));
} else {
return super.resolveEntity(publicId, systemId);
}
}
}

View File

@ -0,0 +1,606 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.namespace.QName;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" +
"italic=" + italic +
", bold=" + bold +
", underlined=" + underlined +
'}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
private static final char[] SPACE = new char[]{ ' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
private int pDepth = 0; //<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
Map<QName, TargetElement> mappings) {
super(handler, mappings);
this.handler = handler;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(
String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) ||
"desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML,"u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML,"i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML,"b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
&& "text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
|| (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
&& Integer.valueOf(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement("", localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI)
&& ("tab-stop".equals(localName)
|| "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
}
public static final String TEXT_NS =
"urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS =
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS =
"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
MAPPINGS.put(
new QName(TEXT_NS, "p"),
new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(
new QName(TEXT_NS, "line-break"),
new TargetElement(XHTML, "br"));
MAPPINGS.put(
new QName(TEXT_NS, "list-item"),
new TargetElement(XHTML, "li"));
MAPPINGS.put(
new QName(TEXT_NS, "note"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(OFFICE_NS, "annotation"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(PRESENTATION_NS, "notes"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(DRAW_NS, "object"),
new TargetElement(XHTML, "object"));
MAPPINGS.put(
new QName(DRAW_NS, "text-box"),
new TargetElement(XHTML, "div"));
MAPPINGS.put(
new QName(SVG_NS, "title"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(SVG_NS, "desc"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(TEXT_NS, "span"),
new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping =
new HashMap<QName, QName>();
aAttsMapping.put(
new QName(XLINK_NS, "href"),
new QName("href"));
aAttsMapping.put(
new QName(XLINK_NS, "title"),
new QName("title"));
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
new QName(TABLE_NS, "table"),
new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(
new QName(TABLE_NS, "table-row"),
new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping =
new HashMap<QName, QName>();
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-spanned"),
new QName("colspan"));
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-rows-spanned"),
new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-repeated"),
new QName("colspan"));
MAPPINGS.put(
new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream,
new XHTMLContentHandler(handler, metadata),
metadata, context);
}
void parseInternal(
InputStream stream, final ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(
new NSNormalizerContentHandler(dh)),
context);
}
}

View File

@ -0,0 +1,199 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
import org.apache.tika.parser.xml.AttributeMetadataHandler;
import org.apache.tika.parser.xml.ElementMetadataHandler;
import org.apache.tika.parser.xml.MetadataHandler;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.xpath.CompositeMatcher;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
/**
* Parser for OpenDocument <code>meta.xml</code> files.
*/
public class OpenDocumentMetaParser extends XMLParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = -8739250869531737584L;
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
/**
* @see OfficeOpenXMLCore#SUBJECT
* @deprecated use OfficeOpenXMLCore#SUBJECT
*/
@Deprecated
private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
Property.composite(Office.INITIAL_AUTHOR,
new Property[]{Property.externalText("initial-creator")});
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
}
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getUserDefined(
ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:user-defined/@meta:name"),
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
matcher);
return new TeeContentHandler(ch, branch);
}
@Deprecated
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, Property property, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
// Process the Dublin Core Attributes
ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "date",
md, TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "subject",
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
ch = getUserDefined(ch, md);
// Process the OO Statistics Attributes
ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
ch = getStatistic(ch, md, "nbWord", "word-count");
ch = getStatistic(ch, md, "nbCharacter", "character-count");
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
@Override
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
if (odfSubject != null && !odfSubject.equals("") &&
(metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
}
}

View File

@ -0,0 +1,256 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* OpenOffice parser
*/
public class OpenDocumentParser extends AbstractParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = -6410276875438618287L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
MediaType.application("vnd.oasis.opendocument.presentation"),
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
MediaType.application("vnd.oasis.opendocument.chart"),
MediaType.application("vnd.oasis.opendocument.image"),
MediaType.application("vnd.oasis.opendocument.formula"),
MediaType.application("vnd.oasis.opendocument.text-master"),
MediaType.application("vnd.oasis.opendocument.text-web"),
MediaType.application("vnd.oasis.opendocument.text-template"),
MediaType.application("vnd.oasis.opendocument.graphics-template"),
MediaType.application("vnd.oasis.opendocument.presentation-template"),
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("vnd.oasis.opendocument.chart-template"),
MediaType.application("vnd.oasis.opendocument.image-template"),
MediaType.application("vnd.oasis.opendocument.formula-template"),
MediaType.application("x-vnd.oasis.opendocument.text"),
MediaType.application("x-vnd.oasis.opendocument.graphics"),
MediaType.application("x-vnd.oasis.opendocument.presentation"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
MediaType.application("x-vnd.oasis.opendocument.chart"),
MediaType.application("x-vnd.oasis.opendocument.image"),
MediaType.application("x-vnd.oasis.opendocument.formula"),
MediaType.application("x-vnd.oasis.opendocument.text-master"),
MediaType.application("x-vnd.oasis.opendocument.text-web"),
MediaType.application("x-vnd.oasis.opendocument.text-template"),
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
MediaType.application("x-vnd.oasis.opendocument.image-template"),
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
public Parser getMetaParser() {
return meta;
}
public void setMetaParser(Parser meta) {
this.meta = meta;
}
public Parser getContentParser() {
return content;
}
public void setContentParser(Parser content) {
this.content = content;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
ZipInputStream zipStream = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
} else if (tis.hasFile()) {
zipFile = new ZipFile(tis.getFile());
} else {
zipStream = new ZipInputStream(stream);
}
} else {
zipStream = new ZipInputStream(stream);
}
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
do {
handleZipEntry(entry, zipStream, metadata, context, handler);
entry = zipStream.getNextEntry();
} while (entry != null);
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, TikaException, SAXException {
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
ZipEntry entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
}
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
}
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
}
}
}
}

View File

@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.helpers.DefaultHandler;
import java.util.Arrays;
import java.util.List;
/**
* Base class for SAX handlers that map SAX events into document metadata.
*
* @since Apache Tika 0.10
*/
class AbstractMetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final Property property;
private final String name;
protected AbstractMetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
this.property = null;
this.name = name;
}
protected AbstractMetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
/**
* Adds the given metadata value. The value is ignored if it is
* <code>null</code> or empty. If the metadata entry already exists,
* then the given value is appended to it with a comma as the separator.
*
* @param value metadata value
*/
protected void addMetadata(String value) {
if (value != null && value.length() > 0) {
if (metadata.isMultiValued(name)) {
// Add the value, assuming it's not already there
List<String> previous = Arrays.asList(metadata.getValues(name));
if (!previous.contains(value)) {
if (property != null) {
metadata.add(property, value);
} else {
metadata.add(name, value);
}
}
} else {
// Set the value, assuming it's not already there
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
if (!previous.equals(value)) {
if (property != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
// Replace the existing value if isMultiValuePermitted is false
metadata.set(property, value);
}
} else {
metadata.add(name, value);
}
}
} else {
if (property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
}
}
}
}
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/**
* This adds a Metadata entry for a given node.
* The textual content of the node is used as the
* value, and the Metadata name is taken from
* an attribute, with a prefix if required.
*/
public class AttributeDependantMetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final String nameHoldingAttribute;
private final String namePrefix;
private String name;
private final StringBuilder buffer = new StringBuilder();
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
this.metadata = metadata;
this.nameHoldingAttribute = nameHoldingAttribute;
this.namePrefix = namePrefix;
}
public void addMetadata(String value) {
if(name == null || name.length() == 0) {
// We didn't find the attribute which holds the name
return;
}
if (value.length() > 0) {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
metadata.set(name, value);
}
}
public void endElement(String uri, String localName, String name) {
addMetadata(buffer.toString());
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
String rawName = attributes.getValue(nameHoldingAttribute);
if (rawName != null) {
if (namePrefix == null) {
this.name = rawName;
} else {
this.name = namePrefix + rawName;
}
}
// All other attributes are ignored
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
}

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
* SAX event handler that maps the contents of an XML attribute into
* a metadata field.
*
* @since Apache Tika 0.10
*/
public class AttributeMetadataHandler extends AbstractMetadataHandler {
private final String uri;
private final String localName;
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
}
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, Property property) {
super(metadata, property);
this.uri = uri;
this.localName = localName;
}
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
throws SAXException {
for (int i = 0; i < attributes.getLength(); i++) {
if (attributes.getURI(i).equals(this.uri)
&& attributes.getLocalName(i).equals(this.localName)) {
addMetadata(attributes.getValue(i).trim());
}
}
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
/**
* Dublin Core metadata parser
*/
public class DcXMLParser extends XMLParser {
/** Serial version UID */
private static final long serialVersionUID = 4905318835463880819L;
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TeeContentHandler(
super.getContentHandler(handler, metadata, context),
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
}
}

View File

@ -0,0 +1,241 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import java.util.Arrays;
/**
* SAX event handler that maps the contents of an XML element into
* a metadata field.
*
* @since Apache Tika 0.10
*/
public class ElementMetadataHandler extends AbstractMetadataHandler {
private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);
private static final String LOCAL_NAME_RDF_BAG = "Bag";
private static final String LOCAL_NAME_RDF_LI = "li";
private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private final String uri;
private final String localName;
private final Metadata metadata;
private final String name;
private Property targetProperty;
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
private boolean isBagless = true;
private int matchLevel = 0;
private int parentMatchLevel = 0;
/**
* Constructor for string metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
LOG.trace("created simple handler for {}", this.name);
}
/**
* Constructor for string metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
LOG.trace("created simple handler for {}", this.name);
}
/**
* Constructor for Property metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
LOG.trace("created property handler for {}", this.name);
}
/**
* Constructor for Property metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
LOG.trace("created property handler for {}", this.name);
}
protected boolean isMatchingParentElement(String uri, String localName) {
return (uri.equals(this.uri) && localName.equals(this.localName));
}
protected boolean isMatchingElement(String uri, String localName) {
// match if we're inside the parent element or within some bag element
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
)
);
}
@Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel++;
}
}
@Override
public void endElement(String uri, String localName, String name) {
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel--;
}
if (isMatchingElement(uri, localName)) {
matchLevel--;
if (matchLevel == 2) {
// we're inside a bag li element, add the bagged buffer
addMetadata(bufferBagged.toString().trim());
bufferBagged.setLength(0);
isBagless = false;
}
if (matchLevel == 0 && isBagless) {
String valueBagless = bufferBagless.toString();
if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
// we're in a standard element, add the bagless buffer
addMetadata(valueBagless.trim());
bufferBagless.setLength(0);
}
isBagless = true;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
// We need to append to both buffers since we don't if we're inside a bag until we're done
if (parentMatchLevel > 0 && matchLevel > 2) {
bufferBagged.append(ch, start, length);
}
if (parentMatchLevel > 0 && matchLevel > 0) {
bufferBagless.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) {
characters(ch, start, length);
}
@Override
protected void addMetadata(String value) {
LOG.trace("adding {}={}", name, value);
if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
if ((value != null && value.length() > 0) || allowEmptyValues) {
if (value == null || value.length() == 0 && allowEmptyValues) {
value = "";
}
String[] previous = metadata.getValues(name);
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}
} else {
super.addMetadata(value);
}
}
}

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-fictionbook+xml"));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
return new BinaryElementsDataHandler(
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
private static final String ELEMENT_BINARY = "binary";
private boolean binaryMode = false;
private static final String ATTRIBUTE_ID = "id";
private final EmbeddedDocumentExtractor partExtractor;
private final ContentHandler handler;
private final StringBuilder binaryData = new StringBuilder();
private Metadata metadata;
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
this.partExtractor = partExtractor;
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
binaryMode = ELEMENT_BINARY.equals(localName);
if (binaryMode) {
binaryData.setLength(0);
metadata = new Metadata();
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (binaryMode) {
try {
partExtractor.parseEmbedded(
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
handler,
metadata,
true
);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}
binaryMode = false;
binaryData.setLength(0);
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (!binaryMode) {
handler.characters(ch, start, length);
} else {
binaryData.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
handler.ignorableWhitespace(ch, start, length);
}
}
}

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/**
* This adds Metadata entries with a specified name for
* the textual content of a node (if present), and
* all attribute values passed through the matcher
* (but not their names).
*
* @deprecated Use the {@link AttributeMetadataHandler} and
* {@link ElementMetadataHandler} classes instead
*/
public class MetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final Property property;
private final String name;
private final StringBuilder buffer = new StringBuilder();
public MetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
this.property = null;
this.name = name;
}
public MetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
public void addMetadata(String value) {
if (value.length() > 0) {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
if (this.property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
}
}
public void endElement(String uri, String localName, String name) {
addMetadata(buffer.toString());
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
for (int i = 0; i < attributes.getLength(); i++) {
addMetadata(attributes.getValue(i));
}
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
}

View File

@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* XML parser.
*/
public class XMLParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = -6028836725280212837L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("xml"),
MediaType.image("svg+xml"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))), context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TextContentHandler(handler, true);
}
}

View File

@ -0,0 +1,29 @@
package docspell.extract
import docspell.common.MimeType
import scala.util.Try
sealed trait ExtractResult {
def textOption: Option[String]
}
object ExtractResult {
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
val textOption = None
}
case class Failure(ex: Throwable) extends ExtractResult {
val textOption = None
}
case class Success(text: String) extends ExtractResult {
val textOption = Some(text)
}
def fromTry(r: Try[String]): ExtractResult =
r.fold(Failure.apply, Success.apply)
}

View File

@ -0,0 +1,30 @@
package docspell.extract.odf
import cats.effect._
import cats.implicits._
import fs2.Stream
import java.io.{ByteArrayInputStream, InputStream}
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ParseContext
import org.apache.tika.parser.odf.OpenDocumentParser
import org.apache.tika.sax.BodyContentHandler
import scala.util.Try
object OdfExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
def get(is: InputStream) = Try {
val handler = new BodyContentHandler()
val pctx = new ParseContext()
val meta = new Metadata()
val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx)
handler.toString.trim
}.toEither
}

View File

@ -0,0 +1,34 @@
package docspell.extract.pdfbox
import java.io.InputStream
import java.nio.file.Path
import cats.implicits._
import cats.effect.Sync
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import scala.util.{Try, Using}
import fs2.Stream
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map { bytes =>
Using(PDDocument.load(bytes))(readText).toEither.flatten
}
def get(is: InputStream): Either[Throwable, String] =
Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, String] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, String] =
Try {
val stripper = new PDFTextStripper()
stripper.setAddMoreFormatting(true)
stripper.setLineSeparator("\n")
stripper.getText(doc).trim // trim here already
}.toEither
}

View File

@ -0,0 +1,85 @@
package docspell.extract.poi
import java.io.{ByteArrayInputStream, InputStream}
import cats.data.EitherT
import cats.implicits._
import cats.effect.Sync
import org.apache.poi.hssf.extractor.ExcelExtractor
import org.apache.poi.hssf.usermodel.HSSFWorkbook
import org.apache.poi.hwpf.extractor.WordExtractor
import org.apache.poi.xssf.extractor.XSSFExcelExtractor
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.poi.xwpf.extractor.XWPFWordExtractor
import org.apache.poi.xwpf.usermodel.XWPFDocument
import fs2.Stream
import scala.util.Try
import docspell.common._
import docspell.files.TikaMimetype
object PoiExtract {
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
TikaMimetype.detect(data, hint).flatMap {
case PoiTypes.doc =>
getDoc(data)
case PoiTypes.xls =>
getXls(data)
case PoiTypes.xlsx =>
getXlsx(data)
case PoiTypes.docx =>
getDocx(data)
case PoiTypes.msoffice =>
EitherT(getDoc[F](data))
.recoverWith({
case _ => EitherT(getXls[F](data))
})
.value
case PoiTypes.ooxml =>
EitherT(getDocx[F](data))
.recoverWith({
case _ => EitherT(getXlsx[F](data))
})
.value
case mt =>
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
}
def getDocx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XWPFWordExtractor(new XWPFDocument(is))
xt.getText.trim
}.toEither
def getDoc(is: InputStream): Either[Throwable, String] =
Try {
val xt = new WordExtractor(is)
xt.getText.trim
}.toEither
def getXlsx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
xt.getText.trim
}.toEither
def getXls(is: InputStream): Either[Throwable, String] =
Try {
val xt = new ExcelExtractor(new HSSFWorkbook(is))
xt.getText.trim
}.toEither
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
}

View File

@ -0,0 +1,16 @@
package docspell.extract.poi
import docspell.common.MimeType
object PoiTypes {
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
}

View File

@ -0,0 +1,24 @@
package docspell.extract.rtf
import java.io.{ByteArrayInputStream, InputStream}
import cats.implicits._
import cats.effect.Sync
import fs2.Stream
import javax.swing.text.rtf.RTFEditorKit
import scala.util.Try
object RtfExtract {
def get(is: InputStream): Either[Throwable, String] =
Try {
val kit = new RTFEditorKit()
val doc = kit.createDefaultDocument()
kit.read(is, doc, 0)
doc.getText(0, doc.getLength).trim
}.toEither
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
}

View File

@ -1,9 +1,7 @@
package docspell.extract.ocr
import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.extract.TestFiles
import docspell.files.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
@ -30,13 +28,4 @@ object TextExtractionSuite extends SimpleTestSuite {
assertEquals(extract.trim, expect.trim)
}
test("find mimetypes") {
ExampleFiles.
all.foreach { url =>
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
map(mt => println(url.asString + ": " + mt.asString)).
unsafeRunSync
}
}
}

View File

@ -0,0 +1,28 @@
package docspell.extract.odf
import cats.effect._
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object OdfExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val files = List(
ExampleFiles.examples_sample_odt -> 6372,
ExampleFiles.examples_sample_ods -> 717
)
test("test extract from odt") {
files.foreach { case (file, len) =>
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str1 = OdfExtract.get(is).fold(throw _, identity)
assertEquals(str1.length, len)
val data = file.readURL[IO](8192, blocker)
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
assertEquals(str2, str1)
}
}
}

View File

@ -0,0 +1,48 @@
package docspell.extract.pdfbox
import cats.effect._
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object PdfboxExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val textPDFs = List(
ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
ExampleFiles.letter_en_pdf -> TestFiles.letterENText
)
test("extract text from text PDFs by inputstream") {
textPDFs.foreach {
case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
}
test("extract text from text PDFs via Stream") {
textPDFs.foreach {
case (file, txt) =>
val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
}
test("extract text from image PDFs") {
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
assertEquals(str, "")
}
private def removeFormatting(str: String): String =
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
}

View File

@ -0,0 +1,39 @@
package docspell.extract.poi
import cats.effect._
import docspell.common.MimeTypeHint
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object PoiExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val officeFiles = List(
ExampleFiles.examples_sample_doc -> 6241,
ExampleFiles.examples_sample_docx -> 6179,
ExampleFiles.examples_sample_xlsx -> 660,
ExampleFiles.examples_sample_xls -> 660
)
test("extract text from ms office files") {
officeFiles.foreach {
case (file, len) =>
val str1 = PoiExtract
.get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
.unsafeRunSync()
.fold(throw _, identity)
val str2 = PoiExtract
.get[IO](
file.readURL[IO](8192, blocker),
MimeTypeHint(Some(file.path.segments.last), None)
)
.unsafeRunSync()
.fold(throw _, identity)
assertEquals(str1, str2)
assertEquals(str1.length, len)
}
}
}

View File

@ -0,0 +1,14 @@
package docspell.extract.rtf
import docspell.files.ExampleFiles
import minitest.SimpleTestSuite
object RtfExtractTest extends SimpleTestSuite {
test("extract text from rtf using java input-stream") {
val file = ExampleFiles.examples_sample_rtf
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str = RtfExtract.get(is).fold(throw _, identity)
assertEquals(str.length, 7342)
}
}

View File

@ -0,0 +1,7 @@
package docspell.files
case class Dimension(width: Int, height: Int) {
def toAwtDimension: java.awt.Dimension =
new java.awt.Dimension(width, height)
}

View File

@ -0,0 +1,61 @@
package docspell.files
import java.io.{ByteArrayInputStream, InputStream}
import java.nio.file.Path
import cats.implicits._
import cats.effect._
import fs2.Stream
import javax.imageio.stream.{FileImageInputStream, ImageInputStream}
import javax.imageio.{ImageIO, ImageReader}
import scala.jdk.CollectionConverters._
import scala.util.{Try, Using}
object ImageSize {
/** Return the image size from its header without reading
* the whole image into memory.
*/
def get(file: Path): Option[Dimension] =
Using(new FileImageInputStream(file.toFile))(getDimension).toOption.flatten
/** Return the image size from its header without reading
* the whole image into memory.
*/
def get(in: InputStream): Option[Dimension] =
Option(ImageIO.createImageInputStream(in)).flatMap(getDimension)
/** Return the image size from its header without reading
* the whole image into memory.
*/
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = {
data.take(768).compile.to(Array).map(ar => {
val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
if (iis == null) sys.error("no reader given for the array")
else getDimension(iis)
})
}
private def getDimension(in: ImageInputStream): Option[Dimension] =
ImageIO
.getImageReaders(in)
.asScala
.to(LazyList)
.collectFirst(Function.unlift { reader =>
val dim = getDimension(in, reader).toOption
reader.dispose()
dim
})
private def getDimension(
in: ImageInputStream,
reader: ImageReader
): Either[Throwable, Dimension] =
Try {
reader.setInput(in)
val width = reader.getWidth(reader.getMinIndex)
val height = reader.getHeight(reader.getMinIndex)
Dimension(width, height)
}.toEither
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View File

@ -2,18 +2,18 @@ Derek Jeter
123 Elm Ave.
Treesville, ON MI1N 2P3
Treesville, ON M1N 2P3
November 7, 2016
Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
Mr. M. Leat
Mr. M. Leaf
Chief of Syrup Production
Old Sticky Pancake Company
456 Maple Lane
Forest, ON 7TW8 9Y0
Forest, ON 7W8 9Y0
Hemptown, September 3, 2019
Dear Mr. Leaf,

View File

@ -3,12 +3,12 @@
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
<pattern>%highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<root level="error">
<appender-ref ref="STDOUT" />
</root>
</configuration>

View File

@ -0,0 +1,46 @@
package docspell.files
import cats.implicits._
import cats.effect.{Blocker, IO}
import minitest.SimpleTestSuite
import scala.concurrent.ExecutionContext
import scala.util.Using
object ImageSizeTest extends SimpleTestSuite {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
//tiff files are not supported on the jdk by default
//requires an external library
val files = List(
ExampleFiles.camera_letter_en_jpg -> Dimension(1695, 2378),
ExampleFiles.camera_letter_en_png -> Dimension(1695, 2378),
// ExampleFiles.camera_letter_en_tiff -> Dimension(1695, 2378),
ExampleFiles.scanner_jfif_jpg -> Dimension(2480, 3514),
ExampleFiles.bombs_20K_gray_jpeg -> Dimension(20000, 20000),
ExampleFiles.bombs_20K_gray_png -> Dimension(20000, 20000),
ExampleFiles.bombs_20K_rgb_jpeg -> Dimension(20000, 20000),
ExampleFiles.bombs_20K_rgb_png -> Dimension(20000, 20000)
)
test("get sizes from input-stream") {
files.foreach {
case (uri, expect) =>
val url = uri.toJavaUrl.fold(sys.error, identity)
Using.resource(url.openStream()) { in =>
val dim = ImageSize.get(in)
assertEquals(dim, expect.some)
}
}
}
test("get sizes from stream") {
files.foreach {
case (uri, expect) =>
val stream = uri.readURL[IO](8192, blocker)
val dim = ImageSize.get(stream).unsafeRunSync()
assertEquals(dim, expect.some)
}
}
}

View File

@ -1,8 +1,7 @@
package docspell.extract
package docspell.files
import fs2.Stream
import cats.effect.{Blocker, IO}
import docspell.files._
import fs2.Stream
import scala.concurrent.ExecutionContext
@ -12,19 +11,19 @@ object TestFiles {
val letterSourceDE: Stream[IO, Byte] =
ExampleFiles.letter_de_pdf
.readURL[IO](16 * 1024, blocker)
.readURL[IO](8 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
ExampleFiles.letter_en_pdf
.readURL[IO](16 * 1024, blocker)
.readURL[IO](8 * 1024, blocker)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.readText[IO](8 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.readText[IO](8 * 1024, blocker)
.unsafeRunSync
}

View File

@ -11,3 +11,8 @@ title: ADRs
- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch)
- [0005 Job Executor](adr/0005_job-executor)
- [0006 More File Types](adr/0006_more-file-types)
- [0007 Convert HTML](adr/0007_convert_html_files)
- [0008 Convert Text](adr/0008_convert_plain_text)
- [0009 Convert Office Files](adr/0009_convert_office_docs)
- [0010 Convert Image Files](adr/0010_convert_image_files)
- [0011 Extract Text](adr/0011_extract_text)

View File

@ -112,7 +112,7 @@ If conversion is not supported for the input file, it is skipped. If
conversion fails, the error is propagated to let the retry mechanism
take care.
### What types?
#### What types?
Which file types should be supported? At a first step, all major
office documents, common images, plain text (i.e. markdown) and html
@ -123,6 +123,12 @@ There is always the preference to use jvm internal libraries in order
to be more platform independent and to reduce external dependencies.
But this is not always possible (like doing OCR).
<div class="thumbnail">
<img src="./img/process-files.png" title="Overview processing files">
</div>
#### Conversion
- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
unoconv (see [ADR 9](0009_convert_office_docs))
- HTML (`html`): wkhtmltopdf (see [ADR 7](0007_convert_html_files))
@ -130,9 +136,19 @@ But this is not always possible (like doing OCR).
- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
10](0010_convert_image_files))
#### Text Extraction
- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
- Office documends (`odt`, `ods`): Apache Tika (including the sources)
- HTML: not supported, extract text from converted PDF
- Images (`jpg`, `png`, `tif`): Tesseract
- Text/Markdown: n.a.
- PDF: Apache PDFBox or Tesseract
## Links
* [Convert HTML Files](0007_convert_html_files)
* [Convert Plain Text](0008_convert_plain_text)
* [Convert Office Documents](0009_convert_office_docs)
* [Convert Image Files](0010_convert_image_files)
* [Extract Text from Files](0011_extract_text)

View File

@ -0,0 +1,77 @@
---
layout: docs
title: Extract Text from Files
---
# Extract Text from Files
## Context and Problem Statement
With support for more file types there must be a way to extract text
from all of them. It is better to extract text from the source files,
in contrast to extracting the text from the converted pdf file.
There are multiple options and multiple file types. Again, most
priority is to use a java/scala library to reduce external
dependencies.
## Considered Options
### MS Office Documents
There is only one library I know: [Apache
POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
However, it doesn't support open-document format (odt and ods).
### OpenDocument Format
There are two libraries:
- [Apache Tika Parser](https://tika.apache.org/)
- [ODFToolkit](https://github.com/tdf/odftoolkit)
*Tika:* The tika-parsers package contains an opendocument parser for
extracting text. But it has a huge dependency tree, since it is a
super-package containing a parser for almost every common file type.
*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
and also pulls in quite some dependencies (while not as much as
tika-parser). It is not too bad, since it is a library for
manipulating opendocument files. But all I need is to only extract
text. I created tests that extracted text from my odt/ods files. It
worked at first sight, but running the tests in a loop resulted in
strange nullpointer exceptions (it only worked the first run).
### Richtext
Richtext is supported by the jdk (using `RichtextEditorKit` from
swing).
### PDF
For "image" pdf files, tesseract is used. For "text" PDF files, the
library [Apache PDFBox](https://pdfbox.apache.org) can be used.
There also is [iText](https://github.com/itext/itext7) with a AGPL
license.
### Images
For images and "image" PDF files, there is already tesseract in place.
### HTML
HTML must be converted into a PDF file before text can be extracted.
### Text/Markdown
These files can be used as-is, obviously.
## Decision Outcome
- MS Office files: POI library
- Open Document files: Tika, but integrating the few source files that
make up the open document parser. Due to its huge dependency tree,
the library is not added.
- PDF: Apache PDFBox. I know this library better than itext.

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

View File

@ -0,0 +1,43 @@
@startuml
scale 1200 width
title: Processing Files
skinparam monochrome true
skinparam backgroundColor white
skinparam rectangle {
roundCorner<<Input>> 25
roundCorner<<Output>> 5
}
rectangle Input <<Input>> {
file "html"
file "plaintext"
file "image"
file "msoffice"
file "rtf"
file "odf"
file "pdf"
}
node toBoth [
PDF + TXT
]
node toPdf [
PDF
]
node toTxt [
TXT
]
image --> toBoth:<tesseract>
html --> toPdf:<wkhtmltopdf>
toPdf --> toTxt:[pdfbox]
plaintext --> html:[flexmark]
msoffice --> toPdf:<unoconv>
msoffice --> toTxt:[poi]
rtf --> toTxt:[jdk]
rtf --> toPdf:<unoconv>
odf --> toTxt:[tika]
odf --> toPdf:<unoconv>
pdf --> toTxt:<tesseract>
pdf --> toTxt:[pdfbox]
plaintext -> toTxt:[identity]
@enduml

View File

@ -36,15 +36,20 @@ object Dependencies {
val ViewerJSVersion = "0.5.8"
val jclOverSlf4j = Seq(
"org.slf4j" % "jcl-over-slf4j" % Slf4jVersion
)
val julOverSlf4j = Seq(
"org.slf4j" % "jul-to-slf4j" % Slf4jVersion
)
val poi = Seq(
"org.apache.poi" % "poi" % PoiVersion,
"org.apache.poi" % "poi-ooxml" % PoiVersion,
"org.slf4j" % "slf4j-log4j12" % Slf4jVersion,
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
"org.apache.poi" % "poi-scratchpad" % PoiVersion,
).map(_.excludeAll(
ExclusionRule("commons-logging"),
ExclusionRule("log4j")
))
ExclusionRule("commons-logging")
)) ++ jclOverSlf4j
// https://github.com/vsch/flexmark-java
// BSD 2-Clause
@ -57,18 +62,17 @@ object Dependencies {
ExclusionRule("hamcrest-core")
))
val twelvemonkeys = Seq(
"com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
"com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
)
// val twelvemonkeys = Seq(
// "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
// "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
// )
val pdfbox = Seq(
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
ExclusionRule("commons-logging"),
ExclusionRule("org.bouncycastle")
),
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
)
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
ExclusionRule("org.bouncycastle"),
ExclusionRule("commons-logging")
)
) ++ jclOverSlf4j
val emil = Seq(
"com.github.eikek" %% "emil-common" % EmilVersion,
@ -100,6 +104,12 @@ object Dependencies {
val tika = Seq(
"org.apache.tika" % "tika-core" % TikaVersion
)
val commonsIO = Seq(
"commons-io" % "commons-io" % "2.6"
)
val tikaParser = Seq(
"org.apache.tika" % "tika-parsers" % TikaVersion
)
val bcrypt = Seq(
"org.mindrot" % "jbcrypt" % BcryptVersion