listStyleStack = new Stack<>();
+ private ListStyle listStyle;
+ // True if we are currently in the named style:
+ private boolean curUnderlined;
+ private boolean curBold;
+ private boolean curItalic;
+ private int pDepth = 0;
+ OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext) {
+ super(handler, MAPPINGS);
+ this.handler = handler;
+ this.parseContext = parseContext;
+ }
+
+ private static Attributes buildAttributes(String key, String value) {
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", key, key, "CDATA", value);
+ return attrs;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (inBinaryData) {
+ base64BinaryDataBuffer.append(ch, start, length);
+ return;
+ }
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) {
+ if (!hasWrittenStartStyleTags) {
+ updateStyleTags();
+ hasWrittenStartStyleTags = true;
+ }
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template") || localName.endsWith("-style");
+ }
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+ }
+ // can appear inside comments and other things that are already inside
+ //we need to track our pDepth and only output
if we're at the main level
+
+ // map the heading level to HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
+
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") &&
+ !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) || "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+ currTextStyle = textStyleMap.get(name);
+ hasWrittenStartStyleTags = false;
+ }
+
+ private void startParagraph(String styleName) throws SAXException {
+ if (pDepth == 0) {
+ handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
+ if (styleName != null) {
+ currTextStyle = paragraphTextStyleMap.get(styleName);
+ }
+ hasWrittenStartStyleTags = false;
+ } else {
+ handler.characters(SPACE, 0, SPACE.length);
+ }
+ pDepth++;
+ }
+
+ private void endParagraph() throws SAXException {
+ closeStyleTags();
+ if (pDepth == 1) {
+ handler.endElement(XHTML, "p", "p");
+ } else {
+ handler.characters(SPACE, 0, SPACE.length);
+ }
+ pDepth--;
+
+ }
+
+ private void updateStyleTags() throws SAXException {
+
+ if (currTextStyle == null) {
+ closeStyleTags();
+ return;
+ }
+ if (currTextStyle.bold != curBold) {
+ // Enforce nesting -- must close s and i tags
+ if (curUnderlined) {
+ handler.endElement(XHTML, "u", "u");
+ curUnderlined = false;
+ }
+ if (curItalic) {
+ handler.endElement(XHTML, "i", "i");
+ curItalic = false;
+ }
+ if (currTextStyle.bold) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "b", "b");
+ }
+ curBold = currTextStyle.bold;
+ }
+
+ if (currTextStyle.italic != curItalic) {
+ // Enforce nesting -- must close s tag
+ if (curUnderlined) {
+ handler.endElement(XHTML, "u", "u");
+ curUnderlined = false;
+ }
+ if (currTextStyle.italic) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "i", "i");
+ }
+ curItalic = currTextStyle.italic;
+ }
+
+ if (currTextStyle.underlined != curUnderlined) {
+ if (currTextStyle.underlined) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "u", "u");
+ }
+ curUnderlined = currTextStyle.underlined;
+ }
+ }
+
+ private void endSpan() throws SAXException {
+ updateStyleTags();
+ }
+
+ private void closeStyleTags() throws SAXException {
+ // Close any still open style tags
+ if (curUnderlined) {
+ handler.endElement(XHTML, "u", "u");
+ curUnderlined = false;
+ }
+ if (curItalic) {
+ handler.endElement(XHTML, "i", "i");
+ curItalic = false;
+ }
+ if (curBold) {
+ handler.endElement(XHTML, "b", "b");
+ curBold = false;
+ }
+ currTextStyle = null;
+ hasWrittenStartStyleTags = false;
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
+ throws SAXException {
+
+ if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
+ String link = attrs.getValue(XLINK_NS, "href");
+ AttributesImpl attr = new AttributesImpl();
+ if (!StringUtils.isEmpty(link)) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
+ }
+ handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
+ handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
+ }
+
+ if (BINARY_DATA.equals(localName)) {
+ inBinaryData = true;
+ return;
+ }
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
+
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ currTextStyle = new TextStyle();
+ currTextStyleName = attrs.getValue(STYLE_NS, "name");
+ } else if ("paragraph".equals(family)) {
+ currTextStyle = new TextStyle();
+ currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) &&
+ "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ currTextStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) ||
+ (fontWeight != null && Character.isDigit(fontWeight.charAt(0)) &&
+ Integer.parseInt(fontWeight) > 500)) {
+ currTextStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null && !underlineStyle.equals("none")) {
+ currTextStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ startParagraph(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
+ handler.characters(SPACE, 0, 1);
+ } else if ("annotation".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
+ } else if ("note".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
+ } else if ("notes".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (BINARY_DATA.equals(localName)) {
+ inBinaryData = false;
+ try {
+ processBinaryData();
+ } catch (IOException e) {
+ throw new SAXException(e);
+ }
+ return;
+ }
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ if (currTextStyle != null && currTextStyleName != null) {
+ textStyleMap.put(currTextStyleName, currTextStyle);
+ currTextStyleName = null;
+ currTextStyle = null;
+ } else if (currTextStyle != null && currParagraphStyleName != null) {
+ paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
+ currParagraphStyleName = null;
+ currTextStyle = null;
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(namespaceURI, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ currTextStyle = null;
+ hasWrittenStartStyleTags = false;
+ } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ endParagraph();
+ } else if ("annotation".equals(localName) || "note".equals(localName) ||
+ "notes".equals(localName)) {
+ closeStyleTags();
+ handler.endElement(namespaceURI, localName, localName);
+ } else {
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI) &&
+ ("tab-stop".equals(localName) || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
+
+ private void processBinaryData() throws IOException, SAXException {
+
+ //TODO: figure out whether we're in an inline image or a regular
+ //attachment and add that info to the embedded metadata
+
+ byte[] bytes = Base64.decodeBase64(base64BinaryDataBuffer.toString());
+ //clear state before parsing
+ base64BinaryDataBuffer.setLength(0);
+ inBinaryData = false;
+
+ if (embeddedDocumentExtractor == null) {
+ embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+ }
+ Metadata embeddedMetadata = new Metadata();
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ try (InputStream is = TikaInputStream.get(bytes)) {
+ embeddedDocumentExtractor.parseEmbedded(is, handler, embeddedMetadata, false);
+ }
+ }
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+
+ @Override
+ public String toString() {
+ return "TextStyle{" + "italic=" + italic + ", bold=" + bold + ", underlined=" +
+ underlined + '}';
+ }
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
+
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index 066f3e95..e99053d9 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -16,591 +16,47 @@
*/
package org.apache.tika.parser.odf;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.ElementMappingContentHandler;
-import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-import org.xml.sax.helpers.DefaultHandler;
-
-import javax.xml.namespace.QName;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.Stack;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for ODF content.xml
files.
*/
public class OpenDocumentContentParser extends AbstractParser {
- private interface Style {
- }
-
- private static class TextStyle implements Style {
- public boolean italic;
- public boolean bold;
- public boolean underlined;
-
- @Override
- public String toString() {
- return "TextStyle{" +
- "italic=" + italic +
- ", bold=" + bold +
- ", underlined=" + underlined +
- '}';
- }
- }
-
- private static class ListStyle implements Style {
- public boolean ordered;
-
- public String getTag() {
- return ordered ? "ol" : "ul";
- }
- }
-
- private static final class OpenDocumentElementMappingContentHandler extends
- ElementMappingContentHandler {
- private static final char[] SPACE = new char[]{ ' '};
- private static final String CLASS = "class";
- private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
- private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
- private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
-
- private static Attributes buildAttributes(String key, String value) {
- AttributesImpl attrs = new AttributesImpl();
- attrs.addAttribute("", key, key, "CDATA", value);
- return attrs;
- }
-
- private final ContentHandler handler;
- private final BitSet textNodeStack = new BitSet();
- private int nodeDepth = 0;
- private int completelyFiltered = 0;
- private Stack headingStack = new Stack();
- private Map paragraphTextStyleMap = new HashMap();
- private Map textStyleMap = new HashMap();
- private Map listStyleMap = new HashMap();
- private String currParagraphStyleName; //paragraph style name
- private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
- private String currTextStyleName;
-
- private Stack listStyleStack = new Stack();
- private ListStyle listStyle;
-
- // True if we are currently in the named style:
- private boolean curUnderlined;
- private boolean curBold;
- private boolean curItalic;
-
- //have we written the start style tags
- //yet for the current text style
- boolean hasWrittenStartStyleTags = false;
-
- private int pDepth = 0; // can appear inside comments and other things that are already inside
- //we need to track our pDepth and only output
if we're at the main level
-
-
- private OpenDocumentElementMappingContentHandler(ContentHandler handler,
- Map mappings) {
- super(handler, mappings);
- this.handler = handler;
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- // only forward content of tags from text:-namespace
- if (completelyFiltered == 0 && nodeDepth > 0
- && textNodeStack.get(nodeDepth - 1)) {
- if (!hasWrittenStartStyleTags) {
- updateStyleTags();
- hasWrittenStartStyleTags = true;
- }
- super.characters(ch, start, length);
- }
- }
-
- // helper for checking tags which need complete filtering
- // (with sub-tags)
- private boolean needsCompleteFiltering(
- String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI)) {
- return localName.endsWith("-template")
- || localName.endsWith("-style");
- }
- return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
- }
-
- // map the heading level to HTML tags
- private String getXHTMLHeaderTagName(Attributes atts) {
- String depthStr = atts.getValue(TEXT_NS, "outline-level");
- if (depthStr == null) {
- return "h1";
- }
-
- int depth = Integer.parseInt(depthStr);
- if (depth >= 6) {
- return "h6";
- } else if (depth <= 1) {
- return "h1";
- } else {
- return "h" + depth;
- }
- }
-
- /**
- * Check if a node is a text node
- */
- private boolean isTextNode(String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
- return true;
- }
- if (SVG_NS.equals(namespaceURI)) {
- return "title".equals(localName) ||
- "desc".equals(localName);
- }
- return false;
- }
-
- private void startList(String name) throws SAXException {
- String elementName = "ul";
- if (name != null) {
- ListStyle style = listStyleMap.get(name);
- elementName = style != null ? style.getTag() : "ul";
- listStyleStack.push(style);
- }
- handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
- }
-
- private void endList() throws SAXException {
- String elementName = "ul";
- if (!listStyleStack.isEmpty()) {
- ListStyle style = listStyleStack.pop();
- elementName = style != null ? style.getTag() : "ul";
- }
- handler.endElement(XHTML, elementName, elementName);
- }
-
- private void startSpan(String name) throws SAXException {
- if (name == null) {
- return;
- }
- currTextStyle = textStyleMap.get(name);
- hasWrittenStartStyleTags = false;
- }
-
- private void startParagraph(String styleName) throws SAXException {
- if (pDepth == 0) {
- handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
- if (styleName != null) {
- currTextStyle = paragraphTextStyleMap.get(styleName);
- }
- hasWrittenStartStyleTags = false;
- } else {
- handler.characters(SPACE, 0, SPACE.length);
- }
- pDepth++;
- }
-
- private void endParagraph() throws SAXException {
- closeStyleTags();
- if (pDepth == 1) {
- handler.endElement(XHTML, "p", "p");
- } else {
- handler.characters(SPACE, 0, SPACE.length);
- }
- pDepth--;
-
- }
-
- private void updateStyleTags() throws SAXException {
-
- if (currTextStyle == null) {
- closeStyleTags();
- return;
- }
- if (currTextStyle.bold != curBold) {
- // Enforce nesting -- must close s and i tags
- if (curUnderlined) {
- handler.endElement(XHTML, "u", "u");
- curUnderlined = false;
- }
- if (curItalic) {
- handler.endElement(XHTML, "i", "i");
- curItalic = false;
- }
- if (currTextStyle.bold) {
- handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
- } else {
- handler.endElement(XHTML, "b", "b");
- }
- curBold = currTextStyle.bold;
- }
-
- if (currTextStyle.italic != curItalic) {
- // Enforce nesting -- must close s tag
- if (curUnderlined) {
- handler.endElement(XHTML, "u", "u");
- curUnderlined = false;
- }
- if (currTextStyle.italic) {
- handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
- } else {
- handler.endElement(XHTML, "i", "i");
- }
- curItalic = currTextStyle.italic;
- }
-
- if (currTextStyle.underlined != curUnderlined) {
- if (currTextStyle.underlined) {
- handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
- } else {
- handler.endElement(XHTML, "u", "u");
- }
- curUnderlined = currTextStyle.underlined;
- }
- }
-
- private void endSpan() throws SAXException {
- updateStyleTags();
- }
-
- private void closeStyleTags() throws SAXException {
- // Close any still open style tags
- if (curUnderlined) {
- handler.endElement(XHTML,"u", "u");
- curUnderlined = false;
- }
- if (curItalic) {
- handler.endElement(XHTML,"i", "i");
- curItalic = false;
- }
- if (curBold) {
- handler.endElement(XHTML,"b", "b");
- curBold = false;
- }
- currTextStyle = null;
- hasWrittenStartStyleTags = false;
- }
-
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes attrs) throws SAXException {
- // keep track of current node type. If it is a text node,
- // a bit at the current depth its set in textNodeStack.
- // characters() checks the top bit to determine, if the
- // actual node is a text node to print out nodeDepth contains
- // the depth of the current node and also marks top of stack.
- assert nodeDepth >= 0;
-
- // Set styles
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- String family = attrs.getValue(STYLE_NS, "family");
- if ("text".equals(family)) {
- currTextStyle = new TextStyle();
- currTextStyleName = attrs.getValue(STYLE_NS, "name");
- } else if ("paragraph".equals(family)) {
- currTextStyle = new TextStyle();
- currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
- }
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = new ListStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- listStyleMap.put(name, listStyle);
- } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
- && "text-properties".equals(localName)) {
- String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
- if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
- currTextStyle.italic = true;
- }
- String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
- if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
- || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
- && Integer.valueOf(fontWeight) > 500)) {
- currTextStyle.bold = true;
- }
- String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
- if (underlineStyle != null && !underlineStyle.equals("none")) {
- currTextStyle.underlined = true;
- }
- } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
- if ("list-level-style-bullet".equals(localName)) {
- listStyle.ordered = false;
- } else if ("list-level-style-number".equals(localName)) {
- listStyle.ordered = true;
- }
- }
-
- textNodeStack.set(nodeDepth++,
- isTextNode(namespaceURI, localName));
- // filter *all* content of some tags
- assert completelyFiltered >= 0;
-
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered++;
- }
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
- handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- startList(attrs.getValue(TEXT_NS, "style-name"));
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- startSpan(attrs.getValue(TEXT_NS, "style-name"));
- } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
- startParagraph(attrs.getValue(TEXT_NS, "style-name"));
- } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
- handler.characters(SPACE, 0, 1);
- } else if ("annotation".equals(localName)) {
- closeStyleTags();
- handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
- } else if ("note".equals(localName)) {
- closeStyleTags();
- handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
- } else if ("notes".equals(localName)) {
- closeStyleTags();
- handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
- } else {
- super.startElement(namespaceURI, localName, qName, attrs);
- }
- }
- }
-
- @Override
- public void endElement(
- String namespaceURI, String localName, String qName)
- throws SAXException {
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- if (currTextStyle != null && currTextStyleName != null) {
- textStyleMap.put(currTextStyleName, currTextStyle);
- currTextStyleName = null;
- currTextStyle = null;
- } else if (currTextStyle != null && currParagraphStyleName != null) {
- paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
- currParagraphStyleName = null;
- currTextStyle = null;
- }
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = null;
- }
-
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.pop();
- handler.endElement(XHTMLContentHandler.XHTML, el, el);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- endList();
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- currTextStyle = null;
- hasWrittenStartStyleTags = false;
- } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
- endParagraph();
- } else if ("annotation".equals(localName) || "note".equals(localName) ||
- "notes".equals(localName)) {
- closeStyleTags();
- handler.endElement("", localName, localName);
- } else {
- super.endElement(namespaceURI, localName, qName);
- }
-
- // special handling of tabulators
- if (TEXT_NS.equals(namespaceURI)
- && ("tab-stop".equals(localName)
- || "tab".equals(localName))) {
- this.characters(TAB, 0, TAB.length);
- }
- }
-
- // revert filter for *all* content of some tags
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered--;
- }
- assert completelyFiltered >= 0;
-
- // reduce current node depth
- nodeDepth--;
- assert nodeDepth >= 0;
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- // remove prefix mappings as they should not occur in XHTML
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- // remove prefix mappings as they should not occur in XHTML
- }
- }
-
- public static final String TEXT_NS =
- "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
-
- public static final String TABLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
-
- public static final String STYLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
-
- public static final String FORMATTING_OBJECTS_NS =
- "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
-
- public static final String OFFICE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
-
- public static final String SVG_NS =
- "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
-
- public static final String PRESENTATION_NS =
- "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
-
- public static final String DRAW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
-
- public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
-
- protected static final char[] TAB = new char[]{'\t'};
-
- private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
-
- /**
- * Mappings between ODF tag names and XHTML tag names
- * (including attributes). All other tag names/attributes are ignored
- * and left out from event stream.
- */
- private static final HashMap MAPPINGS =
- new HashMap();
-
- static {
- // general mappings of text:-tags
- MAPPINGS.put(
- new QName(TEXT_NS, "p"),
- new TargetElement(XHTML, "p"));
- // text:h-tags are mapped specifically in startElement/endElement
- MAPPINGS.put(
- new QName(TEXT_NS, "line-break"),
- new TargetElement(XHTML, "br"));
- MAPPINGS.put(
- new QName(TEXT_NS, "list-item"),
- new TargetElement(XHTML, "li"));
- MAPPINGS.put(
- new QName(TEXT_NS, "note"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(OFFICE_NS, "annotation"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(PRESENTATION_NS, "notes"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(DRAW_NS, "object"),
- new TargetElement(XHTML, "object"));
- MAPPINGS.put(
- new QName(DRAW_NS, "text-box"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(SVG_NS, "title"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(SVG_NS, "desc"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(TEXT_NS, "span"),
- new TargetElement(XHTML, "span"));
-
- final HashMap aAttsMapping =
- new HashMap();
- aAttsMapping.put(
- new QName(XLINK_NS, "href"),
- new QName("href"));
- aAttsMapping.put(
- new QName(XLINK_NS, "title"),
- new QName("title"));
- MAPPINGS.put(
- new QName(TEXT_NS, "a"),
- new TargetElement(XHTML, "a", aAttsMapping));
-
- // create HTML tables from table:-tags
- MAPPINGS.put(
- new QName(TABLE_NS, "table"),
- new TargetElement(XHTML, "table"));
- // repeating of rows is ignored; for columns, see below!
- MAPPINGS.put(
- new QName(TABLE_NS, "table-row"),
- new TargetElement(XHTML, "tr"));
- // special mapping for rowspan/colspan attributes
- final HashMap tableCellAttsMapping =
- new HashMap();
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-spanned"),
- new QName("colspan"));
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-rows-spanned"),
- new QName("rowspan"));
- /* TODO: The following is not correct, the cell should be repeated not spanned!
- * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
- * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
- * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
- * only for empty cells.
- */
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-repeated"),
- new QName("colspan"));
- MAPPINGS.put(
- new QName(TABLE_NS, "table-cell"),
- new TargetElement(XHTML, "td", tableCellAttsMapping));
- }
public Set getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- parseInternal(stream,
- new XHTMLContentHandler(handler, metadata),
- metadata, context);
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context);
}
- void parseInternal(
- InputStream stream, final ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
- DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+ DefaultHandler dh = new OpenDocumentBodyHandler(handler, context);
- XMLReaderUtils.parseSAX(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(
- new NSNormalizerContentHandler(dh)),
- context);
+ XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new NSNormalizerContentHandler(dh)), context);
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
new file mode 100644
index 00000000..3214e96f
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+
+
+class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler {
+
+ OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
+ super(contentHandler, parseContext);
+ }
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
+ throws SAXException {
+ //in the compressed odf, there should only be one element in this file.
+ if (MODULE.equalsIgnoreCase(localName)) {
+ inMacro = true;
+ macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
+ }
+ }
+
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (MODULE.equals(localName)) {
+ try {
+ handleMacro();
+ } catch (IOException e) {
+ throw new SAXException(e);
+ } finally {
+ //this shouldn't be necessary in the compressed odf files
+ resetMacroState();
+ }
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
new file mode 100644
index 00000000..47f49e57
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.sax.ContentHandlerDecorator;
+
+/**
+ * For now, this only looks for any encryption-data elements.
+ * If found this will throw an EncryptedDocumentException wrapped
+ * in a SAXException.
+ *
+ * If desired, we can add to this to actually extract information
+ * necessary for decryption. Please open an issue or pull
+ * request for this added functionality.
+ *
+ */
+class OpenDocumentManifestHandler extends ContentHandlerDecorator {
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ if (localName.equals("encryption-data")) {
+ throw new SAXException(new EncryptedDocumentException());
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 11922d7d..d717d13d 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -16,12 +16,21 @@
*/
package org.apache.tika.parser.odf;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -36,11 +45,6 @@ import org.apache.tika.sax.xpath.CompositeMatcher;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.io.InputStream;
/**
* Parser for OpenDocument meta.xml
files.
@@ -54,68 +58,54 @@ public class OpenDocumentMetaParser extends XMLParser {
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
- /**
- * @see OfficeOpenXMLCore#SUBJECT
- * @deprecated use OfficeOpenXMLCore#SUBJECT
- */
- @Deprecated
- private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
- Property.composite(Office.INITIAL_AUTHOR,
- new Property[]{Property.externalText("initial-creator")});
-
- private static ContentHandler getDublinCoreHandler(
- Metadata metadata, Property property, String element) {
- return new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, element,
- metadata, property);
+ private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
+ String element) {
+ return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
- private static ContentHandler getMeta(
- ContentHandler ch, Metadata md, Property property, String element) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:" + element),
- META_XPATH.parse("//meta:" + element + "//text()"));
+ private static ContentHandler getMeta(ContentHandler ch, Metadata md, Property property,
+ String element) {
+ Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:" + element),
+ META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
- private static ContentHandler getUserDefined(
- ContentHandler ch, Metadata md) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:user-defined/@meta:name"),
- META_XPATH.parse("//meta:user-defined//text()"));
- // eg Text1 becomes custom:Info1=Text1
+ private static ContentHandler getUserDefined(ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ // eg Text1 becomes
+ // custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
- new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
- matcher);
+ new AttributeDependantMetadataHandler(md, "meta:name",
+ Office.USER_DEFINED_METADATA_NAME_PREFIX), matcher);
return new TeeContentHandler(ch, branch);
}
@Deprecated
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, String name, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ private static ContentHandler getStatistic(ContentHandler ch, Metadata md, String name,
+ String attribute) {
+ Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+ new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, Property property, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ private static ContentHandler getStatistic(ContentHandler ch, Metadata md, Property property,
+ String attribute) {
+ Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
- protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+ static ContentHandler getContentHandler(Metadata md, ParseContext context,
+ ContentHandler... handlers) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
- // Process the Dublin Core Attributes
- ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
- getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+ // Process the Dublin Core Attributes
+ ContentHandler ch =
+ new TeeContentHandler(getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
@@ -129,19 +119,20 @@ public class OpenDocumentMetaParser extends XMLParser {
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "date",
- md, TikaCoreProperties.MODIFIED));
+ ch = new TeeContentHandler(ch,
+ new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md,
+ TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "subject",
- md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
- ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+ ch = new TeeContentHandler(ch,
+ new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md,
+ OfficeOpenXMLCore.SUBJECT));
- ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Office.KEYWORDS, "keyword");
+
+ ch = getMeta(ch, md, OfficeOpenXMLExtended.TOTAL_TIME, "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
- ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+ ch = getMeta(ch, md, TikaCoreProperties.CREATOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
@@ -157,43 +148,48 @@ public class OpenDocumentMetaParser extends XMLParser {
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
- // Legacy, Tika-1.0 style attributes
- // TODO Remove these in Tika 2.0
- ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
- ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
- ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
-
- // Legacy Statistics Attributes, replaced with real keys above
- // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
- ch = getStatistic(ch, md, "nbPage", "page-count");
- ch = getStatistic(ch, md, "nbPara", "paragraph-count");
- ch = getStatistic(ch, md, "nbWord", "word-count");
- ch = getStatistic(ch, md, "nbCharacter", "character-count");
- ch = getStatistic(ch, md, "nbTab", "table-count");
- ch = getStatistic(ch, md, "nbObject", "object-count");
- ch = getStatistic(ch, md, "nbImg", "image-count");
-
+ if (handlers != null && handlers.length > 0) {
+ ContentHandler[] newHandlers = new ContentHandler[handlers.length + 1];
+ newHandlers[0] = ch;
+ System.arraycopy(handlers, 0, newHandlers, 1, handlers.length);
+ ch = new TeeContentHandler(newHandlers);
+ }
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md,
+ ParseContext context) {
+ return getContentHandler(md, context, super.getContentHandler(ch, md, context));
+ }
+
@Override
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
if (odfSubject != null && !odfSubject.equals("") &&
- (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+ (metadata.get(TikaCoreProperties.DESCRIPTION) == null ||
+ metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
+ //reset the dc:subject to include both keywords and subject
+ //We can't relying on composite keys in the MatchingContentHandlers
+ //because those are "setting" not "adding" to the Metadata object
+ List subjects = new ArrayList<>();
+ if (metadata.getValues(Office.KEYWORDS) != null) {
+ subjects.addAll(Arrays.asList(metadata.getValues(Office.KEYWORDS)));
+ }
+
+ if (metadata.getValues(OfficeOpenXMLCore.SUBJECT) != null) {
+ subjects.addAll(Arrays.asList(metadata.getValues(OfficeOpenXMLCore.SUBJECT)));
+ }
+
+ if (subjects.size() > 0) {
+ metadata.set(TikaCoreProperties.SUBJECT, subjects.toArray(new String[0]));
+ }
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 6ba5281f..3d6b467d 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -16,37 +16,44 @@
*/
package org.apache.tika.parser.odf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipInputStream;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* OpenOffice parser
@@ -58,47 +65,48 @@ public class OpenDocumentParser extends AbstractParser {
*/
private static final long serialVersionUID = -6410276875438618287L;
- private static final Set SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet(Arrays.asList(
- MediaType.application("vnd.sun.xml.writer"),
- MediaType.application("vnd.oasis.opendocument.text"),
- MediaType.application("vnd.oasis.opendocument.graphics"),
- MediaType.application("vnd.oasis.opendocument.presentation"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("vnd.oasis.opendocument.chart"),
- MediaType.application("vnd.oasis.opendocument.image"),
- MediaType.application("vnd.oasis.opendocument.formula"),
- MediaType.application("vnd.oasis.opendocument.text-master"),
- MediaType.application("vnd.oasis.opendocument.text-web"),
- MediaType.application("vnd.oasis.opendocument.text-template"),
- MediaType.application("vnd.oasis.opendocument.graphics-template"),
- MediaType.application("vnd.oasis.opendocument.presentation-template"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("vnd.oasis.opendocument.chart-template"),
- MediaType.application("vnd.oasis.opendocument.image-template"),
- MediaType.application("vnd.oasis.opendocument.formula-template"),
- MediaType.application("x-vnd.oasis.opendocument.text"),
- MediaType.application("x-vnd.oasis.opendocument.graphics"),
- MediaType.application("x-vnd.oasis.opendocument.presentation"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("x-vnd.oasis.opendocument.chart"),
- MediaType.application("x-vnd.oasis.opendocument.image"),
- MediaType.application("x-vnd.oasis.opendocument.formula"),
- MediaType.application("x-vnd.oasis.opendocument.text-master"),
- MediaType.application("x-vnd.oasis.opendocument.text-web"),
- MediaType.application("x-vnd.oasis.opendocument.text-template"),
- MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
- MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("x-vnd.oasis.opendocument.chart-template"),
- MediaType.application("x-vnd.oasis.opendocument.image-template"),
- MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+ private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList(MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+ MediaType.application("vnd.oasis.opendocument.presentation"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+ MediaType.application("vnd.oasis.opendocument.text-template"),
+ MediaType.application("vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("vnd.oasis.opendocument.chart-template"),
+ MediaType.application("vnd.oasis.opendocument.image-template"),
+ MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+ MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+ MediaType.application("x-vnd.oasis.opendocument.text-template"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+ MediaType.application("x-vnd.oasis.opendocument.image-template"),
+ MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
+ private static final String MANIFEST_NAME = "META-INF/manifest.xml";
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
+ private boolean extractMacros = false;
public Parser getMetaParser() {
return meta;
@@ -120,10 +128,10 @@ public class OpenDocumentParser extends AbstractParser {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler baseHandler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
@@ -145,85 +153,129 @@ public class OpenDocumentParser extends AbstractParser {
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
-
+ xhtml.startDocument();
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
- EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(xhtml);
+ EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
+ try {
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
}
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
+ } catch (SAXException e) {
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
}
+ throw e;
}
// Only now call the end document
- if (handler.getEndDocumentWasCalled()) {
+ if (handler.isEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
- private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
- ZipEntry entry = zipStream.getNextEntry();
- if (entry == null) {
- throw new IOException("No entries found in ZipInputStream");
- }
- do {
- handleZipEntry(entry, zipStream, metadata, context, handler);
- entry = zipStream.getNextEntry();
- } while (entry != null);
+ @Field
+ public void setExtractMacros(boolean extractMacros) {
+ this.extractMacros = extractMacros;
}
- private void handleZipFile(ZipFile zipFile, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, TikaException, SAXException {
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context,
+ EndDocumentShieldingContentHandler handler,
+ EmbeddedDocumentUtil embeddedDocumentUtil)
+ throws IOException, TikaException, SAXException {
+ ZipEntry entry = zipStream.getNextEntry();
+ if (entry == null) {
+ throw new IOException("No entries found in ZipInputStream");
+ }
+ List exceptions = new ArrayList<>();
+ do {
+ try {
+ handleZipEntry(entry, zipStream, metadata, context, handler,
+ embeddedDocumentUtil);
+ } catch (SAXException e) {
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
+ if (e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
+ } else {
+ exceptions.add(e);
+ }
+ }
+ entry = zipStream.getNextEntry();
+ } while (entry != null);
+
+ if (exceptions.size() > 0) {
+ throw exceptions.get(0);
+ }
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context,
+ EndDocumentShieldingContentHandler handler,
+ EmbeddedDocumentUtil embeddedDocumentUtil)
+ throws IOException, TikaException, SAXException {
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
- ZipEntry entry = zipFile.getEntry(META_NAME);
+ ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
if (entry != null) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
+ handler, embeddedDocumentUtil);
+ }
+
+ entry = zipFile.getEntry(META_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
+ handler, embeddedDocumentUtil);
}
Enumeration extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata,
+ context, handler, embeddedDocumentUtil);
}
}
}
- private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, SAXException, TikaException {
- if (entry == null) return;
- if (entry.getName().equals("mimetype")) {
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, ContentHandler handler,
+ EmbeddedDocumentUtil embeddedDocumentUtil)
+ throws IOException, SAXException, TikaException {
+
+
+ if (entry.getName().contains("manifest.xml")) {
+ checkForEncryption(zip, context);
+ } else if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ ((OpenDocumentContentParser) content)
+ .parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ ((OpenDocumentContentParser) content)
+ .parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
@@ -231,26 +283,87 @@ public class OpenDocumentParser extends AbstractParser {
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
- if (embeddedName.contains("Thumbnails/") ||
- embeddedName.contains("Pictures/")) {
- EmbeddedDocumentExtractor embeddedDocumentExtractor =
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
+
Metadata embeddedMetadata = new Metadata();
- embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
- /* if (embeddedName.startsWith("Thumbnails/")) {
+ TikaInputStream stream = TikaInputStream.get(zip);
+
+ embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName());
+ if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
- }*/
+ TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
+ }
+
if (embeddedName.contains("Pictures/")) {
- embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+ MediaType embeddedMimeType =
+ embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
+ if (embeddedMimeType != null) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
+ }
+ stream.reset();
}
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- embeddedDocumentExtractor.parseEmbedded(zip,
- new EmbeddedContentHandler(handler), embeddedMetadata, false);
+
+ if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+ embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler),
+ embeddedMetadata, false);
}
+ } else if (extractMacros && embeddedName.contains("Basic/")) {
+ //process all files under Basic/; let maybeHandleMacro figure
+ //out if it is a macro or not
+ maybeHandleMacro(zip, embeddedName, handler, context);
}
}
}
+
+ private void maybeHandleMacro(InputStream is, String embeddedName, ContentHandler handler,
+ ParseContext context)
+ throws TikaException, IOException, SAXException {
+ //should probably run XMLRootExtractor on the inputstream
+ //or read the macro manifest for the names of the macros
+ //rather than relying on the script file name
+ if (ignoreScriptFile(embeddedName)) {
+ return;
+ }
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ handler = new OpenDocumentMacroHandler(handler, context);
+ XMLReaderUtils.parseSAX(new CloseShieldInputStream(is),
+ new OfflineContentHandler(new EmbeddedContentHandler(handler)), context);
+ }
+
+ private void checkForEncryption(InputStream stream, ParseContext context)
+ throws SAXException, TikaException, IOException {
+ try {
+ XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new OpenDocumentManifestHandler())), context);
+ } catch (SAXException e) {
+ if (e.getCause() != null
+ && e.getCause() instanceof EncryptedDocumentException) {
+ throw (EncryptedDocumentException)e.getCause();
+ }
+ //otherwise...swallow
+ }
+ }
+
+ private boolean ignoreScriptFile(String embeddedName) {
+ if (embeddedName.contains("Basic/")) {
+ if (embeddedName.contains("script-lb.xml")) {
+ return true;
+ } else if (embeddedName.contains("script-lc.xml")) {
+ return true;
+ }
+ } else {
+ //shouldn't ever get here, but if it isn't under Basic/, ignore it
+ return true;
+ }
+ return false;
+ }
+
+
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
index cbff35e7..8051bffb 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.xml.sax.helpers.DefaultHandler;
-
import java.util.Arrays;
import java.util.List;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
/**
* Base class for SAX handlers that map SAX events into document metadata.
*
@@ -39,11 +40,12 @@ class AbstractMetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
+
protected AbstractMetadataHandler(Metadata metadata, Property property) {
- this.metadata = metadata;
- this.property = property;
- this.name = property.getName();
- }
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
/**
* Adds the given metadata value. The value is ignored if it is
@@ -59,9 +61,9 @@ class AbstractMetadataHandler extends DefaultHandler {
List previous = Arrays.asList(metadata.getValues(name));
if (!previous.contains(value)) {
if (property != null) {
- metadata.add(property, value);
+ metadata.add(property, value);
} else {
- metadata.add(name, value);
+ metadata.add(name, value);
}
}
} else {
@@ -69,23 +71,23 @@ class AbstractMetadataHandler extends DefaultHandler {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
if (!previous.equals(value)) {
- if (property != null) {
- if (property.isMultiValuePermitted()) {
- metadata.add(property, value);
- } else {
- // Replace the existing value if isMultiValuePermitted is false
- metadata.set(property, value);
- }
- } else {
- metadata.add(name, value);
- }
+ if (property != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ // Replace the existing value if isMultiValuePermitted is false
+ metadata.set(property, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
}
} else {
- if (property != null) {
- metadata.set(property, value);
- } else {
- metadata.set(name, value);
- }
+ if (property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
}
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
index c1795fad..1f6e1d70 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
@@ -16,15 +16,16 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.metadata.Metadata;
+
/**
* This adds a Metadata entry for a given node.
* The textual content of the node is used as the
- * value, and the Metadata name is taken from
- * an attribute, with a prefix if required.
+ * value, and the Metadata name is taken from
+ * an attribute, with a prefix if required.
*/
public class AttributeDependantMetadataHandler extends DefaultHandler {
@@ -32,20 +33,20 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
private final String nameHoldingAttribute;
private final String namePrefix;
+ private final StringBuilder buffer = new StringBuilder();
private String name;
- private final StringBuilder buffer = new StringBuilder();
-
- public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+ public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute,
+ String namePrefix) {
this.metadata = metadata;
this.nameHoldingAttribute = nameHoldingAttribute;
this.namePrefix = namePrefix;
}
public void addMetadata(String value) {
- if(name == null || name.length() == 0) {
- // We didn't find the attribute which holds the name
- return;
+ if (name == null || name.length() == 0) {
+ // We didn't find the attribute which holds the name
+ return;
}
if (value.length() > 0) {
String previous = metadata.get(name);
@@ -61,20 +62,19 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
+ public void startElement(String uri, String localName, String name, Attributes attributes) {
String rawName = attributes.getValue(nameHoldingAttribute);
if (rawName != null) {
- if (namePrefix == null) {
- this.name = rawName;
- } else {
- this.name = namePrefix + rawName;
- }
+ if (namePrefix == null) {
+ this.name = rawName;
+ } else {
+ this.name = namePrefix + rawName;
+ }
}
// All other attributes are ignored
}
-
+
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
index dba5e4cb..cd5d4f3f 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
@@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
/**
* SAX event handler that maps the contents of an XML attribute into
* a metadata field.
@@ -33,26 +34,25 @@ public class AttributeMetadataHandler extends AbstractMetadataHandler {
private final String localName;
- public AttributeMetadataHandler(
- String uri, String localName, Metadata metadata, String name) {
+ public AttributeMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
}
- public AttributeMetadataHandler(
- String uri, String localName, Metadata metadata, Property property) {
- super(metadata, property);
- this.uri = uri;
- this.localName = localName;
- }
+
+ public AttributeMetadataHandler(String uri, String localName, Metadata metadata,
+ Property property) {
+ super(metadata, property);
+ this.uri = uri;
+ this.localName = localName;
+ }
@Override
- public void startElement(
- String uri, String localName, String qName, Attributes attributes)
- throws SAXException {
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
for (int i = 0; i < attributes.getLength(); i++) {
- if (attributes.getURI(i).equals(this.uri)
- && attributes.getLocalName(i).equals(this.localName)) {
+ if (attributes.getURI(i).equals(this.uri) &&
+ attributes.getLocalName(i).equals(this.localName)) {
addMetadata(attributes.getValue(i).trim());
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 5999773e..0a847a23 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -16,45 +16,45 @@
*/
package org.apache.tika.parser.xml;
+import org.xml.sax.ContentHandler;
+
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
-import org.xml.sax.ContentHandler;
/**
* Dublin Core metadata parser
*/
public class DcXMLParser extends XMLParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 4905318835463880819L;
- private static ContentHandler getDublinCoreHandler(
- Metadata metadata, Property property, String element) {
- return new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, element,
- metadata, property);
+ private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
+ String element) {
+ return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
- protected ContentHandler getContentHandler(
- ContentHandler handler, Metadata metadata, ParseContext context) {
- return new TeeContentHandler(
- super.getContentHandler(handler, metadata, context),
- getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
- getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
- getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
- getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
- getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
- getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
- getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
- getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
- getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
- getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
+ ParseContext context) {
+ return new TeeContentHandler(super.getContentHandler(handler, metadata, context),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.SUBJECT, "subject"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
index d7a81dc4..6a43315a 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
@@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
+import java.util.Arrays;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
-import java.util.Arrays;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
/**
* SAX event handler that maps the contents of an XML element into
@@ -44,21 +45,17 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
private final Metadata metadata;
private final String name;
- private Property targetProperty;
-
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
-
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
-
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
-
+ private Property targetProperty;
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
@@ -70,13 +67,12 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
/**
* Constructor for string metadata keys.
*
- * @param uri the uri of the namespace of the element
+ * @param uri the uri of the namespace of the element
* @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param name the Tika metadata field key
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
*/
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, String name) {
+ public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@@ -91,15 +87,15 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* Constructor for string metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param name the Tika metadata field key
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
* @param allowDuplicateValues add duplicate values to the Tika metadata
- * @param allowEmptyValues add empty values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
*/
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name,
+ boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@@ -113,13 +109,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
/**
* Constructor for Property metadata keys.
*
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, Property targetProperty) {
+ public ElementMetadataHandler(String uri, String localName, Metadata metadata,
+ Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@@ -135,15 +131,16 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* Constructor for Property metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
- * @param uri the uri of the namespace of the element
- * @param localName the local name of the element
- * @param metadata the Tika metadata object to populate
- * @param targetProperty the Tika metadata Property key
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
* @param allowDuplicateValues add duplicate values to the Tika metadata
- * @param allowEmptyValues add empty values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
*/
- public ElementMetadataHandler(
- String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ public ElementMetadataHandler(String uri, String localName, Metadata metadata,
+ Property targetProperty, boolean allowDuplicateValues,
+ boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@@ -162,16 +159,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
protected boolean isMatchingElement(String uri, String localName) {
// match if we're inside the parent element or within some bag element
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
- (parentMatchLevel > 0 &&
- ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
- (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
- )
- );
+ (parentMatchLevel > 0 &&
+ ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+ (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))));
}
@Override
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
+ public void startElement(String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
@@ -230,7 +224,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
value = "";
}
String[] previous = metadata.getValues(name);
- if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+ if (previous == null || !Arrays.asList(previous).contains(value) ||
+ allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
index 1f396901..47699b4c 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -16,64 +16,68 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
+import org.apache.commons.codec.binary.Base64;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
private static final Set SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("x-fictionbook+xml"));
+ Collections.singleton(MediaType.application("x-fictionbook+xml"));
+
@Override
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
- protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
+ ParseContext context) {
return new BinaryElementsDataHandler(
- EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
private static final String ELEMENT_BINARY = "binary";
-
- private boolean binaryMode = false;
private static final String ATTRIBUTE_ID = "id";
-
+ private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private final EmbeddedDocumentExtractor partExtractor;
private final ContentHandler handler;
private final StringBuilder binaryData = new StringBuilder();
+ private boolean binaryMode = false;
private Metadata metadata;
- private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
- private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+ private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor,
+ ContentHandler handler) {
this.partExtractor = partExtractor;
this.handler = handler;
}
@Override
- public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
binaryMode = ELEMENT_BINARY.equals(localName);
if (binaryMode) {
binaryData.setLength(0);
metadata = new Metadata();
- metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ attributes.getValue(ATTRIBUTE_ID));
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
}
}
@@ -83,11 +87,8 @@ public class FictionBookParser extends XMLParser {
if (binaryMode) {
try {
partExtractor.parseEmbedded(
- new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
- handler,
- metadata,
- true
- );
+ new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+ handler, metadata, true);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
index 3fee00a3..e21aebe2 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
@@ -16,19 +16,20 @@
*/
package org.apache.tika.parser.xml;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
/**
* This adds Metadata entries with a specified name for
- * the textual content of a node (if present), and
- * all attribute values passed through the matcher
- * (but not their names).
+ * the textual content of a node (if present), and
+ * all attribute values passed through the matcher
+ * (but not their names).
*
* @deprecated Use the {@link AttributeMetadataHandler} and
- * {@link ElementMetadataHandler} classes instead
+ * {@link ElementMetadataHandler} classes instead
*/
public class MetadataHandler extends DefaultHandler {
@@ -44,11 +45,12 @@ public class MetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
+
public MetadataHandler(Metadata metadata, Property property) {
- this.metadata = metadata;
- this.property = property;
- this.name = property.getName();
- }
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
public void addMetadata(String value) {
if (value.length() > 0) {
@@ -56,11 +58,11 @@ public class MetadataHandler extends DefaultHandler {
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
-
+
if (this.property != null) {
- metadata.set(property, value);
+ metadata.set(property, value);
} else {
- metadata.set(name, value);
+ metadata.set(name, value);
}
}
}
@@ -70,14 +72,13 @@ public class MetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
- public void startElement(
- String uri, String localName, String name, Attributes attributes) {
+ public void startElement(String uri, String localName, String name, Attributes attributes) {
for (int i = 0; i < attributes.getLength(); i++) {
addMetadata(attributes.getValue(i));
}
}
-
+
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/TextAndAttributeXMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/TextAndAttributeXMLParser.java
new file mode 100644
index 00000000..9d4b34cc
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/TextAndAttributeXMLParser.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TextAndAttributeContentHandler;
+
+public class TextAndAttributeXMLParser extends XMLParser {
+
+ private static final long serialVersionUID = 7796914007312429473L;
+
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
+ ParseContext context) {
+ return new TextAndAttributeContentHandler(handler, true);
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
index e247a6c4..34d782cd 100644
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
@@ -16,7 +16,17 @@
*/
package org.apache.tika.parser.xml;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -28,52 +38,41 @@ import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
/**
* XML parser.
*/
public class XMLParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -6028836725280212837L;
- private static final Set SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet(Arrays.asList(
- MediaType.application("xml"),
- MediaType.image("svg+xml"))));
+ private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<>(
+ Arrays.asList(MediaType.application("xml"), MediaType.image("svg+xml"))));
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
- final XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
+ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
- XMLReaderUtils.parseSAX(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(new EmbeddedContentHandler(
- getContentHandler(tagged, metadata, context))), context);
+ XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(
+ new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))),
+ context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
@@ -83,8 +82,8 @@ public class XMLParser extends AbstractParser {
}
}
- protected ContentHandler getContentHandler(
- ContentHandler handler, Metadata metadata, ParseContext context) {
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
+ ParseContext context) {
return new TextContentHandler(handler, true);
}
}
diff --git a/modules/extract/src/main/java/org/apache/tika/utils/StringUtils.java b/modules/extract/src/main/java/org/apache/tika/utils/StringUtils.java
new file mode 100644
index 00000000..53fd47ce
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/utils/StringUtils.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+public class StringUtils {
+
+ /**
+ * The empty String {@code ""}.
+ *
+ * @since 2.0
+ */
+ public static final String EMPTY = "";
+
+ /**
+ * A String for a space character.
+ *
+ * @since 3.2
+ */
+ public static final String SPACE = " ";
+
+ static int PAD_LIMIT = 10000;
+
+ public static boolean isEmpty(final CharSequence cs) {
+ return cs == null || cs.length() == 0;
+ }
+
+ public static boolean isBlank(final String s) {
+ return s == null || s.trim().length() == 0;
+ }
+
+ /**
+ * Left pad a String with a specified String.
+ *
+ * Pad to a size of {@code size}.
+ *
+ *
+ * StringUtils.leftPad(null, *, *) = null
+ * StringUtils.leftPad("", 3, "z") = "zzz"
+ * StringUtils.leftPad("bat", 3, "yz") = "bat"
+ * StringUtils.leftPad("bat", 5, "yz") = "yzbat"
+ * StringUtils.leftPad("bat", 8, "yz") = "yzyzybat"
+ * StringUtils.leftPad("bat", 1, "yz") = "bat"
+ * StringUtils.leftPad("bat", -1, "yz") = "bat"
+ * StringUtils.leftPad("bat", 5, null) = " bat"
+ * StringUtils.leftPad("bat", 5, "") = " bat"
+ *
+ *
+ * @param str the String to pad out, may be null
+ * @param size the size to pad to
+ * @param padStr the String to pad with, null or empty treated as single space
+ * @return left padded String or original String if no padding is necessary,
+ * {@code null} if null String input
+ */
+ public static String leftPad(final String str, final int size, String padStr) {
+ if (str == null) {
+ return null;
+ }
+ if (isEmpty(padStr)) {
+ padStr = SPACE;
+ }
+ final int padLen = padStr.length();
+ final int strLen = str.length();
+ final int pads = size - strLen;
+ if (pads <= 0) {
+ return str; // returns original String when possible
+ }
+ if (padLen == 1 && pads <= PAD_LIMIT) {
+ return leftPad(str, size, padStr.charAt(0));
+ }
+
+ if (pads == padLen) {
+ return padStr.concat(str);
+ } else if (pads < padLen) {
+ return padStr.substring(0, pads).concat(str);
+ } else {
+ final char[] padding = new char[pads];
+ final char[] padChars = padStr.toCharArray();
+ for (int i = 0; i < pads; i++) {
+ padding[i] = padChars[i % padLen];
+ }
+ return new String(padding).concat(str);
+ }
+ }
+
+
+ public static String leftPad(final String str, final int size, final char padChar) {
+ if (str == null) {
+ return null;
+ }
+ final int pads = size - str.length();
+ if (pads <= 0) {
+ return str; // returns original String when possible
+ }
+ if (pads > PAD_LIMIT) {
+ return leftPad(str, size, String.valueOf(padChar));
+ }
+ return repeat(padChar, pads).concat(str);
+ }
+
+ /**
+ * Returns padding using the specified delimiter repeated
+ * to a given length.
+ *
+ *
+ * StringUtils.repeat('e', 0) = ""
+ * StringUtils.repeat('e', 3) = "eee"
+ * StringUtils.repeat('e', -2) = ""
+ *
+ *
+ * Note: this method does not support padding with
+ * Unicode Supplementary Characters
+ * as they require a pair of {@code char}s to be represented.
+ * If you are needing to support full I18N of your applications
+ * consider using {@link #repeat(String, int)} instead.
+ *
+ *
+ * @param ch character to repeat
+ * @param repeat number of times to repeat char, negative treated as zero
+ * @return String with repeated character
+ * @see #repeat(String, int)
+ */
+ public static String repeat(final char ch, final int repeat) {
+ if (repeat <= 0) {
+ return EMPTY;
+ }
+ final char[] buf = new char[repeat];
+ for (int i = repeat - 1; i >= 0; i--) {
+ buf[i] = ch;
+ }
+ return new String(buf);
+ }
+
+ // Padding
+ //-----------------------------------------------------------------------
+
+ /**
+ * Repeat a String {@code repeat} times to form a
+ * new String.
+ *
+ *
+ * StringUtils.repeat(null, 2) = null
+ * StringUtils.repeat("", 0) = ""
+ * StringUtils.repeat("", 2) = ""
+ * StringUtils.repeat("a", 3) = "aaa"
+ * StringUtils.repeat("ab", 2) = "abab"
+ * StringUtils.repeat("a", -2) = ""
+ *
+ *
+ * @param str the String to repeat, may be null
+ * @param repeat number of times to repeat str, negative treated as zero
+ * @return a new String consisting of the original String repeated,
+ * {@code null} if null String input
+ */
+ public static String repeat(final String str, final int repeat) {
+ // Performance tuned for 2.0 (JDK1.4)
+
+ if (str == null) {
+ return null;
+ }
+ if (repeat <= 0) {
+ return EMPTY;
+ }
+ final int inputLength = str.length();
+ if (repeat == 1 || inputLength == 0) {
+ return str;
+ }
+ if (inputLength == 1 && repeat <= PAD_LIMIT) {
+ return repeat(str.charAt(0), repeat);
+ }
+
+ final int outputLength = inputLength * repeat;
+ switch (inputLength) {
+ case 1:
+ return repeat(str.charAt(0), repeat);
+ case 2:
+ final char ch0 = str.charAt(0);
+ final char ch1 = str.charAt(1);
+ final char[] output2 = new char[outputLength];
+ for (int i = repeat * 2 - 2; i >= 0; i--, i--) {
+ output2[i] = ch0;
+ output2[i + 1] = ch1;
+ }
+ return new String(output2);
+ default:
+ final StringBuilder buf = new StringBuilder(outputLength);
+ for (int i = 0; i < repeat; i++) {
+ buf.append(str);
+ }
+ return buf.toString();
+ }
+ }
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
index 3a260fcc..3fae1e1f 100644
--- a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
@@ -16,7 +16,7 @@ import munit._
class OdfExtractTest extends FunSuite {
val files = List(
- ExampleFiles.examples_sample_odt -> 6372,
+ ExampleFiles.examples_sample_odt -> 6367,
ExampleFiles.examples_sample_ods -> 717
)
diff --git a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
index 588d37cc..f9bd0912 100644
--- a/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
+++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala
@@ -20,7 +20,7 @@ import fs2.Stream
import docspell.common._
import org.apache.tika.config.TikaConfig
-import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
+import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties}
import org.apache.tika.mime.MediaType
import org.apache.tika.parser.txt.Icu4jEncodingDetector
@@ -40,7 +40,7 @@ object TikaMimetype {
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
- hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
+ hint.filename.foreach(md.set(TikaCoreProperties.RESOURCE_NAME_KEY, _))
hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 09184b7f..a7f77d0a 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -38,7 +38,7 @@ object Dependencies {
val ScalaJavaTimeVersion = "2.3.0"
val Slf4jVersion = "1.7.31"
val StanfordNlpVersion = "4.2.2"
- val TikaVersion = "1.27"
+ val TikaVersion = "2.0.0"
val YamuscaVersion = "0.8.1"
val SwaggerUIVersion = "3.51.1"
val TwelveMonkeysVersion = "3.7.0"