Update tika-core to 2.0.0

Include new ODF parser from tika-2.0.0
This commit is contained in:
Scala Steward 2021-07-19 14:25:26 +02:00 committed by eikek
parent bde8af8d58
commit 558007235b
22 changed files with 1653 additions and 983 deletions

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.exception;
import org.xml.sax.SAXException;
public class WriteLimitReachedException extends SAXException {
//in case of (hopefully impossible) cyclic exception
private final static int MAX_DEPTH = 100;
private final int writeLimit;
public WriteLimitReachedException(int writeLimit) {
this.writeLimit = writeLimit;
}
@Override
public String getMessage() {
return "Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
+ " reached. To receive the full text of the document,"
+ " increase your limit. (Text up to the limit is"
+ " however available).";
}
/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the write limit.
*
* @param t throwable
* @return <code>true</code> if the write limit was reached,
* <code>false</code> otherwise
* @since Apache Tika 2.0
*/
public static boolean isWriteLimitReached(Throwable t) {
return isWriteLimitReached(t, 0);
}
private static boolean isWriteLimitReached(Throwable t, int depth) {
if (t == null) {
return false;
}
if (depth > MAX_DEPTH) {
return false;
}
if (t instanceof WriteLimitReachedException) {
return true;
} else {
return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1);
}
}
public static void throwIfWriteLimitReached(Exception ex) throws SAXException {
throwIfWriteLimitReached(ex, 0);
}
private static void throwIfWriteLimitReached(Exception ex, int depth) throws SAXException {
if (ex == null) {
return;
}
if (depth > MAX_DEPTH) {
return;
}
if (ex instanceof WriteLimitReachedException) {
throw (SAXException) ex;
} else {
isWriteLimitReached(ex.getCause(), depth + 1);
}
}
}

View File

@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.utils.XMLReaderUtils;
/**
* Handler for macros in flat open documents
*/
class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator {
static String MODULE = "module";
static String NAME = "name";
private static String SOURCE_CODE = "source-code";
private final ContentHandler contentHandler;
private final ParseContext parseContext;
private final StringBuilder macroBuffer = new StringBuilder();
String macroName = null;
boolean inMacro = false;
private EmbeddedDocumentExtractor embeddedDocumentExtractor;
FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
super(contentHandler);
this.contentHandler = contentHandler;
this.parseContext = parseContext;
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
if (MODULE.equals(localName)) {
macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
} else if (SOURCE_CODE.equals(localName)) {
inMacro = true;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inMacro) {
macroBuffer.append(ch, start, length);
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (SOURCE_CODE.equals(localName)) {
try {
handleMacro();
} catch (IOException e) {
throw new SAXException(e);
} finally {
resetMacroState();
}
}
}
protected void resetMacroState() {
macroBuffer.setLength(0);
macroName = null;
inMacro = false;
}
protected void handleMacro() throws IOException, SAXException {
byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
Metadata embeddedMetadata = new Metadata();
if (!isBlank(macroName)) {
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName);
}
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
.parseEmbedded(is, contentHandler, embeddedMetadata, false);
}
}
}
private static boolean isBlank(String s) {
return s == null || s.trim().isEmpty();
}
}

View File

@ -1,31 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import org.apache.tika.sax.ContentHandlerDecorator;
/**
* Content handler decorator that:<ul>
@ -35,14 +36,11 @@ import java.util.Locale;
*/
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
private static final String OLD_NS =
"http://openoffice.org/2000/";
private static final String OLD_NS = "http://openoffice.org/2000/";
private static final String NEW_NS =
"urn:oasis:names:tc:opendocument:xmlns:";
private static final String NEW_NS = "urn:oasis:names:tc:opendocument:xmlns:";
private static final String DTD_PUBLIC_ID =
"-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
private static final String DTD_PUBLIC_ID = "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
public NSNormalizerContentHandler(ContentHandler handler) {
super(handler);
@ -57,27 +55,24 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator {
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes atts) throws SAXException {
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException {
AttributesImpl natts = new AttributesImpl();
for (int i = 0; i < atts.getLength(); i++) {
natts.addAttribute(
mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i), atts.getType(i), atts.getValue(i));
natts.addAttribute(mapOldNS(atts.getURI(i)), atts.getLocalName(i), atts.getQName(i),
atts.getType(i), atts.getValue(i));
}
super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
throws SAXException {
super.endElement(mapOldNS(namespaceURI), localName, qName);
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
public void startPrefixMapping(String prefix, String uri) throws SAXException {
super.startPrefixMapping(prefix, mapOldNS(uri));
}
@ -87,13 +82,13 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator {
*/
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws IOException, SAXException {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|| DTD_PUBLIC_ID.equals(publicId)) {
throws IOException, SAXException {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) ||
DTD_PUBLIC_ID.equals(publicId)) {
return new InputSource(new StringReader(""));
} else {
return super.resolveEntity(publicId, systemId);
}
}
}
}

View File

@ -0,0 +1,564 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import javax.xml.namespace.QName;
import org.apache.commons.codec.binary.Base64;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
/*
Handler for the body element or odt flat files and content.xml of
traditional compressed odt files
*/
class OpenDocumentBodyHandler extends ElementMappingContentHandler {
public static final String TEXT_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS = "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS = "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS = "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final String BINARY_DATA = "binary-data";
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<>();
private static final char[] SPACE = new char[]{' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
static {
// general mappings of text:-tags
MAPPINGS.put(new QName(TEXT_NS, "p"), new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(new QName(TEXT_NS, "line-break"), new TargetElement(XHTML, "br"));
MAPPINGS.put(new QName(TEXT_NS, "list-item"), new TargetElement(XHTML, "li"));
MAPPINGS.put(new QName(TEXT_NS, "note"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(OFFICE_NS, "annotation"), new TargetElement(XHTML,
"span"));
MAPPINGS.put(new QName(PRESENTATION_NS, "notes"), new TargetElement(XHTML,
"span"));
MAPPINGS.put(new QName(DRAW_NS, "object"), new TargetElement(XHTML,
"object"));
MAPPINGS.put(new QName(DRAW_NS, "text-box"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(SVG_NS, "title"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(SVG_NS, "desc"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(TEXT_NS, "span"), new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping = new HashMap<>();
aAttsMapping.put(new QName(XLINK_NS, "href"), new QName("href"));
aAttsMapping.put(new QName(XLINK_NS, "title"), new QName("title"));
MAPPINGS.put(new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a",
aAttsMapping));
MAPPINGS.put(new QName(DRAW_NS, "a"), new TargetElement(XHTML, "a",
aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(new QName(TABLE_NS, "table"), new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(new QName(TABLE_NS, "table-row"), new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping = new HashMap<>();
tableCellAttsMapping
.put(new QName(TABLE_NS, "number-columns-spanned"), new QName("colspan"));
tableCellAttsMapping.put(new QName(TABLE_NS, "number-rows-spanned"), new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by
* spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping
.put(new QName(TABLE_NS, "number-columns-repeated"), new QName("colspan"));
MAPPINGS.put(new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
private final ContentHandler handler;
private final ParseContext parseContext;
private final BitSet textNodeStack = new BitSet();
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
//if we're in a binary-data tag
boolean inBinaryData = false;
private EmbeddedDocumentExtractor embeddedDocumentExtractor;
private StringBuilder base64BinaryDataBuffer = new StringBuilder();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<>();
private Map<String, TextStyle> textStyleMap = new HashMap<>();
private Map<String, ListStyle> listStyleMap = new HashMap<>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
private int pDepth = 0;
OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext) {
super(handler, MAPPINGS);
this.handler = handler;
this.parseContext = parseContext;
}
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inBinaryData) {
base64BinaryDataBuffer.append(ch, start, length);
return;
}
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template") || localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
//<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") &&
!localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) || "desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML, "b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
String link = attrs.getValue(XLINK_NS, "href");
AttributesImpl attr = new AttributesImpl();
if (!StringUtils.isEmpty(link)) {
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
}
handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
}
if (BINARY_DATA.equals(localName)) {
inBinaryData = true;
return;
}
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) &&
"text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) ||
(fontWeight != null && Character.isDigit(fontWeight.charAt(0)) &&
Integer.parseInt(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (BINARY_DATA.equals(localName)) {
inBinaryData = false;
try {
processBinaryData();
} catch (IOException e) {
throw new SAXException(e);
}
return;
}
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(namespaceURI, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement(namespaceURI, localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI) &&
("tab-stop".equals(localName) || "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
private void processBinaryData() throws IOException, SAXException {
//TODO: figure out whether we're in an inline image or a regular
//attachment and add that info to the embedded metadata
byte[] bytes = Base64.decodeBase64(base64BinaryDataBuffer.toString());
//clear state before parsing
base64BinaryDataBuffer.setLength(0);
inBinaryData = false;
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor.parseEmbedded(is, handler, embeddedMetadata, false);
}
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" + "italic=" + italic + ", bold=" + bold + ", underlined=" +
underlined + '}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
}

View File

@ -16,591 +16,47 @@
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.namespace.QName;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" +
"italic=" + italic +
", bold=" + bold +
", underlined=" + underlined +
'}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
private static final char[] SPACE = new char[]{ ' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
private int pDepth = 0; //<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
Map<QName, TargetElement> mappings) {
super(handler, mappings);
this.handler = handler;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(
String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) ||
"desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML,"u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML,"i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML,"b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
&& "text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
|| (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
&& Integer.valueOf(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement("", localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI)
&& ("tab-stop".equals(localName)
|| "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
}
public static final String TEXT_NS =
"urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS =
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS =
"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
MAPPINGS.put(
new QName(TEXT_NS, "p"),
new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(
new QName(TEXT_NS, "line-break"),
new TargetElement(XHTML, "br"));
MAPPINGS.put(
new QName(TEXT_NS, "list-item"),
new TargetElement(XHTML, "li"));
MAPPINGS.put(
new QName(TEXT_NS, "note"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(OFFICE_NS, "annotation"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(PRESENTATION_NS, "notes"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(DRAW_NS, "object"),
new TargetElement(XHTML, "object"));
MAPPINGS.put(
new QName(DRAW_NS, "text-box"),
new TargetElement(XHTML, "div"));
MAPPINGS.put(
new QName(SVG_NS, "title"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(SVG_NS, "desc"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(TEXT_NS, "span"),
new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping =
new HashMap<QName, QName>();
aAttsMapping.put(
new QName(XLINK_NS, "href"),
new QName("href"));
aAttsMapping.put(
new QName(XLINK_NS, "title"),
new QName("title"));
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
new QName(TABLE_NS, "table"),
new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(
new QName(TABLE_NS, "table-row"),
new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping =
new HashMap<QName, QName>();
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-spanned"),
new QName("colspan"));
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-rows-spanned"),
new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-repeated"),
new QName("colspan"));
MAPPINGS.put(
new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream,
new XHTMLContentHandler(handler, metadata),
metadata, context);
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context);
}
void parseInternal(
InputStream stream, final ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
DefaultHandler dh = new OpenDocumentBodyHandler(handler, context);
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(
new NSNormalizerContentHandler(dh)),
context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
new OfflineContentHandler(new NSNormalizerContentHandler(dh)), context);
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.XMLReaderUtils;
class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler {
OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
super(contentHandler, parseContext);
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
//in the compressed odf, there should only be one element in this file.
if (MODULE.equalsIgnoreCase(localName)) {
inMacro = true;
macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (MODULE.equals(localName)) {
try {
handleMacro();
} catch (IOException e) {
throw new SAXException(e);
} finally {
//this shouldn't be necessary in the compressed odf files
resetMacroState();
}
}
}
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.sax.ContentHandlerDecorator;
/**
* For now, this only looks for any encryption-data elements.
* If found this will throw an EncryptedDocumentException wrapped
* in a SAXException.
*
* If desired, we can add to this to actually extract information
* necessary for decryption. Please open an issue or pull
* request for this added functionality.
*
*/
class OpenDocumentManifestHandler extends ContentHandlerDecorator {
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
if (localName.equals("encryption-data")) {
throw new SAXException(new EncryptedDocumentException());
}
}
}

View File

@ -16,12 +16,21 @@
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@ -36,11 +45,6 @@ import org.apache.tika.sax.xpath.CompositeMatcher;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
/**
* Parser for OpenDocument <code>meta.xml</code> files.
@ -54,68 +58,54 @@ public class OpenDocumentMetaParser extends XMLParser {
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
/**
* @see OfficeOpenXMLCore#SUBJECT
* @deprecated use OfficeOpenXMLCore#SUBJECT
*/
@Deprecated
private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
Property.composite(Office.INITIAL_AUTHOR,
new Property[]{Property.externalText("initial-creator")});
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
String element) {
return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
private static ContentHandler getMeta(ContentHandler ch, Metadata md, Property property,
String element) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getUserDefined(
ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:user-defined/@meta:name"),
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
private static ContentHandler getUserDefined(ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:user-defined/@meta:name"),
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes
// custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
matcher);
new AttributeDependantMetadataHandler(md, "meta:name",
Office.USER_DEFINED_METADATA_NAME_PREFIX), matcher);
return new TeeContentHandler(ch, branch);
}
@Deprecated
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, String name,
String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, Property property, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, Property property,
String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
static ContentHandler getContentHandler(Metadata md, ParseContext context,
ContentHandler... handlers) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
// Process the Dublin Core Attributes
ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
// Process the Dublin Core Attributes
ContentHandler ch =
new TeeContentHandler(getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
@ -129,19 +119,20 @@ public class OpenDocumentMetaParser extends XMLParser {
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "date",
md, TikaCoreProperties.MODIFIED));
ch = new TeeContentHandler(ch,
new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md,
TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "subject",
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
ch = new TeeContentHandler(ch,
new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md,
OfficeOpenXMLCore.SUBJECT));
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
ch = getMeta(ch, md, Office.KEYWORDS, "keyword");
ch = getMeta(ch, md, OfficeOpenXMLExtended.TOTAL_TIME, "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
ch = getMeta(ch, md, TikaCoreProperties.CREATOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
@ -157,43 +148,48 @@ public class OpenDocumentMetaParser extends XMLParser {
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
ch = getStatistic(ch, md, "nbWord", "word-count");
ch = getStatistic(ch, md, "nbCharacter", "character-count");
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
if (handlers != null && handlers.length > 0) {
ContentHandler[] newHandlers = new ContentHandler[handlers.length + 1];
newHandlers[0] = ch;
System.arraycopy(handlers, 0, newHandlers, 1, handlers.length);
ch = new TeeContentHandler(newHandlers);
}
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md,
ParseContext context) {
return getContentHandler(md, context, super.getContentHandler(ch, md, context));
}
@Override
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
if (odfSubject != null && !odfSubject.equals("") &&
(metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
(metadata.get(TikaCoreProperties.DESCRIPTION) == null ||
metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
//reset the dc:subject to include both keywords and subject
//We can't relying on composite keys in the MatchingContentHandlers
//because those are "setting" not "adding" to the Metadata object
List<String> subjects = new ArrayList<>();
if (metadata.getValues(Office.KEYWORDS) != null) {
subjects.addAll(Arrays.asList(metadata.getValues(Office.KEYWORDS)));
}
if (metadata.getValues(OfficeOpenXMLCore.SUBJECT) != null) {
subjects.addAll(Arrays.asList(metadata.getValues(OfficeOpenXMLCore.SUBJECT)));
}
if (subjects.size() > 0) {
metadata.set(TikaCoreProperties.SUBJECT, subjects.toArray(new String[0]));
}
}
}

View File

@ -16,37 +16,44 @@
*/
package org.apache.tika.parser.odf;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import static java.nio.charset.StandardCharsets.UTF_8;
import org.apache.tika.utils.XMLReaderUtils;
/**
* OpenOffice parser
@ -58,47 +65,48 @@ public class OpenDocumentParser extends AbstractParser {
*/
private static final long serialVersionUID = -6410276875438618287L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
MediaType.application("vnd.oasis.opendocument.presentation"),
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
MediaType.application("vnd.oasis.opendocument.chart"),
MediaType.application("vnd.oasis.opendocument.image"),
MediaType.application("vnd.oasis.opendocument.formula"),
MediaType.application("vnd.oasis.opendocument.text-master"),
MediaType.application("vnd.oasis.opendocument.text-web"),
MediaType.application("vnd.oasis.opendocument.text-template"),
MediaType.application("vnd.oasis.opendocument.graphics-template"),
MediaType.application("vnd.oasis.opendocument.presentation-template"),
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("vnd.oasis.opendocument.chart-template"),
MediaType.application("vnd.oasis.opendocument.image-template"),
MediaType.application("vnd.oasis.opendocument.formula-template"),
MediaType.application("x-vnd.oasis.opendocument.text"),
MediaType.application("x-vnd.oasis.opendocument.graphics"),
MediaType.application("x-vnd.oasis.opendocument.presentation"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
MediaType.application("x-vnd.oasis.opendocument.chart"),
MediaType.application("x-vnd.oasis.opendocument.image"),
MediaType.application("x-vnd.oasis.opendocument.formula"),
MediaType.application("x-vnd.oasis.opendocument.text-master"),
MediaType.application("x-vnd.oasis.opendocument.text-web"),
MediaType.application("x-vnd.oasis.opendocument.text-template"),
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
MediaType.application("x-vnd.oasis.opendocument.image-template"),
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
MediaType.application("vnd.oasis.opendocument.presentation"),
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
MediaType.application("vnd.oasis.opendocument.chart"),
MediaType.application("vnd.oasis.opendocument.image"),
MediaType.application("vnd.oasis.opendocument.formula"),
MediaType.application("vnd.oasis.opendocument.text-master"),
MediaType.application("vnd.oasis.opendocument.text-web"),
MediaType.application("vnd.oasis.opendocument.text-template"),
MediaType.application("vnd.oasis.opendocument.graphics-template"),
MediaType.application("vnd.oasis.opendocument.presentation-template"),
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("vnd.oasis.opendocument.chart-template"),
MediaType.application("vnd.oasis.opendocument.image-template"),
MediaType.application("vnd.oasis.opendocument.formula-template"),
MediaType.application("x-vnd.oasis.opendocument.text"),
MediaType.application("x-vnd.oasis.opendocument.graphics"),
MediaType.application("x-vnd.oasis.opendocument.presentation"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
MediaType.application("x-vnd.oasis.opendocument.chart"),
MediaType.application("x-vnd.oasis.opendocument.image"),
MediaType.application("x-vnd.oasis.opendocument.formula"),
MediaType.application("x-vnd.oasis.opendocument.text-master"),
MediaType.application("x-vnd.oasis.opendocument.text-web"),
MediaType.application("x-vnd.oasis.opendocument.text-template"),
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
MediaType.application("x-vnd.oasis.opendocument.image-template"),
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
private static final String MANIFEST_NAME = "META-INF/manifest.xml";
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
private boolean extractMacros = false;
public Parser getMetaParser() {
return meta;
@ -120,10 +128,10 @@ public class OpenDocumentParser extends AbstractParser {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
@ -145,85 +153,129 @@ public class OpenDocumentParser extends AbstractParser {
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
xhtml.startDocument();
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
try {
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
} catch (SAXException e) {
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
}
throw e;
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
if (handler.isEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
do {
handleZipEntry(entry, zipStream, metadata, context, handler);
entry = zipStream.getNextEntry();
} while (entry != null);
@Field
public void setExtractMacros(boolean extractMacros) {
this.extractMacros = extractMacros;
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, TikaException, SAXException {
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context,
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
List<SAXException> exceptions = new ArrayList<>();
do {
try {
handleZipEntry(entry, zipStream, metadata, context, handler,
embeddedDocumentUtil);
} catch (SAXException e) {
WriteLimitReachedException.throwIfWriteLimitReached(e);
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
} else {
exceptions.add(e);
}
}
entry = zipStream.getNextEntry();
} while (entry != null);
if (exceptions.size() > 0) {
throw exceptions.get(0);
}
}
private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context,
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, TikaException, SAXException {
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
ZipEntry entry = zipFile.getEntry(META_NAME);
ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
handler, embeddedDocumentUtil);
}
entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
handler, embeddedDocumentUtil);
}
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata,
context, handler, embeddedDocumentUtil);
}
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
if (entry.getName().equals("mimetype")) {
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, ContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, SAXException, TikaException {
if (entry.getName().contains("manifest.xml")) {
checkForEncryption(zip, context);
} else if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
((OpenDocumentContentParser) content)
.parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
((OpenDocumentContentParser) content)
.parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
@ -231,26 +283,87 @@ public class OpenDocumentParser extends AbstractParser {
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
TikaInputStream stream = TikaInputStream.get(zip);
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName());
if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
}
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
MediaType embeddedMimeType =
embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
if (embeddedMimeType != null) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
}
stream.reset();
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler),
embeddedMetadata, false);
}
} else if (extractMacros && embeddedName.contains("Basic/")) {
//process all files under Basic/; let maybeHandleMacro figure
//out if it is a macro or not
maybeHandleMacro(zip, embeddedName, handler, context);
}
}
}
private void maybeHandleMacro(InputStream is, String embeddedName, ContentHandler handler,
ParseContext context)
throws TikaException, IOException, SAXException {
//should probably run XMLRootExtractor on the inputstream
//or read the macro manifest for the names of the macros
//rather than relying on the script file name
if (ignoreScriptFile(embeddedName)) {
return;
}
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
handler = new OpenDocumentMacroHandler(handler, context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(is),
new OfflineContentHandler(new EmbeddedContentHandler(handler)), context);
}
private void checkForEncryption(InputStream stream, ParseContext context)
throws SAXException, TikaException, IOException {
try {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
new OpenDocumentManifestHandler())), context);
} catch (SAXException e) {
if (e.getCause() != null
&& e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
}
//otherwise...swallow
}
}
private boolean ignoreScriptFile(String embeddedName) {
if (embeddedName.contains("Basic/")) {
if (embeddedName.contains("script-lb.xml")) {
return true;
} else if (embeddedName.contains("script-lc.xml")) {
return true;
}
} else {
//shouldn't ever get here, but if it isn't under Basic/, ignore it
return true;
}
return false;
}
}

View File

@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.helpers.DefaultHandler;
import java.util.Arrays;
import java.util.List;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* Base class for SAX handlers that map SAX events into document metadata.
*
@ -39,11 +40,12 @@ class AbstractMetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
protected AbstractMetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
/**
* Adds the given metadata value. The value is ignored if it is
@ -59,9 +61,9 @@ class AbstractMetadataHandler extends DefaultHandler {
List<String> previous = Arrays.asList(metadata.getValues(name));
if (!previous.contains(value)) {
if (property != null) {
metadata.add(property, value);
metadata.add(property, value);
} else {
metadata.add(name, value);
metadata.add(name, value);
}
}
} else {
@ -69,23 +71,23 @@ class AbstractMetadataHandler extends DefaultHandler {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
if (!previous.equals(value)) {
if (property != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
// Replace the existing value if isMultiValuePermitted is false
metadata.set(property, value);
}
} else {
metadata.add(name, value);
}
if (property != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
// Replace the existing value if isMultiValuePermitted is false
metadata.set(property, value);
}
} else {
metadata.add(name, value);
}
}
} else {
if (property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
if (property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
}
}
}

View File

@ -16,15 +16,16 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
/**
* This adds a Metadata entry for a given node.
* The textual content of the node is used as the
* value, and the Metadata name is taken from
* an attribute, with a prefix if required.
* value, and the Metadata name is taken from
* an attribute, with a prefix if required.
*/
public class AttributeDependantMetadataHandler extends DefaultHandler {
@ -32,20 +33,20 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
private final String nameHoldingAttribute;
private final String namePrefix;
private final StringBuilder buffer = new StringBuilder();
private String name;
private final StringBuilder buffer = new StringBuilder();
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute,
String namePrefix) {
this.metadata = metadata;
this.nameHoldingAttribute = nameHoldingAttribute;
this.namePrefix = namePrefix;
}
public void addMetadata(String value) {
if(name == null || name.length() == 0) {
// We didn't find the attribute which holds the name
return;
if (name == null || name.length() == 0) {
// We didn't find the attribute which holds the name
return;
}
if (value.length() > 0) {
String previous = metadata.get(name);
@ -61,20 +62,19 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
String rawName = attributes.getValue(nameHoldingAttribute);
if (rawName != null) {
if (namePrefix == null) {
this.name = rawName;
} else {
this.name = namePrefix + rawName;
}
if (namePrefix == null) {
this.name = rawName;
} else {
this.name = namePrefix + rawName;
}
}
// All other attributes are ignored
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}

View File

@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* SAX event handler that maps the contents of an XML attribute into
* a metadata field.
@ -33,26 +34,25 @@ public class AttributeMetadataHandler extends AbstractMetadataHandler {
private final String localName;
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
public AttributeMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
}
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, Property property) {
super(metadata, property);
this.uri = uri;
this.localName = localName;
}
public AttributeMetadataHandler(String uri, String localName, Metadata metadata,
Property property) {
super(metadata, property);
this.uri = uri;
this.localName = localName;
}
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
throws SAXException {
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
for (int i = 0; i < attributes.getLength(); i++) {
if (attributes.getURI(i).equals(this.uri)
&& attributes.getLocalName(i).equals(this.localName)) {
if (attributes.getURI(i).equals(this.uri) &&
attributes.getLocalName(i).equals(this.localName)) {
addMetadata(attributes.getValue(i).trim());
}
}

View File

@ -16,45 +16,45 @@
*/
package org.apache.tika.parser.xml;
import org.xml.sax.ContentHandler;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
/**
* Dublin Core metadata parser
*/
public class DcXMLParser extends XMLParser {
/** Serial version UID */
/**
* Serial version UID
*/
private static final long serialVersionUID = 4905318835463880819L;
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
String element) {
return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TeeContentHandler(
super.getContentHandler(handler, metadata, context),
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TeeContentHandler(super.getContentHandler(handler, metadata, context),
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(metadata, TikaCoreProperties.SUBJECT, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
}
}

View File

@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import java.util.Arrays;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* SAX event handler that maps the contents of an XML element into
@ -44,21 +45,17 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
private final Metadata metadata;
private final String name;
private Property targetProperty;
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
private Property targetProperty;
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
@ -70,13 +67,12 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
/**
* Constructor for string metadata keys.
*
* @param uri the uri of the namespace of the element
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@ -91,15 +87,15 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* Constructor for string metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name,
boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@ -113,13 +109,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
/**
* Constructor for Property metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata,
Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@ -135,15 +131,16 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* Constructor for Property metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata,
Property targetProperty, boolean allowDuplicateValues,
boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@ -162,16 +159,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
protected boolean isMatchingElement(String uri, String localName) {
// match if we're inside the parent element or within some bag element
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
)
);
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))));
}
@Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
@ -230,7 +224,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
value = "";
}
String[] previous = metadata.getValues(name);
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
if (previous == null || !Arrays.asList(previous).contains(value) ||
allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}

View File

@ -16,64 +16,68 @@
*/
package org.apache.tika.parser.xml;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.codec.binary.Base64;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-fictionbook+xml"));
Collections.singleton(MediaType.application("x-fictionbook+xml"));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new BinaryElementsDataHandler(
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
private static final String ELEMENT_BINARY = "binary";
private boolean binaryMode = false;
private static final String ATTRIBUTE_ID = "id";
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private final EmbeddedDocumentExtractor partExtractor;
private final ContentHandler handler;
private final StringBuilder binaryData = new StringBuilder();
private boolean binaryMode = false;
private Metadata metadata;
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor,
ContentHandler handler) {
this.partExtractor = partExtractor;
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
binaryMode = ELEMENT_BINARY.equals(localName);
if (binaryMode) {
binaryData.setLength(0);
metadata = new Metadata();
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attributes.getValue(ATTRIBUTE_ID));
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
}
}
@ -83,11 +87,8 @@ public class FictionBookParser extends XMLParser {
if (binaryMode) {
try {
partExtractor.parseEmbedded(
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
handler,
metadata,
true
);
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
handler, metadata, true);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}

View File

@ -16,19 +16,20 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* This adds Metadata entries with a specified name for
* the textual content of a node (if present), and
* all attribute values passed through the matcher
* (but not their names).
* the textual content of a node (if present), and
* all attribute values passed through the matcher
* (but not their names).
*
* @deprecated Use the {@link AttributeMetadataHandler} and
* {@link ElementMetadataHandler} classes instead
* {@link ElementMetadataHandler} classes instead
*/
public class MetadataHandler extends DefaultHandler {
@ -44,11 +45,12 @@ public class MetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
public MetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
public void addMetadata(String value) {
if (value.length() > 0) {
@ -56,11 +58,11 @@ public class MetadataHandler extends DefaultHandler {
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
if (this.property != null) {
metadata.set(property, value);
metadata.set(property, value);
} else {
metadata.set(name, value);
metadata.set(name, value);
}
}
}
@ -70,14 +72,13 @@ public class MetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
for (int i = 0; i < attributes.getLength(); i++) {
addMetadata(attributes.getValue(i));
}
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.xml.sax.ContentHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TextAndAttributeContentHandler;
public class TextAndAttributeXMLParser extends XMLParser {
private static final long serialVersionUID = 7796914007312429473L;
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TextAndAttributeContentHandler(handler, true);
}
}

View File

@ -16,7 +16,17 @@
*/
package org.apache.tika.parser.xml;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@ -28,52 +38,41 @@ import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* XML parser.
*/
public class XMLParser extends AbstractParser {
/** Serial version UID */
/**
* Serial version UID
*/
private static final long serialVersionUID = -6028836725280212837L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("xml"),
MediaType.image("svg+xml"))));
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(
Arrays.asList(MediaType.application("xml"), MediaType.image("svg+xml"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))), context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(
new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))),
context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
@ -83,8 +82,8 @@ public class XMLParser extends AbstractParser {
}
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TextContentHandler(handler, true);
}
}

View File

@ -0,0 +1,206 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
public class StringUtils {
/**
* The empty String {@code ""}.
*
* @since 2.0
*/
public static final String EMPTY = "";
/**
* A String for a space character.
*
* @since 3.2
*/
public static final String SPACE = " ";
static int PAD_LIMIT = 10000;
public static boolean isEmpty(final CharSequence cs) {
return cs == null || cs.length() == 0;
}
public static boolean isBlank(final String s) {
return s == null || s.trim().length() == 0;
}
/**
* <p>Left pad a String with a specified String.</p>
*
* <p>Pad to a size of {@code size}.</p>
*
* <pre>
* StringUtils.leftPad(null, *, *) = null
* StringUtils.leftPad("", 3, "z") = "zzz"
* StringUtils.leftPad("bat", 3, "yz") = "bat"
* StringUtils.leftPad("bat", 5, "yz") = "yzbat"
* StringUtils.leftPad("bat", 8, "yz") = "yzyzybat"
* StringUtils.leftPad("bat", 1, "yz") = "bat"
* StringUtils.leftPad("bat", -1, "yz") = "bat"
* StringUtils.leftPad("bat", 5, null) = " bat"
* StringUtils.leftPad("bat", 5, "") = " bat"
* </pre>
*
* @param str the String to pad out, may be null
* @param size the size to pad to
* @param padStr the String to pad with, null or empty treated as single space
* @return left padded String or original String if no padding is necessary,
* {@code null} if null String input
*/
public static String leftPad(final String str, final int size, String padStr) {
if (str == null) {
return null;
}
if (isEmpty(padStr)) {
padStr = SPACE;
}
final int padLen = padStr.length();
final int strLen = str.length();
final int pads = size - strLen;
if (pads <= 0) {
return str; // returns original String when possible
}
if (padLen == 1 && pads <= PAD_LIMIT) {
return leftPad(str, size, padStr.charAt(0));
}
if (pads == padLen) {
return padStr.concat(str);
} else if (pads < padLen) {
return padStr.substring(0, pads).concat(str);
} else {
final char[] padding = new char[pads];
final char[] padChars = padStr.toCharArray();
for (int i = 0; i < pads; i++) {
padding[i] = padChars[i % padLen];
}
return new String(padding).concat(str);
}
}
public static String leftPad(final String str, final int size, final char padChar) {
if (str == null) {
return null;
}
final int pads = size - str.length();
if (pads <= 0) {
return str; // returns original String when possible
}
if (pads > PAD_LIMIT) {
return leftPad(str, size, String.valueOf(padChar));
}
return repeat(padChar, pads).concat(str);
}
/**
* <p>Returns padding using the specified delimiter repeated
* to a given length.</p>
*
* <pre>
* StringUtils.repeat('e', 0) = ""
* StringUtils.repeat('e', 3) = "eee"
* StringUtils.repeat('e', -2) = ""
* </pre>
*
* <p>Note: this method does not support padding with
* <a href="http://www.unicode.org/glossary/#supplementary_character">Unicode Supplementary Characters</a>
* as they require a pair of {@code char}s to be represented.
* If you are needing to support full I18N of your applications
* consider using {@link #repeat(String, int)} instead.
* </p>
*
* @param ch character to repeat
* @param repeat number of times to repeat char, negative treated as zero
* @return String with repeated character
* @see #repeat(String, int)
*/
public static String repeat(final char ch, final int repeat) {
if (repeat <= 0) {
return EMPTY;
}
final char[] buf = new char[repeat];
for (int i = repeat - 1; i >= 0; i--) {
buf[i] = ch;
}
return new String(buf);
}
// Padding
//-----------------------------------------------------------------------
/**
* <p>Repeat a String {@code repeat} times to form a
* new String.</p>
*
* <pre>
* StringUtils.repeat(null, 2) = null
* StringUtils.repeat("", 0) = ""
* StringUtils.repeat("", 2) = ""
* StringUtils.repeat("a", 3) = "aaa"
* StringUtils.repeat("ab", 2) = "abab"
* StringUtils.repeat("a", -2) = ""
* </pre>
*
* @param str the String to repeat, may be null
* @param repeat number of times to repeat str, negative treated as zero
* @return a new String consisting of the original String repeated,
* {@code null} if null String input
*/
public static String repeat(final String str, final int repeat) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) {
return null;
}
if (repeat <= 0) {
return EMPTY;
}
final int inputLength = str.length();
if (repeat == 1 || inputLength == 0) {
return str;
}
if (inputLength == 1 && repeat <= PAD_LIMIT) {
return repeat(str.charAt(0), repeat);
}
final int outputLength = inputLength * repeat;
switch (inputLength) {
case 1:
return repeat(str.charAt(0), repeat);
case 2:
final char ch0 = str.charAt(0);
final char ch1 = str.charAt(1);
final char[] output2 = new char[outputLength];
for (int i = repeat * 2 - 2; i >= 0; i--, i--) {
output2[i] = ch0;
output2[i + 1] = ch1;
}
return new String(output2);
default:
final StringBuilder buf = new StringBuilder(outputLength);
for (int i = 0; i < repeat; i++) {
buf.append(str);
}
return buf.toString();
}
}
}

View File

@ -16,7 +16,7 @@ import munit._
class OdfExtractTest extends FunSuite {
val files = List(
ExampleFiles.examples_sample_odt -> 6372,
ExampleFiles.examples_sample_odt -> 6367,
ExampleFiles.examples_sample_ods -> 717
)

View File

@ -20,7 +20,7 @@ import fs2.Stream
import docspell.common._
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties}
import org.apache.tika.mime.MediaType
import org.apache.tika.parser.txt.Icu4jEncodingDetector
@ -40,7 +40,7 @@ object TikaMimetype {
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.filename.foreach(md.set(TikaCoreProperties.RESOURCE_NAME_KEY, _))
hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}

View File

@ -38,7 +38,7 @@ object Dependencies {
val ScalaJavaTimeVersion = "2.3.0"
val Slf4jVersion = "1.7.31"
val StanfordNlpVersion = "4.2.2"
val TikaVersion = "1.27"
val TikaVersion = "2.0.0"
val YamuscaVersion = "0.8.1"
val SwaggerUIVersion = "3.51.1"
val TwelveMonkeysVersion = "3.7.0"