Merge pull request #944 from scala-steward/update/tika-core-2.0.0

Update tika-core to 2.0.0
This commit is contained in:
mergify[bot] 2021-07-25 11:18:53 +00:00 committed by GitHub
commit 1851c5b7af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 1653 additions and 983 deletions

View File

@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.exception;
import org.xml.sax.SAXException;
public class WriteLimitReachedException extends SAXException {
//in case of (hopefully impossible) cyclic exception
private final static int MAX_DEPTH = 100;
private final int writeLimit;
public WriteLimitReachedException(int writeLimit) {
this.writeLimit = writeLimit;
}
@Override
public String getMessage() {
return "Your document contained more than " + writeLimit
+ " characters, and so your requested limit has been"
+ " reached. To receive the full text of the document,"
+ " increase your limit. (Text up to the limit is"
+ " however available).";
}
/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the write limit.
*
* @param t throwable
* @return <code>true</code> if the write limit was reached,
* <code>false</code> otherwise
* @since Apache Tika 2.0
*/
public static boolean isWriteLimitReached(Throwable t) {
return isWriteLimitReached(t, 0);
}
private static boolean isWriteLimitReached(Throwable t, int depth) {
if (t == null) {
return false;
}
if (depth > MAX_DEPTH) {
return false;
}
if (t instanceof WriteLimitReachedException) {
return true;
} else {
return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1);
}
}
public static void throwIfWriteLimitReached(Exception ex) throws SAXException {
throwIfWriteLimitReached(ex, 0);
}
private static void throwIfWriteLimitReached(Exception ex, int depth) throws SAXException {
if (ex == null) {
return;
}
if (depth > MAX_DEPTH) {
return;
}
if (ex instanceof WriteLimitReachedException) {
throw (SAXException) ex;
} else {
isWriteLimitReached(ex.getCause(), depth + 1);
}
}
}

View File

@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.utils.XMLReaderUtils;
/**
* Handler for macros in flat open documents
*/
class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator {
static String MODULE = "module";
static String NAME = "name";
private static String SOURCE_CODE = "source-code";
private final ContentHandler contentHandler;
private final ParseContext parseContext;
private final StringBuilder macroBuffer = new StringBuilder();
String macroName = null;
boolean inMacro = false;
private EmbeddedDocumentExtractor embeddedDocumentExtractor;
FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
super(contentHandler);
this.contentHandler = contentHandler;
this.parseContext = parseContext;
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
if (MODULE.equals(localName)) {
macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
} else if (SOURCE_CODE.equals(localName)) {
inMacro = true;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inMacro) {
macroBuffer.append(ch, start, length);
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (SOURCE_CODE.equals(localName)) {
try {
handleMacro();
} catch (IOException e) {
throw new SAXException(e);
} finally {
resetMacroState();
}
}
}
protected void resetMacroState() {
macroBuffer.setLength(0);
macroName = null;
inMacro = false;
}
protected void handleMacro() throws IOException, SAXException {
byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8);
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
Metadata embeddedMetadata = new Metadata();
if (!isBlank(macroName)) {
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName);
}
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
.parseEmbedded(is, contentHandler, embeddedMetadata, false);
}
}
}
private static boolean isBlank(String s) {
return s == null || s.trim().isEmpty();
}
}

View File

@ -16,16 +16,17 @@
*/
package org.apache.tika.parser.odf;
import org.apache.tika.sax.ContentHandlerDecorator;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
import org.apache.tika.sax.ContentHandlerDecorator;
/**
* Content handler decorator that:<ul>
@ -35,14 +36,11 @@ import java.util.Locale;
*/
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
private static final String OLD_NS =
"http://openoffice.org/2000/";
private static final String OLD_NS = "http://openoffice.org/2000/";
private static final String NEW_NS =
"urn:oasis:names:tc:opendocument:xmlns:";
private static final String NEW_NS = "urn:oasis:names:tc:opendocument:xmlns:";
private static final String DTD_PUBLIC_ID =
"-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
private static final String DTD_PUBLIC_ID = "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
public NSNormalizerContentHandler(ContentHandler handler) {
super(handler);
@ -57,14 +55,12 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator {
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes atts) throws SAXException {
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException {
AttributesImpl natts = new AttributesImpl();
for (int i = 0; i < atts.getLength(); i++) {
natts.addAttribute(
mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i), atts.getType(i), atts.getValue(i));
natts.addAttribute(mapOldNS(atts.getURI(i)), atts.getLocalName(i), atts.getQName(i),
atts.getType(i), atts.getValue(i));
}
super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
}
@ -76,8 +72,7 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator {
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
public void startPrefixMapping(String prefix, String uri) throws SAXException {
super.startPrefixMapping(prefix, mapOldNS(uri));
}
@ -88,8 +83,8 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator {
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws IOException, SAXException {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|| DTD_PUBLIC_ID.equals(publicId)) {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) ||
DTD_PUBLIC_ID.equals(publicId)) {
return new InputSource(new StringReader(""));
} else {
return super.resolveEntity(publicId, systemId);

View File

@ -0,0 +1,564 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import javax.xml.namespace.QName;
import org.apache.commons.codec.binary.Base64;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
/*
Handler for the body element or odt flat files and content.xml of
traditional compressed odt files
*/
class OpenDocumentBodyHandler extends ElementMappingContentHandler {
public static final String TEXT_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS = "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS = "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS = "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final String BINARY_DATA = "binary-data";
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<>();
private static final char[] SPACE = new char[]{' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
static {
// general mappings of text:-tags
MAPPINGS.put(new QName(TEXT_NS, "p"), new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(new QName(TEXT_NS, "line-break"), new TargetElement(XHTML, "br"));
MAPPINGS.put(new QName(TEXT_NS, "list-item"), new TargetElement(XHTML, "li"));
MAPPINGS.put(new QName(TEXT_NS, "note"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(OFFICE_NS, "annotation"), new TargetElement(XHTML,
"span"));
MAPPINGS.put(new QName(PRESENTATION_NS, "notes"), new TargetElement(XHTML,
"span"));
MAPPINGS.put(new QName(DRAW_NS, "object"), new TargetElement(XHTML,
"object"));
MAPPINGS.put(new QName(DRAW_NS, "text-box"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(SVG_NS, "title"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(SVG_NS, "desc"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(TEXT_NS, "span"), new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping = new HashMap<>();
aAttsMapping.put(new QName(XLINK_NS, "href"), new QName("href"));
aAttsMapping.put(new QName(XLINK_NS, "title"), new QName("title"));
MAPPINGS.put(new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a",
aAttsMapping));
MAPPINGS.put(new QName(DRAW_NS, "a"), new TargetElement(XHTML, "a",
aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(new QName(TABLE_NS, "table"), new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(new QName(TABLE_NS, "table-row"), new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping = new HashMap<>();
tableCellAttsMapping
.put(new QName(TABLE_NS, "number-columns-spanned"), new QName("colspan"));
tableCellAttsMapping.put(new QName(TABLE_NS, "number-rows-spanned"), new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by
* spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping
.put(new QName(TABLE_NS, "number-columns-repeated"), new QName("colspan"));
MAPPINGS.put(new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
private final ContentHandler handler;
private final ParseContext parseContext;
private final BitSet textNodeStack = new BitSet();
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
//if we're in a binary-data tag
boolean inBinaryData = false;
private EmbeddedDocumentExtractor embeddedDocumentExtractor;
private StringBuilder base64BinaryDataBuffer = new StringBuilder();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<>();
private Map<String, TextStyle> textStyleMap = new HashMap<>();
private Map<String, ListStyle> listStyleMap = new HashMap<>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
private int pDepth = 0;
OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext) {
super(handler, MAPPINGS);
this.handler = handler;
this.parseContext = parseContext;
}
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inBinaryData) {
base64BinaryDataBuffer.append(ch, start, length);
return;
}
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template") || localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
//<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") &&
!localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) || "desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML, "b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) {
String link = attrs.getValue(XLINK_NS, "href");
AttributesImpl attr = new AttributesImpl();
if (!StringUtils.isEmpty(link)) {
attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link);
}
handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr);
handler.endElement(XHTMLContentHandler.XHTML, "img", "img");
}
if (BINARY_DATA.equals(localName)) {
inBinaryData = true;
return;
}
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) &&
"text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) ||
(fontWeight != null && Character.isDigit(fontWeight.charAt(0)) &&
Integer.parseInt(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (BINARY_DATA.equals(localName)) {
inBinaryData = false;
try {
processBinaryData();
} catch (IOException e) {
throw new SAXException(e);
}
return;
}
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(namespaceURI, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement(namespaceURI, localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI) &&
("tab-stop".equals(localName) || "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
private void processBinaryData() throws IOException, SAXException {
//TODO: figure out whether we're in an inline image or a regular
//attachment and add that info to the embedded metadata
byte[] bytes = Base64.decodeBase64(base64BinaryDataBuffer.toString());
//clear state before parsing
base64BinaryDataBuffer.setLength(0);
inBinaryData = false;
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor.parseEmbedded(is, handler, embeddedMetadata, false);
}
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" + "italic=" + italic + ", bold=" + bold + ", underlined=" +
underlined + '}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
}

View File

@ -16,591 +16,47 @@
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.namespace.QName;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" +
"italic=" + italic +
", bold=" + bold +
", underlined=" + underlined +
'}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
private static final char[] SPACE = new char[]{ ' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
private int pDepth = 0; //<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
Map<QName, TargetElement> mappings) {
super(handler, mappings);
this.handler = handler;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(
String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) ||
"desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML,"u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML,"i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML,"b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
&& "text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
|| (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
&& Integer.valueOf(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement("", localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI)
&& ("tab-stop".equals(localName)
|| "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
}
public static final String TEXT_NS =
"urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS =
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS =
"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
MAPPINGS.put(
new QName(TEXT_NS, "p"),
new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(
new QName(TEXT_NS, "line-break"),
new TargetElement(XHTML, "br"));
MAPPINGS.put(
new QName(TEXT_NS, "list-item"),
new TargetElement(XHTML, "li"));
MAPPINGS.put(
new QName(TEXT_NS, "note"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(OFFICE_NS, "annotation"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(PRESENTATION_NS, "notes"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(DRAW_NS, "object"),
new TargetElement(XHTML, "object"));
MAPPINGS.put(
new QName(DRAW_NS, "text-box"),
new TargetElement(XHTML, "div"));
MAPPINGS.put(
new QName(SVG_NS, "title"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(SVG_NS, "desc"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(TEXT_NS, "span"),
new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping =
new HashMap<QName, QName>();
aAttsMapping.put(
new QName(XLINK_NS, "href"),
new QName("href"));
aAttsMapping.put(
new QName(XLINK_NS, "title"),
new QName("title"));
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
new QName(TABLE_NS, "table"),
new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(
new QName(TABLE_NS, "table-row"),
new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping =
new HashMap<QName, QName>();
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-spanned"),
new QName("colspan"));
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-rows-spanned"),
new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-repeated"),
new QName("colspan"));
MAPPINGS.put(
new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream,
new XHTMLContentHandler(handler, metadata),
metadata, context);
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context);
}
void parseInternal(
InputStream stream, final ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
DefaultHandler dh = new OpenDocumentBodyHandler(handler, context);
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(
new NSNormalizerContentHandler(dh)),
context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
new OfflineContentHandler(new NSNormalizerContentHandler(dh)), context);
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.XMLReaderUtils;
class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler {
OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) {
super(contentHandler, parseContext);
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes attrs)
throws SAXException {
//in the compressed odf, there should only be one element in this file.
if (MODULE.equalsIgnoreCase(localName)) {
inMacro = true;
macroName = XMLReaderUtils.getAttrValue(NAME, attrs);
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
if (MODULE.equals(localName)) {
try {
handleMacro();
} catch (IOException e) {
throw new SAXException(e);
} finally {
//this shouldn't be necessary in the compressed odf files
resetMacroState();
}
}
}
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.sax.ContentHandlerDecorator;
/**
* For now, this only looks for any encryption-data elements.
* If found this will throw an EncryptedDocumentException wrapped
* in a SAXException.
*
* If desired, we can add to this to actually extract information
* necessary for decryption. Please open an issue or pull
* request for this added functionality.
*
*/
class OpenDocumentManifestHandler extends ContentHandlerDecorator {
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
if (localName.equals("encryption-data")) {
throw new SAXException(new EncryptedDocumentException());
}
}
}

View File

@ -16,12 +16,21 @@
*/
package org.apache.tika.parser.odf;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@ -36,11 +45,6 @@ import org.apache.tika.sax.xpath.CompositeMatcher;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
/**
* Parser for OpenDocument <code>meta.xml</code> files.
@ -54,68 +58,54 @@ public class OpenDocumentMetaParser extends XMLParser {
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
/**
* @see OfficeOpenXMLCore#SUBJECT
* @deprecated use OfficeOpenXMLCore#SUBJECT
*/
@Deprecated
private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
Property.composite(Office.INITIAL_AUTHOR,
new Property[]{Property.externalText("initial-creator")});
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
String element) {
return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
private static ContentHandler getMeta(ContentHandler ch, Metadata md, Property property,
String element) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getUserDefined(
ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:user-defined/@meta:name"),
private static ContentHandler getUserDefined(ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:user-defined/@meta:name"),
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes
// custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
matcher);
new AttributeDependantMetadataHandler(md, "meta:name",
Office.USER_DEFINED_METADATA_NAME_PREFIX), matcher);
return new TeeContentHandler(ch, branch);
}
@Deprecated
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, String name,
String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, Property property, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
private static ContentHandler getStatistic(ContentHandler ch, Metadata md, Property property,
String attribute) {
Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
static ContentHandler getContentHandler(Metadata md, ParseContext context,
ContentHandler... handlers) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
// Process the Dublin Core Attributes
ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
ContentHandler ch =
new TeeContentHandler(getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
@ -129,19 +119,20 @@ public class OpenDocumentMetaParser extends XMLParser {
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "date",
md, TikaCoreProperties.MODIFIED));
ch = new TeeContentHandler(ch,
new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md,
TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "subject",
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
ch = new TeeContentHandler(ch,
new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md,
OfficeOpenXMLCore.SUBJECT));
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
ch = getMeta(ch, md, Office.KEYWORDS, "keyword");
ch = getMeta(ch, md, OfficeOpenXMLExtended.TOTAL_TIME, "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
ch = getMeta(ch, md, TikaCoreProperties.CREATOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
@ -157,43 +148,48 @@ public class OpenDocumentMetaParser extends XMLParser {
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
ch = getStatistic(ch, md, "nbWord", "word-count");
ch = getStatistic(ch, md, "nbCharacter", "character-count");
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
if (handlers != null && handlers.length > 0) {
ContentHandler[] newHandlers = new ContentHandler[handlers.length + 1];
newHandlers[0] = ch;
System.arraycopy(handlers, 0, newHandlers, 1, handlers.length);
ch = new TeeContentHandler(newHandlers);
}
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md,
ParseContext context) {
return getContentHandler(md, context, super.getContentHandler(ch, md, context));
}
@Override
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
if (odfSubject != null && !odfSubject.equals("") &&
(metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
(metadata.get(TikaCoreProperties.DESCRIPTION) == null ||
metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
//reset the dc:subject to include both keywords and subject
//We can't relying on composite keys in the MatchingContentHandlers
//because those are "setting" not "adding" to the Metadata object
List<String> subjects = new ArrayList<>();
if (metadata.getValues(Office.KEYWORDS) != null) {
subjects.addAll(Arrays.asList(metadata.getValues(Office.KEYWORDS)));
}
if (metadata.getValues(OfficeOpenXMLCore.SUBJECT) != null) {
subjects.addAll(Arrays.asList(metadata.getValues(OfficeOpenXMLCore.SUBJECT)));
}
if (subjects.size() > 0) {
metadata.set(TikaCoreProperties.SUBJECT, subjects.toArray(new String[0]));
}
}
}

View File

@ -16,37 +16,44 @@
*/
package org.apache.tika.parser.odf;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import static java.nio.charset.StandardCharsets.UTF_8;
import org.apache.tika.utils.XMLReaderUtils;
/**
* OpenOffice parser
@ -58,9 +65,8 @@ public class OpenDocumentParser extends AbstractParser {
*/
private static final long serialVersionUID = -6410276875438618287L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("vnd.sun.xml.writer"),
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
MediaType.application("vnd.oasis.opendocument.presentation"),
@ -95,10 +101,12 @@ public class OpenDocumentParser extends AbstractParser {
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
private static final String MANIFEST_NAME = "META-INF/manifest.xml";
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
private boolean extractMacros = false;
public Parser getMetaParser() {
return meta;
@ -120,10 +128,10 @@ public class OpenDocumentParser extends AbstractParser {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
@ -145,85 +153,129 @@ public class OpenDocumentParser extends AbstractParser {
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
xhtml.startDocument();
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml);
try {
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
} catch (SAXException e) {
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
}
throw e;
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
if (handler.isEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
@Field
public void setExtractMacros(boolean extractMacros) {
this.extractMacros = extractMacros;
}
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context,
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
List<SAXException> exceptions = new ArrayList<>();
do {
handleZipEntry(entry, zipStream, metadata, context, handler);
try {
handleZipEntry(entry, zipStream, metadata, context, handler,
embeddedDocumentUtil);
} catch (SAXException e) {
WriteLimitReachedException.throwIfWriteLimitReached(e);
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
} else {
exceptions.add(e);
}
}
entry = zipStream.getNextEntry();
} while (entry != null);
if (exceptions.size() > 0) {
throw exceptions.get(0);
}
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context,
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, TikaException, SAXException {
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
ZipEntry entry = zipFile.getEntry(META_NAME);
ZipEntry entry = zipFile.getEntry(MANIFEST_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
handler, embeddedDocumentUtil);
}
entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context,
handler, embeddedDocumentUtil);
}
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
handleZipEntry(entry, zipFile.getInputStream(entry), metadata,
context, handler, embeddedDocumentUtil);
}
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
if (entry.getName().equals("mimetype")) {
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, ContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, SAXException, TikaException {
if (entry.getName().contains("manifest.xml")) {
checkForEncryption(zip, context);
} else if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
((OpenDocumentContentParser) content)
.parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
((OpenDocumentContentParser) content)
.parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
@ -231,26 +283,87 @@ public class OpenDocumentParser extends AbstractParser {
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
TikaInputStream stream = TikaInputStream.get(zip);
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName());
if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString());
}
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
MediaType embeddedMimeType =
embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata);
if (embeddedMimeType != null) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
stream.reset();
}
if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler),
embeddedMetadata, false);
}
} else if (extractMacros && embeddedName.contains("Basic/")) {
//process all files under Basic/; let maybeHandleMacro figure
//out if it is a macro or not
maybeHandleMacro(zip, embeddedName, handler, context);
}
}
}
private void maybeHandleMacro(InputStream is, String embeddedName, ContentHandler handler,
ParseContext context)
throws TikaException, IOException, SAXException {
//should probably run XMLRootExtractor on the inputstream
//or read the macro manifest for the names of the macros
//rather than relying on the script file name
if (ignoreScriptFile(embeddedName)) {
return;
}
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
handler = new OpenDocumentMacroHandler(handler, context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(is),
new OfflineContentHandler(new EmbeddedContentHandler(handler)), context);
}
private void checkForEncryption(InputStream stream, ParseContext context)
throws SAXException, TikaException, IOException {
try {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
new OpenDocumentManifestHandler())), context);
} catch (SAXException e) {
if (e.getCause() != null
&& e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
}
//otherwise...swallow
}
}
private boolean ignoreScriptFile(String embeddedName) {
if (embeddedName.contains("Basic/")) {
if (embeddedName.contains("script-lb.xml")) {
return true;
} else if (embeddedName.contains("script-lc.xml")) {
return true;
}
} else {
//shouldn't ever get here, but if it isn't under Basic/, ignore it
return true;
}
return false;
}
}

View File

@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.helpers.DefaultHandler;
import java.util.Arrays;
import java.util.List;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* Base class for SAX handlers that map SAX events into document metadata.
*
@ -39,6 +40,7 @@ class AbstractMetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
protected AbstractMetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;

View File

@ -16,10 +16,11 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
/**
* This adds a Metadata entry for a given node.
* The textual content of the node is used as the
@ -32,11 +33,11 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
private final String nameHoldingAttribute;
private final String namePrefix;
private final StringBuilder buffer = new StringBuilder();
private String name;
private final StringBuilder buffer = new StringBuilder();
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute,
String namePrefix) {
this.metadata = metadata;
this.nameHoldingAttribute = nameHoldingAttribute;
this.namePrefix = namePrefix;
@ -61,8 +62,7 @@ public class AttributeDependantMetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
String rawName = attributes.getValue(nameHoldingAttribute);
if (rawName != null) {
if (namePrefix == null) {

View File

@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* SAX event handler that maps the contents of an XML attribute into
* a metadata field.
@ -33,26 +34,25 @@ public class AttributeMetadataHandler extends AbstractMetadataHandler {
private final String localName;
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
public AttributeMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
}
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, Property property) {
public AttributeMetadataHandler(String uri, String localName, Metadata metadata,
Property property) {
super(metadata, property);
this.uri = uri;
this.localName = localName;
}
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
for (int i = 0; i < attributes.getLength(); i++) {
if (attributes.getURI(i).equals(this.uri)
&& attributes.getLocalName(i).equals(this.localName)) {
if (attributes.getURI(i).equals(this.uri) &&
attributes.getLocalName(i).equals(this.localName)) {
addMetadata(attributes.getValue(i).trim());
}
}

View File

@ -16,35 +16,35 @@
*/
package org.apache.tika.parser.xml;
import org.xml.sax.ContentHandler;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
/**
* Dublin Core metadata parser
*/
public class DcXMLParser extends XMLParser {
/** Serial version UID */
/**
* Serial version UID
*/
private static final long serialVersionUID = 4905318835463880819L;
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property,
String element) {
return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property);
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TeeContentHandler(
super.getContentHandler(handler, metadata, context),
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TeeContentHandler(super.getContentHandler(handler, metadata, context),
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.SUBJECT, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),

View File

@ -16,13 +16,14 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import java.util.Arrays;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* SAX event handler that maps the contents of an XML element into
@ -44,21 +45,17 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
private final Metadata metadata;
private final String name;
private Property targetProperty;
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
private Property targetProperty;
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
@ -75,8 +72,7 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@ -98,8 +94,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name,
boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
@ -118,8 +114,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata,
Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@ -142,8 +138,9 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
public ElementMetadataHandler(String uri, String localName, Metadata metadata,
Property targetProperty, boolean allowDuplicateValues,
boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
@ -164,14 +161,11 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
)
);
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))));
}
@Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
@ -230,7 +224,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler {
value = "";
}
String[] previous = metadata.getValues(name);
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
if (previous == null || !Arrays.asList(previous).contains(value) ||
allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}

View File

@ -16,64 +16,68 @@
*/
package org.apache.tika.parser.xml;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-fictionbook+xml"));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new BinaryElementsDataHandler(
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
private static final String ELEMENT_BINARY = "binary";
private boolean binaryMode = false;
private static final String ATTRIBUTE_ID = "id";
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private final EmbeddedDocumentExtractor partExtractor;
private final ContentHandler handler;
private final StringBuilder binaryData = new StringBuilder();
private boolean binaryMode = false;
private Metadata metadata;
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor,
ContentHandler handler) {
this.partExtractor = partExtractor;
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
binaryMode = ELEMENT_BINARY.equals(localName);
if (binaryMode) {
binaryData.setLength(0);
metadata = new Metadata();
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attributes.getValue(ATTRIBUTE_ID));
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
}
}
@ -84,10 +88,7 @@ public class FictionBookParser extends XMLParser {
try {
partExtractor.parseEmbedded(
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
handler,
metadata,
true
);
handler, metadata, true);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}

View File

@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
/**
* This adds Metadata entries with a specified name for
* the textual content of a node (if present), and
@ -44,6 +45,7 @@ public class MetadataHandler extends DefaultHandler {
this.property = null;
this.name = name;
}
public MetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
@ -70,8 +72,7 @@ public class MetadataHandler extends DefaultHandler {
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
public void startElement(String uri, String localName, String name, Attributes attributes) {
for (int i = 0; i < attributes.getLength(); i++) {
addMetadata(attributes.getValue(i));
}

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.xml.sax.ContentHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TextAndAttributeContentHandler;
public class TextAndAttributeXMLParser extends XMLParser {
private static final long serialVersionUID = 7796914007312429473L;
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TextAndAttributeContentHandler(handler, true);
}
}

View File

@ -16,7 +16,17 @@
*/
package org.apache.tika.parser.xml;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@ -28,52 +38,41 @@ import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* XML parser.
*/
public class XMLParser extends AbstractParser {
/** Serial version UID */
/**
* Serial version UID
*/
private static final long serialVersionUID = -6028836725280212837L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("xml"),
MediaType.image("svg+xml"))));
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(
Arrays.asList(MediaType.application("xml"), MediaType.image("svg+xml"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))), context);
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(
new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))),
context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
@ -83,8 +82,8 @@ public class XMLParser extends AbstractParser {
}
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata,
ParseContext context) {
return new TextContentHandler(handler, true);
}
}

View File

@ -0,0 +1,206 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
public class StringUtils {
/**
* The empty String {@code ""}.
*
* @since 2.0
*/
public static final String EMPTY = "";
/**
* A String for a space character.
*
* @since 3.2
*/
public static final String SPACE = " ";
static int PAD_LIMIT = 10000;
public static boolean isEmpty(final CharSequence cs) {
return cs == null || cs.length() == 0;
}
public static boolean isBlank(final String s) {
return s == null || s.trim().length() == 0;
}
/**
* <p>Left pad a String with a specified String.</p>
*
* <p>Pad to a size of {@code size}.</p>
*
* <pre>
* StringUtils.leftPad(null, *, *) = null
* StringUtils.leftPad("", 3, "z") = "zzz"
* StringUtils.leftPad("bat", 3, "yz") = "bat"
* StringUtils.leftPad("bat", 5, "yz") = "yzbat"
* StringUtils.leftPad("bat", 8, "yz") = "yzyzybat"
* StringUtils.leftPad("bat", 1, "yz") = "bat"
* StringUtils.leftPad("bat", -1, "yz") = "bat"
* StringUtils.leftPad("bat", 5, null) = " bat"
* StringUtils.leftPad("bat", 5, "") = " bat"
* </pre>
*
* @param str the String to pad out, may be null
* @param size the size to pad to
* @param padStr the String to pad with, null or empty treated as single space
* @return left padded String or original String if no padding is necessary,
* {@code null} if null String input
*/
public static String leftPad(final String str, final int size, String padStr) {
if (str == null) {
return null;
}
if (isEmpty(padStr)) {
padStr = SPACE;
}
final int padLen = padStr.length();
final int strLen = str.length();
final int pads = size - strLen;
if (pads <= 0) {
return str; // returns original String when possible
}
if (padLen == 1 && pads <= PAD_LIMIT) {
return leftPad(str, size, padStr.charAt(0));
}
if (pads == padLen) {
return padStr.concat(str);
} else if (pads < padLen) {
return padStr.substring(0, pads).concat(str);
} else {
final char[] padding = new char[pads];
final char[] padChars = padStr.toCharArray();
for (int i = 0; i < pads; i++) {
padding[i] = padChars[i % padLen];
}
return new String(padding).concat(str);
}
}
public static String leftPad(final String str, final int size, final char padChar) {
if (str == null) {
return null;
}
final int pads = size - str.length();
if (pads <= 0) {
return str; // returns original String when possible
}
if (pads > PAD_LIMIT) {
return leftPad(str, size, String.valueOf(padChar));
}
return repeat(padChar, pads).concat(str);
}
/**
* <p>Returns padding using the specified delimiter repeated
* to a given length.</p>
*
* <pre>
* StringUtils.repeat('e', 0) = ""
* StringUtils.repeat('e', 3) = "eee"
* StringUtils.repeat('e', -2) = ""
* </pre>
*
* <p>Note: this method does not support padding with
* <a href="http://www.unicode.org/glossary/#supplementary_character">Unicode Supplementary Characters</a>
* as they require a pair of {@code char}s to be represented.
* If you are needing to support full I18N of your applications
* consider using {@link #repeat(String, int)} instead.
* </p>
*
* @param ch character to repeat
* @param repeat number of times to repeat char, negative treated as zero
* @return String with repeated character
* @see #repeat(String, int)
*/
public static String repeat(final char ch, final int repeat) {
if (repeat <= 0) {
return EMPTY;
}
final char[] buf = new char[repeat];
for (int i = repeat - 1; i >= 0; i--) {
buf[i] = ch;
}
return new String(buf);
}
// Padding
//-----------------------------------------------------------------------
/**
* <p>Repeat a String {@code repeat} times to form a
* new String.</p>
*
* <pre>
* StringUtils.repeat(null, 2) = null
* StringUtils.repeat("", 0) = ""
* StringUtils.repeat("", 2) = ""
* StringUtils.repeat("a", 3) = "aaa"
* StringUtils.repeat("ab", 2) = "abab"
* StringUtils.repeat("a", -2) = ""
* </pre>
*
* @param str the String to repeat, may be null
* @param repeat number of times to repeat str, negative treated as zero
* @return a new String consisting of the original String repeated,
* {@code null} if null String input
*/
public static String repeat(final String str, final int repeat) {
// Performance tuned for 2.0 (JDK1.4)
if (str == null) {
return null;
}
if (repeat <= 0) {
return EMPTY;
}
final int inputLength = str.length();
if (repeat == 1 || inputLength == 0) {
return str;
}
if (inputLength == 1 && repeat <= PAD_LIMIT) {
return repeat(str.charAt(0), repeat);
}
final int outputLength = inputLength * repeat;
switch (inputLength) {
case 1:
return repeat(str.charAt(0), repeat);
case 2:
final char ch0 = str.charAt(0);
final char ch1 = str.charAt(1);
final char[] output2 = new char[outputLength];
for (int i = repeat * 2 - 2; i >= 0; i--, i--) {
output2[i] = ch0;
output2[i + 1] = ch1;
}
return new String(output2);
default:
final StringBuilder buf = new StringBuilder(outputLength);
for (int i = 0; i < repeat; i++) {
buf.append(str);
}
return buf.toString();
}
}
}

View File

@ -16,7 +16,7 @@ import munit._
class OdfExtractTest extends FunSuite {
val files = List(
ExampleFiles.examples_sample_odt -> 6372,
ExampleFiles.examples_sample_odt -> 6367,
ExampleFiles.examples_sample_ods -> 717
)

View File

@ -20,7 +20,7 @@ import fs2.Stream
import docspell.common._
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties}
import org.apache.tika.mime.MediaType
import org.apache.tika.parser.txt.Icu4jEncodingDetector
@ -40,7 +40,7 @@ object TikaMimetype {
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.filename.foreach(md.set(TikaCoreProperties.RESOURCE_NAME_KEY, _))
hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}

View File

@ -38,7 +38,7 @@ object Dependencies {
val ScalaJavaTimeVersion = "2.3.0"
val Slf4jVersion = "1.7.32"
val StanfordNlpVersion = "4.2.2"
val TikaVersion = "1.27"
val TikaVersion = "2.0.0"
val YamuscaVersion = "0.8.1"
val SwaggerUIVersion = "3.51.1"
val TwelveMonkeysVersion = "3.7.0"