mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 09:30:12 +00:00 
			
		
		
		
	Update tika-core to 2.0.0
Include new ODF parser from tika-2.0.0
This commit is contained in:
		| @@ -0,0 +1,83 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.exception; | ||||
|  | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| public class WriteLimitReachedException extends SAXException { | ||||
|  | ||||
|     //in case of (hopefully impossible) cyclic exception | ||||
|     private final static int MAX_DEPTH = 100; | ||||
|  | ||||
|     private final int writeLimit; | ||||
|     public WriteLimitReachedException(int writeLimit) { | ||||
|         this.writeLimit = writeLimit; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public String getMessage() { | ||||
|         return "Your document contained more than " + writeLimit | ||||
|             + " characters, and so your requested limit has been" | ||||
|             + " reached. To receive the full text of the document," | ||||
|             + " increase your limit. (Text up to the limit is" | ||||
|             + " however available)."; | ||||
|     } | ||||
|     /** | ||||
|      * Checks whether the given exception (or any of it's root causes) was | ||||
|      * thrown by this handler as a signal of reaching the write limit. | ||||
|      * | ||||
|      * @param t throwable | ||||
|      * @return <code>true</code> if the write limit was reached, | ||||
|      * <code>false</code> otherwise | ||||
|      * @since Apache Tika 2.0 | ||||
|      */ | ||||
|     public static boolean isWriteLimitReached(Throwable t) { | ||||
|         return isWriteLimitReached(t, 0); | ||||
|     } | ||||
|  | ||||
|     private static boolean isWriteLimitReached(Throwable t, int depth) { | ||||
|         if (t == null) { | ||||
|             return false; | ||||
|         } | ||||
|         if (depth > MAX_DEPTH) { | ||||
|             return false; | ||||
|         } | ||||
|         if (t instanceof WriteLimitReachedException) { | ||||
|             return true; | ||||
|         } else { | ||||
|             return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     public static void throwIfWriteLimitReached(Exception ex) throws SAXException { | ||||
|         throwIfWriteLimitReached(ex, 0); | ||||
|     } | ||||
|  | ||||
|     private static void throwIfWriteLimitReached(Exception ex, int depth) throws SAXException { | ||||
|         if (ex == null) { | ||||
|             return; | ||||
|         } | ||||
|         if (depth > MAX_DEPTH) { | ||||
|             return; | ||||
|         } | ||||
|         if (ex instanceof WriteLimitReachedException) { | ||||
|             throw (SAXException) ex; | ||||
|         } else { | ||||
|             isWriteLimitReached(ex.getCause(), depth + 1); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,120 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.nio.charset.StandardCharsets; | ||||
|  | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentUtil; | ||||
| import org.apache.tika.io.TikaInputStream; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.TikaCoreProperties; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.sax.ContentHandlerDecorator; | ||||
| import org.apache.tika.utils.XMLReaderUtils; | ||||
|  | ||||
| /** | ||||
|  * Handler for macros in flat open documents | ||||
|  */ | ||||
| class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { | ||||
|  | ||||
|     static String MODULE = "module"; | ||||
|     static String NAME = "name"; | ||||
|     private static String SOURCE_CODE = "source-code"; | ||||
|     private final ContentHandler contentHandler; | ||||
|     private final ParseContext parseContext; | ||||
|     private final StringBuilder macroBuffer = new StringBuilder(); | ||||
|     String macroName = null; | ||||
|     boolean inMacro = false; | ||||
|     private EmbeddedDocumentExtractor embeddedDocumentExtractor; | ||||
|  | ||||
|     FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) { | ||||
|         super(contentHandler); | ||||
|         this.contentHandler = contentHandler; | ||||
|         this.parseContext = parseContext; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) | ||||
|         throws SAXException { | ||||
|         if (MODULE.equals(localName)) { | ||||
|             macroName = XMLReaderUtils.getAttrValue(NAME, attrs); | ||||
|         } else if (SOURCE_CODE.equals(localName)) { | ||||
|             inMacro = true; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void characters(char[] ch, int start, int length) throws SAXException { | ||||
|         if (inMacro) { | ||||
|             macroBuffer.append(ch, start, length); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void endElement(String namespaceURI, String localName, String qName) | ||||
|         throws SAXException { | ||||
|         if (SOURCE_CODE.equals(localName)) { | ||||
|             try { | ||||
|                 handleMacro(); | ||||
|             } catch (IOException e) { | ||||
|                 throw new SAXException(e); | ||||
|             } finally { | ||||
|                 resetMacroState(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected void resetMacroState() { | ||||
|         macroBuffer.setLength(0); | ||||
|         macroName = null; | ||||
|         inMacro = false; | ||||
|     } | ||||
|  | ||||
|     protected void handleMacro() throws IOException, SAXException { | ||||
|  | ||||
|         byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8); | ||||
|  | ||||
|         if (embeddedDocumentExtractor == null) { | ||||
|             embeddedDocumentExtractor = | ||||
|                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); | ||||
|         } | ||||
|         Metadata embeddedMetadata = new Metadata(); | ||||
|         if (!isBlank(macroName)) { | ||||
|             embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName); | ||||
|         } | ||||
|         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | ||||
|             TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); | ||||
|  | ||||
|         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { | ||||
|             try (InputStream is = TikaInputStream.get(bytes)) { | ||||
|                 embeddedDocumentExtractor | ||||
|                     .parseEmbedded(is, contentHandler, embeddedMetadata, false); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private static boolean isBlank(String s) { | ||||
|         return s == null || s.trim().isEmpty(); | ||||
|     } | ||||
| } | ||||
| @@ -1,31 +1,32 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|     * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|     * contributor license agreements.  See the NOTICE file distributed with | ||||
|     * this work for additional information regarding copyright ownership. | ||||
|     * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|     * (the "License"); you may not use this file except in compliance with | ||||
|     * the License.  You may obtain a copy of the License at | ||||
|     * | ||||
|     *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|     * | ||||
|     * Unless required by applicable law or agreed to in writing, software | ||||
|     * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|     * See the License for the specific language governing permissions and | ||||
|     * limitations under the License. | ||||
|     */ | ||||
|     package org.apache.tika.parser.odf; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.StringReader; | ||||
| import java.util.Locale; | ||||
|  | ||||
| import org.apache.tika.sax.ContentHandlerDecorator; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.InputSource; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.AttributesImpl; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.StringReader; | ||||
| import java.util.Locale; | ||||
| import org.apache.tika.sax.ContentHandlerDecorator; | ||||
|  | ||||
| /** | ||||
|  * Content handler decorator that:<ul> | ||||
| @@ -35,14 +36,11 @@ import java.util.Locale; | ||||
|  */ | ||||
| public class NSNormalizerContentHandler extends ContentHandlerDecorator { | ||||
|  | ||||
|     private static final String OLD_NS = | ||||
|             "http://openoffice.org/2000/"; | ||||
|     private static final String OLD_NS = "http://openoffice.org/2000/"; | ||||
|  | ||||
|     private static final String NEW_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:"; | ||||
|     private static final String NEW_NS = "urn:oasis:names:tc:opendocument:xmlns:"; | ||||
|  | ||||
|     private static final String DTD_PUBLIC_ID = | ||||
|             "-//OpenOffice.org//DTD OfficeDocument 1.0//EN"; | ||||
|     private static final String DTD_PUBLIC_ID = "-//OpenOffice.org//DTD OfficeDocument 1.0//EN"; | ||||
|  | ||||
|     public NSNormalizerContentHandler(ContentHandler handler) { | ||||
|         super(handler); | ||||
| @@ -57,27 +55,24 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator { | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement( | ||||
|             String namespaceURI, String localName, String qName, | ||||
|             Attributes atts) throws SAXException { | ||||
|     public void startElement(String namespaceURI, String localName, String qName, Attributes atts) | ||||
|         throws SAXException { | ||||
|         AttributesImpl natts = new AttributesImpl(); | ||||
|         for (int i = 0; i < atts.getLength(); i++) { | ||||
|             natts.addAttribute( | ||||
|                     mapOldNS(atts.getURI(i)), atts.getLocalName(i), | ||||
|                     atts.getQName(i), atts.getType(i), atts.getValue(i)); | ||||
|             natts.addAttribute(mapOldNS(atts.getURI(i)), atts.getLocalName(i), atts.getQName(i), | ||||
|                 atts.getType(i), atts.getValue(i)); | ||||
|         } | ||||
|         super.startElement(mapOldNS(namespaceURI), localName, qName, atts); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void endElement(String namespaceURI, String localName, String qName) | ||||
|             throws SAXException { | ||||
|         throws SAXException { | ||||
|         super.endElement(mapOldNS(namespaceURI), localName, qName); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startPrefixMapping(String prefix, String uri) | ||||
|             throws SAXException { | ||||
|     public void startPrefixMapping(String prefix, String uri) throws SAXException { | ||||
|         super.startPrefixMapping(prefix, mapOldNS(uri)); | ||||
|     } | ||||
|  | ||||
| @@ -87,13 +82,13 @@ public class NSNormalizerContentHandler extends ContentHandlerDecorator { | ||||
|      */ | ||||
|     @Override | ||||
|     public InputSource resolveEntity(String publicId, String systemId) | ||||
|             throws IOException, SAXException { | ||||
|         if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) | ||||
|                 || DTD_PUBLIC_ID.equals(publicId)) { | ||||
|         throws IOException, SAXException { | ||||
|         if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) || | ||||
|             DTD_PUBLIC_ID.equals(publicId)) { | ||||
|             return new InputSource(new StringReader("")); | ||||
|         } else { | ||||
|             return super.resolveEntity(publicId, systemId); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| } | ||||
| } | ||||
| @@ -0,0 +1,564 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import static org.apache.tika.sax.XHTMLContentHandler.XHTML; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.BitSet; | ||||
| import java.util.HashMap; | ||||
| import java.util.Map; | ||||
| import java.util.Stack; | ||||
| import javax.xml.namespace.QName; | ||||
|  | ||||
| import org.apache.commons.codec.binary.Base64; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.AttributesImpl; | ||||
|  | ||||
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentUtil; | ||||
| import org.apache.tika.io.TikaInputStream; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.sax.ElementMappingContentHandler; | ||||
| import org.apache.tika.sax.XHTMLContentHandler; | ||||
| import org.apache.tika.utils.StringUtils; | ||||
|  | ||||
| /* | ||||
|     Handler for the body element or odt flat files and content.xml of | ||||
|     traditional compressed odt files | ||||
|  */ | ||||
| class OpenDocumentBodyHandler extends ElementMappingContentHandler { | ||||
|  | ||||
|     public static final String TEXT_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; | ||||
|     public static final String TABLE_NS = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; | ||||
|     public static final String STYLE_NS = "urn:oasis:names:tc:opendocument:xmlns:style:1.0"; | ||||
|     public static final String FORMATTING_OBJECTS_NS = | ||||
|         "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"; | ||||
|     public static final String OFFICE_NS = "urn:oasis:names:tc:opendocument:xmlns:office:1.0"; | ||||
|     public static final String SVG_NS = "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"; | ||||
|     public static final String PRESENTATION_NS = | ||||
|         "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0"; | ||||
|     public static final String DRAW_NS = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"; | ||||
|     public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; | ||||
|     protected static final char[] TAB = new char[]{'\t'}; | ||||
|     private static final String BINARY_DATA = "binary-data"; | ||||
|     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); | ||||
|     /** | ||||
|      * Mappings between ODF tag names and XHTML tag names | ||||
|      * (including attributes). All other tag names/attributes are ignored | ||||
|      * and left out from event stream. | ||||
|      */ | ||||
|     private static final HashMap<QName, TargetElement> MAPPINGS = | ||||
|         new HashMap<>(); | ||||
|     private static final char[] SPACE = new char[]{' '}; | ||||
|     private static final String CLASS = "class"; | ||||
|     private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation"); | ||||
|     private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note"); | ||||
|     private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes"); | ||||
|  | ||||
|     static { | ||||
|         // general mappings of text:-tags | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "p"), new TargetElement(XHTML, "p")); | ||||
|         // text:h-tags are mapped specifically in startElement/endElement | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "line-break"), new TargetElement(XHTML, "br")); | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "list-item"), new TargetElement(XHTML, "li")); | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "note"), new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put(new QName(OFFICE_NS, "annotation"), new TargetElement(XHTML, | ||||
|             "span")); | ||||
|         MAPPINGS.put(new QName(PRESENTATION_NS, "notes"), new TargetElement(XHTML, | ||||
|             "span")); | ||||
|         MAPPINGS.put(new QName(DRAW_NS, "object"), new TargetElement(XHTML, | ||||
|             "object")); | ||||
|         MAPPINGS.put(new QName(DRAW_NS, "text-box"), new TargetElement(XHTML, "div")); | ||||
|         MAPPINGS.put(new QName(SVG_NS, "title"), new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put(new QName(SVG_NS, "desc"), new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "span"), new TargetElement(XHTML, "span")); | ||||
|  | ||||
|         final HashMap<QName, QName> aAttsMapping = new HashMap<>(); | ||||
|         aAttsMapping.put(new QName(XLINK_NS, "href"), new QName("href")); | ||||
|         aAttsMapping.put(new QName(XLINK_NS, "title"), new QName("title")); | ||||
|         MAPPINGS.put(new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a", | ||||
|             aAttsMapping)); | ||||
|         MAPPINGS.put(new QName(DRAW_NS, "a"), new TargetElement(XHTML, "a", | ||||
|             aAttsMapping)); | ||||
|  | ||||
|         // create HTML tables from table:-tags | ||||
|         MAPPINGS.put(new QName(TABLE_NS, "table"), new TargetElement(XHTML, "table")); | ||||
|         // repeating of rows is ignored; for columns, see below! | ||||
|         MAPPINGS.put(new QName(TABLE_NS, "table-row"), new TargetElement(XHTML, "tr")); | ||||
|         // special mapping for rowspan/colspan attributes | ||||
|         final HashMap<QName, QName> tableCellAttsMapping = new HashMap<>(); | ||||
|         tableCellAttsMapping | ||||
|             .put(new QName(TABLE_NS, "number-columns-spanned"), new QName("colspan")); | ||||
|         tableCellAttsMapping.put(new QName(TABLE_NS, "number-rows-spanned"), new QName("rowspan")); | ||||
|         /* TODO: The following is not correct, the cell should be repeated not spanned! | ||||
|          * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct. | ||||
|          * Problems may occur when both spanning and repeating is given, which is not allowed by | ||||
|          *  spec. | ||||
|          * Cell spanning instead of repeating  is not a problem, because OpenOffice uses it | ||||
|          * only for empty cells. | ||||
|          */ | ||||
|         tableCellAttsMapping | ||||
|             .put(new QName(TABLE_NS, "number-columns-repeated"), new QName("colspan")); | ||||
|         MAPPINGS.put(new QName(TABLE_NS, "table-cell"), | ||||
|             new TargetElement(XHTML, "td", tableCellAttsMapping)); | ||||
|     } | ||||
|  | ||||
|     private final ContentHandler handler; | ||||
|     private final ParseContext parseContext; | ||||
|     private final BitSet textNodeStack = new BitSet(); | ||||
|     //have we written the start style tags | ||||
|     //yet for the current text style | ||||
|     boolean hasWrittenStartStyleTags = false; | ||||
|     //if we're in a binary-data tag | ||||
|     boolean inBinaryData = false; | ||||
|     private EmbeddedDocumentExtractor embeddedDocumentExtractor; | ||||
|     private StringBuilder base64BinaryDataBuffer = new StringBuilder(); | ||||
|     private int nodeDepth = 0; | ||||
|     private int completelyFiltered = 0; | ||||
|     private Stack<String> headingStack = new Stack<>(); | ||||
|     private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<>(); | ||||
|     private Map<String, TextStyle> textStyleMap = new HashMap<>(); | ||||
|     private Map<String, ListStyle> listStyleMap = new HashMap<>(); | ||||
|     private String currParagraphStyleName; //paragraph style name | ||||
|     private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs | ||||
|     private String currTextStyleName; | ||||
|     private Stack<ListStyle> listStyleStack = new Stack<>(); | ||||
|     private ListStyle listStyle; | ||||
|     // True if we are currently in the named style: | ||||
|     private boolean curUnderlined; | ||||
|     private boolean curBold; | ||||
|     private boolean curItalic; | ||||
|     private int pDepth = 0; | ||||
|     OpenDocumentBodyHandler(ContentHandler handler, ParseContext parseContext) { | ||||
|         super(handler, MAPPINGS); | ||||
|         this.handler = handler; | ||||
|         this.parseContext = parseContext; | ||||
|     } | ||||
|  | ||||
|     private static Attributes buildAttributes(String key, String value) { | ||||
|         AttributesImpl attrs = new AttributesImpl(); | ||||
|         attrs.addAttribute("", key, key, "CDATA", value); | ||||
|         return attrs; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void characters(char[] ch, int start, int length) throws SAXException { | ||||
|         if (inBinaryData) { | ||||
|             base64BinaryDataBuffer.append(ch, start, length); | ||||
|             return; | ||||
|         } | ||||
|         // only forward content of tags from text:-namespace | ||||
|         if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) { | ||||
|             if (!hasWrittenStartStyleTags) { | ||||
|                 updateStyleTags(); | ||||
|                 hasWrittenStartStyleTags = true; | ||||
|             } | ||||
|             super.characters(ch, start, length); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // helper for checking tags which need complete filtering | ||||
|     // (with sub-tags) | ||||
|     private boolean needsCompleteFiltering(String namespaceURI, String localName) { | ||||
|         if (TEXT_NS.equals(namespaceURI)) { | ||||
|             return localName.endsWith("-template") || localName.endsWith("-style"); | ||||
|         } | ||||
|         return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName); | ||||
|     } | ||||
|     //<p> can appear inside comments and other things that are already inside <p> | ||||
|     //we need to track our pDepth and only output <p> if we're at the main level | ||||
|  | ||||
|     // map the heading level to <hX> HTML tags | ||||
|     private String getXHTMLHeaderTagName(Attributes atts) { | ||||
|         String depthStr = atts.getValue(TEXT_NS, "outline-level"); | ||||
|         if (depthStr == null) { | ||||
|             return "h1"; | ||||
|         } | ||||
|  | ||||
|         int depth = Integer.parseInt(depthStr); | ||||
|         if (depth >= 6) { | ||||
|             return "h6"; | ||||
|         } else if (depth <= 1) { | ||||
|             return "h1"; | ||||
|         } else { | ||||
|             return "h" + depth; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Check if a node is a text node | ||||
|      */ | ||||
|     private boolean isTextNode(String namespaceURI, String localName) { | ||||
|         if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && | ||||
|             !localName.equals("page-count")) { | ||||
|             return true; | ||||
|         } | ||||
|         if (SVG_NS.equals(namespaceURI)) { | ||||
|             return "title".equals(localName) || "desc".equals(localName); | ||||
|         } | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     private void startList(String name) throws SAXException { | ||||
|         String elementName = "ul"; | ||||
|         if (name != null) { | ||||
|             ListStyle style = listStyleMap.get(name); | ||||
|             elementName = style != null ? style.getTag() : "ul"; | ||||
|             listStyleStack.push(style); | ||||
|         } | ||||
|         handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES); | ||||
|     } | ||||
|  | ||||
|     private void endList() throws SAXException { | ||||
|         String elementName = "ul"; | ||||
|         if (!listStyleStack.isEmpty()) { | ||||
|             ListStyle style = listStyleStack.pop(); | ||||
|             elementName = style != null ? style.getTag() : "ul"; | ||||
|         } | ||||
|         handler.endElement(XHTML, elementName, elementName); | ||||
|     } | ||||
|  | ||||
|     private void startSpan(String name) throws SAXException { | ||||
|         if (name == null) { | ||||
|             return; | ||||
|         } | ||||
|         currTextStyle = textStyleMap.get(name); | ||||
|         hasWrittenStartStyleTags = false; | ||||
|     } | ||||
|  | ||||
|     private void startParagraph(String styleName) throws SAXException { | ||||
|         if (pDepth == 0) { | ||||
|             handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES); | ||||
|             if (styleName != null) { | ||||
|                 currTextStyle = paragraphTextStyleMap.get(styleName); | ||||
|             } | ||||
|             hasWrittenStartStyleTags = false; | ||||
|         } else { | ||||
|             handler.characters(SPACE, 0, SPACE.length); | ||||
|         } | ||||
|         pDepth++; | ||||
|     } | ||||
|  | ||||
|     private void endParagraph() throws SAXException { | ||||
|         closeStyleTags(); | ||||
|         if (pDepth == 1) { | ||||
|             handler.endElement(XHTML, "p", "p"); | ||||
|         } else { | ||||
|             handler.characters(SPACE, 0, SPACE.length); | ||||
|         } | ||||
|         pDepth--; | ||||
|  | ||||
|     } | ||||
|  | ||||
|     private void updateStyleTags() throws SAXException { | ||||
|  | ||||
|         if (currTextStyle == null) { | ||||
|             closeStyleTags(); | ||||
|             return; | ||||
|         } | ||||
|         if (currTextStyle.bold != curBold) { | ||||
|             // Enforce nesting -- must close s and i tags | ||||
|             if (curUnderlined) { | ||||
|                 handler.endElement(XHTML, "u", "u"); | ||||
|                 curUnderlined = false; | ||||
|             } | ||||
|             if (curItalic) { | ||||
|                 handler.endElement(XHTML, "i", "i"); | ||||
|                 curItalic = false; | ||||
|             } | ||||
|             if (currTextStyle.bold) { | ||||
|                 handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES); | ||||
|             } else { | ||||
|                 handler.endElement(XHTML, "b", "b"); | ||||
|             } | ||||
|             curBold = currTextStyle.bold; | ||||
|         } | ||||
|  | ||||
|         if (currTextStyle.italic != curItalic) { | ||||
|             // Enforce nesting -- must close s tag | ||||
|             if (curUnderlined) { | ||||
|                 handler.endElement(XHTML, "u", "u"); | ||||
|                 curUnderlined = false; | ||||
|             } | ||||
|             if (currTextStyle.italic) { | ||||
|                 handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES); | ||||
|             } else { | ||||
|                 handler.endElement(XHTML, "i", "i"); | ||||
|             } | ||||
|             curItalic = currTextStyle.italic; | ||||
|         } | ||||
|  | ||||
|         if (currTextStyle.underlined != curUnderlined) { | ||||
|             if (currTextStyle.underlined) { | ||||
|                 handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES); | ||||
|             } else { | ||||
|                 handler.endElement(XHTML, "u", "u"); | ||||
|             } | ||||
|             curUnderlined = currTextStyle.underlined; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void endSpan() throws SAXException { | ||||
|         updateStyleTags(); | ||||
|     } | ||||
|  | ||||
|     private void closeStyleTags() throws SAXException { | ||||
|         // Close any still open style tags | ||||
|         if (curUnderlined) { | ||||
|             handler.endElement(XHTML, "u", "u"); | ||||
|             curUnderlined = false; | ||||
|         } | ||||
|         if (curItalic) { | ||||
|             handler.endElement(XHTML, "i", "i"); | ||||
|             curItalic = false; | ||||
|         } | ||||
|         if (curBold) { | ||||
|             handler.endElement(XHTML, "b", "b"); | ||||
|             curBold = false; | ||||
|         } | ||||
|         currTextStyle = null; | ||||
|         hasWrittenStartStyleTags = false; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) | ||||
|         throws SAXException { | ||||
|  | ||||
|         if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) { | ||||
|             String link = attrs.getValue(XLINK_NS, "href"); | ||||
|             AttributesImpl attr = new AttributesImpl(); | ||||
|             if (!StringUtils.isEmpty(link)) { | ||||
|                 attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link); | ||||
|             } | ||||
|             handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr); | ||||
|             handler.endElement(XHTMLContentHandler.XHTML, "img", "img"); | ||||
|         } | ||||
|  | ||||
|         if (BINARY_DATA.equals(localName)) { | ||||
|             inBinaryData = true; | ||||
|             return; | ||||
|         } | ||||
|         // keep track of current node type. If it is a text node, | ||||
|         // a bit at the current depth its set in textNodeStack. | ||||
|         // characters() checks the top bit to determine, if the | ||||
|         // actual node is a text node to print out nodeDepth contains | ||||
|         // the depth of the current node and also marks top of stack. | ||||
|         assert nodeDepth >= 0; | ||||
|  | ||||
|         // Set styles | ||||
|         if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { | ||||
|             String family = attrs.getValue(STYLE_NS, "family"); | ||||
|             if ("text".equals(family)) { | ||||
|                 currTextStyle = new TextStyle(); | ||||
|                 currTextStyleName = attrs.getValue(STYLE_NS, "name"); | ||||
|             } else if ("paragraph".equals(family)) { | ||||
|                 currTextStyle = new TextStyle(); | ||||
|                 currParagraphStyleName = attrs.getValue(STYLE_NS, "name"); | ||||
|             } | ||||
|         } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { | ||||
|             listStyle = new ListStyle(); | ||||
|             String name = attrs.getValue(STYLE_NS, "name"); | ||||
|             listStyleMap.put(name, listStyle); | ||||
|         } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) && | ||||
|             "text-properties".equals(localName)) { | ||||
|             String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style"); | ||||
|             if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) { | ||||
|                 currTextStyle.italic = true; | ||||
|             } | ||||
|             String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight"); | ||||
|             if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) || | ||||
|                 (fontWeight != null && Character.isDigit(fontWeight.charAt(0)) && | ||||
|                     Integer.parseInt(fontWeight) > 500)) { | ||||
|                 currTextStyle.bold = true; | ||||
|             } | ||||
|             String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style"); | ||||
|             if (underlineStyle != null && !underlineStyle.equals("none")) { | ||||
|                 currTextStyle.underlined = true; | ||||
|             } | ||||
|         } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) { | ||||
|             if ("list-level-style-bullet".equals(localName)) { | ||||
|                 listStyle.ordered = false; | ||||
|             } else if ("list-level-style-number".equals(localName)) { | ||||
|                 listStyle.ordered = true; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName)); | ||||
|         // filter *all* content of some tags | ||||
|         assert completelyFiltered >= 0; | ||||
|  | ||||
|         if (needsCompleteFiltering(namespaceURI, localName)) { | ||||
|             completelyFiltered++; | ||||
|         } | ||||
|         // call next handler if no filtering | ||||
|         if (completelyFiltered == 0) { | ||||
|             // special handling of text:h, that are directly passed | ||||
|             // to incoming handler | ||||
|             if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { | ||||
|                 final String el = headingStack.push(getXHTMLHeaderTagName(attrs)); | ||||
|                 handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { | ||||
|                 startList(attrs.getValue(TEXT_NS, "style-name")); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { | ||||
|                 startSpan(attrs.getValue(TEXT_NS, "style-name")); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { | ||||
|                 startParagraph(attrs.getValue(TEXT_NS, "style-name")); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) { | ||||
|                 handler.characters(SPACE, 0, 1); | ||||
|             } else if ("annotation".equals(localName)) { | ||||
|                 closeStyleTags(); | ||||
|                 handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES); | ||||
|             } else if ("note".equals(localName)) { | ||||
|                 closeStyleTags(); | ||||
|                 handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES); | ||||
|             } else if ("notes".equals(localName)) { | ||||
|                 closeStyleTags(); | ||||
|                 handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES); | ||||
|             } else { | ||||
|                 super.startElement(namespaceURI, localName, qName, attrs); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void endElement(String namespaceURI, String localName, String qName) | ||||
|         throws SAXException { | ||||
|         if (BINARY_DATA.equals(localName)) { | ||||
|             inBinaryData = false; | ||||
|             try { | ||||
|                 processBinaryData(); | ||||
|             } catch (IOException e) { | ||||
|                 throw new SAXException(e); | ||||
|             } | ||||
|             return; | ||||
|         } | ||||
|         if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { | ||||
|             if (currTextStyle != null && currTextStyleName != null) { | ||||
|                 textStyleMap.put(currTextStyleName, currTextStyle); | ||||
|                 currTextStyleName = null; | ||||
|                 currTextStyle = null; | ||||
|             } else if (currTextStyle != null && currParagraphStyleName != null) { | ||||
|                 paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle); | ||||
|                 currParagraphStyleName = null; | ||||
|                 currTextStyle = null; | ||||
|             } | ||||
|         } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { | ||||
|             listStyle = null; | ||||
|         } | ||||
|  | ||||
|         // call next handler if no filtering | ||||
|         if (completelyFiltered == 0) { | ||||
|             // special handling of text:h, that are directly passed | ||||
|             // to incoming handler | ||||
|             if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { | ||||
|                 final String el = headingStack.pop(); | ||||
|                 handler.endElement(namespaceURI, el, el); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { | ||||
|                 endList(); | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { | ||||
|                 currTextStyle = null; | ||||
|                 hasWrittenStartStyleTags = false; | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { | ||||
|                 endParagraph(); | ||||
|             } else if ("annotation".equals(localName) || "note".equals(localName) || | ||||
|                 "notes".equals(localName)) { | ||||
|                 closeStyleTags(); | ||||
|                 handler.endElement(namespaceURI, localName, localName); | ||||
|             } else { | ||||
|                 super.endElement(namespaceURI, localName, qName); | ||||
|             } | ||||
|  | ||||
|             // special handling of tabulators | ||||
|             if (TEXT_NS.equals(namespaceURI) && | ||||
|                 ("tab-stop".equals(localName) || "tab".equals(localName))) { | ||||
|                 this.characters(TAB, 0, TAB.length); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // revert filter for *all* content of some tags | ||||
|         if (needsCompleteFiltering(namespaceURI, localName)) { | ||||
|             completelyFiltered--; | ||||
|         } | ||||
|         assert completelyFiltered >= 0; | ||||
|  | ||||
|         // reduce current node depth | ||||
|         nodeDepth--; | ||||
|         assert nodeDepth >= 0; | ||||
|     } | ||||
|  | ||||
|     private void processBinaryData() throws IOException, SAXException { | ||||
|  | ||||
|         //TODO: figure out whether we're in an inline image or a regular | ||||
|         //attachment and add that info to the embedded metadata | ||||
|  | ||||
|         byte[] bytes = Base64.decodeBase64(base64BinaryDataBuffer.toString()); | ||||
|         //clear state before parsing | ||||
|         base64BinaryDataBuffer.setLength(0); | ||||
|         inBinaryData = false; | ||||
|  | ||||
|         if (embeddedDocumentExtractor == null) { | ||||
|             embeddedDocumentExtractor = | ||||
|                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); | ||||
|         } | ||||
|         Metadata embeddedMetadata = new Metadata(); | ||||
|         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { | ||||
|             try (InputStream is = TikaInputStream.get(bytes)) { | ||||
|                 embeddedDocumentExtractor.parseEmbedded(is, handler, embeddedMetadata, false); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startPrefixMapping(String prefix, String uri) { | ||||
|         // remove prefix mappings as they should not occur in XHTML | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void endPrefixMapping(String prefix) { | ||||
|         // remove prefix mappings as they should not occur in XHTML | ||||
|     } | ||||
|  | ||||
|     private interface Style { | ||||
|     } | ||||
|  | ||||
|     private static class TextStyle implements Style { | ||||
|         public boolean italic; | ||||
|         public boolean bold; | ||||
|         public boolean underlined; | ||||
|  | ||||
|         @Override | ||||
|         public String toString() { | ||||
|             return "TextStyle{" + "italic=" + italic + ", bold=" + bold + ", underlined=" + | ||||
|                 underlined + '}'; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private static class ListStyle implements Style { | ||||
|         public boolean ordered; | ||||
|  | ||||
|         public String getTag() { | ||||
|             return ordered ? "ol" : "ul"; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
| @@ -16,591 +16,47 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.Collections; | ||||
| import java.util.Set; | ||||
|  | ||||
| import org.apache.commons.io.input.CloseShieldInputStream; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.exception.TikaException; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.mime.MediaType; | ||||
| import org.apache.tika.parser.AbstractParser; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.sax.ElementMappingContentHandler; | ||||
| import org.apache.tika.sax.ElementMappingContentHandler.TargetElement; | ||||
| import org.apache.tika.sax.OfflineContentHandler; | ||||
| import org.apache.tika.sax.XHTMLContentHandler; | ||||
| import org.apache.tika.utils.XMLReaderUtils; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.AttributesImpl; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import javax.xml.namespace.QName; | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.BitSet; | ||||
| import java.util.Collections; | ||||
| import java.util.HashMap; | ||||
| import java.util.Map; | ||||
| import java.util.Set; | ||||
| import java.util.Stack; | ||||
|  | ||||
| import static org.apache.tika.sax.XHTMLContentHandler.XHTML; | ||||
|  | ||||
| /** | ||||
|  * Parser for ODF <code>content.xml</code> files. | ||||
|  */ | ||||
| public class OpenDocumentContentParser extends AbstractParser { | ||||
|     private interface Style { | ||||
|     } | ||||
|  | ||||
|     private static class TextStyle implements Style { | ||||
|         public boolean italic; | ||||
|         public boolean bold; | ||||
|         public boolean underlined; | ||||
|  | ||||
|         @Override | ||||
|         public String toString() { | ||||
|             return "TextStyle{" + | ||||
|                     "italic=" + italic + | ||||
|                     ", bold=" + bold + | ||||
|                     ", underlined=" + underlined + | ||||
|                     '}'; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private static class ListStyle implements Style { | ||||
|         public boolean ordered; | ||||
|  | ||||
|         public String getTag() { | ||||
|             return ordered ? "ol" : "ul"; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private static final class OpenDocumentElementMappingContentHandler extends | ||||
|             ElementMappingContentHandler { | ||||
|         private static final char[] SPACE = new char[]{ ' '}; | ||||
|         private static final String CLASS = "class"; | ||||
|         private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation"); | ||||
|         private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note"); | ||||
|         private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes"); | ||||
|  | ||||
|         private static Attributes buildAttributes(String key, String value) { | ||||
|             AttributesImpl attrs = new AttributesImpl(); | ||||
|             attrs.addAttribute("", key, key, "CDATA", value); | ||||
|             return attrs; | ||||
|         } | ||||
|  | ||||
|         private final ContentHandler handler; | ||||
|         private final BitSet textNodeStack = new BitSet(); | ||||
|         private int nodeDepth = 0; | ||||
|         private int completelyFiltered = 0; | ||||
|         private Stack<String> headingStack = new Stack<String>(); | ||||
|         private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>(); | ||||
|         private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>(); | ||||
|         private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>(); | ||||
|         private String currParagraphStyleName; //paragraph style name | ||||
|         private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs | ||||
|         private String currTextStyleName; | ||||
|  | ||||
|         private Stack<ListStyle> listStyleStack = new Stack<ListStyle>(); | ||||
|         private ListStyle listStyle; | ||||
|  | ||||
|         // True if we are currently in the named style: | ||||
|         private boolean curUnderlined; | ||||
|         private boolean curBold; | ||||
|         private boolean curItalic; | ||||
|  | ||||
|         //have we written the start style tags | ||||
|         //yet for the current text style | ||||
|         boolean hasWrittenStartStyleTags = false; | ||||
|  | ||||
|         private int pDepth = 0;  //<p> can appear inside comments and other things that are already inside <p> | ||||
|                                 //we need to track our pDepth and only output <p> if we're at the main level | ||||
|  | ||||
|  | ||||
|         private OpenDocumentElementMappingContentHandler(ContentHandler handler, | ||||
|                                                          Map<QName, TargetElement> mappings) { | ||||
|             super(handler, mappings); | ||||
|             this.handler = handler; | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void characters(char[] ch, int start, int length) | ||||
|                 throws SAXException { | ||||
|             // only forward content of tags from text:-namespace | ||||
|             if (completelyFiltered == 0 && nodeDepth > 0 | ||||
|                     && textNodeStack.get(nodeDepth - 1)) { | ||||
|                 if (!hasWrittenStartStyleTags) { | ||||
|                     updateStyleTags(); | ||||
|                     hasWrittenStartStyleTags = true; | ||||
|                 } | ||||
|                 super.characters(ch, start, length); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // helper for checking tags which need complete filtering | ||||
|         // (with sub-tags) | ||||
|         private boolean needsCompleteFiltering( | ||||
|                 String namespaceURI, String localName) { | ||||
|             if (TEXT_NS.equals(namespaceURI)) { | ||||
|                 return localName.endsWith("-template") | ||||
|                         || localName.endsWith("-style"); | ||||
|             } | ||||
|             return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName); | ||||
|         } | ||||
|  | ||||
|         // map the heading level to <hX> HTML tags | ||||
|         private String getXHTMLHeaderTagName(Attributes atts) { | ||||
|             String depthStr = atts.getValue(TEXT_NS, "outline-level"); | ||||
|             if (depthStr == null) { | ||||
|                 return "h1"; | ||||
|             } | ||||
|  | ||||
|             int depth = Integer.parseInt(depthStr); | ||||
|             if (depth >= 6) { | ||||
|                 return "h6"; | ||||
|             } else if (depth <= 1) { | ||||
|                 return "h1"; | ||||
|             } else { | ||||
|                 return "h" + depth; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         /** | ||||
|          * Check if a node is a text node | ||||
|          */ | ||||
|         private boolean isTextNode(String namespaceURI, String localName) { | ||||
|             if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) { | ||||
|                 return true; | ||||
|             } | ||||
|             if (SVG_NS.equals(namespaceURI)) { | ||||
|                 return "title".equals(localName) || | ||||
|                         "desc".equals(localName); | ||||
|             } | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         private void startList(String name) throws SAXException { | ||||
|             String elementName = "ul"; | ||||
|             if (name != null) { | ||||
|                 ListStyle style = listStyleMap.get(name); | ||||
|                 elementName = style != null ? style.getTag() : "ul"; | ||||
|                 listStyleStack.push(style); | ||||
|             } | ||||
|             handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES); | ||||
|         } | ||||
|  | ||||
|         private void endList() throws SAXException { | ||||
|             String elementName = "ul"; | ||||
|             if (!listStyleStack.isEmpty()) { | ||||
|                 ListStyle style = listStyleStack.pop(); | ||||
|                 elementName = style != null ? style.getTag() : "ul"; | ||||
|             } | ||||
|             handler.endElement(XHTML, elementName, elementName); | ||||
|         } | ||||
|  | ||||
|         private void startSpan(String name) throws SAXException { | ||||
|             if (name == null) { | ||||
|                 return; | ||||
|             } | ||||
|             currTextStyle = textStyleMap.get(name); | ||||
|             hasWrittenStartStyleTags = false; | ||||
|         } | ||||
|  | ||||
|         private void startParagraph(String styleName) throws SAXException { | ||||
|             if (pDepth == 0) { | ||||
|                 handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES); | ||||
|                 if (styleName != null) { | ||||
|                     currTextStyle = paragraphTextStyleMap.get(styleName); | ||||
|                 } | ||||
|                 hasWrittenStartStyleTags = false; | ||||
|             } else { | ||||
|                 handler.characters(SPACE, 0, SPACE.length); | ||||
|             } | ||||
|             pDepth++; | ||||
|         } | ||||
|  | ||||
|         private void endParagraph() throws SAXException { | ||||
|             closeStyleTags(); | ||||
|             if (pDepth == 1) { | ||||
|                 handler.endElement(XHTML, "p", "p"); | ||||
|             } else { | ||||
|                 handler.characters(SPACE, 0, SPACE.length); | ||||
|             } | ||||
|             pDepth--; | ||||
|  | ||||
|         } | ||||
|  | ||||
|         private void updateStyleTags() throws SAXException { | ||||
|  | ||||
|             if (currTextStyle == null) { | ||||
|                 closeStyleTags(); | ||||
|                 return; | ||||
|             } | ||||
|             if (currTextStyle.bold != curBold) { | ||||
|                 // Enforce nesting -- must close s and i tags | ||||
|                 if (curUnderlined) { | ||||
|                     handler.endElement(XHTML, "u", "u"); | ||||
|                     curUnderlined = false; | ||||
|                 } | ||||
|                 if (curItalic) { | ||||
|                     handler.endElement(XHTML, "i", "i"); | ||||
|                     curItalic = false; | ||||
|                 } | ||||
|                 if (currTextStyle.bold) { | ||||
|                     handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES); | ||||
|                 } else { | ||||
|                     handler.endElement(XHTML, "b", "b"); | ||||
|                 } | ||||
|                 curBold = currTextStyle.bold; | ||||
|             } | ||||
|  | ||||
|             if (currTextStyle.italic != curItalic) { | ||||
|                 // Enforce nesting -- must close s tag | ||||
|                 if (curUnderlined) { | ||||
|                     handler.endElement(XHTML, "u", "u"); | ||||
|                     curUnderlined = false; | ||||
|                 } | ||||
|                 if (currTextStyle.italic) { | ||||
|                     handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES); | ||||
|                 } else { | ||||
|                     handler.endElement(XHTML, "i", "i"); | ||||
|                 } | ||||
|                 curItalic = currTextStyle.italic; | ||||
|             } | ||||
|  | ||||
|             if (currTextStyle.underlined != curUnderlined) { | ||||
|                 if (currTextStyle.underlined) { | ||||
|                     handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES); | ||||
|                 } else { | ||||
|                     handler.endElement(XHTML, "u", "u"); | ||||
|                 } | ||||
|                 curUnderlined = currTextStyle.underlined; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         private void endSpan() throws SAXException { | ||||
|             updateStyleTags(); | ||||
|         } | ||||
|  | ||||
|         private void closeStyleTags() throws SAXException { | ||||
|             // Close any still open style tags | ||||
|             if (curUnderlined) { | ||||
|                 handler.endElement(XHTML,"u", "u"); | ||||
|                 curUnderlined = false; | ||||
|             } | ||||
|             if (curItalic) { | ||||
|                 handler.endElement(XHTML,"i", "i"); | ||||
|                 curItalic = false; | ||||
|             } | ||||
|             if (curBold) { | ||||
|                 handler.endElement(XHTML,"b", "b"); | ||||
|                 curBold = false; | ||||
|             } | ||||
|             currTextStyle = null; | ||||
|             hasWrittenStartStyleTags = false; | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void startElement( | ||||
|                 String namespaceURI, String localName, String qName, | ||||
|                 Attributes attrs) throws SAXException { | ||||
|             // keep track of current node type. If it is a text node, | ||||
|             // a bit at the current depth its set in textNodeStack. | ||||
|             // characters() checks the top bit to determine, if the | ||||
|             // actual node is a text node to print out nodeDepth contains | ||||
|             // the depth of the current node and also marks top of stack. | ||||
|             assert nodeDepth >= 0; | ||||
|  | ||||
|             // Set styles | ||||
|             if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { | ||||
|                 String family = attrs.getValue(STYLE_NS, "family"); | ||||
|                 if ("text".equals(family)) { | ||||
|                     currTextStyle = new TextStyle(); | ||||
|                     currTextStyleName = attrs.getValue(STYLE_NS, "name"); | ||||
|                 } else if ("paragraph".equals(family)) { | ||||
|                     currTextStyle = new TextStyle(); | ||||
|                     currParagraphStyleName = attrs.getValue(STYLE_NS, "name"); | ||||
|                 } | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { | ||||
|                 listStyle = new ListStyle(); | ||||
|                 String name = attrs.getValue(STYLE_NS, "name"); | ||||
|                 listStyleMap.put(name, listStyle); | ||||
|             } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) | ||||
|                     && "text-properties".equals(localName)) { | ||||
|                 String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style"); | ||||
|                 if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) { | ||||
|                     currTextStyle.italic = true; | ||||
|                 } | ||||
|                 String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight"); | ||||
|                 if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) | ||||
|                         || (fontWeight != null && Character.isDigit(fontWeight.charAt(0)) | ||||
|                         && Integer.valueOf(fontWeight) > 500)) { | ||||
|                     currTextStyle.bold = true; | ||||
|                 } | ||||
|                 String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style"); | ||||
|                 if (underlineStyle != null && !underlineStyle.equals("none")) { | ||||
|                     currTextStyle.underlined = true; | ||||
|                 } | ||||
|             } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) { | ||||
|                 if ("list-level-style-bullet".equals(localName)) { | ||||
|                     listStyle.ordered = false; | ||||
|                 } else if ("list-level-style-number".equals(localName)) { | ||||
|                     listStyle.ordered = true; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             textNodeStack.set(nodeDepth++, | ||||
|                     isTextNode(namespaceURI, localName)); | ||||
|             // filter *all* content of some tags | ||||
|             assert completelyFiltered >= 0; | ||||
|  | ||||
|             if (needsCompleteFiltering(namespaceURI, localName)) { | ||||
|                 completelyFiltered++; | ||||
|             } | ||||
|             // call next handler if no filtering | ||||
|             if (completelyFiltered == 0) { | ||||
|                 // special handling of text:h, that are directly passed | ||||
|                 // to incoming handler | ||||
|                 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { | ||||
|                     final String el = headingStack.push(getXHTMLHeaderTagName(attrs)); | ||||
|                     handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { | ||||
|                     startList(attrs.getValue(TEXT_NS, "style-name")); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { | ||||
|                     startSpan(attrs.getValue(TEXT_NS, "style-name")); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { | ||||
|                     startParagraph(attrs.getValue(TEXT_NS, "style-name")); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) { | ||||
|                     handler.characters(SPACE, 0, 1); | ||||
|                 } else if ("annotation".equals(localName)) { | ||||
|                     closeStyleTags(); | ||||
|                     handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES); | ||||
|                 } else if ("note".equals(localName)) { | ||||
|                     closeStyleTags(); | ||||
|                     handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES); | ||||
|                 } else if ("notes".equals(localName)) { | ||||
|                     closeStyleTags(); | ||||
|                     handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES); | ||||
|                 } else { | ||||
|                     super.startElement(namespaceURI, localName, qName, attrs); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void endElement( | ||||
|                 String namespaceURI, String localName, String qName) | ||||
|                 throws SAXException { | ||||
|             if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { | ||||
|                 if (currTextStyle != null && currTextStyleName != null) { | ||||
|                     textStyleMap.put(currTextStyleName, currTextStyle); | ||||
|                     currTextStyleName = null; | ||||
|                     currTextStyle = null; | ||||
|                 } else if (currTextStyle != null && currParagraphStyleName != null) { | ||||
|                     paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle); | ||||
|                     currParagraphStyleName = null; | ||||
|                     currTextStyle = null; | ||||
|                 } | ||||
|             } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { | ||||
|                 listStyle = null; | ||||
|             } | ||||
|  | ||||
|             // call next handler if no filtering | ||||
|             if (completelyFiltered == 0) { | ||||
|                 // special handling of text:h, that are directly passed | ||||
|                 // to incoming handler | ||||
|                 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { | ||||
|                     final String el = headingStack.pop(); | ||||
|                     handler.endElement(XHTMLContentHandler.XHTML, el, el); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { | ||||
|                     endList(); | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { | ||||
|                     currTextStyle = null; | ||||
|                     hasWrittenStartStyleTags = false; | ||||
|                 } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { | ||||
|                     endParagraph(); | ||||
|                 } else if ("annotation".equals(localName) || "note".equals(localName) || | ||||
|                         "notes".equals(localName)) { | ||||
|                         closeStyleTags(); | ||||
|                         handler.endElement("", localName, localName); | ||||
|                 } else { | ||||
|                     super.endElement(namespaceURI, localName, qName); | ||||
|                 } | ||||
|  | ||||
|                 // special handling of tabulators | ||||
|                 if (TEXT_NS.equals(namespaceURI) | ||||
|                         && ("tab-stop".equals(localName) | ||||
|                         || "tab".equals(localName))) { | ||||
|                     this.characters(TAB, 0, TAB.length); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // revert filter for *all* content of some tags | ||||
|             if (needsCompleteFiltering(namespaceURI, localName)) { | ||||
|                 completelyFiltered--; | ||||
|             } | ||||
|             assert completelyFiltered >= 0; | ||||
|  | ||||
|             // reduce current node depth | ||||
|             nodeDepth--; | ||||
|             assert nodeDepth >= 0; | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void startPrefixMapping(String prefix, String uri) { | ||||
|             // remove prefix mappings as they should not occur in XHTML | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void endPrefixMapping(String prefix) { | ||||
|             // remove prefix mappings as they should not occur in XHTML | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     public static final String TEXT_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; | ||||
|  | ||||
|     public static final String TABLE_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; | ||||
|  | ||||
|     public static final String STYLE_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:style:1.0"; | ||||
|  | ||||
|     public static final String FORMATTING_OBJECTS_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"; | ||||
|  | ||||
|     public static final String OFFICE_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:office:1.0"; | ||||
|  | ||||
|     public static final String SVG_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"; | ||||
|  | ||||
|     public static final String PRESENTATION_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0"; | ||||
|  | ||||
|     public static final String DRAW_NS = | ||||
|             "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"; | ||||
|  | ||||
|     public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; | ||||
|  | ||||
|     protected static final char[] TAB = new char[]{'\t'}; | ||||
|  | ||||
|     private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); | ||||
|  | ||||
|     /** | ||||
|      * Mappings between ODF tag names and XHTML tag names | ||||
|      * (including attributes). All other tag names/attributes are ignored | ||||
|      * and left out from event stream. | ||||
|      */ | ||||
|     private static final HashMap<QName, TargetElement> MAPPINGS = | ||||
|             new HashMap<QName, TargetElement>(); | ||||
|  | ||||
|     static { | ||||
|         // general mappings of text:-tags | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "p"), | ||||
|                 new TargetElement(XHTML, "p")); | ||||
|         // text:h-tags are mapped specifically in startElement/endElement | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "line-break"), | ||||
|                 new TargetElement(XHTML, "br")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "list-item"), | ||||
|                 new TargetElement(XHTML, "li")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "note"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(OFFICE_NS, "annotation"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(PRESENTATION_NS, "notes"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(DRAW_NS, "object"), | ||||
|                 new TargetElement(XHTML, "object")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(DRAW_NS, "text-box"), | ||||
|                 new TargetElement(XHTML, "div")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(SVG_NS, "title"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(SVG_NS, "desc"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "span"), | ||||
|                 new TargetElement(XHTML, "span")); | ||||
|  | ||||
|         final HashMap<QName, QName> aAttsMapping = | ||||
|                 new HashMap<QName, QName>(); | ||||
|         aAttsMapping.put( | ||||
|                 new QName(XLINK_NS, "href"), | ||||
|                 new QName("href")); | ||||
|         aAttsMapping.put( | ||||
|                 new QName(XLINK_NS, "title"), | ||||
|                 new QName("title")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TEXT_NS, "a"), | ||||
|                 new TargetElement(XHTML, "a", aAttsMapping)); | ||||
|  | ||||
|         // create HTML tables from table:-tags | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TABLE_NS, "table"), | ||||
|                 new TargetElement(XHTML, "table")); | ||||
|         // repeating of rows is ignored; for columns, see below! | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TABLE_NS, "table-row"), | ||||
|                 new TargetElement(XHTML, "tr")); | ||||
|         // special mapping for rowspan/colspan attributes | ||||
|         final HashMap<QName, QName> tableCellAttsMapping = | ||||
|                 new HashMap<QName, QName>(); | ||||
|         tableCellAttsMapping.put( | ||||
|                 new QName(TABLE_NS, "number-columns-spanned"), | ||||
|                 new QName("colspan")); | ||||
|         tableCellAttsMapping.put( | ||||
|                 new QName(TABLE_NS, "number-rows-spanned"), | ||||
|                 new QName("rowspan")); | ||||
|         /* TODO: The following is not correct, the cell should be repeated not spanned! | ||||
|          * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct. | ||||
|          * Problems may occur when both spanning and repeating is given, which is not allowed by spec. | ||||
|          * Cell spanning instead of repeating  is not a problem, because OpenOffice uses it | ||||
|          * only for empty cells. | ||||
|          */ | ||||
|         tableCellAttsMapping.put( | ||||
|                 new QName(TABLE_NS, "number-columns-repeated"), | ||||
|                 new QName("colspan")); | ||||
|         MAPPINGS.put( | ||||
|                 new QName(TABLE_NS, "table-cell"), | ||||
|                 new TargetElement(XHTML, "td", tableCellAttsMapping)); | ||||
|     } | ||||
|  | ||||
|     public Set<MediaType> getSupportedTypes(ParseContext context) { | ||||
|         return Collections.emptySet(); // not a top-level parser | ||||
|     } | ||||
|  | ||||
|     public void parse( | ||||
|             InputStream stream, ContentHandler handler, | ||||
|             Metadata metadata, ParseContext context) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|         parseInternal(stream, | ||||
|                 new XHTMLContentHandler(handler, metadata), | ||||
|                 metadata, context); | ||||
|     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, | ||||
|                       ParseContext context) throws IOException, SAXException, TikaException { | ||||
|         parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context); | ||||
|     } | ||||
|  | ||||
|     void parseInternal( | ||||
|             InputStream stream, final ContentHandler handler, | ||||
|             Metadata metadata, ParseContext context) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|     void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, | ||||
|                        ParseContext context) throws IOException, SAXException, TikaException { | ||||
|  | ||||
|         DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); | ||||
|         DefaultHandler dh = new OpenDocumentBodyHandler(handler, context); | ||||
|  | ||||
|  | ||||
|         XMLReaderUtils.parseSAX( | ||||
|                 new CloseShieldInputStream(stream), | ||||
|                 new OfflineContentHandler( | ||||
|                         new NSNormalizerContentHandler(dh)), | ||||
|                 context); | ||||
|         XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), | ||||
|             new OfflineContentHandler(new NSNormalizerContentHandler(dh)), context); | ||||
|     } | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -0,0 +1,60 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import java.io.IOException; | ||||
|  | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.utils.XMLReaderUtils; | ||||
|  | ||||
|  | ||||
| class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler { | ||||
|  | ||||
|     OpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) { | ||||
|         super(contentHandler, parseContext); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) | ||||
|         throws SAXException { | ||||
|         //in the compressed odf, there should only be one element in this file. | ||||
|         if (MODULE.equalsIgnoreCase(localName)) { | ||||
|             inMacro = true; | ||||
|             macroName = XMLReaderUtils.getAttrValue(NAME, attrs); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|  | ||||
|     @Override | ||||
|     public void endElement(String namespaceURI, String localName, String qName) | ||||
|         throws SAXException { | ||||
|         if (MODULE.equals(localName)) { | ||||
|             try { | ||||
|                 handleMacro(); | ||||
|             } catch (IOException e) { | ||||
|                 throw new SAXException(e); | ||||
|             } finally { | ||||
|                 //this shouldn't be necessary in the compressed odf files | ||||
|                 resetMacroState(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,45 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.exception.EncryptedDocumentException; | ||||
| import org.apache.tika.sax.ContentHandlerDecorator; | ||||
|  | ||||
| /** | ||||
|  * For now, this only looks for any encryption-data elements. | ||||
|  * If found this will throw an EncryptedDocumentException wrapped | ||||
|  * in a SAXException. | ||||
|  * | ||||
|  * If desired, we can add to this to actually extract information | ||||
|  * necessary for decryption.  Please open an issue or pull | ||||
|  * request for this added functionality. | ||||
|  * | ||||
|  */ | ||||
| class OpenDocumentManifestHandler extends ContentHandlerDecorator { | ||||
|  | ||||
|     @Override | ||||
|     public void startElement( | ||||
|         String namespaceURI, String localName, String qName, | ||||
|         Attributes attrs) throws SAXException { | ||||
|         if (localName.equals("encryption-data")) { | ||||
|             throw new SAXException(new EncryptedDocumentException()); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -16,12 +16,21 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.ArrayList; | ||||
| import java.util.Arrays; | ||||
| import java.util.List; | ||||
|  | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.exception.TikaException; | ||||
| import org.apache.tika.metadata.DublinCore; | ||||
| import org.apache.tika.metadata.MSOffice; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Office; | ||||
| import org.apache.tika.metadata.OfficeOpenXMLCore; | ||||
| import org.apache.tika.metadata.OfficeOpenXMLExtended; | ||||
| import org.apache.tika.metadata.PagedText; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import org.apache.tika.metadata.TikaCoreProperties; | ||||
| @@ -36,11 +45,6 @@ import org.apache.tika.sax.xpath.CompositeMatcher; | ||||
| import org.apache.tika.sax.xpath.Matcher; | ||||
| import org.apache.tika.sax.xpath.MatchingContentHandler; | ||||
| import org.apache.tika.sax.xpath.XPathParser; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
|  | ||||
| /** | ||||
|  * Parser for OpenDocument <code>meta.xml</code> files. | ||||
| @@ -54,68 +58,54 @@ public class OpenDocumentMetaParser extends XMLParser { | ||||
|     private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"; | ||||
|     private static final XPathParser META_XPATH = new XPathParser("meta", META_NS); | ||||
|  | ||||
|     /** | ||||
|      * @see OfficeOpenXMLCore#SUBJECT | ||||
|      * @deprecated use OfficeOpenXMLCore#SUBJECT | ||||
|      */ | ||||
|     @Deprecated | ||||
|     private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR = | ||||
|             Property.composite(Office.INITIAL_AUTHOR, | ||||
|                     new Property[]{Property.externalText("initial-creator")}); | ||||
|  | ||||
|     private static ContentHandler getDublinCoreHandler( | ||||
|             Metadata metadata, Property property, String element) { | ||||
|         return new ElementMetadataHandler( | ||||
|                 DublinCore.NAMESPACE_URI_DC, element, | ||||
|                 metadata, property); | ||||
|     private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property, | ||||
|                                                        String element) { | ||||
|         return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property); | ||||
|     } | ||||
|  | ||||
|     private static ContentHandler getMeta( | ||||
|             ContentHandler ch, Metadata md, Property property, String element) { | ||||
|         Matcher matcher = new CompositeMatcher( | ||||
|                 META_XPATH.parse("//meta:" + element), | ||||
|                 META_XPATH.parse("//meta:" + element + "//text()")); | ||||
|     private static ContentHandler getMeta(ContentHandler ch, Metadata md, Property property, | ||||
|                                           String element) { | ||||
|         Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:" + element), | ||||
|             META_XPATH.parse("//meta:" + element + "//text()")); | ||||
|         ContentHandler branch = | ||||
|                 new MatchingContentHandler(new MetadataHandler(md, property), matcher); | ||||
|             new MatchingContentHandler(new MetadataHandler(md, property), matcher); | ||||
|         return new TeeContentHandler(ch, branch); | ||||
|     } | ||||
|  | ||||
|     private static ContentHandler getUserDefined( | ||||
|             ContentHandler ch, Metadata md) { | ||||
|         Matcher matcher = new CompositeMatcher( | ||||
|                 META_XPATH.parse("//meta:user-defined/@meta:name"), | ||||
|                 META_XPATH.parse("//meta:user-defined//text()")); | ||||
|         // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1 | ||||
|     private static ContentHandler getUserDefined(ContentHandler ch, Metadata md) { | ||||
|         Matcher matcher = new CompositeMatcher(META_XPATH.parse("//meta:user-defined/@meta:name"), | ||||
|             META_XPATH.parse("//meta:user-defined//text()")); | ||||
|         // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes | ||||
|         // custom:Info1=Text1 | ||||
|         ContentHandler branch = new MatchingContentHandler( | ||||
|                 new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX), | ||||
|                 matcher); | ||||
|             new AttributeDependantMetadataHandler(md, "meta:name", | ||||
|                 Office.USER_DEFINED_METADATA_NAME_PREFIX), matcher); | ||||
|         return new TeeContentHandler(ch, branch); | ||||
|     } | ||||
|  | ||||
|     @Deprecated | ||||
|     private static ContentHandler getStatistic( | ||||
|             ContentHandler ch, Metadata md, String name, String attribute) { | ||||
|         Matcher matcher = | ||||
|                 META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); | ||||
|     private static ContentHandler getStatistic(ContentHandler ch, Metadata md, String name, | ||||
|                                                String attribute) { | ||||
|         Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); | ||||
|         ContentHandler branch = new MatchingContentHandler( | ||||
|                 new AttributeMetadataHandler(META_NS, attribute, md, name), matcher); | ||||
|             new AttributeMetadataHandler(META_NS, attribute, md, name), matcher); | ||||
|         return new TeeContentHandler(ch, branch); | ||||
|     } | ||||
|  | ||||
|     private static ContentHandler getStatistic( | ||||
|             ContentHandler ch, Metadata md, Property property, String attribute) { | ||||
|         Matcher matcher = | ||||
|                 META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); | ||||
|     private static ContentHandler getStatistic(ContentHandler ch, Metadata md, Property property, | ||||
|                                                String attribute) { | ||||
|         Matcher matcher = META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); | ||||
|         ContentHandler branch = new MatchingContentHandler( | ||||
|                 new AttributeMetadataHandler(META_NS, attribute, md, property), matcher); | ||||
|             new AttributeMetadataHandler(META_NS, attribute, md, property), matcher); | ||||
|         return new TeeContentHandler(ch, branch); | ||||
|     } | ||||
|  | ||||
|     protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) { | ||||
|     static ContentHandler getContentHandler(Metadata md, ParseContext context, | ||||
|                                             ContentHandler... handlers) { | ||||
|         // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date | ||||
|         // Process the Dublin Core Attributes  | ||||
|         ch = new TeeContentHandler(super.getContentHandler(ch, md, context), | ||||
|                 getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), | ||||
|         // Process the Dublin Core Attributes | ||||
|         ContentHandler ch = | ||||
|             new TeeContentHandler(getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), | ||||
|                 getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"), | ||||
|                 getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"), | ||||
|                 getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"), | ||||
| @@ -129,19 +119,20 @@ public class OpenDocumentMetaParser extends XMLParser { | ||||
|         // Process the OO Meta Attributes | ||||
|         ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date"); | ||||
|         // ODF uses dc:date for modified | ||||
|         ch = new TeeContentHandler(ch, new ElementMetadataHandler( | ||||
|                 DublinCore.NAMESPACE_URI_DC, "date", | ||||
|                 md, TikaCoreProperties.MODIFIED)); | ||||
|         ch = new TeeContentHandler(ch, | ||||
|             new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "date", md, | ||||
|                 TikaCoreProperties.MODIFIED)); | ||||
|  | ||||
|         // ODF uses dc:subject for description | ||||
|         ch = new TeeContentHandler(ch, new ElementMetadataHandler( | ||||
|                 DublinCore.NAMESPACE_URI_DC, "subject", | ||||
|                 md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT)); | ||||
|         ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword"); | ||||
|         ch = new TeeContentHandler(ch, | ||||
|             new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, "subject", md, | ||||
|                 OfficeOpenXMLCore.SUBJECT)); | ||||
|  | ||||
|         ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration"); | ||||
|         ch = getMeta(ch, md, Office.KEYWORDS, "keyword"); | ||||
|  | ||||
|         ch = getMeta(ch, md, OfficeOpenXMLExtended.TOTAL_TIME, "editing-duration"); | ||||
|         ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles"); | ||||
|         ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator"); | ||||
|         ch = getMeta(ch, md, TikaCoreProperties.CREATOR, "initial-creator"); | ||||
|         ch = getMeta(ch, md, Property.externalText("generator"), "generator"); | ||||
|  | ||||
|         // Process the user defined Meta Attributes | ||||
| @@ -157,43 +148,48 @@ public class OpenDocumentMetaParser extends XMLParser { | ||||
|         ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count"); | ||||
|         ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count"); | ||||
|  | ||||
|         // Legacy, Tika-1.0 style attributes | ||||
|         // TODO Remove these in Tika 2.0 | ||||
|         ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count"); | ||||
|         ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count"); | ||||
|  | ||||
|         // Legacy Statistics Attributes, replaced with real keys above | ||||
|         // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770) | ||||
|         ch = getStatistic(ch, md, "nbPage", "page-count"); | ||||
|         ch = getStatistic(ch, md, "nbPara", "paragraph-count"); | ||||
|         ch = getStatistic(ch, md, "nbWord", "word-count"); | ||||
|         ch = getStatistic(ch, md, "nbCharacter", "character-count"); | ||||
|         ch = getStatistic(ch, md, "nbTab", "table-count"); | ||||
|         ch = getStatistic(ch, md, "nbObject", "object-count"); | ||||
|         ch = getStatistic(ch, md, "nbImg", "image-count"); | ||||
|  | ||||
|         if (handlers != null && handlers.length > 0) { | ||||
|             ContentHandler[] newHandlers = new ContentHandler[handlers.length + 1]; | ||||
|             newHandlers[0] = ch; | ||||
|             System.arraycopy(handlers, 0, newHandlers, 1, handlers.length); | ||||
|             ch = new TeeContentHandler(newHandlers); | ||||
|         } | ||||
|         // Normalise the rest | ||||
|         ch = new NSNormalizerContentHandler(ch); | ||||
|         return ch; | ||||
|     } | ||||
|  | ||||
|     protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, | ||||
|                                                ParseContext context) { | ||||
|         return getContentHandler(md, context, super.getContentHandler(ch, md, context)); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void parse( | ||||
|             InputStream stream, ContentHandler handler, | ||||
|             Metadata metadata, ParseContext context) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, | ||||
|                       ParseContext context) throws IOException, SAXException, TikaException { | ||||
|         super.parse(stream, handler, metadata, context); | ||||
|         // Copy subject to description for OO2 | ||||
|         String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT); | ||||
|         if (odfSubject != null && !odfSubject.equals("") && | ||||
|                 (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) { | ||||
|             (metadata.get(TikaCoreProperties.DESCRIPTION) == null || | ||||
|                 metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) { | ||||
|             metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject); | ||||
|         } | ||||
|         //reset the dc:subject to include both keywords and subject | ||||
|         //We can't relying on composite keys in the MatchingContentHandlers | ||||
|         //because those are "setting" not "adding" to the Metadata object | ||||
|         List<String> subjects = new ArrayList<>(); | ||||
|         if (metadata.getValues(Office.KEYWORDS) != null) { | ||||
|             subjects.addAll(Arrays.asList(metadata.getValues(Office.KEYWORDS))); | ||||
|         } | ||||
|  | ||||
|         if (metadata.getValues(OfficeOpenXMLCore.SUBJECT) != null) { | ||||
|             subjects.addAll(Arrays.asList(metadata.getValues(OfficeOpenXMLCore.SUBJECT))); | ||||
|         } | ||||
|  | ||||
|         if (subjects.size() > 0) { | ||||
|             metadata.set(TikaCoreProperties.SUBJECT, subjects.toArray(new String[0])); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -16,37 +16,44 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.odf; | ||||
|  | ||||
| import static java.nio.charset.StandardCharsets.UTF_8; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.ArrayList; | ||||
| import java.util.Arrays; | ||||
| import java.util.Collections; | ||||
| import java.util.Enumeration; | ||||
| import java.util.HashSet; | ||||
| import java.util.List; | ||||
| import java.util.Set; | ||||
| import java.util.zip.ZipEntry; | ||||
| import java.util.zip.ZipFile; | ||||
| import java.util.zip.ZipInputStream; | ||||
|  | ||||
| import org.apache.commons.io.IOUtils; | ||||
| import org.apache.commons.io.input.CloseShieldInputStream; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.config.Field; | ||||
| import org.apache.tika.exception.EncryptedDocumentException; | ||||
| import org.apache.tika.exception.TikaException; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; | ||||
| import org.apache.tika.exception.WriteLimitReachedException; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentUtil; | ||||
| import org.apache.tika.io.TikaInputStream; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.TikaCoreProperties; | ||||
| import org.apache.tika.metadata.TikaMetadataKeys; | ||||
| import org.apache.tika.mime.MediaType; | ||||
| import org.apache.tika.parser.AbstractParser; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.parser.Parser; | ||||
| import org.apache.tika.sax.EmbeddedContentHandler; | ||||
| import org.apache.tika.sax.EndDocumentShieldingContentHandler; | ||||
| import org.apache.tika.sax.OfflineContentHandler; | ||||
| import org.apache.tika.sax.XHTMLContentHandler; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.Arrays; | ||||
| import java.util.Collections; | ||||
| import java.util.Enumeration; | ||||
| import java.util.HashSet; | ||||
| import java.util.Set; | ||||
| import java.util.zip.ZipEntry; | ||||
| import java.util.zip.ZipFile; | ||||
| import java.util.zip.ZipInputStream; | ||||
|  | ||||
| import static java.nio.charset.StandardCharsets.UTF_8; | ||||
| import org.apache.tika.utils.XMLReaderUtils; | ||||
|  | ||||
| /** | ||||
|  * OpenOffice parser | ||||
| @@ -58,47 +65,48 @@ public class OpenDocumentParser extends AbstractParser { | ||||
|      */ | ||||
|     private static final long serialVersionUID = -6410276875438618287L; | ||||
|  | ||||
|     private static final Set<MediaType> SUPPORTED_TYPES = | ||||
|             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( | ||||
|                     MediaType.application("vnd.sun.xml.writer"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.text"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.graphics"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.presentation"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.spreadsheet"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.chart"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.image"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.formula"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.text-master"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.text-web"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.text-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.graphics-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.presentation-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.spreadsheet-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.chart-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.image-template"), | ||||
|                     MediaType.application("vnd.oasis.opendocument.formula-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.text"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.graphics"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.presentation"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.spreadsheet"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.chart"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.image"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.formula"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.text-master"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.text-web"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.text-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.graphics-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.presentation-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.chart-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.image-template"), | ||||
|                     MediaType.application("x-vnd.oasis.opendocument.formula-template")))); | ||||
|     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( | ||||
|         new HashSet<>(Arrays.asList(MediaType.application("vnd.sun.xml.writer"), | ||||
|             MediaType.application("vnd.oasis.opendocument.text"), | ||||
|             MediaType.application("vnd.oasis.opendocument.graphics"), | ||||
|             MediaType.application("vnd.oasis.opendocument.presentation"), | ||||
|             MediaType.application("vnd.oasis.opendocument.spreadsheet"), | ||||
|             MediaType.application("vnd.oasis.opendocument.chart"), | ||||
|             MediaType.application("vnd.oasis.opendocument.image"), | ||||
|             MediaType.application("vnd.oasis.opendocument.formula"), | ||||
|             MediaType.application("vnd.oasis.opendocument.text-master"), | ||||
|             MediaType.application("vnd.oasis.opendocument.text-web"), | ||||
|             MediaType.application("vnd.oasis.opendocument.text-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.graphics-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.presentation-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.spreadsheet-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.chart-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.image-template"), | ||||
|             MediaType.application("vnd.oasis.opendocument.formula-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.text"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.graphics"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.presentation"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.spreadsheet"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.chart"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.image"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.formula"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.text-master"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.text-web"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.text-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.graphics-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.presentation-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.chart-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.image-template"), | ||||
|             MediaType.application("x-vnd.oasis.opendocument.formula-template")))); | ||||
|  | ||||
|     private static final String META_NAME = "meta.xml"; | ||||
|     private static final String MANIFEST_NAME = "META-INF/manifest.xml"; | ||||
|  | ||||
|     private Parser meta = new OpenDocumentMetaParser(); | ||||
|  | ||||
|     private Parser content = new OpenDocumentContentParser(); | ||||
|     private boolean extractMacros = false; | ||||
|  | ||||
|     public Parser getMetaParser() { | ||||
|         return meta; | ||||
| @@ -120,10 +128,10 @@ public class OpenDocumentParser extends AbstractParser { | ||||
|         return SUPPORTED_TYPES; | ||||
|     } | ||||
|  | ||||
|     public void parse( | ||||
|             InputStream stream, ContentHandler baseHandler, | ||||
|             Metadata metadata, ParseContext context) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|     public void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, | ||||
|                       ParseContext context) throws IOException, SAXException, TikaException { | ||||
|  | ||||
|         EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context); | ||||
|  | ||||
|         // Open the Zip stream | ||||
|         // Use a File if we can, and an already open zip is even better | ||||
| @@ -145,85 +153,129 @@ public class OpenDocumentParser extends AbstractParser { | ||||
|  | ||||
|         // Prepare to handle the content | ||||
|         XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); | ||||
|  | ||||
|         xhtml.startDocument(); | ||||
|         // As we don't know which of the metadata or the content | ||||
|         //  we'll hit first, catch the endDocument call initially | ||||
|         EndDocumentShieldingContentHandler handler = | ||||
|                 new EndDocumentShieldingContentHandler(xhtml); | ||||
|         EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml); | ||||
|  | ||||
|         if (zipFile != null) { | ||||
|             try { | ||||
|                 handleZipFile(zipFile, metadata, context, handler); | ||||
|             } finally { | ||||
|                 //Do we want to close silently == catch an exception here? | ||||
|                 zipFile.close(); | ||||
|         try { | ||||
|             if (zipFile != null) { | ||||
|                 try { | ||||
|                     handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil); | ||||
|                 } finally { | ||||
|                     //Do we want to close silently == catch an exception here? | ||||
|                     zipFile.close(); | ||||
|                 } | ||||
|             } else { | ||||
|                 try { | ||||
|                     handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil); | ||||
|                 } finally { | ||||
|                     //Do we want to close silently == catch an exception here? | ||||
|                     zipStream.close(); | ||||
|                 } | ||||
|             } | ||||
|         } else { | ||||
|             try { | ||||
|                 handleZipStream(zipStream, metadata, context, handler); | ||||
|             } finally { | ||||
|                 //Do we want to close silently == catch an exception here? | ||||
|                 zipStream.close(); | ||||
|         } catch (SAXException e) { | ||||
|             if (e.getCause() instanceof EncryptedDocumentException) { | ||||
|                 throw (EncryptedDocumentException)e.getCause(); | ||||
|             } | ||||
|             throw e; | ||||
|         } | ||||
|  | ||||
|         // Only now call the end document | ||||
|         if (handler.getEndDocumentWasCalled()) { | ||||
|         if (handler.isEndDocumentWasCalled()) { | ||||
|             handler.reallyEndDocument(); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException { | ||||
|         ZipEntry entry = zipStream.getNextEntry(); | ||||
| 		if (entry == null) { | ||||
| 			throw new IOException("No entries found in ZipInputStream"); | ||||
| 		} | ||||
|         do { | ||||
|             handleZipEntry(entry, zipStream, metadata, context, handler); | ||||
|             entry = zipStream.getNextEntry(); | ||||
|         } while (entry != null); | ||||
|     @Field | ||||
|     public void setExtractMacros(boolean extractMacros) { | ||||
|         this.extractMacros = extractMacros; | ||||
|     } | ||||
|  | ||||
|     private void handleZipFile(ZipFile zipFile, Metadata metadata, | ||||
|                                ParseContext context, EndDocumentShieldingContentHandler handler) | ||||
|             throws IOException, TikaException, SAXException { | ||||
|     private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, | ||||
|                                  EndDocumentShieldingContentHandler handler, | ||||
|                                  EmbeddedDocumentUtil embeddedDocumentUtil) | ||||
|         throws IOException, TikaException, SAXException { | ||||
|         ZipEntry entry = zipStream.getNextEntry(); | ||||
|         if (entry == null) { | ||||
|             throw new IOException("No entries found in ZipInputStream"); | ||||
|         } | ||||
|         List<SAXException> exceptions = new ArrayList<>(); | ||||
|         do { | ||||
|             try { | ||||
|                 handleZipEntry(entry, zipStream, metadata, context, handler, | ||||
|                     embeddedDocumentUtil); | ||||
|             } catch (SAXException e) { | ||||
|                 WriteLimitReachedException.throwIfWriteLimitReached(e); | ||||
|                 if (e.getCause() instanceof EncryptedDocumentException) { | ||||
|                     throw (EncryptedDocumentException)e.getCause(); | ||||
|                 } else { | ||||
|                     exceptions.add(e); | ||||
|                 } | ||||
|             } | ||||
|             entry = zipStream.getNextEntry(); | ||||
|         } while (entry != null); | ||||
|  | ||||
|         if (exceptions.size() > 0) { | ||||
|             throw exceptions.get(0); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context, | ||||
|                                EndDocumentShieldingContentHandler handler, | ||||
|                                EmbeddedDocumentUtil embeddedDocumentUtil) | ||||
|         throws IOException, TikaException, SAXException { | ||||
|         // If we can, process the metadata first, then the | ||||
|         //  rest of the file afterwards (TIKA-1353) | ||||
|         // Only possible to guarantee that when opened from a file not a stream | ||||
|  | ||||
|         ZipEntry entry = zipFile.getEntry(META_NAME); | ||||
|         ZipEntry entry = zipFile.getEntry(MANIFEST_NAME); | ||||
|         if (entry != null) { | ||||
|             handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); | ||||
|             handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, | ||||
|                 handler, embeddedDocumentUtil); | ||||
|         } | ||||
|  | ||||
|         entry = zipFile.getEntry(META_NAME); | ||||
|         if (entry != null) { | ||||
|             handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, | ||||
|                 handler, embeddedDocumentUtil); | ||||
|         } | ||||
|  | ||||
|         Enumeration<? extends ZipEntry> entries = zipFile.entries(); | ||||
|         while (entries.hasMoreElements()) { | ||||
|             entry = entries.nextElement(); | ||||
|             if (!META_NAME.equals(entry.getName())) { | ||||
|                 handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); | ||||
|                 handleZipEntry(entry, zipFile.getInputStream(entry), metadata, | ||||
|                     context, handler, embeddedDocumentUtil); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, | ||||
|                                 ParseContext context, EndDocumentShieldingContentHandler handler) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|         if (entry == null) return; | ||||
|  | ||||
|         if (entry.getName().equals("mimetype")) { | ||||
|     private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, | ||||
|                                 ParseContext context, ContentHandler handler, | ||||
|                                 EmbeddedDocumentUtil embeddedDocumentUtil) | ||||
|         throws IOException, SAXException, TikaException { | ||||
|  | ||||
|  | ||||
|         if (entry.getName().contains("manifest.xml")) { | ||||
|             checkForEncryption(zip, context); | ||||
|         } else if (entry.getName().equals("mimetype")) { | ||||
|             String type = IOUtils.toString(zip, UTF_8); | ||||
|             metadata.set(Metadata.CONTENT_TYPE, type); | ||||
|         } else if (entry.getName().equals(META_NAME)) { | ||||
|             meta.parse(zip, new DefaultHandler(), metadata, context); | ||||
|         } else if (entry.getName().endsWith("content.xml")) { | ||||
|             if (content instanceof OpenDocumentContentParser) { | ||||
|                 ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); | ||||
|                 ((OpenDocumentContentParser) content) | ||||
|                     .parseInternal(zip, handler, metadata, context); | ||||
|             } else { | ||||
|                 // Foreign content parser was set: | ||||
|                 content.parse(zip, handler, metadata, context); | ||||
|             } | ||||
|         } else if (entry.getName().endsWith("styles.xml")) { | ||||
|             if (content instanceof OpenDocumentContentParser) { | ||||
|                 ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); | ||||
|                 ((OpenDocumentContentParser) content) | ||||
|                     .parseInternal(zip, handler, metadata, context); | ||||
|             } else { | ||||
|                 // Foreign content parser was set: | ||||
|                 content.parse(zip, handler, metadata, context); | ||||
| @@ -231,26 +283,87 @@ public class OpenDocumentParser extends AbstractParser { | ||||
|         } else { | ||||
|             String embeddedName = entry.getName(); | ||||
|             //scrape everything under Thumbnails/ and Pictures/ | ||||
|             if (embeddedName.contains("Thumbnails/") || | ||||
|                     embeddedName.contains("Pictures/")) { | ||||
|                 EmbeddedDocumentExtractor embeddedDocumentExtractor = | ||||
|                         EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); | ||||
|             if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) { | ||||
|  | ||||
|                 Metadata embeddedMetadata = new Metadata(); | ||||
|                 embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName()); | ||||
|                 /* if (embeddedName.startsWith("Thumbnails/")) { | ||||
|                 TikaInputStream stream = TikaInputStream.get(zip); | ||||
|  | ||||
|                 embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName()); | ||||
|                 if (embeddedName.startsWith("Thumbnails/")) { | ||||
|                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | ||||
|                             TikaCoreProperties.EmbeddedResourceType.THUMBNAIL); | ||||
|                 }*/ | ||||
|                         TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); | ||||
|                 } | ||||
|  | ||||
|                 if (embeddedName.contains("Pictures/")) { | ||||
|                     embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, | ||||
|                             TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); | ||||
|                     embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | ||||
|                         TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); | ||||
|  | ||||
|                     MediaType embeddedMimeType = | ||||
|                         embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata); | ||||
|                     if (embeddedMimeType != null) { | ||||
|                         embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString()); | ||||
|                     } | ||||
|                     stream.reset(); | ||||
|                 } | ||||
|                 if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { | ||||
|                     embeddedDocumentExtractor.parseEmbedded(zip, | ||||
|                             new EmbeddedContentHandler(handler), embeddedMetadata, false); | ||||
|  | ||||
|                 if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) { | ||||
|                     embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), | ||||
|                         embeddedMetadata, false); | ||||
|                 } | ||||
|             } else if (extractMacros && embeddedName.contains("Basic/")) { | ||||
|                 //process all files under Basic/; let maybeHandleMacro figure | ||||
|                 //out if it is a macro or not | ||||
|                 maybeHandleMacro(zip, embeddedName, handler, context); | ||||
|             } | ||||
|  | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private void maybeHandleMacro(InputStream is, String embeddedName, ContentHandler handler, | ||||
|                                   ParseContext context) | ||||
|         throws TikaException, IOException, SAXException { | ||||
|         //should probably run XMLRootExtractor on the inputstream | ||||
|         //or read the macro manifest for the names of the macros | ||||
|         //rather than relying on the script file name | ||||
|         if (ignoreScriptFile(embeddedName)) { | ||||
|             return; | ||||
|         } | ||||
|         Metadata embeddedMetadata = new Metadata(); | ||||
|         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, | ||||
|             TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); | ||||
|         handler = new OpenDocumentMacroHandler(handler, context); | ||||
|         XMLReaderUtils.parseSAX(new CloseShieldInputStream(is), | ||||
|             new OfflineContentHandler(new EmbeddedContentHandler(handler)), context); | ||||
|     } | ||||
|  | ||||
|     private void checkForEncryption(InputStream stream, ParseContext context) | ||||
|         throws SAXException, TikaException, IOException { | ||||
|         try { | ||||
|             XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), | ||||
|                 new OfflineContentHandler(new EmbeddedContentHandler( | ||||
|                     new OpenDocumentManifestHandler())), context); | ||||
|         } catch (SAXException e) { | ||||
|             if (e.getCause() != null | ||||
|                 && e.getCause() instanceof EncryptedDocumentException) { | ||||
|                 throw (EncryptedDocumentException)e.getCause(); | ||||
|             } | ||||
|             //otherwise...swallow | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     private boolean ignoreScriptFile(String embeddedName) { | ||||
|         if (embeddedName.contains("Basic/")) { | ||||
|             if (embeddedName.contains("script-lb.xml")) { | ||||
|                 return true; | ||||
|             } else if (embeddedName.contains("script-lc.xml")) { | ||||
|                 return true; | ||||
|             } | ||||
|         } else { | ||||
|             //shouldn't ever get here, but if it isn't under Basic/, ignore it | ||||
|             return true; | ||||
|         } | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -16,13 +16,14 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import java.util.Arrays; | ||||
| import java.util.List; | ||||
|  | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
|  | ||||
| /** | ||||
|  * Base class for SAX handlers that map SAX events into document metadata. | ||||
|  * | ||||
| @@ -39,11 +40,12 @@ class AbstractMetadataHandler extends DefaultHandler { | ||||
|         this.property = null; | ||||
|         this.name = name; | ||||
|     } | ||||
|  | ||||
|     protected AbstractMetadataHandler(Metadata metadata, Property property) { | ||||
|        this.metadata = metadata; | ||||
|        this.property = property; | ||||
|        this.name = property.getName(); | ||||
|    } | ||||
|         this.metadata = metadata; | ||||
|         this.property = property; | ||||
|         this.name = property.getName(); | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * Adds the given metadata value. The value is ignored if it is | ||||
| @@ -59,9 +61,9 @@ class AbstractMetadataHandler extends DefaultHandler { | ||||
|                 List<String> previous = Arrays.asList(metadata.getValues(name)); | ||||
|                 if (!previous.contains(value)) { | ||||
|                     if (property != null) { | ||||
|                        metadata.add(property, value); | ||||
|                         metadata.add(property, value); | ||||
|                     } else { | ||||
|                        metadata.add(name, value); | ||||
|                         metadata.add(name, value); | ||||
|                     } | ||||
|                 } | ||||
|             } else { | ||||
| @@ -69,23 +71,23 @@ class AbstractMetadataHandler extends DefaultHandler { | ||||
|                 String previous = metadata.get(name); | ||||
|                 if (previous != null && previous.length() > 0) { | ||||
|                     if (!previous.equals(value)) { | ||||
|                        if (property != null) { | ||||
|                           if (property.isMultiValuePermitted()) { | ||||
|                               metadata.add(property, value); | ||||
|                           } else { | ||||
|                               // Replace the existing value if isMultiValuePermitted is false | ||||
|                               metadata.set(property, value); | ||||
|                           } | ||||
|                        } else { | ||||
|                           metadata.add(name, value); | ||||
|                        } | ||||
|                         if (property != null) { | ||||
|                             if (property.isMultiValuePermitted()) { | ||||
|                                 metadata.add(property, value); | ||||
|                             } else { | ||||
|                                 // Replace the existing value if isMultiValuePermitted is false | ||||
|                                 metadata.set(property, value); | ||||
|                             } | ||||
|                         } else { | ||||
|                             metadata.add(name, value); | ||||
|                         } | ||||
|                     } | ||||
|                 } else { | ||||
|                    if (property != null) { | ||||
|                       metadata.set(property, value); | ||||
|                    } else { | ||||
|                       metadata.set(name, value); | ||||
|                    } | ||||
|                     if (property != null) { | ||||
|                         metadata.set(property, value); | ||||
|                     } else { | ||||
|                         metadata.set(name, value); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|   | ||||
| @@ -16,15 +16,16 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
|  | ||||
| /** | ||||
|  * This adds a Metadata entry for a given node. | ||||
|  * The textual content of the node is used as the | ||||
|  *  value, and the Metadata name is taken from | ||||
|  *  an attribute, with a prefix if required.  | ||||
|  * value, and the Metadata name is taken from | ||||
|  * an attribute, with a prefix if required. | ||||
|  */ | ||||
| public class AttributeDependantMetadataHandler extends DefaultHandler { | ||||
|  | ||||
| @@ -32,20 +33,20 @@ public class AttributeDependantMetadataHandler extends DefaultHandler { | ||||
|  | ||||
|     private final String nameHoldingAttribute; | ||||
|     private final String namePrefix; | ||||
|     private final StringBuilder buffer = new StringBuilder(); | ||||
|     private String name; | ||||
|  | ||||
|     private final StringBuilder buffer = new StringBuilder(); | ||||
|  | ||||
|     public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) { | ||||
|     public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, | ||||
|                                              String namePrefix) { | ||||
|         this.metadata = metadata; | ||||
|         this.nameHoldingAttribute = nameHoldingAttribute; | ||||
|         this.namePrefix = namePrefix; | ||||
|     } | ||||
|  | ||||
|     public void addMetadata(String value) { | ||||
|         if(name == null || name.length() == 0) { | ||||
|            // We didn't find the attribute which holds the name | ||||
|            return; | ||||
|         if (name == null || name.length() == 0) { | ||||
|             // We didn't find the attribute which holds the name | ||||
|             return; | ||||
|         } | ||||
|         if (value.length() > 0) { | ||||
|             String previous = metadata.get(name); | ||||
| @@ -61,20 +62,19 @@ public class AttributeDependantMetadataHandler extends DefaultHandler { | ||||
|         buffer.setLength(0); | ||||
|     } | ||||
|  | ||||
|     public void startElement( | ||||
|             String uri, String localName, String name, Attributes attributes) { | ||||
|     public void startElement(String uri, String localName, String name, Attributes attributes) { | ||||
|         String rawName = attributes.getValue(nameHoldingAttribute); | ||||
|         if (rawName != null) { | ||||
|            if (namePrefix == null) { | ||||
|               this.name = rawName; | ||||
|            } else { | ||||
|               this.name = namePrefix + rawName; | ||||
|            } | ||||
|             if (namePrefix == null) { | ||||
|                 this.name = rawName; | ||||
|             } else { | ||||
|                 this.name = namePrefix + rawName; | ||||
|             } | ||||
|         } | ||||
|         // All other attributes are ignored | ||||
|     } | ||||
|  | ||||
|      | ||||
|  | ||||
|     public void characters(char[] ch, int start, int length) { | ||||
|         buffer.append(ch, start, length); | ||||
|     } | ||||
|   | ||||
| @@ -16,11 +16,12 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
|  | ||||
| /** | ||||
|  * SAX event handler that maps the contents of an XML attribute into | ||||
|  * a metadata field. | ||||
| @@ -33,26 +34,25 @@ public class AttributeMetadataHandler extends AbstractMetadataHandler { | ||||
|  | ||||
|     private final String localName; | ||||
|  | ||||
|     public AttributeMetadataHandler( | ||||
|             String uri, String localName, Metadata metadata, String name) { | ||||
|     public AttributeMetadataHandler(String uri, String localName, Metadata metadata, String name) { | ||||
|         super(metadata, name); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
|     } | ||||
|     public AttributeMetadataHandler( | ||||
|           String uri, String localName, Metadata metadata, Property property) { | ||||
|       super(metadata, property); | ||||
|       this.uri = uri; | ||||
|       this.localName = localName; | ||||
|   } | ||||
|  | ||||
|     public AttributeMetadataHandler(String uri, String localName, Metadata metadata, | ||||
|                                     Property property) { | ||||
|         super(metadata, property); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement( | ||||
|             String uri, String localName, String qName, Attributes attributes) | ||||
|             throws SAXException { | ||||
|     public void startElement(String uri, String localName, String qName, Attributes attributes) | ||||
|         throws SAXException { | ||||
|         for (int i = 0; i < attributes.getLength(); i++) { | ||||
|             if (attributes.getURI(i).equals(this.uri) | ||||
|                     && attributes.getLocalName(i).equals(this.localName)) { | ||||
|             if (attributes.getURI(i).equals(this.uri) && | ||||
|                 attributes.getLocalName(i).equals(this.localName)) { | ||||
|                 addMetadata(attributes.getValue(i).trim()); | ||||
|             } | ||||
|         } | ||||
|   | ||||
| @@ -16,45 +16,45 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.xml.sax.ContentHandler; | ||||
|  | ||||
| import org.apache.tika.metadata.DublinCore; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import org.apache.tika.metadata.TikaCoreProperties; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.sax.TeeContentHandler; | ||||
| import org.xml.sax.ContentHandler; | ||||
|  | ||||
| /** | ||||
|  * Dublin Core metadata parser | ||||
|  */ | ||||
| public class DcXMLParser extends XMLParser { | ||||
|  | ||||
|     /** Serial version UID */ | ||||
|     /** | ||||
|      * Serial version UID | ||||
|      */ | ||||
|     private static final long serialVersionUID = 4905318835463880819L; | ||||
|  | ||||
|     private static ContentHandler getDublinCoreHandler( | ||||
|             Metadata metadata, Property property, String element) { | ||||
|         return new ElementMetadataHandler( | ||||
|                 DublinCore.NAMESPACE_URI_DC, element, | ||||
|                 metadata, property); | ||||
|     private static ContentHandler getDublinCoreHandler(Metadata metadata, Property property, | ||||
|                                                        String element) { | ||||
|         return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, element, metadata, property); | ||||
|     } | ||||
|  | ||||
|     protected ContentHandler getContentHandler( | ||||
|             ContentHandler handler, Metadata metadata, ParseContext context) { | ||||
|         return new TeeContentHandler( | ||||
|                 super.getContentHandler(handler, metadata, context), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"), | ||||
|                 getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights")); | ||||
|     protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, | ||||
|                                                ParseContext context) { | ||||
|         return new TeeContentHandler(super.getContentHandler(handler, metadata, context), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.SUBJECT, "subject"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"), | ||||
|             getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights")); | ||||
|     } | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -16,13 +16,14 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import java.util.Arrays; | ||||
|  | ||||
| import org.slf4j.Logger; | ||||
| import org.slf4j.LoggerFactory; | ||||
| import org.xml.sax.Attributes; | ||||
|  | ||||
| import java.util.Arrays; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
|  | ||||
| /** | ||||
|  * SAX event handler that maps the contents of an XML element into | ||||
| @@ -44,21 +45,17 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|     private final Metadata metadata; | ||||
|  | ||||
|     private final String name; | ||||
|     private Property targetProperty; | ||||
|  | ||||
|     private final boolean allowDuplicateValues; | ||||
|     private final boolean allowEmptyValues; | ||||
|  | ||||
|     /** | ||||
|      * The buffer used to capture characters when inside a bag li element. | ||||
|      */ | ||||
|     private final StringBuilder bufferBagged = new StringBuilder(); | ||||
|  | ||||
|     /** | ||||
|      * The buffer used to capture characters inside standard elements. | ||||
|      */ | ||||
|     private final StringBuilder bufferBagless = new StringBuilder(); | ||||
|  | ||||
|     private Property targetProperty; | ||||
|     /** | ||||
|      * Whether or not the value was found in a standard element structure or inside a bag. | ||||
|      */ | ||||
| @@ -70,13 +67,12 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|     /** | ||||
|      * Constructor for string metadata keys. | ||||
|      * | ||||
|      * @param uri the uri of the namespace of the element | ||||
|      * @param uri       the uri of the namespace of the element | ||||
|      * @param localName the local name of the element | ||||
|      * @param metadata the Tika metadata object to populate | ||||
|      * @param name the Tika metadata field key | ||||
|      * @param metadata  the Tika metadata object to populate | ||||
|      * @param name      the Tika metadata field key | ||||
|      */ | ||||
|     public ElementMetadataHandler( | ||||
|             String uri, String localName, Metadata metadata, String name) { | ||||
|     public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name) { | ||||
|         super(metadata, name); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
| @@ -91,15 +87,15 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|      * Constructor for string metadata keys which allows change of behavior | ||||
|      * for duplicate and empty entry values. | ||||
|      * | ||||
|      * @param uri the uri of the namespace of the element | ||||
|      * @param localName the local name of the element | ||||
|      * @param metadata the Tika metadata object to populate | ||||
|      * @param name the Tika metadata field key | ||||
|      * @param uri                  the uri of the namespace of the element | ||||
|      * @param localName            the local name of the element | ||||
|      * @param metadata             the Tika metadata object to populate | ||||
|      * @param name                 the Tika metadata field key | ||||
|      * @param allowDuplicateValues add duplicate values to the Tika metadata | ||||
|      * @param allowEmptyValues add empty values to the Tika metadata | ||||
|      * @param allowEmptyValues     add empty values to the Tika metadata | ||||
|      */ | ||||
|     public ElementMetadataHandler( | ||||
|             String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) { | ||||
|     public ElementMetadataHandler(String uri, String localName, Metadata metadata, String name, | ||||
|                                   boolean allowDuplicateValues, boolean allowEmptyValues) { | ||||
|         super(metadata, name); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
| @@ -113,13 +109,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|     /** | ||||
|      * Constructor for Property metadata keys. | ||||
|      * | ||||
|      * @param uri the uri of the namespace of the element | ||||
|      * @param localName the local name of the element | ||||
|      * @param metadata the Tika metadata object to populate | ||||
|      * @param uri            the uri of the namespace of the element | ||||
|      * @param localName      the local name of the element | ||||
|      * @param metadata       the Tika metadata object to populate | ||||
|      * @param targetProperty the Tika metadata Property key | ||||
|      */ | ||||
|     public ElementMetadataHandler( | ||||
|             String uri, String localName, Metadata metadata, Property targetProperty) { | ||||
|     public ElementMetadataHandler(String uri, String localName, Metadata metadata, | ||||
|                                   Property targetProperty) { | ||||
|         super(metadata, targetProperty); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
| @@ -135,15 +131,16 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|      * Constructor for Property metadata keys which allows change of behavior | ||||
|      * for duplicate and empty entry values. | ||||
|      * | ||||
|      * @param uri the uri of the namespace of the element | ||||
|      * @param localName the local name of the element | ||||
|      * @param metadata the Tika metadata object to populate | ||||
|      * @param targetProperty the Tika metadata Property key | ||||
|      * @param uri                  the uri of the namespace of the element | ||||
|      * @param localName            the local name of the element | ||||
|      * @param metadata             the Tika metadata object to populate | ||||
|      * @param targetProperty       the Tika metadata Property key | ||||
|      * @param allowDuplicateValues add duplicate values to the Tika metadata | ||||
|      * @param allowEmptyValues add empty values to the Tika metadata | ||||
|      * @param allowEmptyValues     add empty values to the Tika metadata | ||||
|      */ | ||||
|     public ElementMetadataHandler( | ||||
|             String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) { | ||||
|     public ElementMetadataHandler(String uri, String localName, Metadata metadata, | ||||
|                                   Property targetProperty, boolean allowDuplicateValues, | ||||
|                                   boolean allowEmptyValues) { | ||||
|         super(metadata, targetProperty); | ||||
|         this.uri = uri; | ||||
|         this.localName = localName; | ||||
| @@ -162,16 +159,13 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|     protected boolean isMatchingElement(String uri, String localName) { | ||||
|         // match if we're inside the parent element or within some bag element | ||||
|         return (uri.equals(this.uri) && localName.equals(this.localName)) || | ||||
|                 (parentMatchLevel > 0 && | ||||
|                         ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) || | ||||
|                         (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI)) | ||||
|                 ) | ||||
|         ); | ||||
|             (parentMatchLevel > 0 && | ||||
|                 ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) || | ||||
|                     (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI)))); | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     public void startElement( | ||||
|             String uri, String localName, String name, Attributes attributes) { | ||||
|     public void startElement(String uri, String localName, String name, Attributes attributes) { | ||||
|         if (isMatchingElement(uri, localName)) { | ||||
|             matchLevel++; | ||||
|         } | ||||
| @@ -230,7 +224,8 @@ public class ElementMetadataHandler extends AbstractMetadataHandler { | ||||
|                     value = ""; | ||||
|                 } | ||||
|                 String[] previous = metadata.getValues(name); | ||||
|                 if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) { | ||||
|                 if (previous == null || !Arrays.asList(previous).contains(value) || | ||||
|                     allowDuplicateValues) { | ||||
|                     metadata.add(targetProperty, value); | ||||
|                 } | ||||
|             } | ||||
|   | ||||
| @@ -16,64 +16,68 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.commons.codec.binary.Base64; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentUtil; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.TikaMetadataKeys; | ||||
| import org.apache.tika.mime.MediaType; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import java.io.ByteArrayInputStream; | ||||
| import java.io.IOException; | ||||
| import java.util.Collections; | ||||
| import java.util.Set; | ||||
|  | ||||
| import org.apache.commons.codec.binary.Base64; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; | ||||
| import org.apache.tika.extractor.EmbeddedDocumentUtil; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.TikaCoreProperties; | ||||
| import org.apache.tika.mime.MediaType; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
|  | ||||
| public class FictionBookParser extends XMLParser { | ||||
|     private static final long serialVersionUID = 4195954546491524374L; | ||||
|  | ||||
|     private static final Set<MediaType> SUPPORTED_TYPES = | ||||
|             Collections.singleton(MediaType.application("x-fictionbook+xml")); | ||||
|         Collections.singleton(MediaType.application("x-fictionbook+xml")); | ||||
|  | ||||
|     @Override | ||||
|     public Set<MediaType> getSupportedTypes(ParseContext context) { | ||||
|         return SUPPORTED_TYPES; | ||||
|     } | ||||
|  | ||||
|     @Override | ||||
|     protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { | ||||
|     protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, | ||||
|                                                ParseContext context) { | ||||
|         return new BinaryElementsDataHandler( | ||||
|                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler); | ||||
|             EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler); | ||||
|     } | ||||
|  | ||||
|     private static class BinaryElementsDataHandler extends DefaultHandler { | ||||
|         private static final String ELEMENT_BINARY = "binary"; | ||||
|  | ||||
|         private boolean binaryMode = false; | ||||
|         private static final String ATTRIBUTE_ID = "id"; | ||||
|  | ||||
|         private static final String ATTRIBUTE_CONTENT_TYPE = "content-type"; | ||||
|         private final EmbeddedDocumentExtractor partExtractor; | ||||
|         private final ContentHandler handler; | ||||
|         private final StringBuilder binaryData = new StringBuilder(); | ||||
|         private boolean binaryMode = false; | ||||
|         private Metadata metadata; | ||||
|         private static final String ATTRIBUTE_CONTENT_TYPE = "content-type"; | ||||
|  | ||||
|         private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) { | ||||
|         private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, | ||||
|                                           ContentHandler handler) { | ||||
|             this.partExtractor = partExtractor; | ||||
|             this.handler = handler; | ||||
|         } | ||||
|  | ||||
|         @Override | ||||
|         public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { | ||||
|         public void startElement(String uri, String localName, String qName, Attributes attributes) | ||||
|             throws SAXException { | ||||
|             binaryMode = ELEMENT_BINARY.equals(localName); | ||||
|             if (binaryMode) { | ||||
|                 binaryData.setLength(0); | ||||
|                 metadata = new Metadata(); | ||||
|  | ||||
|                 metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID)); | ||||
|                 metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, | ||||
|                     attributes.getValue(ATTRIBUTE_ID)); | ||||
|                 metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE)); | ||||
|             } | ||||
|         } | ||||
| @@ -83,11 +87,8 @@ public class FictionBookParser extends XMLParser { | ||||
|             if (binaryMode) { | ||||
|                 try { | ||||
|                     partExtractor.parseEmbedded( | ||||
|                             new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())), | ||||
|                             handler, | ||||
|                             metadata, | ||||
|                             true | ||||
|                     ); | ||||
|                         new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())), | ||||
|                         handler, metadata, true); | ||||
|                 } catch (IOException e) { | ||||
|                     throw new SAXException("IOException in parseEmbedded", e); | ||||
|                 } | ||||
|   | ||||
| @@ -16,19 +16,20 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
| import org.xml.sax.Attributes; | ||||
| import org.xml.sax.helpers.DefaultHandler; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.metadata.Property; | ||||
|  | ||||
| /** | ||||
|  * This adds Metadata entries with a specified name for | ||||
|  *  the textual content of a node (if present), and  | ||||
|  *  all attribute values passed through the matcher | ||||
|  *  (but not their names).  | ||||
|  * the textual content of a node (if present), and | ||||
|  * all attribute values passed through the matcher | ||||
|  * (but not their names). | ||||
|  * | ||||
|  * @deprecated Use the {@link AttributeMetadataHandler} and | ||||
|  *             {@link ElementMetadataHandler} classes instead | ||||
|  * {@link ElementMetadataHandler} classes instead | ||||
|  */ | ||||
| public class MetadataHandler extends DefaultHandler { | ||||
|  | ||||
| @@ -44,11 +45,12 @@ public class MetadataHandler extends DefaultHandler { | ||||
|         this.property = null; | ||||
|         this.name = name; | ||||
|     } | ||||
|  | ||||
|     public MetadataHandler(Metadata metadata, Property property) { | ||||
|        this.metadata = metadata; | ||||
|        this.property = property; | ||||
|        this.name = property.getName(); | ||||
|    } | ||||
|         this.metadata = metadata; | ||||
|         this.property = property; | ||||
|         this.name = property.getName(); | ||||
|     } | ||||
|  | ||||
|     public void addMetadata(String value) { | ||||
|         if (value.length() > 0) { | ||||
| @@ -56,11 +58,11 @@ public class MetadataHandler extends DefaultHandler { | ||||
|             if (previous != null && previous.length() > 0) { | ||||
|                 value = previous + ", " + value; | ||||
|             } | ||||
|              | ||||
|  | ||||
|             if (this.property != null) { | ||||
|                metadata.set(property, value); | ||||
|                 metadata.set(property, value); | ||||
|             } else { | ||||
|                metadata.set(name, value); | ||||
|                 metadata.set(name, value); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -70,14 +72,13 @@ public class MetadataHandler extends DefaultHandler { | ||||
|         buffer.setLength(0); | ||||
|     } | ||||
|  | ||||
|     public void startElement( | ||||
|             String uri, String localName, String name, Attributes attributes) { | ||||
|     public void startElement(String uri, String localName, String name, Attributes attributes) { | ||||
|         for (int i = 0; i < attributes.getLength(); i++) { | ||||
|             addMetadata(attributes.getValue(i)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|      | ||||
|  | ||||
|     public void characters(char[] ch, int start, int length) { | ||||
|         buffer.append(ch, start, length); | ||||
|     } | ||||
|   | ||||
| @@ -0,0 +1,34 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import org.xml.sax.ContentHandler; | ||||
|  | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.parser.ParseContext; | ||||
| import org.apache.tika.sax.TextAndAttributeContentHandler; | ||||
|  | ||||
| public class TextAndAttributeXMLParser extends XMLParser { | ||||
|  | ||||
|     private static final long serialVersionUID = 7796914007312429473L; | ||||
|  | ||||
|     @Override | ||||
|     protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, | ||||
|                                                ParseContext context) { | ||||
|         return new TextAndAttributeContentHandler(handler, true); | ||||
|     } | ||||
| } | ||||
| @@ -16,7 +16,17 @@ | ||||
|  */ | ||||
| package org.apache.tika.parser.xml; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.Arrays; | ||||
| import java.util.Collections; | ||||
| import java.util.HashSet; | ||||
| import java.util.Set; | ||||
|  | ||||
| import org.apache.commons.io.input.CloseShieldInputStream; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import org.apache.tika.exception.TikaException; | ||||
| import org.apache.tika.metadata.Metadata; | ||||
| import org.apache.tika.mime.MediaType; | ||||
| @@ -28,52 +38,41 @@ import org.apache.tika.sax.TaggedContentHandler; | ||||
| import org.apache.tika.sax.TextContentHandler; | ||||
| import org.apache.tika.sax.XHTMLContentHandler; | ||||
| import org.apache.tika.utils.XMLReaderUtils; | ||||
| import org.xml.sax.ContentHandler; | ||||
| import org.xml.sax.SAXException; | ||||
|  | ||||
| import java.io.IOException; | ||||
| import java.io.InputStream; | ||||
| import java.util.Arrays; | ||||
| import java.util.Collections; | ||||
| import java.util.HashSet; | ||||
| import java.util.Set; | ||||
|  | ||||
| /** | ||||
|  * XML parser. | ||||
|  */ | ||||
| public class XMLParser extends AbstractParser { | ||||
|  | ||||
|     /** Serial version UID */ | ||||
|     /** | ||||
|      * Serial version UID | ||||
|      */ | ||||
|     private static final long serialVersionUID = -6028836725280212837L; | ||||
|  | ||||
|     private static final Set<MediaType> SUPPORTED_TYPES = | ||||
|         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( | ||||
|                 MediaType.application("xml"), | ||||
|                 MediaType.image("svg+xml")))); | ||||
|     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( | ||||
|         new HashSet<>( | ||||
|             Arrays.asList(MediaType.application("xml"), MediaType.image("svg+xml")))); | ||||
|  | ||||
|     public Set<MediaType> getSupportedTypes(ParseContext context) { | ||||
|         return SUPPORTED_TYPES; | ||||
|     } | ||||
|  | ||||
|     public void parse( | ||||
|             InputStream stream, ContentHandler handler, | ||||
|             Metadata metadata, ParseContext context) | ||||
|             throws IOException, SAXException, TikaException { | ||||
|     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, | ||||
|                       ParseContext context) throws IOException, SAXException, TikaException { | ||||
|         if (metadata.get(Metadata.CONTENT_TYPE) == null) { | ||||
|             metadata.set(Metadata.CONTENT_TYPE, "application/xml"); | ||||
|         } | ||||
|  | ||||
|         final XHTMLContentHandler xhtml = | ||||
|             new XHTMLContentHandler(handler, metadata); | ||||
|         final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); | ||||
|         xhtml.startDocument(); | ||||
|         xhtml.startElement("p"); | ||||
|  | ||||
|         TaggedContentHandler tagged = new TaggedContentHandler(handler); | ||||
|         try { | ||||
|             XMLReaderUtils.parseSAX( | ||||
|                     new CloseShieldInputStream(stream), | ||||
|                     new OfflineContentHandler(new EmbeddedContentHandler( | ||||
|                             getContentHandler(tagged, metadata, context))), context); | ||||
|             XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler( | ||||
|                     new EmbeddedContentHandler( | ||||
|                         getContentHandler(tagged, metadata, context))), | ||||
|                 context); | ||||
|         } catch (SAXException e) { | ||||
|             tagged.throwIfCauseOf(e); | ||||
|             throw new TikaException("XML parse error", e); | ||||
| @@ -83,8 +82,8 @@ public class XMLParser extends AbstractParser { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     protected ContentHandler getContentHandler( | ||||
|             ContentHandler handler, Metadata metadata, ParseContext context) { | ||||
|     protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, | ||||
|                                                ParseContext context) { | ||||
|         return new TextContentHandler(handler, true); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -0,0 +1,206 @@ | ||||
| /* | ||||
|  * Licensed to the Apache Software Foundation (ASF) under one or more | ||||
|  * contributor license agreements.  See the NOTICE file distributed with | ||||
|  * this work for additional information regarding copyright ownership. | ||||
|  * The ASF licenses this file to You under the Apache License, Version 2.0 | ||||
|  * (the "License"); you may not use this file except in compliance with | ||||
|  * the License.  You may obtain a copy of the License at | ||||
|  * | ||||
|  *     http://www.apache.org/licenses/LICENSE-2.0 | ||||
|  * | ||||
|  * Unless required by applicable law or agreed to in writing, software | ||||
|  * distributed under the License is distributed on an "AS IS" BASIS, | ||||
|  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
|  * See the License for the specific language governing permissions and | ||||
|  * limitations under the License. | ||||
|  */ | ||||
| package org.apache.tika.utils; | ||||
|  | ||||
| public class StringUtils { | ||||
|  | ||||
|     /** | ||||
|      * The empty String {@code ""}. | ||||
|      * | ||||
|      * @since 2.0 | ||||
|      */ | ||||
|     public static final String EMPTY = ""; | ||||
|  | ||||
|     /** | ||||
|      * A String for a space character. | ||||
|      * | ||||
|      * @since 3.2 | ||||
|      */ | ||||
|     public static final String SPACE = " "; | ||||
|  | ||||
|     static int PAD_LIMIT = 10000; | ||||
|  | ||||
|     public static boolean isEmpty(final CharSequence cs) { | ||||
|         return cs == null || cs.length() == 0; | ||||
|     } | ||||
|  | ||||
|     public static boolean isBlank(final String s) { | ||||
|         return s == null || s.trim().length() == 0; | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * <p>Left pad a String with a specified String.</p> | ||||
|      * | ||||
|      * <p>Pad to a size of {@code size}.</p> | ||||
|      * | ||||
|      * <pre> | ||||
|      * StringUtils.leftPad(null, *, *)      = null | ||||
|      * StringUtils.leftPad("", 3, "z")      = "zzz" | ||||
|      * StringUtils.leftPad("bat", 3, "yz")  = "bat" | ||||
|      * StringUtils.leftPad("bat", 5, "yz")  = "yzbat" | ||||
|      * StringUtils.leftPad("bat", 8, "yz")  = "yzyzybat" | ||||
|      * StringUtils.leftPad("bat", 1, "yz")  = "bat" | ||||
|      * StringUtils.leftPad("bat", -1, "yz") = "bat" | ||||
|      * StringUtils.leftPad("bat", 5, null)  = "  bat" | ||||
|      * StringUtils.leftPad("bat", 5, "")    = "  bat" | ||||
|      * </pre> | ||||
|      * | ||||
|      * @param str    the String to pad out, may be null | ||||
|      * @param size   the size to pad to | ||||
|      * @param padStr the String to pad with, null or empty treated as single space | ||||
|      * @return left padded String or original String if no padding is necessary, | ||||
|      * {@code null} if null String input | ||||
|      */ | ||||
|     public static String leftPad(final String str, final int size, String padStr) { | ||||
|         if (str == null) { | ||||
|             return null; | ||||
|         } | ||||
|         if (isEmpty(padStr)) { | ||||
|             padStr = SPACE; | ||||
|         } | ||||
|         final int padLen = padStr.length(); | ||||
|         final int strLen = str.length(); | ||||
|         final int pads = size - strLen; | ||||
|         if (pads <= 0) { | ||||
|             return str; // returns original String when possible | ||||
|         } | ||||
|         if (padLen == 1 && pads <= PAD_LIMIT) { | ||||
|             return leftPad(str, size, padStr.charAt(0)); | ||||
|         } | ||||
|  | ||||
|         if (pads == padLen) { | ||||
|             return padStr.concat(str); | ||||
|         } else if (pads < padLen) { | ||||
|             return padStr.substring(0, pads).concat(str); | ||||
|         } else { | ||||
|             final char[] padding = new char[pads]; | ||||
|             final char[] padChars = padStr.toCharArray(); | ||||
|             for (int i = 0; i < pads; i++) { | ||||
|                 padding[i] = padChars[i % padLen]; | ||||
|             } | ||||
|             return new String(padding).concat(str); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|  | ||||
|     public static String leftPad(final String str, final int size, final char padChar) { | ||||
|         if (str == null) { | ||||
|             return null; | ||||
|         } | ||||
|         final int pads = size - str.length(); | ||||
|         if (pads <= 0) { | ||||
|             return str; // returns original String when possible | ||||
|         } | ||||
|         if (pads > PAD_LIMIT) { | ||||
|             return leftPad(str, size, String.valueOf(padChar)); | ||||
|         } | ||||
|         return repeat(padChar, pads).concat(str); | ||||
|     } | ||||
|  | ||||
|     /** | ||||
|      * <p>Returns padding using the specified delimiter repeated | ||||
|      * to a given length.</p> | ||||
|      * | ||||
|      * <pre> | ||||
|      * StringUtils.repeat('e', 0)  = "" | ||||
|      * StringUtils.repeat('e', 3)  = "eee" | ||||
|      * StringUtils.repeat('e', -2) = "" | ||||
|      * </pre> | ||||
|      * | ||||
|      * <p>Note: this method does not support padding with | ||||
|      * <a href="http://www.unicode.org/glossary/#supplementary_character">Unicode Supplementary Characters</a> | ||||
|      * as they require a pair of {@code char}s to be represented. | ||||
|      * If you are needing to support full I18N of your applications | ||||
|      * consider using {@link #repeat(String, int)} instead. | ||||
|      * </p> | ||||
|      * | ||||
|      * @param ch     character to repeat | ||||
|      * @param repeat number of times to repeat char, negative treated as zero | ||||
|      * @return String with repeated character | ||||
|      * @see #repeat(String, int) | ||||
|      */ | ||||
|     public static String repeat(final char ch, final int repeat) { | ||||
|         if (repeat <= 0) { | ||||
|             return EMPTY; | ||||
|         } | ||||
|         final char[] buf = new char[repeat]; | ||||
|         for (int i = repeat - 1; i >= 0; i--) { | ||||
|             buf[i] = ch; | ||||
|         } | ||||
|         return new String(buf); | ||||
|     } | ||||
|  | ||||
|     // Padding | ||||
|     //----------------------------------------------------------------------- | ||||
|  | ||||
|     /** | ||||
|      * <p>Repeat a String {@code repeat} times to form a | ||||
|      * new String.</p> | ||||
|      * | ||||
|      * <pre> | ||||
|      * StringUtils.repeat(null, 2) = null | ||||
|      * StringUtils.repeat("", 0)   = "" | ||||
|      * StringUtils.repeat("", 2)   = "" | ||||
|      * StringUtils.repeat("a", 3)  = "aaa" | ||||
|      * StringUtils.repeat("ab", 2) = "abab" | ||||
|      * StringUtils.repeat("a", -2) = "" | ||||
|      * </pre> | ||||
|      * | ||||
|      * @param str    the String to repeat, may be null | ||||
|      * @param repeat number of times to repeat str, negative treated as zero | ||||
|      * @return a new String consisting of the original String repeated, | ||||
|      * {@code null} if null String input | ||||
|      */ | ||||
|     public static String repeat(final String str, final int repeat) { | ||||
|         // Performance tuned for 2.0 (JDK1.4) | ||||
|  | ||||
|         if (str == null) { | ||||
|             return null; | ||||
|         } | ||||
|         if (repeat <= 0) { | ||||
|             return EMPTY; | ||||
|         } | ||||
|         final int inputLength = str.length(); | ||||
|         if (repeat == 1 || inputLength == 0) { | ||||
|             return str; | ||||
|         } | ||||
|         if (inputLength == 1 && repeat <= PAD_LIMIT) { | ||||
|             return repeat(str.charAt(0), repeat); | ||||
|         } | ||||
|  | ||||
|         final int outputLength = inputLength * repeat; | ||||
|         switch (inputLength) { | ||||
|             case 1: | ||||
|                 return repeat(str.charAt(0), repeat); | ||||
|             case 2: | ||||
|                 final char ch0 = str.charAt(0); | ||||
|                 final char ch1 = str.charAt(1); | ||||
|                 final char[] output2 = new char[outputLength]; | ||||
|                 for (int i = repeat * 2 - 2; i >= 0; i--, i--) { | ||||
|                     output2[i] = ch0; | ||||
|                     output2[i + 1] = ch1; | ||||
|                 } | ||||
|                 return new String(output2); | ||||
|             default: | ||||
|                 final StringBuilder buf = new StringBuilder(outputLength); | ||||
|                 for (int i = 0; i < repeat; i++) { | ||||
|                     buf.append(str); | ||||
|                 } | ||||
|                 return buf.toString(); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -16,7 +16,7 @@ import munit._ | ||||
| class OdfExtractTest extends FunSuite { | ||||
|  | ||||
|   val files = List( | ||||
|     ExampleFiles.examples_sample_odt -> 6372, | ||||
|     ExampleFiles.examples_sample_odt -> 6367, | ||||
|     ExampleFiles.examples_sample_ods -> 717 | ||||
|   ) | ||||
|  | ||||
|   | ||||
| @@ -20,7 +20,7 @@ import fs2.Stream | ||||
| import docspell.common._ | ||||
|  | ||||
| import org.apache.tika.config.TikaConfig | ||||
| import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} | ||||
| import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaCoreProperties} | ||||
| import org.apache.tika.mime.MediaType | ||||
| import org.apache.tika.parser.txt.Icu4jEncodingDetector | ||||
|  | ||||
| @@ -40,7 +40,7 @@ object TikaMimetype { | ||||
|  | ||||
|   private def makeMetadata(hint: MimeTypeHint): Metadata = { | ||||
|     val md = new Metadata | ||||
|     hint.filename.foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _)) | ||||
|     hint.filename.foreach(md.set(TikaCoreProperties.RESOURCE_NAME_KEY, _)) | ||||
|     hint.advertised.foreach(md.set(HttpHeaders.CONTENT_TYPE, _)) | ||||
|     md | ||||
|   } | ||||
|   | ||||
| @@ -38,7 +38,7 @@ object Dependencies { | ||||
|   val ScalaJavaTimeVersion    = "2.3.0" | ||||
|   val Slf4jVersion            = "1.7.31" | ||||
|   val StanfordNlpVersion      = "4.2.2" | ||||
|   val TikaVersion             = "1.27" | ||||
|   val TikaVersion             = "2.0.0" | ||||
|   val YamuscaVersion          = "0.8.1" | ||||
|   val SwaggerUIVersion        = "3.51.1" | ||||
|   val TwelveMonkeysVersion    = "3.7.0" | ||||
|   | ||||
		Reference in New Issue
	
	Block a user