Adding extraction primitives

2025-09-15 21:46:53 +00:00 · 2020-02-16 21:37:26 +01:00
parent 851ee7ef0f
commit 8143a4edcc
46 changed files with 2731 additions and 83 deletions
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+    private static final String OLD_NS =
+            "http://openoffice.org/2000/";
+
+    private static final String NEW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:";
+
+    private static final String DTD_PUBLIC_ID =
+            "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+    public NSNormalizerContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    private String mapOldNS(String ns) {
+        if (ns != null && ns.startsWith(OLD_NS)) {
+            return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+        } else {
+            return ns;
+        }
+    }
+
+    @Override
+    public void startElement(
+            String namespaceURI, String localName, String qName,
+            Attributes atts) throws SAXException {
+        AttributesImpl natts = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            natts.addAttribute(
+                    mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+                    atts.getQName(i), atts.getType(i), atts.getValue(i));
+        }
+        super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+    }
+
+    @Override
+    public void endElement(String namespaceURI, String localName, String qName)
+            throws SAXException {
+        super.endElement(mapOldNS(namespaceURI), localName, qName);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri)
+            throws SAXException {
+        super.startPrefixMapping(prefix, mapOldNS(uri));
+    }
+
+    /**
+     * do not load any DTDs (may be requested by parser). Fake the DTD by
+     * returning a empty string as InputSource
+     */
+    @Override
+    public InputSource resolveEntity(String publicId, String systemId)
+            throws IOException, SAXException {
+        if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+                || DTD_PUBLIC_ID.equals(publicId)) {
+            return new InputSource(new StringReader(""));
+        } else {
+            return super.resolveEntity(publicId, systemId);
+        }
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -0,0 +1,606 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+    private interface Style {
+    }
+
+    private static class TextStyle implements Style {
+        public boolean italic;
+        public boolean bold;
+        public boolean underlined;
+
+        @Override
+        public String toString() {
+            return "TextStyle{" +
+                    "italic=" + italic +
+                    ", bold=" + bold +
+                    ", underlined=" + underlined +
+                    '}';
+        }
+    }
+
+    private static class ListStyle implements Style {
+        public boolean ordered;
+
+        public String getTag() {
+            return ordered ? "ol" : "ul";
+        }
+    }
+
+    private static final class OpenDocumentElementMappingContentHandler extends
+            ElementMappingContentHandler {
+        private static final char[] SPACE = new char[]{ ' '};
+        private static final String CLASS = "class";
+        private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
+        private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
+        private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
+
+        private static Attributes buildAttributes(String key, String value) {
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", key, key, "CDATA", value);
+            return attrs;
+        }
+
+        private final ContentHandler handler;
+        private final BitSet textNodeStack = new BitSet();
+        private int nodeDepth = 0;
+        private int completelyFiltered = 0;
+        private Stack<String> headingStack = new Stack<String>();
+        private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
+        private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+        private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+        private String currParagraphStyleName; //paragraph style name
+        private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
+        private String currTextStyleName;
+
+        private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+        private ListStyle listStyle;
+
+        // True if we are currently in the named style:
+        private boolean curUnderlined;
+        private boolean curBold;
+        private boolean curItalic;
+
+        //have we written the start style tags
+        //yet for the current text style
+        boolean hasWrittenStartStyleTags = false;
+
+        private int pDepth = 0;  //<p> can appear inside comments and other things that are already inside <p>
+                                //we need to track our pDepth and only output <p> if we're at the main level
+
+
+        private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+                                                         Map<QName, TargetElement> mappings) {
+            super(handler, mappings);
+            this.handler = handler;
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            // only forward content of tags from text:-namespace
+            if (completelyFiltered == 0 && nodeDepth > 0
+                    && textNodeStack.get(nodeDepth - 1)) {
+                if (!hasWrittenStartStyleTags) {
+                    updateStyleTags();
+                    hasWrittenStartStyleTags = true;
+                }
+                super.characters(ch, start, length);
+            }
+        }
+
+        // helper for checking tags which need complete filtering
+        // (with sub-tags)
+        private boolean needsCompleteFiltering(
+                String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI)) {
+                return localName.endsWith("-template")
+                        || localName.endsWith("-style");
+            }
+            return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+        }
+
+        // map the heading level to <hX> HTML tags
+        private String getXHTMLHeaderTagName(Attributes atts) {
+            String depthStr = atts.getValue(TEXT_NS, "outline-level");
+            if (depthStr == null) {
+                return "h1";
+            }
+
+            int depth = Integer.parseInt(depthStr);
+            if (depth >= 6) {
+                return "h6";
+            } else if (depth <= 1) {
+                return "h1";
+            } else {
+                return "h" + depth;
+            }
+        }
+
+        /**
+         * Check if a node is a text node
+         */
+        private boolean isTextNode(String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+                return true;
+            }
+            if (SVG_NS.equals(namespaceURI)) {
+                return "title".equals(localName) ||
+                        "desc".equals(localName);
+            }
+            return false;
+        }
+
+        private void startList(String name) throws SAXException {
+            String elementName = "ul";
+            if (name != null) {
+                ListStyle style = listStyleMap.get(name);
+                elementName = style != null ? style.getTag() : "ul";
+                listStyleStack.push(style);
+            }
+            handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+        }
+
+        private void endList() throws SAXException {
+            String elementName = "ul";
+            if (!listStyleStack.isEmpty()) {
+                ListStyle style = listStyleStack.pop();
+                elementName = style != null ? style.getTag() : "ul";
+            }
+            handler.endElement(XHTML, elementName, elementName);
+        }
+
+        private void startSpan(String name) throws SAXException {
+            if (name == null) {
+                return;
+            }
+            currTextStyle = textStyleMap.get(name);
+            hasWrittenStartStyleTags = false;
+        }
+
+        private void startParagraph(String styleName) throws SAXException {
+            if (pDepth == 0) {
+                handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
+                if (styleName != null) {
+                    currTextStyle = paragraphTextStyleMap.get(styleName);
+                }
+                hasWrittenStartStyleTags = false;
+            } else {
+                handler.characters(SPACE, 0, SPACE.length);
+            }
+            pDepth++;
+        }
+
+        private void endParagraph() throws SAXException {
+            closeStyleTags();
+            if (pDepth == 1) {
+                handler.endElement(XHTML, "p", "p");
+            } else {
+                handler.characters(SPACE, 0, SPACE.length);
+            }
+            pDepth--;
+
+        }
+
+        private void updateStyleTags() throws SAXException {
+
+            if (currTextStyle == null) {
+                closeStyleTags();
+                return;
+            }
+            if (currTextStyle.bold != curBold) {
+                // Enforce nesting -- must close s and i tags
+                if (curUnderlined) {
+                    handler.endElement(XHTML, "u", "u");
+                    curUnderlined = false;
+                }
+                if (curItalic) {
+                    handler.endElement(XHTML, "i", "i");
+                    curItalic = false;
+                }
+                if (currTextStyle.bold) {
+                    handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+                } else {
+                    handler.endElement(XHTML, "b", "b");
+                }
+                curBold = currTextStyle.bold;
+            }
+
+            if (currTextStyle.italic != curItalic) {
+                // Enforce nesting -- must close s tag
+                if (curUnderlined) {
+                    handler.endElement(XHTML, "u", "u");
+                    curUnderlined = false;
+                }
+                if (currTextStyle.italic) {
+                    handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+                } else {
+                    handler.endElement(XHTML, "i", "i");
+                }
+                curItalic = currTextStyle.italic;
+            }
+
+            if (currTextStyle.underlined != curUnderlined) {
+                if (currTextStyle.underlined) {
+                    handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+                } else {
+                    handler.endElement(XHTML, "u", "u");
+                }
+                curUnderlined = currTextStyle.underlined;
+            }
+        }
+
+        private void endSpan() throws SAXException {
+            updateStyleTags();
+        }
+
+        private void closeStyleTags() throws SAXException {
+            // Close any still open style tags
+            if (curUnderlined) {
+                handler.endElement(XHTML,"u", "u");
+                curUnderlined = false;
+            }
+            if (curItalic) {
+                handler.endElement(XHTML,"i", "i");
+                curItalic = false;
+            }
+            if (curBold) {
+                handler.endElement(XHTML,"b", "b");
+                curBold = false;
+            }
+            currTextStyle = null;
+            hasWrittenStartStyleTags = false;
+        }
+
+        @Override
+        public void startElement(
+                String namespaceURI, String localName, String qName,
+                Attributes attrs) throws SAXException {
+            // keep track of current node type. If it is a text node,
+            // a bit at the current depth its set in textNodeStack.
+            // characters() checks the top bit to determine, if the
+            // actual node is a text node to print out nodeDepth contains
+            // the depth of the current node and also marks top of stack.
+            assert nodeDepth >= 0;
+
+            // Set styles
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                String family = attrs.getValue(STYLE_NS, "family");
+                if ("text".equals(family)) {
+                    currTextStyle = new TextStyle();
+                    currTextStyleName = attrs.getValue(STYLE_NS, "name");
+                } else if ("paragraph".equals(family)) {
+                    currTextStyle = new TextStyle();
+                    currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
+                }
+            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+                listStyle = new ListStyle();
+                String name = attrs.getValue(STYLE_NS, "name");
+                listStyleMap.put(name, listStyle);
+            } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
+                    && "text-properties".equals(localName)) {
+                String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+                if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+                    currTextStyle.italic = true;
+                }
+                String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+                if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+                        || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+                        && Integer.valueOf(fontWeight) > 500)) {
+                    currTextStyle.bold = true;
+                }
+                String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+                if (underlineStyle != null && !underlineStyle.equals("none")) {
+                    currTextStyle.underlined = true;
+                }
+            } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+                if ("list-level-style-bullet".equals(localName)) {
+                    listStyle.ordered = false;
+                } else if ("list-level-style-number".equals(localName)) {
+                    listStyle.ordered = true;
+                }
+            }
+
+            textNodeStack.set(nodeDepth++,
+                    isTextNode(namespaceURI, localName));
+            // filter *all* content of some tags
+            assert completelyFiltered >= 0;
+
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered++;
+            }
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+                    handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+                    startList(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    startSpan(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+                    startParagraph(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
+                    handler.characters(SPACE, 0, 1);
+                } else if ("annotation".equals(localName)) {
+                    closeStyleTags();
+                    handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
+                } else if ("note".equals(localName)) {
+                    closeStyleTags();
+                    handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
+                } else if ("notes".equals(localName)) {
+                    closeStyleTags();
+                    handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
+                } else {
+                    super.startElement(namespaceURI, localName, qName, attrs);
+                }
+            }
+        }
+
+        @Override
+        public void endElement(
+                String namespaceURI, String localName, String qName)
+                throws SAXException {
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                if (currTextStyle != null && currTextStyleName != null) {
+                    textStyleMap.put(currTextStyleName, currTextStyle);
+                    currTextStyleName = null;
+                    currTextStyle = null;
+                } else if (currTextStyle != null && currParagraphStyleName != null) {
+                    paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
+                    currParagraphStyleName = null;
+                    currTextStyle = null;
+                }
+            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+                listStyle = null;
+            }
+
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = headingStack.pop();
+                    handler.endElement(XHTMLContentHandler.XHTML, el, el);
+                } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+                    endList();
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    currTextStyle = null;
+                    hasWrittenStartStyleTags = false;
+                } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+                    endParagraph();
+                } else if ("annotation".equals(localName) || "note".equals(localName) ||
+                        "notes".equals(localName)) {
+                        closeStyleTags();
+                        handler.endElement("", localName, localName);
+                } else {
+                    super.endElement(namespaceURI, localName, qName);
+                }
+
+                // special handling of tabulators
+                if (TEXT_NS.equals(namespaceURI)
+                        && ("tab-stop".equals(localName)
+                        || "tab".equals(localName))) {
+                    this.characters(TAB, 0, TAB.length);
+                }
+            }
+
+            // revert filter for *all* content of some tags
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered--;
+            }
+            assert completelyFiltered >= 0;
+
+            // reduce current node depth
+            nodeDepth--;
+            assert nodeDepth >= 0;
+        }
+
+        @Override
+        public void startPrefixMapping(String prefix, String uri) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+
+        @Override
+        public void endPrefixMapping(String prefix) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+    }
+
+    public static final String TEXT_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+    public static final String TABLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+    public static final String STYLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+    public static final String FORMATTING_OBJECTS_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+    public static final String OFFICE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+    public static final String SVG_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+    public static final String PRESENTATION_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+    public static final String DRAW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+    public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+    protected static final char[] TAB = new char[]{'\t'};
+
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+    /**
+     * Mappings between ODF tag names and XHTML tag names
+     * (including attributes). All other tag names/attributes are ignored
+     * and left out from event stream.
+     */
+    private static final HashMap<QName, TargetElement> MAPPINGS =
+            new HashMap<QName, TargetElement>();
+
+    static {
+        // general mappings of text:-tags
+        MAPPINGS.put(
+                new QName(TEXT_NS, "p"),
+                new TargetElement(XHTML, "p"));
+        // text:h-tags are mapped specifically in startElement/endElement
+        MAPPINGS.put(
+                new QName(TEXT_NS, "line-break"),
+                new TargetElement(XHTML, "br"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "list-item"),
+                new TargetElement(XHTML, "li"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "note"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(OFFICE_NS, "annotation"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(PRESENTATION_NS, "notes"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "object"),
+                new TargetElement(XHTML, "object"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "text-box"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "title"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "desc"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "span"),
+                new TargetElement(XHTML, "span"));
+
+        final HashMap<QName, QName> aAttsMapping =
+                new HashMap<QName, QName>();
+        aAttsMapping.put(
+                new QName(XLINK_NS, "href"),
+                new QName("href"));
+        aAttsMapping.put(
+                new QName(XLINK_NS, "title"),
+                new QName("title"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
+
+        // create HTML tables from table:-tags
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table"),
+                new TargetElement(XHTML, "table"));
+        // repeating of rows is ignored; for columns, see below!
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-row"),
+                new TargetElement(XHTML, "tr"));
+        // special mapping for rowspan/colspan attributes
+        final HashMap<QName, QName> tableCellAttsMapping =
+                new HashMap<QName, QName>();
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-spanned"),
+                new QName("colspan"));
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-rows-spanned"),
+                new QName("rowspan"));
+        /* TODO: The following is not correct, the cell should be repeated not spanned!
+         * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+         * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+         * Cell spanning instead of repeating  is not a problem, because OpenOffice uses it
+         * only for empty cells.
+         */
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-repeated"),
+                new QName("colspan"));
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-cell"),
+                new TargetElement(XHTML, "td", tableCellAttsMapping));
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.emptySet(); // not a top-level parser
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        parseInternal(stream,
+                new XHTMLContentHandler(handler, metadata),
+                metadata, context);
+    }
+
+    void parseInternal(
+            InputStream stream, final ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(
+                        new NSNormalizerContentHandler(dh)),
+                context);
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -8739250869531737584L;
+
+    private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+    private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+    /**
+     * @see OfficeOpenXMLCore#SUBJECT
+     * @deprecated use OfficeOpenXMLCore#SUBJECT
+     */
+    @Deprecated
+    private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+            Property.composite(Office.INITIAL_AUTHOR,
+                    new Property[]{Property.externalText("initial-creator")});
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    private static ContentHandler getMeta(
+            ContentHandler ch, Metadata md, Property property, String element) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:" + element),
+                META_XPATH.parse("//meta:" + element + "//text()"));
+        ContentHandler branch =
+                new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getUserDefined(
+            ContentHandler ch, Metadata md) {
+        Matcher matcher = new CompositeMatcher(
+                META_XPATH.parse("//meta:user-defined/@meta:name"),
+                META_XPATH.parse("//meta:user-defined//text()"));
+        // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+                matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    @Deprecated
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, String name, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    private static ContentHandler getStatistic(
+            ContentHandler ch, Metadata md, Property property, String attribute) {
+        Matcher matcher =
+                META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
+    protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+        // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+        // Process the Dublin Core Attributes 
+        ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+                getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+        // Process the OO Meta Attributes
+        ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+        // ODF uses dc:date for modified
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "date",
+                md, TikaCoreProperties.MODIFIED));
+
+        // ODF uses dc:subject for description
+        ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, "subject",
+                md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+        ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+        ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+        ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+        ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+        ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+        // Process the user defined Meta Attributes
+        ch = getUserDefined(ch, md);
+
+        // Process the OO Statistics Attributes
+        ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+        ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+        // Legacy, Tika-1.0 style attributes
+        // TODO Remove these in Tika 2.0
+        ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+        ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+        ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+        ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+        ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+        ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+        ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+        // Legacy Statistics Attributes, replaced with real keys above
+        // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+        ch = getStatistic(ch, md, "nbPage", "page-count");
+        ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+        ch = getStatistic(ch, md, "nbWord", "word-count");
+        ch = getStatistic(ch, md, "nbCharacter", "character-count");
+        ch = getStatistic(ch, md, "nbTab", "table-count");
+        ch = getStatistic(ch, md, "nbObject", "object-count");
+        ch = getStatistic(ch, md, "nbImg", "image-count");
+
+        // Normalise the rest
+        ch = new NSNormalizerContentHandler(ch);
+        return ch;
+    }
+
+    @Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        super.parse(stream, handler, metadata, context);
+        // Copy subject to description for OO2
+        String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+        if (odfSubject != null && !odfSubject.equals("") &&
+                (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+            metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+        }
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -6410276875438618287L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("vnd.sun.xml.writer"),
+                    MediaType.application("vnd.oasis.opendocument.text"),
+                    MediaType.application("vnd.oasis.opendocument.graphics"),
+                    MediaType.application("vnd.oasis.opendocument.presentation"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("vnd.oasis.opendocument.chart"),
+                    MediaType.application("vnd.oasis.opendocument.image"),
+                    MediaType.application("vnd.oasis.opendocument.formula"),
+                    MediaType.application("vnd.oasis.opendocument.text-master"),
+                    MediaType.application("vnd.oasis.opendocument.text-web"),
+                    MediaType.application("vnd.oasis.opendocument.text-template"),
+                    MediaType.application("vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("vnd.oasis.opendocument.image-template"),
+                    MediaType.application("vnd.oasis.opendocument.formula-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.text"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart"),
+                    MediaType.application("x-vnd.oasis.opendocument.image"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-master"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-web"),
+                    MediaType.application("x-vnd.oasis.opendocument.text-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.image-template"),
+                    MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+    private static final String META_NAME = "meta.xml";
+
+    private Parser meta = new OpenDocumentMetaParser();
+
+    private Parser content = new OpenDocumentContentParser();
+
+    public Parser getMetaParser() {
+        return meta;
+    }
+
+    public void setMetaParser(Parser meta) {
+        this.meta = meta;
+    }
+
+    public Parser getContentParser() {
+        return content;
+    }
+
+    public void setContentParser(Parser content) {
+        this.content = content;
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler baseHandler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        // Open the Zip stream
+        // Use a File if we can, and an already open zip is even better
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());
+            } else {
+                zipStream = new ZipInputStream(stream);
+            }
+        } else {
+            zipStream = new ZipInputStream(stream);
+        }
+
+        // Prepare to handle the content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+        // As we don't know which of the metadata or the content
+        //  we'll hit first, catch the endDocument call initially
+        EndDocumentShieldingContentHandler handler =
+                new EndDocumentShieldingContentHandler(xhtml);
+
+        if (zipFile != null) {
+            try {
+                handleZipFile(zipFile, metadata, context, handler);
+            } finally {
+                //Do we want to close silently == catch an exception here?
+                zipFile.close();
+            }
+        } else {
+            try {
+                handleZipStream(zipStream, metadata, context, handler);
+            } finally {
+                //Do we want to close silently == catch an exception here?
+                zipStream.close();
+            }
+        }
+
+        // Only now call the end document
+        if (handler.getEndDocumentWasCalled()) {
+            handler.reallyEndDocument();
+        }
+    }
+
+    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+        ZipEntry entry = zipStream.getNextEntry();
+		if (entry == null) {
+			throw new IOException("No entries found in ZipInputStream");
+		}
+        do {
+            handleZipEntry(entry, zipStream, metadata, context, handler);
+            entry = zipStream.getNextEntry();
+        } while (entry != null);
+    }
+
+    private void handleZipFile(ZipFile zipFile, Metadata metadata,
+                               ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, TikaException, SAXException {
+        // If we can, process the metadata first, then the
+        //  rest of the file afterwards (TIKA-1353)
+        // Only possible to guarantee that when opened from a file not a stream
+
+        ZipEntry entry = zipFile.getEntry(META_NAME);
+        if (entry != null) {
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+        }
+
+        Enumeration<? extends ZipEntry> entries = zipFile.entries();
+        while (entries.hasMoreElements()) {
+            entry = entries.nextElement();
+            if (!META_NAME.equals(entry.getName())) {
+                handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+            }
+        }
+    }
+    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+                                ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        if (entry == null) return;
+
+        if (entry.getName().equals("mimetype")) {
+            String type = IOUtils.toString(zip, UTF_8);
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        } else if (entry.getName().equals(META_NAME)) {
+            meta.parse(zip, new DefaultHandler(), metadata, context);
+        } else if (entry.getName().endsWith("content.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        } else if (entry.getName().endsWith("styles.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        } else {
+            String embeddedName = entry.getName();
+            //scrape everything under Thumbnails/ and Pictures/
+            if (embeddedName.contains("Thumbnails/") ||
+                    embeddedName.contains("Pictures/")) {
+                EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+                Metadata embeddedMetadata = new Metadata();
+                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+                /* if (embeddedName.startsWith("Thumbnails/")) {
+                    embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                            TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
+                }*/
+                if (embeddedName.contains("Pictures/")) {
+                    embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
+                            TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+                }
+                if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+                    embeddedDocumentExtractor.parseEmbedded(zip,
+                            new EmbeddedContentHandler(handler), embeddedMetadata, false);
+                }
+            }
+
+        }
+    }
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+    private final Property property;
+    private final String name;
+
+    protected AbstractMetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    protected AbstractMetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    /**
+     * Adds the given metadata value. The value is ignored if it is
+     * <code>null</code> or empty. If the metadata entry already exists,
+     * then the given value is appended to it with a comma as the separator.
+     *
+     * @param value metadata value
+     */
+    protected void addMetadata(String value) {
+        if (value != null && value.length() > 0) {
+            if (metadata.isMultiValued(name)) {
+                // Add the value, assuming it's not already there
+                List<String> previous = Arrays.asList(metadata.getValues(name));
+                if (!previous.contains(value)) {
+                    if (property != null) {
+                       metadata.add(property, value);
+                    } else {
+                       metadata.add(name, value);
+                    }
+                }
+            } else {
+                // Set the value, assuming it's not already there
+                String previous = metadata.get(name);
+                if (previous != null && previous.length() > 0) {
+                    if (!previous.equals(value)) {
+                       if (property != null) {
+                          if (property.isMultiValuePermitted()) {
+                              metadata.add(property, value);
+                          } else {
+                              // Replace the existing value if isMultiValuePermitted is false
+                              metadata.set(property, value);
+                          }
+                       } else {
+                          metadata.add(name, value);
+                       }
+                    }
+                } else {
+                   if (property != null) {
+                      metadata.set(property, value);
+                   } else {
+                      metadata.set(name, value);
+                   }
+                }
+            }
+        }
+    }
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ *  value, and the Metadata name is taken from
+ *  an attribute, with a prefix if required. 
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final String nameHoldingAttribute;
+    private final String namePrefix;
+    private String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+        this.metadata = metadata;
+        this.nameHoldingAttribute = nameHoldingAttribute;
+        this.namePrefix = namePrefix;
+    }
+
+    public void addMetadata(String value) {
+        if(name == null || name.length() == 0) {
+           // We didn't find the attribute which holds the name
+           return;
+        }
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            metadata.set(name, value);
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        String rawName = attributes.getValue(nameHoldingAttribute);
+        if (rawName != null) {
+           if (namePrefix == null) {
+              this.name = rawName;
+           } else {
+              this.name = namePrefix + rawName;
+           }
+        }
+        // All other attributes are ignored
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+    private final String uri;
+
+    private final String localName;
+
+    public AttributeMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+    }
+    public AttributeMetadataHandler(
+          String uri, String localName, Metadata metadata, Property property) {
+      super(metadata, property);
+      this.uri = uri;
+      this.localName = localName;
+  }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            if (attributes.getURI(i).equals(this.uri)
+                    && attributes.getLocalName(i).equals(this.localName)) {
+                addMetadata(attributes.getValue(i).trim());
+            }
+        }
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 4905318835463880819L;
+
+    private static ContentHandler getDublinCoreHandler(
+            Metadata metadata, Property property, String element) {
+        return new ElementMetadataHandler(
+                DublinCore.NAMESPACE_URI_DC, element,
+                metadata, property);
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TeeContentHandler(
+                super.getContentHandler(handler, metadata, context),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+
+import java.util.Arrays;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+    private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);
+
+    private static final String LOCAL_NAME_RDF_BAG = "Bag";
+    private static final String LOCAL_NAME_RDF_LI = "li";
+    private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    private final String uri;
+
+    private final String localName;
+
+    private final Metadata metadata;
+
+    private final String name;
+    private Property targetProperty;
+
+    private final boolean allowDuplicateValues;
+    private final boolean allowEmptyValues;
+
+    /**
+     * The buffer used to capture characters when inside a bag li element.
+     */
+    private final StringBuilder bufferBagged = new StringBuilder();
+
+    /**
+     * The buffer used to capture characters inside standard elements.
+     */
+    private final StringBuilder bufferBagless = new StringBuilder();
+
+    /**
+     * Whether or not the value was found in a standard element structure or inside a bag.
+     */
+    private boolean isBagless = true;
+
+    private int matchLevel = 0;
+    private int parentMatchLevel = 0;
+
+    /**
+     * Constructor for string metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        LOG.trace("created simple handler for {}", this.name);
+    }
+
+    /**
+     * Constructor for string metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param name the Tika metadata field key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, name);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.name = name;
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        LOG.trace("created simple handler for {}", this.name);
+    }
+
+    /**
+     * Constructor for Property metadata keys.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = false;
+        this.allowEmptyValues = false;
+        LOG.trace("created property handler for {}", this.name);
+    }
+
+    /**
+     * Constructor for Property metadata keys which allows change of behavior
+     * for duplicate and empty entry values.
+     *
+     * @param uri the uri of the namespace of the element
+     * @param localName the local name of the element
+     * @param metadata the Tika metadata object to populate
+     * @param targetProperty the Tika metadata Property key
+     * @param allowDuplicateValues add duplicate values to the Tika metadata
+     * @param allowEmptyValues add empty values to the Tika metadata
+     */
+    public ElementMetadataHandler(
+            String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+        super(metadata, targetProperty);
+        this.uri = uri;
+        this.localName = localName;
+        this.metadata = metadata;
+        this.targetProperty = targetProperty;
+        this.name = targetProperty.getName();
+        this.allowDuplicateValues = allowDuplicateValues;
+        this.allowEmptyValues = allowEmptyValues;
+        LOG.trace("created property handler for {}", this.name);
+    }
+
+    protected boolean isMatchingParentElement(String uri, String localName) {
+        return (uri.equals(this.uri) && localName.equals(this.localName));
+    }
+
+    protected boolean isMatchingElement(String uri, String localName) {
+        // match if we're inside the parent element or within some bag element
+        return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+                (parentMatchLevel > 0 &&
+                        ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+                        (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+                )
+        );
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        if (isMatchingElement(uri, localName)) {
+            matchLevel++;
+        }
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel++;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name) {
+        if (isMatchingParentElement(uri, localName)) {
+            parentMatchLevel--;
+        }
+        if (isMatchingElement(uri, localName)) {
+            matchLevel--;
+            if (matchLevel == 2) {
+                // we're inside a bag li element, add the bagged buffer
+                addMetadata(bufferBagged.toString().trim());
+                bufferBagged.setLength(0);
+                isBagless = false;
+            }
+            if (matchLevel == 0 && isBagless) {
+                String valueBagless = bufferBagless.toString();
+                if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+                    // we're in a standard element, add the bagless buffer
+                    addMetadata(valueBagless.trim());
+                    bufferBagless.setLength(0);
+                }
+                isBagless = true;
+            }
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) {
+        // We need to append to both buffers since we don't if we're inside a bag until we're done
+        if (parentMatchLevel > 0 && matchLevel > 2) {
+            bufferBagged.append(ch, start, length);
+        }
+        if (parentMatchLevel > 0 && matchLevel > 0) {
+            bufferBagless.append(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) {
+        characters(ch, start, length);
+    }
+
+    @Override
+    protected void addMetadata(String value) {
+        LOG.trace("adding {}={}", name, value);
+        if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+            if ((value != null && value.length() > 0) || allowEmptyValues) {
+                if (value == null || value.length() == 0 && allowEmptyValues) {
+                    value = "";
+                }
+                String[] previous = metadata.getValues(name);
+                if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+                    metadata.add(targetProperty, value);
+                }
+            }
+        } else {
+            super.addMetadata(value);
+        }
+    }
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+    private static final long serialVersionUID = 4195954546491524374L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("x-fictionbook+xml"));
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new BinaryElementsDataHandler(
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
+    }
+
+    private static class BinaryElementsDataHandler extends DefaultHandler {
+        private static final String ELEMENT_BINARY = "binary";
+
+        private boolean binaryMode = false;
+        private static final String ATTRIBUTE_ID = "id";
+
+        private final EmbeddedDocumentExtractor partExtractor;
+        private final ContentHandler handler;
+        private final StringBuilder binaryData = new StringBuilder();
+        private Metadata metadata;
+        private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+        private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+            this.partExtractor = partExtractor;
+            this.handler = handler;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+            binaryMode = ELEMENT_BINARY.equals(localName);
+            if (binaryMode) {
+                binaryData.setLength(0);
+                metadata = new Metadata();
+
+                metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+                metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if (binaryMode) {
+                try {
+                    partExtractor.parseEmbedded(
+                            new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+                            handler,
+                            metadata,
+                            true
+                    );
+                } catch (IOException e) {
+                    throw new SAXException("IOException in parseEmbedded", e);
+                }
+
+                binaryMode = false;
+                binaryData.setLength(0);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (!binaryMode) {
+                handler.characters(ch, start, length);
+            } else {
+                binaryData.append(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+            handler.ignorableWhitespace(ch, start, length);
+        }
+    }
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ *  the textual content of a node (if present), and 
+ *  all attribute values passed through the matcher
+ *  (but not their names). 
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ *             {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    private final Property property;
+    private final String name;
+
+    private final StringBuilder buffer = new StringBuilder();
+
+    public MetadataHandler(Metadata metadata, String name) {
+        this.metadata = metadata;
+        this.property = null;
+        this.name = name;
+    }
+    public MetadataHandler(Metadata metadata, Property property) {
+       this.metadata = metadata;
+       this.property = property;
+       this.name = property.getName();
+   }
+
+    public void addMetadata(String value) {
+        if (value.length() > 0) {
+            String previous = metadata.get(name);
+            if (previous != null && previous.length() > 0) {
+                value = previous + ", " + value;
+            }
+            
+            if (this.property != null) {
+               metadata.set(property, value);
+            } else {
+               metadata.set(name, value);
+            }
+        }
+    }
+
+    public void endElement(String uri, String localName, String name) {
+        addMetadata(buffer.toString());
+        buffer.setLength(0);
+    }
+
+    public void startElement(
+            String uri, String localName, String name, Attributes attributes) {
+        for (int i = 0; i < attributes.getLength(); i++) {
+            addMetadata(attributes.getValue(i));
+        }
+    }
+
+    
+    public void characters(char[] ch, int start, int length) {
+        buffer.append(ch, start, length);
+    }
+
+}
--- a/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -6028836725280212837L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("xml"),
+                MediaType.image("svg+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+            metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+        }
+
+        final XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.startElement("p");
+
+        TaggedContentHandler tagged = new TaggedContentHandler(handler);
+        try {
+            XMLReaderUtils.parseSAX(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(new EmbeddedContentHandler(
+                            getContentHandler(tagged, metadata, context))), context);
+        } catch (SAXException e) {
+            tagged.throwIfCauseOf(e);
+            throw new TikaException("XML parse error", e);
+        } finally {
+            xhtml.endElement("p");
+            xhtml.endDocument();
+        }
+    }
+
+    protected ContentHandler getContentHandler(
+            ContentHandler handler, Metadata metadata, ParseContext context) {
+        return new TextContentHandler(handler, true);
+    }
+}
--- a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
@@ -0,0 +1,29 @@
+package docspell.extract
+
+import docspell.common.MimeType
+
+import scala.util.Try
+
+sealed trait ExtractResult {
+
+  def textOption: Option[String]
+
+}
+
+object ExtractResult {
+
+  case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
+    val textOption = None
+  }
+  case class Failure(ex: Throwable) extends ExtractResult {
+    val textOption = None
+  }
+  case class Success(text: String) extends ExtractResult {
+    val textOption = Some(text)
+  }
+
+  def fromTry(r: Try[String]): ExtractResult =
+    r.fold(Failure.apply, Success.apply)
+
+
+}
--- a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
@@ -0,0 +1,30 @@
+package docspell.extract.odf
+
+import cats.effect._
+import cats.implicits._
+import fs2.Stream
+import java.io.{ByteArrayInputStream, InputStream}
+
+import org.apache.tika.metadata.Metadata
+import org.apache.tika.parser.ParseContext
+import org.apache.tika.parser.odf.OpenDocumentParser
+import org.apache.tika.sax.BodyContentHandler
+
+import scala.util.Try
+
+object OdfExtract {
+
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
+
+
+  def get(is: InputStream) = Try {
+    val handler = new BodyContentHandler()
+    val pctx = new ParseContext()
+    val meta = new Metadata()
+    val ooparser = new OpenDocumentParser()
+    ooparser.parse(is, handler, meta, pctx)
+    handler.toString.trim
+  }.toEither
+
+}
--- a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -0,0 +1,34 @@
+package docspell.extract.pdfbox
+
+import java.io.InputStream
+import java.nio.file.Path
+
+import cats.implicits._
+import cats.effect.Sync
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.text.PDFTextStripper
+
+import scala.util.{Try, Using}
+import fs2.Stream
+
+object PdfboxExtract {
+
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map { bytes =>
+      Using(PDDocument.load(bytes))(readText).toEither.flatten
+    }
+
+  def get(is: InputStream): Either[Throwable, String] =
+    Using(PDDocument.load(is))(readText).toEither.flatten
+
+  def get(inFile: Path): Either[Throwable, String] =
+    Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
+
+  private def readText(doc: PDDocument): Either[Throwable, String] =
+    Try {
+      val stripper = new PDFTextStripper()
+      stripper.setAddMoreFormatting(true)
+      stripper.setLineSeparator("\n")
+      stripper.getText(doc).trim // trim here already
+    }.toEither
+}
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
@@ -0,0 +1,85 @@
+package docspell.extract.poi
+
+import java.io.{ByteArrayInputStream, InputStream}
+
+import cats.data.EitherT
+import cats.implicits._
+import cats.effect.Sync
+import org.apache.poi.hssf.extractor.ExcelExtractor
+import org.apache.poi.hssf.usermodel.HSSFWorkbook
+import org.apache.poi.hwpf.extractor.WordExtractor
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor
+import org.apache.poi.xssf.usermodel.XSSFWorkbook
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor
+import org.apache.poi.xwpf.usermodel.XWPFDocument
+import fs2.Stream
+
+import scala.util.Try
+import docspell.common._
+import docspell.files.TikaMimetype
+
+object PoiExtract {
+
+  def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
+    TikaMimetype.detect(data, hint).flatMap {
+      case PoiTypes.doc =>
+        getDoc(data)
+      case PoiTypes.xls =>
+        getXls(data)
+      case PoiTypes.xlsx =>
+        getXlsx(data)
+      case PoiTypes.docx =>
+        getDocx(data)
+      case PoiTypes.msoffice =>
+        EitherT(getDoc[F](data))
+          .recoverWith({
+            case _ => EitherT(getXls[F](data))
+          })
+          .value
+      case PoiTypes.ooxml =>
+        EitherT(getDocx[F](data))
+          .recoverWith({
+            case _ => EitherT(getXlsx[F](data))
+          })
+          .value
+      case mt =>
+        Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
+    }
+
+  def getDocx(is: InputStream): Either[Throwable, String] =
+    Try {
+      val xt = new XWPFWordExtractor(new XWPFDocument(is))
+      xt.getText.trim
+    }.toEither
+
+  def getDoc(is: InputStream): Either[Throwable, String] =
+    Try {
+      val xt = new WordExtractor(is)
+      xt.getText.trim
+    }.toEither
+
+  def getXlsx(is: InputStream): Either[Throwable, String] =
+    Try {
+      val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
+      xt.getText.trim
+    }.toEither
+
+  def getXls(is: InputStream): Either[Throwable, String] =
+    Try {
+      val xt = new ExcelExtractor(new HSSFWorkbook(is))
+      xt.getText.trim
+    }.toEither
+
+  def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
+
+  def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
+
+  def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
+
+  def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
+
+}
--- a/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala
@@ -0,0 +1,16 @@
+package docspell.extract.poi
+
+import docspell.common.MimeType
+
+object PoiTypes {
+
+  val msoffice = MimeType.application("x-tika-msoffice")
+  val ooxml = MimeType.application("x-tika-ooxml")
+  val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
+  val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+  val xls = MimeType.application("vnd.ms-excel")
+  val doc = MimeType.application("msword")
+
+  val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
+
+}
--- a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
@@ -0,0 +1,24 @@
+package docspell.extract.rtf
+
+import java.io.{ByteArrayInputStream, InputStream}
+
+import cats.implicits._
+import cats.effect.Sync
+import fs2.Stream
+import javax.swing.text.rtf.RTFEditorKit
+
+import scala.util.Try
+
+object RtfExtract {
+
+  def get(is: InputStream): Either[Throwable, String] =
+    Try {
+      val kit = new RTFEditorKit()
+      val doc = kit.createDefaultDocument()
+      kit.read(is, doc, 0)
+      doc.getText(0, doc.getLength).trim
+    }.toEither
+
+  def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+    data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
+}
--- a/modules/extract/src/test/resources/logback.xml
+++ b/modules/extract/src/test/resources/logback.xml
@@ -1,14 +0,0 @@
-<configuration>
-    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
-        <withJansi>true</withJansi>
-
-        <encoder>
-            <pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
-        </encoder>
-    </appender>
-
-    <logger name="docspell" level="debug" />
-    <root level="INFO">
-        <appender-ref ref="STDOUT" />
-    </root>
-</configuration>
--- a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
+++ b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
@@ -1,30 +0,0 @@
-package docspell.extract
-
-import fs2.Stream
-import cats.effect.{Blocker, IO}
-import docspell.files._
-
-import scala.concurrent.ExecutionContext
-
-object TestFiles {
-  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
-  implicit val CS = IO.contextShift(ExecutionContext.global)
-
-  val letterSourceDE: Stream[IO, Byte] =
-    ExampleFiles.letter_de_pdf
-      .readURL[IO](16 * 1024, blocker)
-
-  val letterSourceEN: Stream[IO, Byte] =
-    ExampleFiles.letter_en_pdf
-      .readURL[IO](16 * 1024, blocker)
-
-  lazy val letterDEText =
-    ExampleFiles.letter_de_txt
-      .readText[IO](16 * 1024, blocker)
-      .unsafeRunSync
-
-  lazy val letterENText =
-    ExampleFiles.letter_en_txt
-      .readText[IO](16 * 1024, blocker)
-      .unsafeRunSync
-}
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@@ -1,9 +1,7 @@
 package docspell.extract.ocr

 import cats.effect.IO
-import docspell.common._
-import docspell.files._
-import docspell.extract.TestFiles
+import docspell.files.TestFiles
 import minitest.SimpleTestSuite

 object TextExtractionSuite extends SimpleTestSuite {
@@ -30,13 +28,4 @@ object TextExtractionSuite extends SimpleTestSuite {

    assertEquals(extract.trim, expect.trim)
  }
-
-  test("find mimetypes") {
-    ExampleFiles.
-      all.foreach { url =>
-        TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
-          map(mt => println(url.asString + ": " + mt.asString)).
-          unsafeRunSync
-      }
-  }
 }
--- a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
@@ -0,0 +1,28 @@
+package docspell.extract.odf
+
+import cats.effect._
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object OdfExtractTest extends SimpleTestSuite {
+  val blocker = TestFiles.blocker
+  implicit val CS = TestFiles.CS
+
+  val files = List(
+    ExampleFiles.examples_sample_odt -> 6372,
+    ExampleFiles.examples_sample_ods -> 717
+  )
+
+  test("test extract from odt") {
+    files.foreach { case (file, len) =>
+      val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
+      val str1 = OdfExtract.get(is).fold(throw _, identity)
+      assertEquals(str1.length, len)
+
+      val data = file.readURL[IO](8192, blocker)
+      val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
+      assertEquals(str2, str1)
+    }
+  }
+
+}
--- a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@@ -0,0 +1,48 @@
+package docspell.extract.pdfbox
+
+import cats.effect._
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object PdfboxExtractTest extends SimpleTestSuite {
+  val blocker     = TestFiles.blocker
+  implicit val CS = TestFiles.CS
+
+  val textPDFs = List(
+    ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
+    ExampleFiles.letter_en_pdf -> TestFiles.letterENText
+  )
+
+  test("extract text from text PDFs by inputstream") {
+    textPDFs.foreach {
+      case (file, txt) =>
+        val url      = file.toJavaUrl.fold(sys.error, identity)
+        val str      = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+        val received = removeFormatting(str)
+        val expect   = removeFormatting(txt)
+        assertEquals(received, expect)
+    }
+  }
+
+  test("extract text from text PDFs via Stream") {
+    textPDFs.foreach {
+      case (file, txt) =>
+        val data     = file.readURL[IO](8192, blocker)
+        val str      = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
+        val received = removeFormatting(str)
+        val expect   = removeFormatting(txt)
+        assertEquals(received, expect)
+    }
+  }
+
+  test("extract text from image PDFs") {
+    val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
+
+    val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+
+    assertEquals(str, "")
+  }
+
+  private def removeFormatting(str: String): String =
+    str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
+}
--- a/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
@@ -0,0 +1,39 @@
+package docspell.extract.poi
+
+import cats.effect._
+import docspell.common.MimeTypeHint
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object PoiExtractTest extends SimpleTestSuite {
+  val blocker     = TestFiles.blocker
+  implicit val CS = TestFiles.CS
+
+  val officeFiles = List(
+    ExampleFiles.examples_sample_doc  -> 6241,
+    ExampleFiles.examples_sample_docx -> 6179,
+    ExampleFiles.examples_sample_xlsx -> 660,
+    ExampleFiles.examples_sample_xls  -> 660
+  )
+
+  test("extract text from ms office files") {
+    officeFiles.foreach {
+      case (file, len) =>
+        val str1 = PoiExtract
+          .get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
+          .unsafeRunSync()
+          .fold(throw _, identity)
+
+        val str2 = PoiExtract
+          .get[IO](
+            file.readURL[IO](8192, blocker),
+            MimeTypeHint(Some(file.path.segments.last), None)
+          )
+          .unsafeRunSync()
+          .fold(throw _, identity)
+
+        assertEquals(str1, str2)
+        assertEquals(str1.length, len)
+    }
+  }
+}
--- a/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala
+++ b/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala
@@ -0,0 +1,14 @@
+package docspell.extract.rtf
+
+import docspell.files.ExampleFiles
+import minitest.SimpleTestSuite
+
+object RtfExtractTest extends SimpleTestSuite {
+
+  test("extract text from rtf using java input-stream") {
+    val file = ExampleFiles.examples_sample_rtf
+    val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
+    val str = RtfExtract.get(is).fold(throw _, identity)
+    assertEquals(str.length, 7342)
+  }
+}