+ |
+ |
+ |
+ |$body
+ |
+ |
+ |""".stripMargin
+ }
+
+ private def createParser(): Parser = {
+ val opts = new MutableDataSet()
+ opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
+ util.Arrays.asList(TablesExtension.create(),
+ StrikethroughExtension.create()));
+
+ Parser.builder(opts).build()
+ }
+
+ private def createRenderer(): HtmlRenderer = {
+ val opts = new MutableDataSet()
+ HtmlRenderer.builder(opts).build()
+ }
+}
diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala
new file mode 100644
index 00000000..3d0a5ab3
--- /dev/null
+++ b/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala
@@ -0,0 +1,3 @@
+package docspell.convert.flexmark
+
+case class MarkdownConfig(internalCss: String)
diff --git a/modules/extract/NOTICE b/modules/extract/NOTICE
new file mode 100644
index 00000000..05ccbbcc
--- /dev/null
+++ b/modules/extract/NOTICE
@@ -0,0 +1,11 @@
+The Java source files in docspell-extract are unmodified copies of
+those found in the Apache Tika parser project. It follows the
+NOTICE.txt file from Apache Tika parsers:
+
+Apache Tika parsers
+Copyright 2007-2019 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
new file mode 100644
index 00000000..80b2301c
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+/**
+ * Content handler decorator that:
+ *
Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones
+ *
Returns a fake DTD when parser requests OpenOffice DTD
+ *
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+ private static final String OLD_NS =
+ "http://openoffice.org/2000/";
+
+ private static final String NEW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:";
+
+ private static final String DTD_PUBLIC_ID =
+ "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private String mapOldNS(String ns) {
+ if (ns != null && ns.startsWith(OLD_NS)) {
+ return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+ } else {
+ return ns;
+ }
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+ atts.getQName(i), atts.getType(i), atts.getValue(i));
+ }
+ super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ super.endElement(mapOldNS(namespaceURI), localName, qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ super.startPrefixMapping(prefix, mapOldNS(uri));
+ }
+
+ /**
+ * do not load any DTDs (may be requested by parser). Fake the DTD by
+ * returning a empty string as InputSource
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId)
+ throws IOException, SAXException {
+ if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+ || DTD_PUBLIC_ID.equals(publicId)) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId, systemId);
+ }
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
new file mode 100644
index 00000000..066f3e95
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -0,0 +1,606 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+/**
+ * Parser for ODF content.xml files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+
+ @Override
+ public String toString() {
+ return "TextStyle{" +
+ "italic=" + italic +
+ ", bold=" + bold +
+ ", underlined=" + underlined +
+ '}';
+ }
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
+
+ private static final class OpenDocumentElementMappingContentHandler extends
+ ElementMappingContentHandler {
+ private static final char[] SPACE = new char[]{ ' '};
+ private static final String CLASS = "class";
+ private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
+ private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
+ private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
+
+ private static Attributes buildAttributes(String key, String value) {
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", key, key, "CDATA", value);
+ return attrs;
+ }
+
+ private final ContentHandler handler;
+ private final BitSet textNodeStack = new BitSet();
+ private int nodeDepth = 0;
+ private int completelyFiltered = 0;
+ private Stack headingStack = new Stack();
+ private Map paragraphTextStyleMap = new HashMap();
+ private Map textStyleMap = new HashMap();
+ private Map listStyleMap = new HashMap();
+ private String currParagraphStyleName; //paragraph style name
+ private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
+ private String currTextStyleName;
+
+ private Stack listStyleStack = new Stack();
+ private ListStyle listStyle;
+
+ // True if we are currently in the named style:
+ private boolean curUnderlined;
+ private boolean curBold;
+ private boolean curItalic;
+
+ //have we written the start style tags
+ //yet for the current text style
+ boolean hasWrittenStartStyleTags = false;
+
+ private int pDepth = 0; //
can appear inside comments and other things that are already inside
+ //we need to track our pDepth and only output
if we're at the main level
+
+
+ private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+ Map mappings) {
+ super(handler, mappings);
+ this.handler = handler;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0
+ && textNodeStack.get(nodeDepth - 1)) {
+ if (!hasWrittenStartStyleTags) {
+ updateStyleTags();
+ hasWrittenStartStyleTags = true;
+ }
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(
+ String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template")
+ || localName.endsWith("-style");
+ }
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+ }
+
+ // map the heading level to HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
+
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) ||
+ "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+ currTextStyle = textStyleMap.get(name);
+ hasWrittenStartStyleTags = false;
+ }
+
+ private void startParagraph(String styleName) throws SAXException {
+ if (pDepth == 0) {
+ handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
+ if (styleName != null) {
+ currTextStyle = paragraphTextStyleMap.get(styleName);
+ }
+ hasWrittenStartStyleTags = false;
+ } else {
+ handler.characters(SPACE, 0, SPACE.length);
+ }
+ pDepth++;
+ }
+
+ private void endParagraph() throws SAXException {
+ closeStyleTags();
+ if (pDepth == 1) {
+ handler.endElement(XHTML, "p", "p");
+ } else {
+ handler.characters(SPACE, 0, SPACE.length);
+ }
+ pDepth--;
+
+ }
+
+ private void updateStyleTags() throws SAXException {
+
+ if (currTextStyle == null) {
+ closeStyleTags();
+ return;
+ }
+ if (currTextStyle.bold != curBold) {
+ // Enforce nesting -- must close s and i tags
+ if (curUnderlined) {
+ handler.endElement(XHTML, "u", "u");
+ curUnderlined = false;
+ }
+ if (curItalic) {
+ handler.endElement(XHTML, "i", "i");
+ curItalic = false;
+ }
+ if (currTextStyle.bold) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "b", "b");
+ }
+ curBold = currTextStyle.bold;
+ }
+
+ if (currTextStyle.italic != curItalic) {
+ // Enforce nesting -- must close s tag
+ if (curUnderlined) {
+ handler.endElement(XHTML, "u", "u");
+ curUnderlined = false;
+ }
+ if (currTextStyle.italic) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "i", "i");
+ }
+ curItalic = currTextStyle.italic;
+ }
+
+ if (currTextStyle.underlined != curUnderlined) {
+ if (currTextStyle.underlined) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ } else {
+ handler.endElement(XHTML, "u", "u");
+ }
+ curUnderlined = currTextStyle.underlined;
+ }
+ }
+
+ private void endSpan() throws SAXException {
+ updateStyleTags();
+ }
+
+ private void closeStyleTags() throws SAXException {
+ // Close any still open style tags
+ if (curUnderlined) {
+ handler.endElement(XHTML,"u", "u");
+ curUnderlined = false;
+ }
+ if (curItalic) {
+ handler.endElement(XHTML,"i", "i");
+ curItalic = false;
+ }
+ if (curBold) {
+ handler.endElement(XHTML,"b", "b");
+ curBold = false;
+ }
+ currTextStyle = null;
+ hasWrittenStartStyleTags = false;
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
+
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ currTextStyle = new TextStyle();
+ currTextStyleName = attrs.getValue(STYLE_NS, "name");
+ } else if ("paragraph".equals(family)) {
+ currTextStyle = new TextStyle();
+ currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ currTextStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ currTextStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null && !underlineStyle.equals("none")) {
+ currTextStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++,
+ isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ startParagraph(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
+ handler.characters(SPACE, 0, 1);
+ } else if ("annotation".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
+ } else if ("note".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
+ } else if ("notes".equals(localName)) {
+ closeStyleTags();
+ handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(
+ String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ if (currTextStyle != null && currTextStyleName != null) {
+ textStyleMap.put(currTextStyleName, currTextStyle);
+ currTextStyleName = null;
+ currTextStyle = null;
+ } else if (currTextStyle != null && currParagraphStyleName != null) {
+ paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
+ currParagraphStyleName = null;
+ currTextStyle = null;
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ currTextStyle = null;
+ hasWrittenStartStyleTags = false;
+ } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ endParagraph();
+ } else if ("annotation".equals(localName) || "note".equals(localName) ||
+ "notes".equals(localName)) {
+ closeStyleTags();
+ handler.endElement("", localName, localName);
+ } else {
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI)
+ && ("tab-stop".equals(localName)
+ || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+ }
+
+ public static final String TEXT_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+ public static final String TABLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+ public static final String STYLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+ public static final String FORMATTING_OBJECTS_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+ public static final String OFFICE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+ public static final String SVG_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+ public static final String PRESENTATION_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+ public static final String DRAW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+ public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB = new char[]{'\t'};
+
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ /**
+ * Mappings between ODF tag names and XHTML tag names
+ * (including attributes). All other tag names/attributes are ignored
+ * and left out from event stream.
+ */
+ private static final HashMap MAPPINGS =
+ new HashMap();
+
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(
+ new QName(TEXT_NS, "p"),
+ new TargetElement(XHTML, "p"));
+ // text:h-tags are mapped specifically in startElement/endElement
+ MAPPINGS.put(
+ new QName(TEXT_NS, "line-break"),
+ new TargetElement(XHTML, "br"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "list-item"),
+ new TargetElement(XHTML, "li"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "note"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(OFFICE_NS, "annotation"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(PRESENTATION_NS, "notes"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "object"),
+ new TargetElement(XHTML, "object"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "text-box"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "title"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "desc"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "span"),
+ new TargetElement(XHTML, "span"));
+
+ final HashMap aAttsMapping =
+ new HashMap();
+ aAttsMapping.put(
+ new QName(XLINK_NS, "href"),
+ new QName("href"));
+ aAttsMapping.put(
+ new QName(XLINK_NS, "title"),
+ new QName("title"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table"),
+ new TargetElement(XHTML, "table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-row"),
+ new TargetElement(XHTML, "tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap tableCellAttsMapping =
+ new HashMap();
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-spanned"),
+ new QName("colspan"));
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-rows-spanned"),
+ new QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be repeated not spanned!
+ * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+ * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-repeated"),
+ new QName("colspan"));
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-cell"),
+ new TargetElement(XHTML, "td", tableCellAttsMapping));
+ }
+
+ public Set getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parseInternal(stream,
+ new XHTMLContentHandler(handler, metadata),
+ metadata, context);
+ }
+
+ void parseInternal(
+ InputStream stream, final ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ new NSNormalizerContentHandler(dh)),
+ context);
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
new file mode 100644
index 00000000..11922d7d
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Parser for OpenDocument meta.xml files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -8739250869531737584L;
+
+ private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+ private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+ /**
+ * @see OfficeOpenXMLCore#SUBJECT
+ * @deprecated use OfficeOpenXMLCore#SUBJECT
+ */
+ @Deprecated
+ private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+ Property.composite(Office.INITIAL_AUTHOR,
+ new Property[]{Property.externalText("initial-creator")});
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ private static ContentHandler getMeta(
+ ContentHandler ch, Metadata md, Property property, String element) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:" + element),
+ META_XPATH.parse("//meta:" + element + "//text()"));
+ ContentHandler branch =
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getUserDefined(
+ ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ // eg Text1 becomes custom:Info1=Text1
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ @Deprecated
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, String name, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, Property property, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+ // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+ // Process the Dublin Core Attributes
+ ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+ getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+ // Process the OO Meta Attributes
+ ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+ // ODF uses dc:date for modified
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "date",
+ md, TikaCoreProperties.MODIFIED));
+
+ // ODF uses dc:subject for description
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "subject",
+ md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+ ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+ ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+ ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+ // Process the user defined Meta Attributes
+ ch = getUserDefined(ch, md);
+
+ // Process the OO Statistics Attributes
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+ // Legacy, Tika-1.0 style attributes
+ // TODO Remove these in Tika 2.0
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+ // Legacy Statistics Attributes, replaced with real keys above
+ // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+ ch = getStatistic(ch, md, "nbPage", "page-count");
+ ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+ ch = getStatistic(ch, md, "nbWord", "word-count");
+ ch = getStatistic(ch, md, "nbCharacter", "character-count");
+ ch = getStatistic(ch, md, "nbTab", "table-count");
+ ch = getStatistic(ch, md, "nbObject", "object-count");
+ ch = getStatistic(ch, md, "nbImg", "image-count");
+
+ // Normalise the rest
+ ch = new NSNormalizerContentHandler(ch);
+ return ch;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ super.parse(stream, handler, metadata, context);
+ // Copy subject to description for OO2
+ String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+ if (odfSubject != null && !odfSubject.equals("") &&
+ (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+ }
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
new file mode 100644
index 00000000..6ba5281f
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6410276875438618287L;
+
+ private static final Set SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet(Arrays.asList(
+ MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+ MediaType.application("vnd.oasis.opendocument.presentation"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+ MediaType.application("vnd.oasis.opendocument.text-template"),
+ MediaType.application("vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("vnd.oasis.opendocument.chart-template"),
+ MediaType.application("vnd.oasis.opendocument.image-template"),
+ MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+ MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+ MediaType.application("x-vnd.oasis.opendocument.text-template"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+ MediaType.application("x-vnd.oasis.opendocument.image-template"),
+ MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+ private static final String META_NAME = "meta.xml";
+
+ private Parser meta = new OpenDocumentMetaParser();
+
+ private Parser content = new OpenDocumentContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // Prepare to handle the content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+ // As we don't know which of the metadata or the content
+ // we'll hit first, catch the endDocument call initially
+ EndDocumentShieldingContentHandler handler =
+ new EndDocumentShieldingContentHandler(xhtml);
+
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
+ }
+
+ // Only now call the end document
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+ ZipEntry entry = zipStream.getNextEntry();
+ if (entry == null) {
+ throw new IOException("No entries found in ZipInputStream");
+ }
+ do {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ entry = zipStream.getNextEntry();
+ } while (entry != null);
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, TikaException, SAXException {
+ // If we can, process the metadata first, then the
+ // rest of the file afterwards (TIKA-1353)
+ // Only possible to guarantee that when opened from a file not a stream
+
+ ZipEntry entry = zipFile.getEntry(META_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+
+ Enumeration extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (!META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+ }
+ }
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ if (entry == null) return;
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals(META_NAME)) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith("content.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else {
+ String embeddedName = entry.getName();
+ //scrape everything under Thumbnails/ and Pictures/
+ if (embeddedName.contains("Thumbnails/") ||
+ embeddedName.contains("Pictures/")) {
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
+ /* if (embeddedName.startsWith("Thumbnails/")) {
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
+ }*/
+ if (embeddedName.contains("Pictures/")) {
+ embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ }
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
+ embeddedDocumentExtractor.parseEmbedded(zip,
+ new EmbeddedContentHandler(handler), embeddedMetadata, false);
+ }
+ }
+
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
new file mode 100644
index 00000000..cbff35e7
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Base class for SAX handlers that map SAX events into document metadata.
+ *
+ * @since Apache Tika 0.10
+ */
+class AbstractMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+ private final Property property;
+ private final String name;
+
+ protected AbstractMetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ protected AbstractMetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ /**
+ * Adds the given metadata value. The value is ignored if it is
+ * null or empty. If the metadata entry already exists,
+ * then the given value is appended to it with a comma as the separator.
+ *
+ * @param value metadata value
+ */
+ protected void addMetadata(String value) {
+ if (value != null && value.length() > 0) {
+ if (metadata.isMultiValued(name)) {
+ // Add the value, assuming it's not already there
+ List previous = Arrays.asList(metadata.getValues(name));
+ if (!previous.contains(value)) {
+ if (property != null) {
+ metadata.add(property, value);
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ // Set the value, assuming it's not already there
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ if (!previous.equals(value)) {
+ if (property != null) {
+ if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ // Replace the existing value if isMultiValuePermitted is false
+ metadata.set(property, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+ } else {
+ if (property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
new file mode 100644
index 00000000..c1795fad
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds a Metadata entry for a given node.
+ * The textual content of the node is used as the
+ * value, and the Metadata name is taken from
+ * an attribute, with a prefix if required.
+ */
+public class AttributeDependantMetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final String nameHoldingAttribute;
+ private final String namePrefix;
+ private String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
+ this.metadata = metadata;
+ this.nameHoldingAttribute = nameHoldingAttribute;
+ this.namePrefix = namePrefix;
+ }
+
+ public void addMetadata(String value) {
+ if(name == null || name.length() == 0) {
+ // We didn't find the attribute which holds the name
+ return;
+ }
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+ metadata.set(name, value);
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ String rawName = attributes.getValue(nameHoldingAttribute);
+ if (rawName != null) {
+ if (namePrefix == null) {
+ this.name = rawName;
+ } else {
+ this.name = namePrefix + rawName;
+ }
+ }
+ // All other attributes are ignored
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
new file mode 100644
index 00000000..dba5e4cb
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * SAX event handler that maps the contents of an XML attribute into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class AttributeMetadataHandler extends AbstractMetadataHandler {
+
+ private final String uri;
+
+ private final String localName;
+
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ }
+ public AttributeMetadataHandler(
+ String uri, String localName, Metadata metadata, Property property) {
+ super(metadata, property);
+ this.uri = uri;
+ this.localName = localName;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ if (attributes.getURI(i).equals(this.uri)
+ && attributes.getLocalName(i).equals(this.localName)) {
+ addMetadata(attributes.getValue(i).trim());
+ }
+ }
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
new file mode 100644
index 00000000..5999773e
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Dublin Core metadata parser
+ */
+public class DcXMLParser extends XMLParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 4905318835463880819L;
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
new file mode 100644
index 00000000..d7a81dc4
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+
+import java.util.Arrays;
+
+/**
+ * SAX event handler that maps the contents of an XML element into
+ * a metadata field.
+ *
+ * @since Apache Tika 0.10
+ */
+public class ElementMetadataHandler extends AbstractMetadataHandler {
+ private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);
+
+ private static final String LOCAL_NAME_RDF_BAG = "Bag";
+ private static final String LOCAL_NAME_RDF_LI = "li";
+ private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+ private final String uri;
+
+ private final String localName;
+
+ private final Metadata metadata;
+
+ private final String name;
+ private Property targetProperty;
+
+ private final boolean allowDuplicateValues;
+ private final boolean allowEmptyValues;
+
+ /**
+ * The buffer used to capture characters when inside a bag li element.
+ */
+ private final StringBuilder bufferBagged = new StringBuilder();
+
+ /**
+ * The buffer used to capture characters inside standard elements.
+ */
+ private final StringBuilder bufferBagless = new StringBuilder();
+
+ /**
+ * Whether or not the value was found in a standard element structure or inside a bag.
+ */
+ private boolean isBagless = true;
+
+ private int matchLevel = 0;
+ private int parentMatchLevel = 0;
+
+ /**
+ * Constructor for string metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ LOG.trace("created simple handler for {}", this.name);
+ }
+
+ /**
+ * Constructor for string metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param name the Tika metadata field key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, name);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.name = name;
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ LOG.trace("created simple handler for {}", this.name);
+ }
+
+ /**
+ * Constructor for Property metadata keys.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = false;
+ this.allowEmptyValues = false;
+ LOG.trace("created property handler for {}", this.name);
+ }
+
+ /**
+ * Constructor for Property metadata keys which allows change of behavior
+ * for duplicate and empty entry values.
+ *
+ * @param uri the uri of the namespace of the element
+ * @param localName the local name of the element
+ * @param metadata the Tika metadata object to populate
+ * @param targetProperty the Tika metadata Property key
+ * @param allowDuplicateValues add duplicate values to the Tika metadata
+ * @param allowEmptyValues add empty values to the Tika metadata
+ */
+ public ElementMetadataHandler(
+ String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
+ super(metadata, targetProperty);
+ this.uri = uri;
+ this.localName = localName;
+ this.metadata = metadata;
+ this.targetProperty = targetProperty;
+ this.name = targetProperty.getName();
+ this.allowDuplicateValues = allowDuplicateValues;
+ this.allowEmptyValues = allowEmptyValues;
+ LOG.trace("created property handler for {}", this.name);
+ }
+
+ protected boolean isMatchingParentElement(String uri, String localName) {
+ return (uri.equals(this.uri) && localName.equals(this.localName));
+ }
+
+ protected boolean isMatchingElement(String uri, String localName) {
+ // match if we're inside the parent element or within some bag element
+ return (uri.equals(this.uri) && localName.equals(this.localName)) ||
+ (parentMatchLevel > 0 &&
+ ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
+ (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
+ )
+ );
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ if (isMatchingElement(uri, localName)) {
+ matchLevel++;
+ }
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel++;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) {
+ if (isMatchingParentElement(uri, localName)) {
+ parentMatchLevel--;
+ }
+ if (isMatchingElement(uri, localName)) {
+ matchLevel--;
+ if (matchLevel == 2) {
+ // we're inside a bag li element, add the bagged buffer
+ addMetadata(bufferBagged.toString().trim());
+ bufferBagged.setLength(0);
+ isBagless = false;
+ }
+ if (matchLevel == 0 && isBagless) {
+ String valueBagless = bufferBagless.toString();
+ if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
+ // we're in a standard element, add the bagless buffer
+ addMetadata(valueBagless.trim());
+ bufferBagless.setLength(0);
+ }
+ isBagless = true;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ // We need to append to both buffers since we don't if we're inside a bag until we're done
+ if (parentMatchLevel > 0 && matchLevel > 2) {
+ bufferBagged.append(ch, start, length);
+ }
+ if (parentMatchLevel > 0 && matchLevel > 0) {
+ bufferBagless.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) {
+ characters(ch, start, length);
+ }
+
+ @Override
+ protected void addMetadata(String value) {
+ LOG.trace("adding {}={}", name, value);
+ if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
+ if ((value != null && value.length() > 0) || allowEmptyValues) {
+ if (value == null || value.length() == 0 && allowEmptyValues) {
+ value = "";
+ }
+ String[] previous = metadata.getValues(name);
+ if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
+ metadata.add(targetProperty, value);
+ }
+ }
+ } else {
+ super.addMetadata(value);
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
new file mode 100644
index 00000000..1f396901
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+public class FictionBookParser extends XMLParser {
+ private static final long serialVersionUID = 4195954546491524374L;
+
+ private static final Set SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-fictionbook+xml"));
+ @Override
+ public Set getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new BinaryElementsDataHandler(
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
+ }
+
+ private static class BinaryElementsDataHandler extends DefaultHandler {
+ private static final String ELEMENT_BINARY = "binary";
+
+ private boolean binaryMode = false;
+ private static final String ATTRIBUTE_ID = "id";
+
+ private final EmbeddedDocumentExtractor partExtractor;
+ private final ContentHandler handler;
+ private final StringBuilder binaryData = new StringBuilder();
+ private Metadata metadata;
+ private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
+
+ private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
+ this.partExtractor = partExtractor;
+ this.handler = handler;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ binaryMode = ELEMENT_BINARY.equals(localName);
+ if (binaryMode) {
+ binaryData.setLength(0);
+ metadata = new Metadata();
+
+ metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
+ metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (binaryMode) {
+ try {
+ partExtractor.parseEmbedded(
+ new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
+ handler,
+ metadata,
+ true
+ );
+ } catch (IOException e) {
+ throw new SAXException("IOException in parseEmbedded", e);
+ }
+
+ binaryMode = false;
+ binaryData.setLength(0);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (!binaryMode) {
+ handler.characters(ch, start, length);
+ } else {
+ binaryData.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ handler.ignorableWhitespace(ch, start, length);
+ }
+ }
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
new file mode 100644
index 00000000..3fee00a3
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.Attributes;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This adds Metadata entries with a specified name for
+ * the textual content of a node (if present), and
+ * all attribute values passed through the matcher
+ * (but not their names).
+ *
+ * @deprecated Use the {@link AttributeMetadataHandler} and
+ * {@link ElementMetadataHandler} classes instead
+ */
+public class MetadataHandler extends DefaultHandler {
+
+ private final Metadata metadata;
+
+ private final Property property;
+ private final String name;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ public MetadataHandler(Metadata metadata, String name) {
+ this.metadata = metadata;
+ this.property = null;
+ this.name = name;
+ }
+ public MetadataHandler(Metadata metadata, Property property) {
+ this.metadata = metadata;
+ this.property = property;
+ this.name = property.getName();
+ }
+
+ public void addMetadata(String value) {
+ if (value.length() > 0) {
+ String previous = metadata.get(name);
+ if (previous != null && previous.length() > 0) {
+ value = previous + ", " + value;
+ }
+
+ if (this.property != null) {
+ metadata.set(property, value);
+ } else {
+ metadata.set(name, value);
+ }
+ }
+ }
+
+ public void endElement(String uri, String localName, String name) {
+ addMetadata(buffer.toString());
+ buffer.setLength(0);
+ }
+
+ public void startElement(
+ String uri, String localName, String name, Attributes attributes) {
+ for (int i = 0; i < attributes.getLength(); i++) {
+ addMetadata(attributes.getValue(i));
+ }
+ }
+
+
+ public void characters(char[] ch, int start, int length) {
+ buffer.append(ch, start, length);
+ }
+
+}
diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
new file mode 100644
index 00000000..e247a6c4
--- /dev/null
+++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * XML parser.
+ */
+public class XMLParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -6028836725280212837L;
+
+ private static final Set SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet(Arrays.asList(
+ MediaType.application("xml"),
+ MediaType.image("svg+xml"))));
+
+ public Set getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+ metadata.set(Metadata.CONTENT_TYPE, "application/xml");
+ }
+
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
+ try {
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ getContentHandler(tagged, metadata, context))), context);
+ } catch (SAXException e) {
+ tagged.throwIfCauseOf(e);
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endElement("p");
+ xhtml.endDocument();
+ }
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TextContentHandler(handler, true);
+ }
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
new file mode 100644
index 00000000..6c05d56a
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala
@@ -0,0 +1,29 @@
+package docspell.extract
+
+import docspell.common.MimeType
+
+import scala.util.Try
+
+sealed trait ExtractResult {
+
+ def textOption: Option[String]
+
+}
+
+object ExtractResult {
+
+ case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
+ val textOption = None
+ }
+ case class Failure(ex: Throwable) extends ExtractResult {
+ val textOption = None
+ }
+ case class Success(text: String) extends ExtractResult {
+ val textOption = Some(text)
+ }
+
+ def fromTry(r: Try[String]): ExtractResult =
+ r.fold(Failure.apply, Success.apply)
+
+
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
new file mode 100644
index 00000000..ae3ac66d
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala
@@ -0,0 +1,30 @@
+package docspell.extract.odf
+
+import cats.effect._
+import cats.implicits._
+import fs2.Stream
+import java.io.{ByteArrayInputStream, InputStream}
+
+import org.apache.tika.metadata.Metadata
+import org.apache.tika.parser.ParseContext
+import org.apache.tika.parser.odf.OpenDocumentParser
+import org.apache.tika.sax.BodyContentHandler
+
+import scala.util.Try
+
+object OdfExtract {
+
+ def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
+
+
+ def get(is: InputStream) = Try {
+ val handler = new BodyContentHandler()
+ val pctx = new ParseContext()
+ val meta = new Metadata()
+ val ooparser = new OpenDocumentParser()
+ ooparser.parse(is, handler, meta, pctx)
+ handler.toString.trim
+ }.toEither
+
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
new file mode 100644
index 00000000..c935100c
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala
@@ -0,0 +1,34 @@
+package docspell.extract.pdfbox
+
+import java.io.InputStream
+import java.nio.file.Path
+
+import cats.implicits._
+import cats.effect.Sync
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.text.PDFTextStripper
+
+import scala.util.{Try, Using}
+import fs2.Stream
+
+object PdfboxExtract {
+
+ def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map { bytes =>
+ Using(PDDocument.load(bytes))(readText).toEither.flatten
+ }
+
+ def get(is: InputStream): Either[Throwable, String] =
+ Using(PDDocument.load(is))(readText).toEither.flatten
+
+ def get(inFile: Path): Either[Throwable, String] =
+ Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
+
+ private def readText(doc: PDDocument): Either[Throwable, String] =
+ Try {
+ val stripper = new PDFTextStripper()
+ stripper.setAddMoreFormatting(true)
+ stripper.setLineSeparator("\n")
+ stripper.getText(doc).trim // trim here already
+ }.toEither
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
new file mode 100644
index 00000000..68e1de18
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala
@@ -0,0 +1,85 @@
+package docspell.extract.poi
+
+import java.io.{ByteArrayInputStream, InputStream}
+
+import cats.data.EitherT
+import cats.implicits._
+import cats.effect.Sync
+import org.apache.poi.hssf.extractor.ExcelExtractor
+import org.apache.poi.hssf.usermodel.HSSFWorkbook
+import org.apache.poi.hwpf.extractor.WordExtractor
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor
+import org.apache.poi.xssf.usermodel.XSSFWorkbook
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor
+import org.apache.poi.xwpf.usermodel.XWPFDocument
+import fs2.Stream
+
+import scala.util.Try
+import docspell.common._
+import docspell.files.TikaMimetype
+
+object PoiExtract {
+
+ def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
+ TikaMimetype.detect(data, hint).flatMap {
+ case PoiTypes.doc =>
+ getDoc(data)
+ case PoiTypes.xls =>
+ getXls(data)
+ case PoiTypes.xlsx =>
+ getXlsx(data)
+ case PoiTypes.docx =>
+ getDocx(data)
+ case PoiTypes.msoffice =>
+ EitherT(getDoc[F](data))
+ .recoverWith({
+ case _ => EitherT(getXls[F](data))
+ })
+ .value
+ case PoiTypes.ooxml =>
+ EitherT(getDocx[F](data))
+ .recoverWith({
+ case _ => EitherT(getXlsx[F](data))
+ })
+ .value
+ case mt =>
+ Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
+ }
+
+ def getDocx(is: InputStream): Either[Throwable, String] =
+ Try {
+ val xt = new XWPFWordExtractor(new XWPFDocument(is))
+ xt.getText.trim
+ }.toEither
+
+ def getDoc(is: InputStream): Either[Throwable, String] =
+ Try {
+ val xt = new WordExtractor(is)
+ xt.getText.trim
+ }.toEither
+
+ def getXlsx(is: InputStream): Either[Throwable, String] =
+ Try {
+ val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
+ xt.getText.trim
+ }.toEither
+
+ def getXls(is: InputStream): Either[Throwable, String] =
+ Try {
+ val xt = new ExcelExtractor(new HSSFWorkbook(is))
+ xt.getText.trim
+ }.toEither
+
+ def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
+
+ def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
+
+ def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
+
+ def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
+
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala
new file mode 100644
index 00000000..f3795fc5
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiTypes.scala
@@ -0,0 +1,16 @@
+package docspell.extract.poi
+
+import docspell.common.MimeType
+
+object PoiTypes {
+
+ val msoffice = MimeType.application("x-tika-msoffice")
+ val ooxml = MimeType.application("x-tika-ooxml")
+ val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
+ val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+ val xls = MimeType.application("vnd.ms-excel")
+ val doc = MimeType.application("msword")
+
+ val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
+
+}
diff --git a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
new file mode 100644
index 00000000..e2b5757b
--- /dev/null
+++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala
@@ -0,0 +1,24 @@
+package docspell.extract.rtf
+
+import java.io.{ByteArrayInputStream, InputStream}
+
+import cats.implicits._
+import cats.effect.Sync
+import fs2.Stream
+import javax.swing.text.rtf.RTFEditorKit
+
+import scala.util.Try
+
+object RtfExtract {
+
+ def get(is: InputStream): Either[Throwable, String] =
+ Try {
+ val kit = new RTFEditorKit()
+ val doc = kit.createDefaultDocument()
+ kit.read(is, doc, 0)
+ doc.getText(0, doc.getLength).trim
+ }.toEither
+
+ def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
+ data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
index 0f400a13..8033200a 100644
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@@ -1,9 +1,7 @@
package docspell.extract.ocr
import cats.effect.IO
-import docspell.common._
-import docspell.files._
-import docspell.extract.TestFiles
+import docspell.files.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
@@ -30,13 +28,4 @@ object TextExtractionSuite extends SimpleTestSuite {
assertEquals(extract.trim, expect.trim)
}
-
- test("find mimetypes") {
- ExampleFiles.
- all.foreach { url =>
- TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
- map(mt => println(url.asString + ": " + mt.asString)).
- unsafeRunSync
- }
- }
}
diff --git a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
new file mode 100644
index 00000000..00189e10
--- /dev/null
+++ b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala
@@ -0,0 +1,28 @@
+package docspell.extract.odf
+
+import cats.effect._
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object OdfExtractTest extends SimpleTestSuite {
+ val blocker = TestFiles.blocker
+ implicit val CS = TestFiles.CS
+
+ val files = List(
+ ExampleFiles.examples_sample_odt -> 6372,
+ ExampleFiles.examples_sample_ods -> 717
+ )
+
+ test("test extract from odt") {
+ files.foreach { case (file, len) =>
+ val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
+ val str1 = OdfExtract.get(is).fold(throw _, identity)
+ assertEquals(str1.length, len)
+
+ val data = file.readURL[IO](8192, blocker)
+ val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
+ assertEquals(str2, str1)
+ }
+ }
+
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
new file mode 100644
index 00000000..4d06be76
--- /dev/null
+++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala
@@ -0,0 +1,48 @@
+package docspell.extract.pdfbox
+
+import cats.effect._
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object PdfboxExtractTest extends SimpleTestSuite {
+ val blocker = TestFiles.blocker
+ implicit val CS = TestFiles.CS
+
+ val textPDFs = List(
+ ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
+ ExampleFiles.letter_en_pdf -> TestFiles.letterENText
+ )
+
+ test("extract text from text PDFs by inputstream") {
+ textPDFs.foreach {
+ case (file, txt) =>
+ val url = file.toJavaUrl.fold(sys.error, identity)
+ val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+ val received = removeFormatting(str)
+ val expect = removeFormatting(txt)
+ assertEquals(received, expect)
+ }
+ }
+
+ test("extract text from text PDFs via Stream") {
+ textPDFs.foreach {
+ case (file, txt) =>
+ val data = file.readURL[IO](8192, blocker)
+ val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
+ val received = removeFormatting(str)
+ val expect = removeFormatting(txt)
+ assertEquals(received, expect)
+ }
+ }
+
+ test("extract text from image PDFs") {
+ val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
+
+ val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
+
+ assertEquals(str, "")
+ }
+
+ private def removeFormatting(str: String): String =
+ str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
new file mode 100644
index 00000000..002755bc
--- /dev/null
+++ b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala
@@ -0,0 +1,39 @@
+package docspell.extract.poi
+
+import cats.effect._
+import docspell.common.MimeTypeHint
+import docspell.files.{ExampleFiles, TestFiles}
+import minitest.SimpleTestSuite
+
+object PoiExtractTest extends SimpleTestSuite {
+ val blocker = TestFiles.blocker
+ implicit val CS = TestFiles.CS
+
+ val officeFiles = List(
+ ExampleFiles.examples_sample_doc -> 6241,
+ ExampleFiles.examples_sample_docx -> 6179,
+ ExampleFiles.examples_sample_xlsx -> 660,
+ ExampleFiles.examples_sample_xls -> 660
+ )
+
+ test("extract text from ms office files") {
+ officeFiles.foreach {
+ case (file, len) =>
+ val str1 = PoiExtract
+ .get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
+ .unsafeRunSync()
+ .fold(throw _, identity)
+
+ val str2 = PoiExtract
+ .get[IO](
+ file.readURL[IO](8192, blocker),
+ MimeTypeHint(Some(file.path.segments.last), None)
+ )
+ .unsafeRunSync()
+ .fold(throw _, identity)
+
+ assertEquals(str1, str2)
+ assertEquals(str1.length, len)
+ }
+ }
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala
new file mode 100644
index 00000000..699af486
--- /dev/null
+++ b/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala
@@ -0,0 +1,14 @@
+package docspell.extract.rtf
+
+import docspell.files.ExampleFiles
+import minitest.SimpleTestSuite
+
+object RtfExtractTest extends SimpleTestSuite {
+
+ test("extract text from rtf using java input-stream") {
+ val file = ExampleFiles.examples_sample_rtf
+ val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
+ val str = RtfExtract.get(is).fold(throw _, identity)
+ assertEquals(str.length, 7342)
+ }
+}
diff --git a/modules/files/src/main/scala/docspell/files/Dimension.scala b/modules/files/src/main/scala/docspell/files/Dimension.scala
new file mode 100644
index 00000000..2d1a1f4b
--- /dev/null
+++ b/modules/files/src/main/scala/docspell/files/Dimension.scala
@@ -0,0 +1,7 @@
+package docspell.files
+
+case class Dimension(width: Int, height: Int) {
+
+ def toAwtDimension: java.awt.Dimension =
+ new java.awt.Dimension(width, height)
+}
diff --git a/modules/files/src/main/scala/docspell/files/ImageSize.scala b/modules/files/src/main/scala/docspell/files/ImageSize.scala
new file mode 100644
index 00000000..21cd0180
--- /dev/null
+++ b/modules/files/src/main/scala/docspell/files/ImageSize.scala
@@ -0,0 +1,61 @@
+package docspell.files
+
+import java.io.{ByteArrayInputStream, InputStream}
+import java.nio.file.Path
+
+import cats.implicits._
+import cats.effect._
+import fs2.Stream
+import javax.imageio.stream.{FileImageInputStream, ImageInputStream}
+import javax.imageio.{ImageIO, ImageReader}
+
+import scala.jdk.CollectionConverters._
+import scala.util.{Try, Using}
+
+object ImageSize {
+
+ /** Return the image size from its header without reading
+ * the whole image into memory.
+ */
+ def get(file: Path): Option[Dimension] =
+ Using(new FileImageInputStream(file.toFile))(getDimension).toOption.flatten
+
+ /** Return the image size from its header without reading
+ * the whole image into memory.
+ */
+ def get(in: InputStream): Option[Dimension] =
+ Option(ImageIO.createImageInputStream(in)).flatMap(getDimension)
+
+ /** Return the image size from its header without reading
+ * the whole image into memory.
+ */
+ def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = {
+ data.take(768).compile.to(Array).map(ar => {
+ val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
+ if (iis == null) sys.error("no reader given for the array")
+ else getDimension(iis)
+ })
+ }
+
+ private def getDimension(in: ImageInputStream): Option[Dimension] =
+ ImageIO
+ .getImageReaders(in)
+ .asScala
+ .to(LazyList)
+ .collectFirst(Function.unlift { reader =>
+ val dim = getDimension(in, reader).toOption
+ reader.dispose()
+ dim
+ })
+
+ private def getDimension(
+ in: ImageInputStream,
+ reader: ImageReader
+ ): Either[Throwable, Dimension] =
+ Try {
+ reader.setInput(in)
+ val width = reader.getWidth(reader.getMinIndex)
+ val height = reader.getHeight(reader.getMinIndex)
+ Dimension(width, height)
+ }.toEither
+}
diff --git a/modules/files/src/test/resources/bombs/20K-gray.jpeg b/modules/files/src/test/resources/bombs/20K-gray.jpeg
new file mode 100644
index 00000000..4804bb10
Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-gray.jpeg differ
diff --git a/modules/files/src/test/resources/bombs/20K-gray.png b/modules/files/src/test/resources/bombs/20K-gray.png
new file mode 100644
index 00000000..66d8b0a4
Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-gray.png differ
diff --git a/modules/files/src/test/resources/bombs/20K-rgb.jpeg b/modules/files/src/test/resources/bombs/20K-rgb.jpeg
new file mode 100644
index 00000000..a4ef7bf6
Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-rgb.jpeg differ
diff --git a/modules/files/src/test/resources/bombs/20K-rgb.png b/modules/files/src/test/resources/bombs/20K-rgb.png
new file mode 100644
index 00000000..cf332e53
Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-rgb.png differ
diff --git a/modules/files/src/test/resources/letter-en.txt b/modules/files/src/test/resources/letter-en.txt
index 79bcca36..b7051bc4 100644
--- a/modules/files/src/test/resources/letter-en.txt
+++ b/modules/files/src/test/resources/letter-en.txt
@@ -2,18 +2,18 @@ Derek Jeter
123 Elm Ave.
-Treesville, ON MI1N 2P3
+Treesville, ON M1N 2P3
November 7, 2016
Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
-Mr. M. Leat
+Mr. M. Leaf
Chief of Syrup Production
Old Sticky Pancake Company
456 Maple Lane
-Forest, ON 7TW8 9Y0
+Forest, ON 7W8 9Y0
Hemptown, September 3, 2019
Dear Mr. Leaf,
diff --git a/modules/extract/src/test/resources/logback.xml b/modules/files/src/test/resources/logback-test.xml
similarity index 71%
rename from modules/extract/src/test/resources/logback.xml
rename to modules/files/src/test/resources/logback-test.xml
index 5b0b6a44..fdc4bdf7 100644
--- a/modules/extract/src/test/resources/logback.xml
+++ b/modules/files/src/test/resources/logback-test.xml
@@ -3,12 +3,12 @@
true
- [%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n
+ %highlight(%-5level) %cyan(%logger{15}) - %msg %n
-
+
diff --git a/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala b/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala
new file mode 100644
index 00000000..ac3bce6b
--- /dev/null
+++ b/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala
@@ -0,0 +1,46 @@
+package docspell.files
+
+import cats.implicits._
+import cats.effect.{Blocker, IO}
+import minitest.SimpleTestSuite
+
+import scala.concurrent.ExecutionContext
+import scala.util.Using
+
+object ImageSizeTest extends SimpleTestSuite {
+ val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
+ implicit val CS = IO.contextShift(ExecutionContext.global)
+
+ //tiff files are not supported on the jdk by default
+ //requires an external library
+ val files = List(
+ ExampleFiles.camera_letter_en_jpg -> Dimension(1695, 2378),
+ ExampleFiles.camera_letter_en_png -> Dimension(1695, 2378),
+// ExampleFiles.camera_letter_en_tiff -> Dimension(1695, 2378),
+ ExampleFiles.scanner_jfif_jpg -> Dimension(2480, 3514),
+ ExampleFiles.bombs_20K_gray_jpeg -> Dimension(20000, 20000),
+ ExampleFiles.bombs_20K_gray_png -> Dimension(20000, 20000),
+ ExampleFiles.bombs_20K_rgb_jpeg -> Dimension(20000, 20000),
+ ExampleFiles.bombs_20K_rgb_png -> Dimension(20000, 20000)
+ )
+
+ test("get sizes from input-stream") {
+ files.foreach {
+ case (uri, expect) =>
+ val url = uri.toJavaUrl.fold(sys.error, identity)
+ Using.resource(url.openStream()) { in =>
+ val dim = ImageSize.get(in)
+ assertEquals(dim, expect.some)
+ }
+ }
+ }
+
+ test("get sizes from stream") {
+ files.foreach {
+ case (uri, expect) =>
+ val stream = uri.readURL[IO](8192, blocker)
+ val dim = ImageSize.get(stream).unsafeRunSync()
+ assertEquals(dim, expect.some)
+ }
+ }
+}
diff --git a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala b/modules/files/src/test/scala/docspell/files/TestFiles.scala
similarity index 72%
rename from modules/extract/src/test/scala/docspell/extract/TestFiles.scala
rename to modules/files/src/test/scala/docspell/files/TestFiles.scala
index 9c5637e3..1ee01c9a 100644
--- a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
+++ b/modules/files/src/test/scala/docspell/files/TestFiles.scala
@@ -1,8 +1,7 @@
-package docspell.extract
+package docspell.files
-import fs2.Stream
import cats.effect.{Blocker, IO}
-import docspell.files._
+import fs2.Stream
import scala.concurrent.ExecutionContext
@@ -12,19 +11,19 @@ object TestFiles {
val letterSourceDE: Stream[IO, Byte] =
ExampleFiles.letter_de_pdf
- .readURL[IO](16 * 1024, blocker)
+ .readURL[IO](8 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
ExampleFiles.letter_en_pdf
- .readURL[IO](16 * 1024, blocker)
+ .readURL[IO](8 * 1024, blocker)
lazy val letterDEText =
ExampleFiles.letter_de_txt
- .readText[IO](16 * 1024, blocker)
+ .readText[IO](8 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
- .readText[IO](16 * 1024, blocker)
+ .readText[IO](8 * 1024, blocker)
.unsafeRunSync
}
diff --git a/modules/microsite/docs/dev/adr.md b/modules/microsite/docs/dev/adr.md
index 43840acb..285571da 100644
--- a/modules/microsite/docs/dev/adr.md
+++ b/modules/microsite/docs/dev/adr.md
@@ -11,3 +11,8 @@ title: ADRs
- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch)
- [0005 Job Executor](adr/0005_job-executor)
- [0006 More File Types](adr/0006_more-file-types)
+ - [0007 Convert HTML](adr/0007_convert_html_files)
+ - [0008 Convert Text](adr/0008_convert_plain_text)
+ - [0009 Convert Office Files](adr/0009_convert_office_docs)
+ - [0010 Convert Image Files](adr/0010_convert_image_files)
+ - [0011 Extract Text](adr/0011_extract_text)
diff --git a/modules/microsite/docs/dev/adr/0006_more-file-types.md b/modules/microsite/docs/dev/adr/0006_more-file-types.md
index 6c433051..08a7104b 100644
--- a/modules/microsite/docs/dev/adr/0006_more-file-types.md
+++ b/modules/microsite/docs/dev/adr/0006_more-file-types.md
@@ -112,7 +112,7 @@ If conversion is not supported for the input file, it is skipped. If
conversion fails, the error is propagated to let the retry mechanism
take care.
-### What types?
+#### What types?
Which file types should be supported? At a first step, all major
office documents, common images, plain text (i.e. markdown) and html
@@ -123,6 +123,12 @@ There is always the preference to use jvm internal libraries in order
to be more platform independent and to reduce external dependencies.
But this is not always possible (like doing OCR).
+
+
+
+
+#### Conversion
+
- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
unoconv (see [ADR 9](0009_convert_office_docs))
- HTML (`html`): wkhtmltopdf (see [ADR 7](0007_convert_html_files))
@@ -130,9 +136,19 @@ But this is not always possible (like doing OCR).
- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
10](0010_convert_image_files))
+#### Text Extraction
+
+- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
+- Office documends (`odt`, `ods`): Apache Tika (including the sources)
+- HTML: not supported, extract text from converted PDF
+- Images (`jpg`, `png`, `tif`): Tesseract
+- Text/Markdown: n.a.
+- PDF: Apache PDFBox or Tesseract
+
## Links
* [Convert HTML Files](0007_convert_html_files)
* [Convert Plain Text](0008_convert_plain_text)
* [Convert Office Documents](0009_convert_office_docs)
* [Convert Image Files](0010_convert_image_files)
+* [Extract Text from Files](0011_extract_text)
diff --git a/modules/microsite/docs/dev/adr/0011_extract_text.md b/modules/microsite/docs/dev/adr/0011_extract_text.md
new file mode 100644
index 00000000..c90736b6
--- /dev/null
+++ b/modules/microsite/docs/dev/adr/0011_extract_text.md
@@ -0,0 +1,77 @@
+---
+layout: docs
+title: Extract Text from Files
+---
+
+# Extract Text from Files
+
+## Context and Problem Statement
+
+With support for more file types there must be a way to extract text
+from all of them. It is better to extract text from the source files,
+in contrast to extracting the text from the converted pdf file.
+
+There are multiple options and multiple file types. Again, most
+priority is to use a java/scala library to reduce external
+dependencies.
+
+## Considered Options
+
+### MS Office Documents
+
+There is only one library I know: [Apache
+POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
+However, it doesn't support open-document format (odt and ods).
+
+### OpenDocument Format
+
+There are two libraries:
+
+- [Apache Tika Parser](https://tika.apache.org/)
+- [ODFToolkit](https://github.com/tdf/odftoolkit)
+
+*Tika:* The tika-parsers package contains an opendocument parser for
+extracting text. But it has a huge dependency tree, since it is a
+super-package containing a parser for almost every common file type.
+
+*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
+and also pulls in quite some dependencies (while not as much as
+tika-parser). It is not too bad, since it is a library for
+manipulating opendocument files. But all I need is to only extract
+text. I created tests that extracted text from my odt/ods files. It
+worked at first sight, but running the tests in a loop resulted in
+strange nullpointer exceptions (it only worked the first run).
+
+### Richtext
+
+Richtext is supported by the jdk (using `RichtextEditorKit` from
+swing).
+
+### PDF
+
+For "image" pdf files, tesseract is used. For "text" PDF files, the
+library [Apache PDFBox](https://pdfbox.apache.org) can be used.
+
+There also is [iText](https://github.com/itext/itext7) with a AGPL
+license.
+
+### Images
+
+For images and "image" PDF files, there is already tesseract in place.
+
+### HTML
+
+HTML must be converted into a PDF file before text can be extracted.
+
+### Text/Markdown
+
+These files can be used as-is, obviously.
+
+
+## Decision Outcome
+
+- MS Office files: POI library
+- Open Document files: Tika, but integrating the few source files that
+ make up the open document parser. Due to its huge dependency tree,
+ the library is not added.
+- PDF: Apache PDFBox. I know this library better than itext.
diff --git a/modules/microsite/docs/dev/adr/img/process-files.png b/modules/microsite/docs/dev/adr/img/process-files.png
new file mode 100644
index 00000000..455b1a13
Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/process-files.png differ
diff --git a/modules/microsite/docs/dev/adr/process-files.puml b/modules/microsite/docs/dev/adr/process-files.puml
new file mode 100644
index 00000000..2c5330cd
--- /dev/null
+++ b/modules/microsite/docs/dev/adr/process-files.puml
@@ -0,0 +1,43 @@
+@startuml
+scale 1200 width
+title: Processing Files
+skinparam monochrome true
+skinparam backgroundColor white
+skinparam rectangle {
+ roundCorner<> 25
+ roundCorner<