Adding extraction primitives

This commit is contained in:
Eike Kettner
2020-02-16 21:37:26 +01:00
parent 851ee7ef0f
commit 8143a4edcc
46 changed files with 2731 additions and 83 deletions

View File

@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.io.StringReader;
import java.util.Locale;
/**
* Content handler decorator that:<ul>
* <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
* <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
* </ul>
*/
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
private static final String OLD_NS =
"http://openoffice.org/2000/";
private static final String NEW_NS =
"urn:oasis:names:tc:opendocument:xmlns:";
private static final String DTD_PUBLIC_ID =
"-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
public NSNormalizerContentHandler(ContentHandler handler) {
super(handler);
}
private String mapOldNS(String ns) {
if (ns != null && ns.startsWith(OLD_NS)) {
return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
} else {
return ns;
}
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes atts) throws SAXException {
AttributesImpl natts = new AttributesImpl();
for (int i = 0; i < atts.getLength(); i++) {
natts.addAttribute(
mapOldNS(atts.getURI(i)), atts.getLocalName(i),
atts.getQName(i), atts.getType(i), atts.getValue(i));
}
super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
}
@Override
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException {
super.endElement(mapOldNS(namespaceURI), localName, qName);
}
@Override
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
super.startPrefixMapping(prefix, mapOldNS(uri));
}
/**
* do not load any DTDs (may be requested by parser). Fake the DTD by
* returning a empty string as InputSource
*/
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws IOException, SAXException {
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|| DTD_PUBLIC_ID.equals(publicId)) {
return new InputSource(new StringReader(""));
} else {
return super.resolveEntity(publicId, systemId);
}
}
}

View File

@ -0,0 +1,606 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.namespace.QName;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
/**
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
private interface Style {
}
private static class TextStyle implements Style {
public boolean italic;
public boolean bold;
public boolean underlined;
@Override
public String toString() {
return "TextStyle{" +
"italic=" + italic +
", bold=" + bold +
", underlined=" + underlined +
'}';
}
}
private static class ListStyle implements Style {
public boolean ordered;
public String getTag() {
return ordered ? "ol" : "ul";
}
}
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
private static final char[] SPACE = new char[]{ ' '};
private static final String CLASS = "class";
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
private static Attributes buildAttributes(String key, String value) {
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", key, key, "CDATA", value);
return attrs;
}
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
private String currParagraphStyleName; //paragraph style name
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
private String currTextStyleName;
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
private ListStyle listStyle;
// True if we are currently in the named style:
private boolean curUnderlined;
private boolean curBold;
private boolean curItalic;
//have we written the start style tags
//yet for the current text style
boolean hasWrittenStartStyleTags = false;
private int pDepth = 0; //<p> can appear inside comments and other things that are already inside <p>
//we need to track our pDepth and only output <p> if we're at the main level
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
Map<QName, TargetElement> mappings) {
super(handler, mappings);
this.handler = handler;
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
if (!hasWrittenStartStyleTags) {
updateStyleTags();
hasWrittenStartStyleTags = true;
}
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(
String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
}
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) ||
"desc".equals(localName);
}
return false;
}
private void startList(String name) throws SAXException {
String elementName = "ul";
if (name != null) {
ListStyle style = listStyleMap.get(name);
elementName = style != null ? style.getTag() : "ul";
listStyleStack.push(style);
}
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
}
private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
}
private void startSpan(String name) throws SAXException {
if (name == null) {
return;
}
currTextStyle = textStyleMap.get(name);
hasWrittenStartStyleTags = false;
}
private void startParagraph(String styleName) throws SAXException {
if (pDepth == 0) {
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
if (styleName != null) {
currTextStyle = paragraphTextStyleMap.get(styleName);
}
hasWrittenStartStyleTags = false;
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth++;
}
private void endParagraph() throws SAXException {
closeStyleTags();
if (pDepth == 1) {
handler.endElement(XHTML, "p", "p");
} else {
handler.characters(SPACE, 0, SPACE.length);
}
pDepth--;
}
private void updateStyleTags() throws SAXException {
if (currTextStyle == null) {
closeStyleTags();
return;
}
if (currTextStyle.bold != curBold) {
// Enforce nesting -- must close s and i tags
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML, "i", "i");
curItalic = false;
}
if (currTextStyle.bold) {
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "b", "b");
}
curBold = currTextStyle.bold;
}
if (currTextStyle.italic != curItalic) {
// Enforce nesting -- must close s tag
if (curUnderlined) {
handler.endElement(XHTML, "u", "u");
curUnderlined = false;
}
if (currTextStyle.italic) {
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "i", "i");
}
curItalic = currTextStyle.italic;
}
if (currTextStyle.underlined != curUnderlined) {
if (currTextStyle.underlined) {
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
} else {
handler.endElement(XHTML, "u", "u");
}
curUnderlined = currTextStyle.underlined;
}
}
private void endSpan() throws SAXException {
updateStyleTags();
}
private void closeStyleTags() throws SAXException {
// Close any still open style tags
if (curUnderlined) {
handler.endElement(XHTML,"u", "u");
curUnderlined = false;
}
if (curItalic) {
handler.endElement(XHTML,"i", "i");
curItalic = false;
}
if (curBold) {
handler.endElement(XHTML,"b", "b");
curBold = false;
}
currTextStyle = null;
hasWrittenStartStyleTags = false;
}
@Override
public void startElement(
String namespaceURI, String localName, String qName,
Attributes attrs) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth its set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
// Set styles
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
String family = attrs.getValue(STYLE_NS, "family");
if ("text".equals(family)) {
currTextStyle = new TextStyle();
currTextStyleName = attrs.getValue(STYLE_NS, "name");
} else if ("paragraph".equals(family)) {
currTextStyle = new TextStyle();
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
&& "text-properties".equals(localName)) {
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
currTextStyle.italic = true;
}
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
|| (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
&& Integer.valueOf(fontWeight) > 500)) {
currTextStyle.bold = true;
}
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
if (underlineStyle != null && !underlineStyle.equals("none")) {
currTextStyle.underlined = true;
}
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
if ("list-level-style-bullet".equals(localName)) {
listStyle.ordered = false;
} else if ("list-level-style-number".equals(localName)) {
listStyle.ordered = true;
}
}
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
handler.characters(SPACE, 0, 1);
} else if ("annotation".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
} else if ("note".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
} else if ("notes".equals(localName)) {
closeStyleTags();
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@Override
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
if (currTextStyle != null && currTextStyleName != null) {
textStyleMap.put(currTextStyleName, currTextStyle);
currTextStyleName = null;
currTextStyle = null;
} else if (currTextStyle != null && currParagraphStyleName != null) {
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
currParagraphStyleName = null;
currTextStyle = null;
}
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
listStyle = null;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
endList();
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
currTextStyle = null;
hasWrittenStartStyleTags = false;
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
endParagraph();
} else if ("annotation".equals(localName) || "note".equals(localName) ||
"notes".equals(localName)) {
closeStyleTags();
handler.endElement("", localName, localName);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI)
&& ("tab-stop".equals(localName)
|| "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
}
public static final String TEXT_NS =
"urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS =
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS =
"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[]{'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
* and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
MAPPINGS.put(
new QName(TEXT_NS, "p"),
new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(
new QName(TEXT_NS, "line-break"),
new TargetElement(XHTML, "br"));
MAPPINGS.put(
new QName(TEXT_NS, "list-item"),
new TargetElement(XHTML, "li"));
MAPPINGS.put(
new QName(TEXT_NS, "note"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(OFFICE_NS, "annotation"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(PRESENTATION_NS, "notes"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(DRAW_NS, "object"),
new TargetElement(XHTML, "object"));
MAPPINGS.put(
new QName(DRAW_NS, "text-box"),
new TargetElement(XHTML, "div"));
MAPPINGS.put(
new QName(SVG_NS, "title"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(SVG_NS, "desc"),
new TargetElement(XHTML, "span"));
MAPPINGS.put(
new QName(TEXT_NS, "span"),
new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping =
new HashMap<QName, QName>();
aAttsMapping.put(
new QName(XLINK_NS, "href"),
new QName("href"));
aAttsMapping.put(
new QName(XLINK_NS, "title"),
new QName("title"));
MAPPINGS.put(
new QName(TEXT_NS, "a"),
new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(
new QName(TABLE_NS, "table"),
new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(
new QName(TABLE_NS, "table-row"),
new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping =
new HashMap<QName, QName>();
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-spanned"),
new QName("colspan"));
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-rows-spanned"),
new QName("rowspan"));
/* TODO: The following is not correct, the cell should be repeated not spanned!
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
* Problems may occur when both spanning and repeating is given, which is not allowed by spec.
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
* only for empty cells.
*/
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-repeated"),
new QName("colspan"));
MAPPINGS.put(
new QName(TABLE_NS, "table-cell"),
new TargetElement(XHTML, "td", tableCellAttsMapping));
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream,
new XHTMLContentHandler(handler, metadata),
metadata, context);
}
void parseInternal(
InputStream stream, final ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(
new NSNormalizerContentHandler(dh)),
context);
}
}

View File

@ -0,0 +1,199 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
import org.apache.tika.parser.xml.AttributeMetadataHandler;
import org.apache.tika.parser.xml.ElementMetadataHandler;
import org.apache.tika.parser.xml.MetadataHandler;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.xpath.CompositeMatcher;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
/**
* Parser for OpenDocument <code>meta.xml</code> files.
*/
public class OpenDocumentMetaParser extends XMLParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = -8739250869531737584L;
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
/**
* @see OfficeOpenXMLCore#SUBJECT
* @deprecated use OfficeOpenXMLCore#SUBJECT
*/
@Deprecated
private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
Property.composite(Office.INITIAL_AUTHOR,
new Property[]{Property.externalText("initial-creator")});
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
}
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, Property property, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getUserDefined(
ContentHandler ch, Metadata md) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:user-defined/@meta:name"),
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
matcher);
return new TeeContentHandler(ch, branch);
}
@Deprecated
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
return new TeeContentHandler(ch, branch);
}
private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, Property property, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
ContentHandler branch = new MatchingContentHandler(
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
return new TeeContentHandler(ch, branch);
}
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
// Process the Dublin Core Attributes
ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "date",
md, TikaCoreProperties.MODIFIED));
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "subject",
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
// Process the user defined Meta Attributes
ch = getUserDefined(ch, md);
// Process the OO Statistics Attributes
ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
ch = getStatistic(ch, md, "nbWord", "word-count");
ch = getStatistic(ch, md, "nbCharacter", "character-count");
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
@Override
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
if (odfSubject != null && !odfSubject.equals("") &&
(metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
}
}

View File

@ -0,0 +1,256 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* OpenOffice parser
*/
public class OpenDocumentParser extends AbstractParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = -6410276875438618287L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("vnd.sun.xml.writer"),
MediaType.application("vnd.oasis.opendocument.text"),
MediaType.application("vnd.oasis.opendocument.graphics"),
MediaType.application("vnd.oasis.opendocument.presentation"),
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
MediaType.application("vnd.oasis.opendocument.chart"),
MediaType.application("vnd.oasis.opendocument.image"),
MediaType.application("vnd.oasis.opendocument.formula"),
MediaType.application("vnd.oasis.opendocument.text-master"),
MediaType.application("vnd.oasis.opendocument.text-web"),
MediaType.application("vnd.oasis.opendocument.text-template"),
MediaType.application("vnd.oasis.opendocument.graphics-template"),
MediaType.application("vnd.oasis.opendocument.presentation-template"),
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("vnd.oasis.opendocument.chart-template"),
MediaType.application("vnd.oasis.opendocument.image-template"),
MediaType.application("vnd.oasis.opendocument.formula-template"),
MediaType.application("x-vnd.oasis.opendocument.text"),
MediaType.application("x-vnd.oasis.opendocument.graphics"),
MediaType.application("x-vnd.oasis.opendocument.presentation"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
MediaType.application("x-vnd.oasis.opendocument.chart"),
MediaType.application("x-vnd.oasis.opendocument.image"),
MediaType.application("x-vnd.oasis.opendocument.formula"),
MediaType.application("x-vnd.oasis.opendocument.text-master"),
MediaType.application("x-vnd.oasis.opendocument.text-web"),
MediaType.application("x-vnd.oasis.opendocument.text-template"),
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
MediaType.application("x-vnd.oasis.opendocument.image-template"),
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
public Parser getMetaParser() {
return meta;
}
public void setMetaParser(Parser meta) {
this.meta = meta;
}
public Parser getContentParser() {
return content;
}
public void setContentParser(Parser content) {
this.content = content;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
ZipInputStream zipStream = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
} else if (tis.hasFile()) {
zipFile = new ZipFile(tis.getFile());
} else {
zipStream = new ZipInputStream(stream);
}
} else {
zipStream = new ZipInputStream(stream);
}
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
if (zipFile != null) {
try {
handleZipFile(zipFile, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipFile.close();
}
} else {
try {
handleZipStream(zipStream, metadata, context, handler);
} finally {
//Do we want to close silently == catch an exception here?
zipStream.close();
}
}
// Only now call the end document
if (handler.getEndDocumentWasCalled()) {
handler.reallyEndDocument();
}
}
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
if (entry == null) {
throw new IOException("No entries found in ZipInputStream");
}
do {
handleZipEntry(entry, zipStream, metadata, context, handler);
entry = zipStream.getNextEntry();
} while (entry != null);
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, TikaException, SAXException {
// If we can, process the metadata first, then the
// rest of the file afterwards (TIKA-1353)
// Only possible to guarantee that when opened from a file not a stream
ZipEntry entry = zipFile.getEntry(META_NAME);
if (entry != null) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
}
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
}
}
}
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
ParseContext context, EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, UTF_8);
metadata.set(Metadata.CONTENT_TYPE, type);
} else if (entry.getName().equals(META_NAME)) {
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith("content.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else if (entry.getName().endsWith("styles.xml")) {
if (content instanceof OpenDocumentContentParser) {
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
} else {
// Foreign content parser was set:
content.parse(zip, handler, metadata, context);
}
} else {
String embeddedName = entry.getName();
//scrape everything under Thumbnails/ and Pictures/
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
EmbeddedDocumentExtractor embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
/* if (embeddedName.startsWith("Thumbnails/")) {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
}*/
if (embeddedName.contains("Pictures/")) {
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
}
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor.parseEmbedded(zip,
new EmbeddedContentHandler(handler), embeddedMetadata, false);
}
}
}
}
}

View File

@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.helpers.DefaultHandler;
import java.util.Arrays;
import java.util.List;
/**
* Base class for SAX handlers that map SAX events into document metadata.
*
* @since Apache Tika 0.10
*/
class AbstractMetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final Property property;
private final String name;
protected AbstractMetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
this.property = null;
this.name = name;
}
protected AbstractMetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
/**
* Adds the given metadata value. The value is ignored if it is
* <code>null</code> or empty. If the metadata entry already exists,
* then the given value is appended to it with a comma as the separator.
*
* @param value metadata value
*/
protected void addMetadata(String value) {
if (value != null && value.length() > 0) {
if (metadata.isMultiValued(name)) {
// Add the value, assuming it's not already there
List<String> previous = Arrays.asList(metadata.getValues(name));
if (!previous.contains(value)) {
if (property != null) {
metadata.add(property, value);
} else {
metadata.add(name, value);
}
}
} else {
// Set the value, assuming it's not already there
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
if (!previous.equals(value)) {
if (property != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
// Replace the existing value if isMultiValuePermitted is false
metadata.set(property, value);
}
} else {
metadata.add(name, value);
}
}
} else {
if (property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
}
}
}
}
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/**
* This adds a Metadata entry for a given node.
* The textual content of the node is used as the
* value, and the Metadata name is taken from
* an attribute, with a prefix if required.
*/
public class AttributeDependantMetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final String nameHoldingAttribute;
private final String namePrefix;
private String name;
private final StringBuilder buffer = new StringBuilder();
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
this.metadata = metadata;
this.nameHoldingAttribute = nameHoldingAttribute;
this.namePrefix = namePrefix;
}
public void addMetadata(String value) {
if(name == null || name.length() == 0) {
// We didn't find the attribute which holds the name
return;
}
if (value.length() > 0) {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
metadata.set(name, value);
}
}
public void endElement(String uri, String localName, String name) {
addMetadata(buffer.toString());
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
String rawName = attributes.getValue(nameHoldingAttribute);
if (rawName != null) {
if (namePrefix == null) {
this.name = rawName;
} else {
this.name = namePrefix + rawName;
}
}
// All other attributes are ignored
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
}

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
* SAX event handler that maps the contents of an XML attribute into
* a metadata field.
*
* @since Apache Tika 0.10
*/
public class AttributeMetadataHandler extends AbstractMetadataHandler {
private final String uri;
private final String localName;
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
}
public AttributeMetadataHandler(
String uri, String localName, Metadata metadata, Property property) {
super(metadata, property);
this.uri = uri;
this.localName = localName;
}
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
throws SAXException {
for (int i = 0; i < attributes.getLength(); i++) {
if (attributes.getURI(i).equals(this.uri)
&& attributes.getLocalName(i).equals(this.localName)) {
addMetadata(attributes.getValue(i).trim());
}
}
}
}

View File

@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
/**
* Dublin Core metadata parser
*/
public class DcXMLParser extends XMLParser {
/** Serial version UID */
private static final long serialVersionUID = 4905318835463880819L;
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TeeContentHandler(
super.getContentHandler(handler, metadata, context),
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
}
}

View File

@ -0,0 +1,241 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import java.util.Arrays;
/**
* SAX event handler that maps the contents of an XML element into
* a metadata field.
*
* @since Apache Tika 0.10
*/
public class ElementMetadataHandler extends AbstractMetadataHandler {
private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);
private static final String LOCAL_NAME_RDF_BAG = "Bag";
private static final String LOCAL_NAME_RDF_LI = "li";
private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private final String uri;
private final String localName;
private final Metadata metadata;
private final String name;
private Property targetProperty;
private final boolean allowDuplicateValues;
private final boolean allowEmptyValues;
/**
* The buffer used to capture characters when inside a bag li element.
*/
private final StringBuilder bufferBagged = new StringBuilder();
/**
* The buffer used to capture characters inside standard elements.
*/
private final StringBuilder bufferBagless = new StringBuilder();
/**
* Whether or not the value was found in a standard element structure or inside a bag.
*/
private boolean isBagless = true;
private int matchLevel = 0;
private int parentMatchLevel = 0;
/**
* Constructor for string metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
LOG.trace("created simple handler for {}", this.name);
}
/**
* Constructor for string metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param name the Tika metadata field key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, name);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.name = name;
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
LOG.trace("created simple handler for {}", this.name);
}
/**
* Constructor for Property metadata keys.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = false;
this.allowEmptyValues = false;
LOG.trace("created property handler for {}", this.name);
}
/**
* Constructor for Property metadata keys which allows change of behavior
* for duplicate and empty entry values.
*
* @param uri the uri of the namespace of the element
* @param localName the local name of the element
* @param metadata the Tika metadata object to populate
* @param targetProperty the Tika metadata Property key
* @param allowDuplicateValues add duplicate values to the Tika metadata
* @param allowEmptyValues add empty values to the Tika metadata
*/
public ElementMetadataHandler(
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
super(metadata, targetProperty);
this.uri = uri;
this.localName = localName;
this.metadata = metadata;
this.targetProperty = targetProperty;
this.name = targetProperty.getName();
this.allowDuplicateValues = allowDuplicateValues;
this.allowEmptyValues = allowEmptyValues;
LOG.trace("created property handler for {}", this.name);
}
protected boolean isMatchingParentElement(String uri, String localName) {
return (uri.equals(this.uri) && localName.equals(this.localName));
}
protected boolean isMatchingElement(String uri, String localName) {
// match if we're inside the parent element or within some bag element
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
(parentMatchLevel > 0 &&
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
)
);
}
@Override
public void startElement(
String uri, String localName, String name, Attributes attributes) {
if (isMatchingElement(uri, localName)) {
matchLevel++;
}
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel++;
}
}
@Override
public void endElement(String uri, String localName, String name) {
if (isMatchingParentElement(uri, localName)) {
parentMatchLevel--;
}
if (isMatchingElement(uri, localName)) {
matchLevel--;
if (matchLevel == 2) {
// we're inside a bag li element, add the bagged buffer
addMetadata(bufferBagged.toString().trim());
bufferBagged.setLength(0);
isBagless = false;
}
if (matchLevel == 0 && isBagless) {
String valueBagless = bufferBagless.toString();
if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
// we're in a standard element, add the bagless buffer
addMetadata(valueBagless.trim());
bufferBagless.setLength(0);
}
isBagless = true;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
// We need to append to both buffers since we don't if we're inside a bag until we're done
if (parentMatchLevel > 0 && matchLevel > 2) {
bufferBagged.append(ch, start, length);
}
if (parentMatchLevel > 0 && matchLevel > 0) {
bufferBagless.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) {
characters(ch, start, length);
}
@Override
protected void addMetadata(String value) {
LOG.trace("adding {}={}", name, value);
if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
if ((value != null && value.length() > 0) || allowEmptyValues) {
if (value == null || value.length() == 0 && allowEmptyValues) {
value = "";
}
String[] previous = metadata.getValues(name);
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
metadata.add(targetProperty, value);
}
}
} else {
super.addMetadata(value);
}
}
}

View File

@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Set;
public class FictionBookParser extends XMLParser {
private static final long serialVersionUID = 4195954546491524374L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-fictionbook+xml"));
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
return new BinaryElementsDataHandler(
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
}
private static class BinaryElementsDataHandler extends DefaultHandler {
private static final String ELEMENT_BINARY = "binary";
private boolean binaryMode = false;
private static final String ATTRIBUTE_ID = "id";
private final EmbeddedDocumentExtractor partExtractor;
private final ContentHandler handler;
private final StringBuilder binaryData = new StringBuilder();
private Metadata metadata;
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
this.partExtractor = partExtractor;
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
binaryMode = ELEMENT_BINARY.equals(localName);
if (binaryMode) {
binaryData.setLength(0);
metadata = new Metadata();
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (binaryMode) {
try {
partExtractor.parseEmbedded(
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
handler,
metadata,
true
);
} catch (IOException e) {
throw new SAXException("IOException in parseEmbedded", e);
}
binaryMode = false;
binaryData.setLength(0);
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (!binaryMode) {
handler.characters(ch, start, length);
} else {
binaryData.append(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
handler.ignorableWhitespace(ch, start, length);
}
}
}

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
/**
* This adds Metadata entries with a specified name for
* the textual content of a node (if present), and
* all attribute values passed through the matcher
* (but not their names).
*
* @deprecated Use the {@link AttributeMetadataHandler} and
* {@link ElementMetadataHandler} classes instead
*/
public class MetadataHandler extends DefaultHandler {
private final Metadata metadata;
private final Property property;
private final String name;
private final StringBuilder buffer = new StringBuilder();
public MetadataHandler(Metadata metadata, String name) {
this.metadata = metadata;
this.property = null;
this.name = name;
}
public MetadataHandler(Metadata metadata, Property property) {
this.metadata = metadata;
this.property = property;
this.name = property.getName();
}
public void addMetadata(String value) {
if (value.length() > 0) {
String previous = metadata.get(name);
if (previous != null && previous.length() > 0) {
value = previous + ", " + value;
}
if (this.property != null) {
metadata.set(property, value);
} else {
metadata.set(name, value);
}
}
}
public void endElement(String uri, String localName, String name) {
addMetadata(buffer.toString());
buffer.setLength(0);
}
public void startElement(
String uri, String localName, String name, Attributes attributes) {
for (int i = 0; i < attributes.getLength(); i++) {
addMetadata(attributes.getValue(i));
}
}
public void characters(char[] ch, int start, int length) {
buffer.append(ch, start, length);
}
}

View File

@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xml;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* XML parser.
*/
public class XMLParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = -6028836725280212837L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.application("xml"),
MediaType.image("svg+xml"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
}
final XHTMLContentHandler xhtml =
new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.startElement("p");
TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))), context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endElement("p");
xhtml.endDocument();
}
}
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
return new TextContentHandler(handler, true);
}
}

View File

@ -0,0 +1,29 @@
package docspell.extract
import docspell.common.MimeType
import scala.util.Try
sealed trait ExtractResult {
def textOption: Option[String]
}
object ExtractResult {
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
val textOption = None
}
case class Failure(ex: Throwable) extends ExtractResult {
val textOption = None
}
case class Success(text: String) extends ExtractResult {
val textOption = Some(text)
}
def fromTry(r: Try[String]): ExtractResult =
r.fold(Failure.apply, Success.apply)
}

View File

@ -0,0 +1,30 @@
package docspell.extract.odf
import cats.effect._
import cats.implicits._
import fs2.Stream
import java.io.{ByteArrayInputStream, InputStream}
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ParseContext
import org.apache.tika.parser.odf.OpenDocumentParser
import org.apache.tika.sax.BodyContentHandler
import scala.util.Try
object OdfExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
def get(is: InputStream) = Try {
val handler = new BodyContentHandler()
val pctx = new ParseContext()
val meta = new Metadata()
val ooparser = new OpenDocumentParser()
ooparser.parse(is, handler, meta, pctx)
handler.toString.trim
}.toEither
}

View File

@ -0,0 +1,34 @@
package docspell.extract.pdfbox
import java.io.InputStream
import java.nio.file.Path
import cats.implicits._
import cats.effect.Sync
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import scala.util.{Try, Using}
import fs2.Stream
object PdfboxExtract {
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map { bytes =>
Using(PDDocument.load(bytes))(readText).toEither.flatten
}
def get(is: InputStream): Either[Throwable, String] =
Using(PDDocument.load(is))(readText).toEither.flatten
def get(inFile: Path): Either[Throwable, String] =
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
private def readText(doc: PDDocument): Either[Throwable, String] =
Try {
val stripper = new PDFTextStripper()
stripper.setAddMoreFormatting(true)
stripper.setLineSeparator("\n")
stripper.getText(doc).trim // trim here already
}.toEither
}

View File

@ -0,0 +1,85 @@
package docspell.extract.poi
import java.io.{ByteArrayInputStream, InputStream}
import cats.data.EitherT
import cats.implicits._
import cats.effect.Sync
import org.apache.poi.hssf.extractor.ExcelExtractor
import org.apache.poi.hssf.usermodel.HSSFWorkbook
import org.apache.poi.hwpf.extractor.WordExtractor
import org.apache.poi.xssf.extractor.XSSFExcelExtractor
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.poi.xwpf.extractor.XWPFWordExtractor
import org.apache.poi.xwpf.usermodel.XWPFDocument
import fs2.Stream
import scala.util.Try
import docspell.common._
import docspell.files.TikaMimetype
object PoiExtract {
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
TikaMimetype.detect(data, hint).flatMap {
case PoiTypes.doc =>
getDoc(data)
case PoiTypes.xls =>
getXls(data)
case PoiTypes.xlsx =>
getXlsx(data)
case PoiTypes.docx =>
getDocx(data)
case PoiTypes.msoffice =>
EitherT(getDoc[F](data))
.recoverWith({
case _ => EitherT(getXls[F](data))
})
.value
case PoiTypes.ooxml =>
EitherT(getDocx[F](data))
.recoverWith({
case _ => EitherT(getXlsx[F](data))
})
.value
case mt =>
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
}
def getDocx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XWPFWordExtractor(new XWPFDocument(is))
xt.getText.trim
}.toEither
def getDoc(is: InputStream): Either[Throwable, String] =
Try {
val xt = new WordExtractor(is)
xt.getText.trim
}.toEither
def getXlsx(is: InputStream): Either[Throwable, String] =
Try {
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
xt.getText.trim
}.toEither
def getXls(is: InputStream): Either[Throwable, String] =
Try {
val xt = new ExcelExtractor(new HSSFWorkbook(is))
xt.getText.trim
}.toEither
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
}

View File

@ -0,0 +1,16 @@
package docspell.extract.poi
import docspell.common.MimeType
object PoiTypes {
val msoffice = MimeType.application("x-tika-msoffice")
val ooxml = MimeType.application("x-tika-ooxml")
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
val xls = MimeType.application("vnd.ms-excel")
val doc = MimeType.application("msword")
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
}

View File

@ -0,0 +1,24 @@
package docspell.extract.rtf
import java.io.{ByteArrayInputStream, InputStream}
import cats.implicits._
import cats.effect.Sync
import fs2.Stream
import javax.swing.text.rtf.RTFEditorKit
import scala.util.Try
object RtfExtract {
def get(is: InputStream): Either[Throwable, String] =
Try {
val kit = new RTFEditorKit()
val doc = kit.createDefaultDocument()
kit.read(is, doc, 0)
doc.getText(0, doc.getLength).trim
}.toEither
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
}

View File

@ -1,14 +0,0 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

View File

@ -1,30 +0,0 @@
package docspell.extract
import fs2.Stream
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
val letterSourceDE: Stream[IO, Byte] =
ExampleFiles.letter_de_pdf
.readURL[IO](16 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
ExampleFiles.letter_en_pdf
.readURL[IO](16 * 1024, blocker)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -1,9 +1,7 @@
package docspell.extract.ocr
import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.extract.TestFiles
import docspell.files.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
@ -30,13 +28,4 @@ object TextExtractionSuite extends SimpleTestSuite {
assertEquals(extract.trim, expect.trim)
}
test("find mimetypes") {
ExampleFiles.
all.foreach { url =>
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
map(mt => println(url.asString + ": " + mt.asString)).
unsafeRunSync
}
}
}

View File

@ -0,0 +1,28 @@
package docspell.extract.odf
import cats.effect._
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object OdfExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val files = List(
ExampleFiles.examples_sample_odt -> 6372,
ExampleFiles.examples_sample_ods -> 717
)
test("test extract from odt") {
files.foreach { case (file, len) =>
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str1 = OdfExtract.get(is).fold(throw _, identity)
assertEquals(str1.length, len)
val data = file.readURL[IO](8192, blocker)
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
assertEquals(str2, str1)
}
}
}

View File

@ -0,0 +1,48 @@
package docspell.extract.pdfbox
import cats.effect._
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object PdfboxExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val textPDFs = List(
ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
ExampleFiles.letter_en_pdf -> TestFiles.letterENText
)
test("extract text from text PDFs by inputstream") {
textPDFs.foreach {
case (file, txt) =>
val url = file.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
val received = removeFormatting(str)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
}
test("extract text from text PDFs via Stream") {
textPDFs.foreach {
case (file, txt) =>
val data = file.readURL[IO](8192, blocker)
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
val received = removeFormatting(str)
val expect = removeFormatting(txt)
assertEquals(received, expect)
}
}
test("extract text from image PDFs") {
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
assertEquals(str, "")
}
private def removeFormatting(str: String): String =
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
}

View File

@ -0,0 +1,39 @@
package docspell.extract.poi
import cats.effect._
import docspell.common.MimeTypeHint
import docspell.files.{ExampleFiles, TestFiles}
import minitest.SimpleTestSuite
object PoiExtractTest extends SimpleTestSuite {
val blocker = TestFiles.blocker
implicit val CS = TestFiles.CS
val officeFiles = List(
ExampleFiles.examples_sample_doc -> 6241,
ExampleFiles.examples_sample_docx -> 6179,
ExampleFiles.examples_sample_xlsx -> 660,
ExampleFiles.examples_sample_xls -> 660
)
test("extract text from ms office files") {
officeFiles.foreach {
case (file, len) =>
val str1 = PoiExtract
.get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
.unsafeRunSync()
.fold(throw _, identity)
val str2 = PoiExtract
.get[IO](
file.readURL[IO](8192, blocker),
MimeTypeHint(Some(file.path.segments.last), None)
)
.unsafeRunSync()
.fold(throw _, identity)
assertEquals(str1, str2)
assertEquals(str1.length, len)
}
}
}

View File

@ -0,0 +1,14 @@
package docspell.extract.rtf
import docspell.files.ExampleFiles
import minitest.SimpleTestSuite
object RtfExtractTest extends SimpleTestSuite {
test("extract text from rtf using java input-stream") {
val file = ExampleFiles.examples_sample_rtf
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
val str = RtfExtract.get(is).fold(throw _, identity)
assertEquals(str.length, 7342)
}
}