mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 13:32:51 +00:00
Adding extraction primitives
This commit is contained in:
parent
851ee7ef0f
commit
8143a4edcc
@ -205,7 +205,9 @@ val extract = project.in(file("modules/extract")).
|
||||
libraryDependencies ++=
|
||||
Dependencies.fs2 ++
|
||||
Dependencies.pdfbox ++
|
||||
Dependencies.poi
|
||||
Dependencies.poi ++
|
||||
Dependencies.commonsIO ++
|
||||
Dependencies.julOverSlf4j
|
||||
).dependsOn(common, files % "compile->compile;test->test")
|
||||
|
||||
val convert = project.in(file("modules/convert")).
|
||||
|
@ -1,21 +0,0 @@
|
||||
package docspell.analysis
|
||||
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
package docspell.analysis.date
|
||||
|
||||
import docspell.analysis.TestFiles
|
||||
import docspell.files.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.common.Language
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.analysis.TestFiles
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
@ -12,25 +12,23 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 69, 74),
|
||||
NerLabel("Jeter", NerTag.Person, 75, 80),
|
||||
NerLabel("Treesville", NerTag.Location, 96, 106),
|
||||
NerLabel("M.", NerTag.Person, 142, 144),
|
||||
NerLabel("Leat", NerTag.Person, 145, 149),
|
||||
NerLabel("Syrup", NerTag.Organization, 160, 165),
|
||||
NerLabel("Production", NerTag.Organization, 166, 176),
|
||||
NerLabel("Old", NerTag.Organization, 177, 180),
|
||||
NerLabel("Sticky", NerTag.Organization, 181, 187),
|
||||
NerLabel("Pancake", NerTag.Organization, 188, 195),
|
||||
NerLabel("Company", NerTag.Organization, 196, 203),
|
||||
NerLabel("Maple", NerTag.Location, 208, 213),
|
||||
NerLabel("Lane", NerTag.Location, 214, 218),
|
||||
NerLabel("Forest", NerTag.Location, 220, 226),
|
||||
NerLabel("Hemptown", NerTag.Location, 241, 249),
|
||||
NerLabel("Little", NerTag.Organization, 349, 355),
|
||||
NerLabel("League", NerTag.Organization, 356, 362),
|
||||
NerLabel("Derek", NerTag.Person, 1119, 1124),
|
||||
NerLabel("Jeter", NerTag.Person, 1125, 1130)
|
||||
NerLabel("Derek", NerTag.Person, 68, 73),
|
||||
NerLabel("Jeter", NerTag.Person, 74, 79),
|
||||
NerLabel("Treesville", NerTag.Location, 95, 105),
|
||||
NerLabel("Syrup", NerTag.Organization, 159, 164),
|
||||
NerLabel("Production", NerTag.Organization, 165, 175),
|
||||
NerLabel("Old", NerTag.Organization, 176, 179),
|
||||
NerLabel("Sticky", NerTag.Organization, 180, 186),
|
||||
NerLabel("Pancake", NerTag.Organization, 187, 194),
|
||||
NerLabel("Company", NerTag.Organization, 195, 202),
|
||||
NerLabel("Maple", NerTag.Location, 207, 212),
|
||||
NerLabel("Lane", NerTag.Location, 213, 217),
|
||||
NerLabel("Forest", NerTag.Location, 219, 225),
|
||||
NerLabel("Hemptown", NerTag.Location, 239, 247),
|
||||
NerLabel("Little", NerTag.Organization, 347, 353),
|
||||
NerLabel("League", NerTag.Organization, 354, 360),
|
||||
NerLabel("Derek", NerTag.Person, 1117, 1122),
|
||||
NerLabel("Jeter", NerTag.Person, 1123, 1128)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
|
@ -0,0 +1,69 @@
|
||||
package docspell.convert.flexmark
|
||||
|
||||
import java.io.{InputStream, InputStreamReader}
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.util
|
||||
|
||||
import cats.effect.Sync
|
||||
import cats.implicits._
|
||||
import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension
|
||||
import com.vladsch.flexmark.ext.tables.TablesExtension
|
||||
import com.vladsch.flexmark.html.HtmlRenderer
|
||||
import com.vladsch.flexmark.parser.Parser
|
||||
import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet}
|
||||
import fs2.Stream
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
object Markdown {
|
||||
|
||||
def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = {
|
||||
val p = createParser()
|
||||
val r = createRenderer()
|
||||
Try {
|
||||
val reader = new InputStreamReader(is, StandardCharsets.UTF_8)
|
||||
val doc = p.parseReader(reader)
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}.toEither
|
||||
}
|
||||
|
||||
|
||||
def toHtml(md: String, cfg: MarkdownConfig): String = {
|
||||
val p = createParser()
|
||||
val r = createRenderer()
|
||||
val doc = p.parse(md)
|
||||
wrapHtml(r.render(doc), cfg)
|
||||
}
|
||||
|
||||
def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] =
|
||||
data.through(fs2.text.utf8Decode).compile.foldMonoid.
|
||||
map(str => toHtml(str, cfg))
|
||||
|
||||
private def wrapHtml(body: String, cfg: MarkdownConfig): String = {
|
||||
s"""<html>
|
||||
|<head>
|
||||
|<style>
|
||||
|${cfg.internalCss}
|
||||
|</style>
|
||||
|</head>
|
||||
|<body>
|
||||
|$body
|
||||
|</body>
|
||||
|</html>
|
||||
|""".stripMargin
|
||||
}
|
||||
|
||||
private def createParser(): Parser = {
|
||||
val opts = new MutableDataSet()
|
||||
opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]],
|
||||
util.Arrays.asList(TablesExtension.create(),
|
||||
StrikethroughExtension.create()));
|
||||
|
||||
Parser.builder(opts).build()
|
||||
}
|
||||
|
||||
private def createRenderer(): HtmlRenderer = {
|
||||
val opts = new MutableDataSet()
|
||||
HtmlRenderer.builder(opts).build()
|
||||
}
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
package docspell.convert.flexmark
|
||||
|
||||
case class MarkdownConfig(internalCss: String)
|
11
modules/extract/NOTICE
Normal file
11
modules/extract/NOTICE
Normal file
@ -0,0 +1,11 @@
|
||||
The Java source files in docspell-extract are unmodified copies of
|
||||
those found in the Apache Tika parser project. It follows the
|
||||
NOTICE.txt file from Apache Tika parsers:
|
||||
|
||||
Apache Tika parsers
|
||||
Copyright 2007-2019 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
|
||||
|
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.odf;
|
||||
|
||||
import org.apache.tika.sax.ContentHandlerDecorator;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.AttributesImpl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Content handler decorator that:<ul>
|
||||
* <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
|
||||
* <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
|
||||
|
||||
private static final String OLD_NS =
|
||||
"http://openoffice.org/2000/";
|
||||
|
||||
private static final String NEW_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:";
|
||||
|
||||
private static final String DTD_PUBLIC_ID =
|
||||
"-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
|
||||
|
||||
public NSNormalizerContentHandler(ContentHandler handler) {
|
||||
super(handler);
|
||||
}
|
||||
|
||||
private String mapOldNS(String ns) {
|
||||
if (ns != null && ns.startsWith(OLD_NS)) {
|
||||
return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
|
||||
} else {
|
||||
return ns;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(
|
||||
String namespaceURI, String localName, String qName,
|
||||
Attributes atts) throws SAXException {
|
||||
AttributesImpl natts = new AttributesImpl();
|
||||
for (int i = 0; i < atts.getLength(); i++) {
|
||||
natts.addAttribute(
|
||||
mapOldNS(atts.getURI(i)), atts.getLocalName(i),
|
||||
atts.getQName(i), atts.getType(i), atts.getValue(i));
|
||||
}
|
||||
super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String namespaceURI, String localName, String qName)
|
||||
throws SAXException {
|
||||
super.endElement(mapOldNS(namespaceURI), localName, qName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startPrefixMapping(String prefix, String uri)
|
||||
throws SAXException {
|
||||
super.startPrefixMapping(prefix, mapOldNS(uri));
|
||||
}
|
||||
|
||||
/**
|
||||
* do not load any DTDs (may be requested by parser). Fake the DTD by
|
||||
* returning a empty string as InputSource
|
||||
*/
|
||||
@Override
|
||||
public InputSource resolveEntity(String publicId, String systemId)
|
||||
throws IOException, SAXException {
|
||||
if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
|
||||
|| DTD_PUBLIC_ID.equals(publicId)) {
|
||||
return new InputSource(new StringReader(""));
|
||||
} else {
|
||||
return super.resolveEntity(publicId, systemId);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,606 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.odf;
|
||||
|
||||
import org.apache.commons.io.input.CloseShieldInputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AbstractParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.sax.ElementMappingContentHandler;
|
||||
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
|
||||
import org.apache.tika.sax.OfflineContentHandler;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.utils.XMLReaderUtils;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.AttributesImpl;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.BitSet;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Stack;
|
||||
|
||||
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
|
||||
|
||||
/**
|
||||
* Parser for ODF <code>content.xml</code> files.
|
||||
*/
|
||||
public class OpenDocumentContentParser extends AbstractParser {
|
||||
private interface Style {
|
||||
}
|
||||
|
||||
private static class TextStyle implements Style {
|
||||
public boolean italic;
|
||||
public boolean bold;
|
||||
public boolean underlined;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TextStyle{" +
|
||||
"italic=" + italic +
|
||||
", bold=" + bold +
|
||||
", underlined=" + underlined +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
private static class ListStyle implements Style {
|
||||
public boolean ordered;
|
||||
|
||||
public String getTag() {
|
||||
return ordered ? "ol" : "ul";
|
||||
}
|
||||
}
|
||||
|
||||
private static final class OpenDocumentElementMappingContentHandler extends
|
||||
ElementMappingContentHandler {
|
||||
private static final char[] SPACE = new char[]{ ' '};
|
||||
private static final String CLASS = "class";
|
||||
private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation");
|
||||
private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note");
|
||||
private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes");
|
||||
|
||||
private static Attributes buildAttributes(String key, String value) {
|
||||
AttributesImpl attrs = new AttributesImpl();
|
||||
attrs.addAttribute("", key, key, "CDATA", value);
|
||||
return attrs;
|
||||
}
|
||||
|
||||
private final ContentHandler handler;
|
||||
private final BitSet textNodeStack = new BitSet();
|
||||
private int nodeDepth = 0;
|
||||
private int completelyFiltered = 0;
|
||||
private Stack<String> headingStack = new Stack<String>();
|
||||
private Map<String, TextStyle> paragraphTextStyleMap = new HashMap<String, TextStyle>();
|
||||
private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
|
||||
private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
|
||||
private String currParagraphStyleName; //paragraph style name
|
||||
private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs
|
||||
private String currTextStyleName;
|
||||
|
||||
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
|
||||
private ListStyle listStyle;
|
||||
|
||||
// True if we are currently in the named style:
|
||||
private boolean curUnderlined;
|
||||
private boolean curBold;
|
||||
private boolean curItalic;
|
||||
|
||||
//have we written the start style tags
|
||||
//yet for the current text style
|
||||
boolean hasWrittenStartStyleTags = false;
|
||||
|
||||
private int pDepth = 0; //<p> can appear inside comments and other things that are already inside <p>
|
||||
//we need to track our pDepth and only output <p> if we're at the main level
|
||||
|
||||
|
||||
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
|
||||
Map<QName, TargetElement> mappings) {
|
||||
super(handler, mappings);
|
||||
this.handler = handler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length)
|
||||
throws SAXException {
|
||||
// only forward content of tags from text:-namespace
|
||||
if (completelyFiltered == 0 && nodeDepth > 0
|
||||
&& textNodeStack.get(nodeDepth - 1)) {
|
||||
if (!hasWrittenStartStyleTags) {
|
||||
updateStyleTags();
|
||||
hasWrittenStartStyleTags = true;
|
||||
}
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
// helper for checking tags which need complete filtering
|
||||
// (with sub-tags)
|
||||
private boolean needsCompleteFiltering(
|
||||
String namespaceURI, String localName) {
|
||||
if (TEXT_NS.equals(namespaceURI)) {
|
||||
return localName.endsWith("-template")
|
||||
|| localName.endsWith("-style");
|
||||
}
|
||||
return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
|
||||
}
|
||||
|
||||
// map the heading level to <hX> HTML tags
|
||||
private String getXHTMLHeaderTagName(Attributes atts) {
|
||||
String depthStr = atts.getValue(TEXT_NS, "outline-level");
|
||||
if (depthStr == null) {
|
||||
return "h1";
|
||||
}
|
||||
|
||||
int depth = Integer.parseInt(depthStr);
|
||||
if (depth >= 6) {
|
||||
return "h6";
|
||||
} else if (depth <= 1) {
|
||||
return "h1";
|
||||
} else {
|
||||
return "h" + depth;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a node is a text node
|
||||
*/
|
||||
private boolean isTextNode(String namespaceURI, String localName) {
|
||||
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
|
||||
return true;
|
||||
}
|
||||
if (SVG_NS.equals(namespaceURI)) {
|
||||
return "title".equals(localName) ||
|
||||
"desc".equals(localName);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void startList(String name) throws SAXException {
|
||||
String elementName = "ul";
|
||||
if (name != null) {
|
||||
ListStyle style = listStyleMap.get(name);
|
||||
elementName = style != null ? style.getTag() : "ul";
|
||||
listStyleStack.push(style);
|
||||
}
|
||||
handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
|
||||
}
|
||||
|
||||
private void endList() throws SAXException {
|
||||
String elementName = "ul";
|
||||
if (!listStyleStack.isEmpty()) {
|
||||
ListStyle style = listStyleStack.pop();
|
||||
elementName = style != null ? style.getTag() : "ul";
|
||||
}
|
||||
handler.endElement(XHTML, elementName, elementName);
|
||||
}
|
||||
|
||||
private void startSpan(String name) throws SAXException {
|
||||
if (name == null) {
|
||||
return;
|
||||
}
|
||||
currTextStyle = textStyleMap.get(name);
|
||||
hasWrittenStartStyleTags = false;
|
||||
}
|
||||
|
||||
private void startParagraph(String styleName) throws SAXException {
|
||||
if (pDepth == 0) {
|
||||
handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES);
|
||||
if (styleName != null) {
|
||||
currTextStyle = paragraphTextStyleMap.get(styleName);
|
||||
}
|
||||
hasWrittenStartStyleTags = false;
|
||||
} else {
|
||||
handler.characters(SPACE, 0, SPACE.length);
|
||||
}
|
||||
pDepth++;
|
||||
}
|
||||
|
||||
private void endParagraph() throws SAXException {
|
||||
closeStyleTags();
|
||||
if (pDepth == 1) {
|
||||
handler.endElement(XHTML, "p", "p");
|
||||
} else {
|
||||
handler.characters(SPACE, 0, SPACE.length);
|
||||
}
|
||||
pDepth--;
|
||||
|
||||
}
|
||||
|
||||
private void updateStyleTags() throws SAXException {
|
||||
|
||||
if (currTextStyle == null) {
|
||||
closeStyleTags();
|
||||
return;
|
||||
}
|
||||
if (currTextStyle.bold != curBold) {
|
||||
// Enforce nesting -- must close s and i tags
|
||||
if (curUnderlined) {
|
||||
handler.endElement(XHTML, "u", "u");
|
||||
curUnderlined = false;
|
||||
}
|
||||
if (curItalic) {
|
||||
handler.endElement(XHTML, "i", "i");
|
||||
curItalic = false;
|
||||
}
|
||||
if (currTextStyle.bold) {
|
||||
handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
|
||||
} else {
|
||||
handler.endElement(XHTML, "b", "b");
|
||||
}
|
||||
curBold = currTextStyle.bold;
|
||||
}
|
||||
|
||||
if (currTextStyle.italic != curItalic) {
|
||||
// Enforce nesting -- must close s tag
|
||||
if (curUnderlined) {
|
||||
handler.endElement(XHTML, "u", "u");
|
||||
curUnderlined = false;
|
||||
}
|
||||
if (currTextStyle.italic) {
|
||||
handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
|
||||
} else {
|
||||
handler.endElement(XHTML, "i", "i");
|
||||
}
|
||||
curItalic = currTextStyle.italic;
|
||||
}
|
||||
|
||||
if (currTextStyle.underlined != curUnderlined) {
|
||||
if (currTextStyle.underlined) {
|
||||
handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
|
||||
} else {
|
||||
handler.endElement(XHTML, "u", "u");
|
||||
}
|
||||
curUnderlined = currTextStyle.underlined;
|
||||
}
|
||||
}
|
||||
|
||||
private void endSpan() throws SAXException {
|
||||
updateStyleTags();
|
||||
}
|
||||
|
||||
private void closeStyleTags() throws SAXException {
|
||||
// Close any still open style tags
|
||||
if (curUnderlined) {
|
||||
handler.endElement(XHTML,"u", "u");
|
||||
curUnderlined = false;
|
||||
}
|
||||
if (curItalic) {
|
||||
handler.endElement(XHTML,"i", "i");
|
||||
curItalic = false;
|
||||
}
|
||||
if (curBold) {
|
||||
handler.endElement(XHTML,"b", "b");
|
||||
curBold = false;
|
||||
}
|
||||
currTextStyle = null;
|
||||
hasWrittenStartStyleTags = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(
|
||||
String namespaceURI, String localName, String qName,
|
||||
Attributes attrs) throws SAXException {
|
||||
// keep track of current node type. If it is a text node,
|
||||
// a bit at the current depth its set in textNodeStack.
|
||||
// characters() checks the top bit to determine, if the
|
||||
// actual node is a text node to print out nodeDepth contains
|
||||
// the depth of the current node and also marks top of stack.
|
||||
assert nodeDepth >= 0;
|
||||
|
||||
// Set styles
|
||||
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
|
||||
String family = attrs.getValue(STYLE_NS, "family");
|
||||
if ("text".equals(family)) {
|
||||
currTextStyle = new TextStyle();
|
||||
currTextStyleName = attrs.getValue(STYLE_NS, "name");
|
||||
} else if ("paragraph".equals(family)) {
|
||||
currTextStyle = new TextStyle();
|
||||
currParagraphStyleName = attrs.getValue(STYLE_NS, "name");
|
||||
}
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
|
||||
listStyle = new ListStyle();
|
||||
String name = attrs.getValue(STYLE_NS, "name");
|
||||
listStyleMap.put(name, listStyle);
|
||||
} else if (currTextStyle != null && STYLE_NS.equals(namespaceURI)
|
||||
&& "text-properties".equals(localName)) {
|
||||
String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
|
||||
if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
|
||||
currTextStyle.italic = true;
|
||||
}
|
||||
String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
|
||||
if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
|
||||
|| (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
|
||||
&& Integer.valueOf(fontWeight) > 500)) {
|
||||
currTextStyle.bold = true;
|
||||
}
|
||||
String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
|
||||
if (underlineStyle != null && !underlineStyle.equals("none")) {
|
||||
currTextStyle.underlined = true;
|
||||
}
|
||||
} else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
|
||||
if ("list-level-style-bullet".equals(localName)) {
|
||||
listStyle.ordered = false;
|
||||
} else if ("list-level-style-number".equals(localName)) {
|
||||
listStyle.ordered = true;
|
||||
}
|
||||
}
|
||||
|
||||
textNodeStack.set(nodeDepth++,
|
||||
isTextNode(namespaceURI, localName));
|
||||
// filter *all* content of some tags
|
||||
assert completelyFiltered >= 0;
|
||||
|
||||
if (needsCompleteFiltering(namespaceURI, localName)) {
|
||||
completelyFiltered++;
|
||||
}
|
||||
// call next handler if no filtering
|
||||
if (completelyFiltered == 0) {
|
||||
// special handling of text:h, that are directly passed
|
||||
// to incoming handler
|
||||
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
|
||||
final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
|
||||
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
|
||||
startList(attrs.getValue(TEXT_NS, "style-name"));
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
|
||||
startSpan(attrs.getValue(TEXT_NS, "style-name"));
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
|
||||
startParagraph(attrs.getValue(TEXT_NS, "style-name"));
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
|
||||
handler.characters(SPACE, 0, 1);
|
||||
} else if ("annotation".equals(localName)) {
|
||||
closeStyleTags();
|
||||
handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES);
|
||||
} else if ("note".equals(localName)) {
|
||||
closeStyleTags();
|
||||
handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES);
|
||||
} else if ("notes".equals(localName)) {
|
||||
closeStyleTags();
|
||||
handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES);
|
||||
} else {
|
||||
super.startElement(namespaceURI, localName, qName, attrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(
|
||||
String namespaceURI, String localName, String qName)
|
||||
throws SAXException {
|
||||
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
|
||||
if (currTextStyle != null && currTextStyleName != null) {
|
||||
textStyleMap.put(currTextStyleName, currTextStyle);
|
||||
currTextStyleName = null;
|
||||
currTextStyle = null;
|
||||
} else if (currTextStyle != null && currParagraphStyleName != null) {
|
||||
paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle);
|
||||
currParagraphStyleName = null;
|
||||
currTextStyle = null;
|
||||
}
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
|
||||
listStyle = null;
|
||||
}
|
||||
|
||||
// call next handler if no filtering
|
||||
if (completelyFiltered == 0) {
|
||||
// special handling of text:h, that are directly passed
|
||||
// to incoming handler
|
||||
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
|
||||
final String el = headingStack.pop();
|
||||
handler.endElement(XHTMLContentHandler.XHTML, el, el);
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
|
||||
endList();
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
|
||||
currTextStyle = null;
|
||||
hasWrittenStartStyleTags = false;
|
||||
} else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
|
||||
endParagraph();
|
||||
} else if ("annotation".equals(localName) || "note".equals(localName) ||
|
||||
"notes".equals(localName)) {
|
||||
closeStyleTags();
|
||||
handler.endElement("", localName, localName);
|
||||
} else {
|
||||
super.endElement(namespaceURI, localName, qName);
|
||||
}
|
||||
|
||||
// special handling of tabulators
|
||||
if (TEXT_NS.equals(namespaceURI)
|
||||
&& ("tab-stop".equals(localName)
|
||||
|| "tab".equals(localName))) {
|
||||
this.characters(TAB, 0, TAB.length);
|
||||
}
|
||||
}
|
||||
|
||||
// revert filter for *all* content of some tags
|
||||
if (needsCompleteFiltering(namespaceURI, localName)) {
|
||||
completelyFiltered--;
|
||||
}
|
||||
assert completelyFiltered >= 0;
|
||||
|
||||
// reduce current node depth
|
||||
nodeDepth--;
|
||||
assert nodeDepth >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startPrefixMapping(String prefix, String uri) {
|
||||
// remove prefix mappings as they should not occur in XHTML
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endPrefixMapping(String prefix) {
|
||||
// remove prefix mappings as they should not occur in XHTML
|
||||
}
|
||||
}
|
||||
|
||||
public static final String TEXT_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:text:1.0";
|
||||
|
||||
public static final String TABLE_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
|
||||
|
||||
public static final String STYLE_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:style:1.0";
|
||||
|
||||
public static final String FORMATTING_OBJECTS_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
|
||||
|
||||
public static final String OFFICE_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
|
||||
|
||||
public static final String SVG_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
|
||||
|
||||
public static final String PRESENTATION_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
|
||||
|
||||
public static final String DRAW_NS =
|
||||
"urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
|
||||
|
||||
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
|
||||
|
||||
protected static final char[] TAB = new char[]{'\t'};
|
||||
|
||||
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
|
||||
|
||||
/**
|
||||
* Mappings between ODF tag names and XHTML tag names
|
||||
* (including attributes). All other tag names/attributes are ignored
|
||||
* and left out from event stream.
|
||||
*/
|
||||
private static final HashMap<QName, TargetElement> MAPPINGS =
|
||||
new HashMap<QName, TargetElement>();
|
||||
|
||||
static {
|
||||
// general mappings of text:-tags
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "p"),
|
||||
new TargetElement(XHTML, "p"));
|
||||
// text:h-tags are mapped specifically in startElement/endElement
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "line-break"),
|
||||
new TargetElement(XHTML, "br"));
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "list-item"),
|
||||
new TargetElement(XHTML, "li"));
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "note"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
MAPPINGS.put(
|
||||
new QName(OFFICE_NS, "annotation"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
MAPPINGS.put(
|
||||
new QName(PRESENTATION_NS, "notes"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
MAPPINGS.put(
|
||||
new QName(DRAW_NS, "object"),
|
||||
new TargetElement(XHTML, "object"));
|
||||
MAPPINGS.put(
|
||||
new QName(DRAW_NS, "text-box"),
|
||||
new TargetElement(XHTML, "div"));
|
||||
MAPPINGS.put(
|
||||
new QName(SVG_NS, "title"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
MAPPINGS.put(
|
||||
new QName(SVG_NS, "desc"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "span"),
|
||||
new TargetElement(XHTML, "span"));
|
||||
|
||||
final HashMap<QName, QName> aAttsMapping =
|
||||
new HashMap<QName, QName>();
|
||||
aAttsMapping.put(
|
||||
new QName(XLINK_NS, "href"),
|
||||
new QName("href"));
|
||||
aAttsMapping.put(
|
||||
new QName(XLINK_NS, "title"),
|
||||
new QName("title"));
|
||||
MAPPINGS.put(
|
||||
new QName(TEXT_NS, "a"),
|
||||
new TargetElement(XHTML, "a", aAttsMapping));
|
||||
|
||||
// create HTML tables from table:-tags
|
||||
MAPPINGS.put(
|
||||
new QName(TABLE_NS, "table"),
|
||||
new TargetElement(XHTML, "table"));
|
||||
// repeating of rows is ignored; for columns, see below!
|
||||
MAPPINGS.put(
|
||||
new QName(TABLE_NS, "table-row"),
|
||||
new TargetElement(XHTML, "tr"));
|
||||
// special mapping for rowspan/colspan attributes
|
||||
final HashMap<QName, QName> tableCellAttsMapping =
|
||||
new HashMap<QName, QName>();
|
||||
tableCellAttsMapping.put(
|
||||
new QName(TABLE_NS, "number-columns-spanned"),
|
||||
new QName("colspan"));
|
||||
tableCellAttsMapping.put(
|
||||
new QName(TABLE_NS, "number-rows-spanned"),
|
||||
new QName("rowspan"));
|
||||
/* TODO: The following is not correct, the cell should be repeated not spanned!
|
||||
* Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
|
||||
* Problems may occur when both spanning and repeating is given, which is not allowed by spec.
|
||||
* Cell spanning instead of repeating is not a problem, because OpenOffice uses it
|
||||
* only for empty cells.
|
||||
*/
|
||||
tableCellAttsMapping.put(
|
||||
new QName(TABLE_NS, "number-columns-repeated"),
|
||||
new QName("colspan"));
|
||||
MAPPINGS.put(
|
||||
new QName(TABLE_NS, "table-cell"),
|
||||
new TargetElement(XHTML, "td", tableCellAttsMapping));
|
||||
}
|
||||
|
||||
public Set<MediaType> getSupportedTypes(ParseContext context) {
|
||||
return Collections.emptySet(); // not a top-level parser
|
||||
}
|
||||
|
||||
public void parse(
|
||||
InputStream stream, ContentHandler handler,
|
||||
Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
parseInternal(stream,
|
||||
new XHTMLContentHandler(handler, metadata),
|
||||
metadata, context);
|
||||
}
|
||||
|
||||
void parseInternal(
|
||||
InputStream stream, final ContentHandler handler,
|
||||
Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
|
||||
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
|
||||
|
||||
|
||||
XMLReaderUtils.parseSAX(
|
||||
new CloseShieldInputStream(stream),
|
||||
new OfflineContentHandler(
|
||||
new NSNormalizerContentHandler(dh)),
|
||||
context);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,199 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.odf;
|
||||
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
import org.apache.tika.metadata.MSOffice;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.OfficeOpenXMLCore;
|
||||
import org.apache.tika.metadata.PagedText;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
|
||||
import org.apache.tika.parser.xml.AttributeMetadataHandler;
|
||||
import org.apache.tika.parser.xml.ElementMetadataHandler;
|
||||
import org.apache.tika.parser.xml.MetadataHandler;
|
||||
import org.apache.tika.parser.xml.XMLParser;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.apache.tika.sax.xpath.CompositeMatcher;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* Parser for OpenDocument <code>meta.xml</code> files.
|
||||
*/
|
||||
public class OpenDocumentMetaParser extends XMLParser {
|
||||
/**
|
||||
* Serial version UID
|
||||
*/
|
||||
private static final long serialVersionUID = -8739250869531737584L;
|
||||
|
||||
private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
|
||||
private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
|
||||
|
||||
/**
|
||||
* @see OfficeOpenXMLCore#SUBJECT
|
||||
* @deprecated use OfficeOpenXMLCore#SUBJECT
|
||||
*/
|
||||
@Deprecated
|
||||
private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
|
||||
Property.composite(Office.INITIAL_AUTHOR,
|
||||
new Property[]{Property.externalText("initial-creator")});
|
||||
|
||||
private static ContentHandler getDublinCoreHandler(
|
||||
Metadata metadata, Property property, String element) {
|
||||
return new ElementMetadataHandler(
|
||||
DublinCore.NAMESPACE_URI_DC, element,
|
||||
metadata, property);
|
||||
}
|
||||
|
||||
private static ContentHandler getMeta(
|
||||
ContentHandler ch, Metadata md, Property property, String element) {
|
||||
Matcher matcher = new CompositeMatcher(
|
||||
META_XPATH.parse("//meta:" + element),
|
||||
META_XPATH.parse("//meta:" + element + "//text()"));
|
||||
ContentHandler branch =
|
||||
new MatchingContentHandler(new MetadataHandler(md, property), matcher);
|
||||
return new TeeContentHandler(ch, branch);
|
||||
}
|
||||
|
||||
private static ContentHandler getUserDefined(
|
||||
ContentHandler ch, Metadata md) {
|
||||
Matcher matcher = new CompositeMatcher(
|
||||
META_XPATH.parse("//meta:user-defined/@meta:name"),
|
||||
META_XPATH.parse("//meta:user-defined//text()"));
|
||||
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
|
||||
ContentHandler branch = new MatchingContentHandler(
|
||||
new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
|
||||
matcher);
|
||||
return new TeeContentHandler(ch, branch);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
private static ContentHandler getStatistic(
|
||||
ContentHandler ch, Metadata md, String name, String attribute) {
|
||||
Matcher matcher =
|
||||
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
|
||||
ContentHandler branch = new MatchingContentHandler(
|
||||
new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
|
||||
return new TeeContentHandler(ch, branch);
|
||||
}
|
||||
|
||||
private static ContentHandler getStatistic(
|
||||
ContentHandler ch, Metadata md, Property property, String attribute) {
|
||||
Matcher matcher =
|
||||
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
|
||||
ContentHandler branch = new MatchingContentHandler(
|
||||
new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
|
||||
return new TeeContentHandler(ch, branch);
|
||||
}
|
||||
|
||||
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
|
||||
// We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
|
||||
// Process the Dublin Core Attributes
|
||||
ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
|
||||
getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
|
||||
|
||||
// Process the OO Meta Attributes
|
||||
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
|
||||
// ODF uses dc:date for modified
|
||||
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
|
||||
DublinCore.NAMESPACE_URI_DC, "date",
|
||||
md, TikaCoreProperties.MODIFIED));
|
||||
|
||||
// ODF uses dc:subject for description
|
||||
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
|
||||
DublinCore.NAMESPACE_URI_DC, "subject",
|
||||
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
|
||||
ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
|
||||
|
||||
ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
|
||||
ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
|
||||
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
|
||||
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
|
||||
|
||||
// Process the user defined Meta Attributes
|
||||
ch = getUserDefined(ch, md);
|
||||
|
||||
// Process the OO Statistics Attributes
|
||||
ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
|
||||
ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
|
||||
ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
|
||||
ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
|
||||
ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
|
||||
ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
|
||||
ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
|
||||
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
|
||||
|
||||
// Legacy, Tika-1.0 style attributes
|
||||
// TODO Remove these in Tika 2.0
|
||||
ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
|
||||
ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
|
||||
ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
|
||||
ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
|
||||
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
|
||||
ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
|
||||
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
|
||||
|
||||
// Legacy Statistics Attributes, replaced with real keys above
|
||||
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
|
||||
ch = getStatistic(ch, md, "nbPage", "page-count");
|
||||
ch = getStatistic(ch, md, "nbPara", "paragraph-count");
|
||||
ch = getStatistic(ch, md, "nbWord", "word-count");
|
||||
ch = getStatistic(ch, md, "nbCharacter", "character-count");
|
||||
ch = getStatistic(ch, md, "nbTab", "table-count");
|
||||
ch = getStatistic(ch, md, "nbObject", "object-count");
|
||||
ch = getStatistic(ch, md, "nbImg", "image-count");
|
||||
|
||||
// Normalise the rest
|
||||
ch = new NSNormalizerContentHandler(ch);
|
||||
return ch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void parse(
|
||||
InputStream stream, ContentHandler handler,
|
||||
Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
super.parse(stream, handler, metadata, context);
|
||||
// Copy subject to description for OO2
|
||||
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
|
||||
if (odfSubject != null && !odfSubject.equals("") &&
|
||||
(metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
|
||||
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,256 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.odf;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentUtil;
|
||||
import org.apache.tika.io.TikaInputStream;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AbstractParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.EmbeddedContentHandler;
|
||||
import org.apache.tika.sax.EndDocumentShieldingContentHandler;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.UTF_8;
|
||||
|
||||
/**
|
||||
* OpenOffice parser
|
||||
*/
|
||||
public class OpenDocumentParser extends AbstractParser {
|
||||
|
||||
/**
|
||||
* Serial version UID
|
||||
*/
|
||||
private static final long serialVersionUID = -6410276875438618287L;
|
||||
|
||||
private static final Set<MediaType> SUPPORTED_TYPES =
|
||||
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
|
||||
MediaType.application("vnd.sun.xml.writer"),
|
||||
MediaType.application("vnd.oasis.opendocument.text"),
|
||||
MediaType.application("vnd.oasis.opendocument.graphics"),
|
||||
MediaType.application("vnd.oasis.opendocument.presentation"),
|
||||
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
|
||||
MediaType.application("vnd.oasis.opendocument.chart"),
|
||||
MediaType.application("vnd.oasis.opendocument.image"),
|
||||
MediaType.application("vnd.oasis.opendocument.formula"),
|
||||
MediaType.application("vnd.oasis.opendocument.text-master"),
|
||||
MediaType.application("vnd.oasis.opendocument.text-web"),
|
||||
MediaType.application("vnd.oasis.opendocument.text-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.graphics-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.presentation-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.chart-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.image-template"),
|
||||
MediaType.application("vnd.oasis.opendocument.formula-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.text"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.graphics"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.presentation"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.chart"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.image"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.formula"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.text-master"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.text-web"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.text-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.image-template"),
|
||||
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
|
||||
|
||||
private static final String META_NAME = "meta.xml";
|
||||
|
||||
private Parser meta = new OpenDocumentMetaParser();
|
||||
|
||||
private Parser content = new OpenDocumentContentParser();
|
||||
|
||||
public Parser getMetaParser() {
|
||||
return meta;
|
||||
}
|
||||
|
||||
public void setMetaParser(Parser meta) {
|
||||
this.meta = meta;
|
||||
}
|
||||
|
||||
public Parser getContentParser() {
|
||||
return content;
|
||||
}
|
||||
|
||||
public void setContentParser(Parser content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public Set<MediaType> getSupportedTypes(ParseContext context) {
|
||||
return SUPPORTED_TYPES;
|
||||
}
|
||||
|
||||
public void parse(
|
||||
InputStream stream, ContentHandler baseHandler,
|
||||
Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
|
||||
// Open the Zip stream
|
||||
// Use a File if we can, and an already open zip is even better
|
||||
ZipFile zipFile = null;
|
||||
ZipInputStream zipStream = null;
|
||||
if (stream instanceof TikaInputStream) {
|
||||
TikaInputStream tis = (TikaInputStream) stream;
|
||||
Object container = ((TikaInputStream) stream).getOpenContainer();
|
||||
if (container instanceof ZipFile) {
|
||||
zipFile = (ZipFile) container;
|
||||
} else if (tis.hasFile()) {
|
||||
zipFile = new ZipFile(tis.getFile());
|
||||
} else {
|
||||
zipStream = new ZipInputStream(stream);
|
||||
}
|
||||
} else {
|
||||
zipStream = new ZipInputStream(stream);
|
||||
}
|
||||
|
||||
// Prepare to handle the content
|
||||
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
|
||||
|
||||
// As we don't know which of the metadata or the content
|
||||
// we'll hit first, catch the endDocument call initially
|
||||
EndDocumentShieldingContentHandler handler =
|
||||
new EndDocumentShieldingContentHandler(xhtml);
|
||||
|
||||
if (zipFile != null) {
|
||||
try {
|
||||
handleZipFile(zipFile, metadata, context, handler);
|
||||
} finally {
|
||||
//Do we want to close silently == catch an exception here?
|
||||
zipFile.close();
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
handleZipStream(zipStream, metadata, context, handler);
|
||||
} finally {
|
||||
//Do we want to close silently == catch an exception here?
|
||||
zipStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Only now call the end document
|
||||
if (handler.getEndDocumentWasCalled()) {
|
||||
handler.reallyEndDocument();
|
||||
}
|
||||
}
|
||||
|
||||
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
|
||||
ZipEntry entry = zipStream.getNextEntry();
|
||||
if (entry == null) {
|
||||
throw new IOException("No entries found in ZipInputStream");
|
||||
}
|
||||
do {
|
||||
handleZipEntry(entry, zipStream, metadata, context, handler);
|
||||
entry = zipStream.getNextEntry();
|
||||
} while (entry != null);
|
||||
}
|
||||
|
||||
private void handleZipFile(ZipFile zipFile, Metadata metadata,
|
||||
ParseContext context, EndDocumentShieldingContentHandler handler)
|
||||
throws IOException, TikaException, SAXException {
|
||||
// If we can, process the metadata first, then the
|
||||
// rest of the file afterwards (TIKA-1353)
|
||||
// Only possible to guarantee that when opened from a file not a stream
|
||||
|
||||
ZipEntry entry = zipFile.getEntry(META_NAME);
|
||||
if (entry != null) {
|
||||
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
|
||||
}
|
||||
|
||||
Enumeration<? extends ZipEntry> entries = zipFile.entries();
|
||||
while (entries.hasMoreElements()) {
|
||||
entry = entries.nextElement();
|
||||
if (!META_NAME.equals(entry.getName())) {
|
||||
handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
|
||||
}
|
||||
}
|
||||
}
|
||||
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
|
||||
ParseContext context, EndDocumentShieldingContentHandler handler)
|
||||
throws IOException, SAXException, TikaException {
|
||||
if (entry == null) return;
|
||||
|
||||
if (entry.getName().equals("mimetype")) {
|
||||
String type = IOUtils.toString(zip, UTF_8);
|
||||
metadata.set(Metadata.CONTENT_TYPE, type);
|
||||
} else if (entry.getName().equals(META_NAME)) {
|
||||
meta.parse(zip, new DefaultHandler(), metadata, context);
|
||||
} else if (entry.getName().endsWith("content.xml")) {
|
||||
if (content instanceof OpenDocumentContentParser) {
|
||||
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
|
||||
} else {
|
||||
// Foreign content parser was set:
|
||||
content.parse(zip, handler, metadata, context);
|
||||
}
|
||||
} else if (entry.getName().endsWith("styles.xml")) {
|
||||
if (content instanceof OpenDocumentContentParser) {
|
||||
((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
|
||||
} else {
|
||||
// Foreign content parser was set:
|
||||
content.parse(zip, handler, metadata, context);
|
||||
}
|
||||
} else {
|
||||
String embeddedName = entry.getName();
|
||||
//scrape everything under Thumbnails/ and Pictures/
|
||||
if (embeddedName.contains("Thumbnails/") ||
|
||||
embeddedName.contains("Pictures/")) {
|
||||
EmbeddedDocumentExtractor embeddedDocumentExtractor =
|
||||
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
|
||||
Metadata embeddedMetadata = new Metadata();
|
||||
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
|
||||
/* if (embeddedName.startsWith("Thumbnails/")) {
|
||||
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
|
||||
TikaCoreProperties.EmbeddedResourceType.THUMBNAIL);
|
||||
}*/
|
||||
if (embeddedName.contains("Pictures/")) {
|
||||
embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE,
|
||||
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
|
||||
}
|
||||
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
|
||||
embeddedDocumentExtractor.parseEmbedded(zip,
|
||||
new EmbeddedContentHandler(handler), embeddedMetadata, false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base class for SAX handlers that map SAX events into document metadata.
|
||||
*
|
||||
* @since Apache Tika 0.10
|
||||
*/
|
||||
class AbstractMetadataHandler extends DefaultHandler {
|
||||
|
||||
private final Metadata metadata;
|
||||
private final Property property;
|
||||
private final String name;
|
||||
|
||||
protected AbstractMetadataHandler(Metadata metadata, String name) {
|
||||
this.metadata = metadata;
|
||||
this.property = null;
|
||||
this.name = name;
|
||||
}
|
||||
protected AbstractMetadataHandler(Metadata metadata, Property property) {
|
||||
this.metadata = metadata;
|
||||
this.property = property;
|
||||
this.name = property.getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given metadata value. The value is ignored if it is
|
||||
* <code>null</code> or empty. If the metadata entry already exists,
|
||||
* then the given value is appended to it with a comma as the separator.
|
||||
*
|
||||
* @param value metadata value
|
||||
*/
|
||||
protected void addMetadata(String value) {
|
||||
if (value != null && value.length() > 0) {
|
||||
if (metadata.isMultiValued(name)) {
|
||||
// Add the value, assuming it's not already there
|
||||
List<String> previous = Arrays.asList(metadata.getValues(name));
|
||||
if (!previous.contains(value)) {
|
||||
if (property != null) {
|
||||
metadata.add(property, value);
|
||||
} else {
|
||||
metadata.add(name, value);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Set the value, assuming it's not already there
|
||||
String previous = metadata.get(name);
|
||||
if (previous != null && previous.length() > 0) {
|
||||
if (!previous.equals(value)) {
|
||||
if (property != null) {
|
||||
if (property.isMultiValuePermitted()) {
|
||||
metadata.add(property, value);
|
||||
} else {
|
||||
// Replace the existing value if isMultiValuePermitted is false
|
||||
metadata.set(property, value);
|
||||
}
|
||||
} else {
|
||||
metadata.add(name, value);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (property != null) {
|
||||
metadata.set(property, value);
|
||||
} else {
|
||||
metadata.set(name, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/**
|
||||
* This adds a Metadata entry for a given node.
|
||||
* The textual content of the node is used as the
|
||||
* value, and the Metadata name is taken from
|
||||
* an attribute, with a prefix if required.
|
||||
*/
|
||||
public class AttributeDependantMetadataHandler extends DefaultHandler {
|
||||
|
||||
private final Metadata metadata;
|
||||
|
||||
private final String nameHoldingAttribute;
|
||||
private final String namePrefix;
|
||||
private String name;
|
||||
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
|
||||
public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) {
|
||||
this.metadata = metadata;
|
||||
this.nameHoldingAttribute = nameHoldingAttribute;
|
||||
this.namePrefix = namePrefix;
|
||||
}
|
||||
|
||||
public void addMetadata(String value) {
|
||||
if(name == null || name.length() == 0) {
|
||||
// We didn't find the attribute which holds the name
|
||||
return;
|
||||
}
|
||||
if (value.length() > 0) {
|
||||
String previous = metadata.get(name);
|
||||
if (previous != null && previous.length() > 0) {
|
||||
value = previous + ", " + value;
|
||||
}
|
||||
metadata.set(name, value);
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String uri, String localName, String name) {
|
||||
addMetadata(buffer.toString());
|
||||
buffer.setLength(0);
|
||||
}
|
||||
|
||||
public void startElement(
|
||||
String uri, String localName, String name, Attributes attributes) {
|
||||
String rawName = attributes.getValue(nameHoldingAttribute);
|
||||
if (rawName != null) {
|
||||
if (namePrefix == null) {
|
||||
this.name = rawName;
|
||||
} else {
|
||||
this.name = namePrefix + rawName;
|
||||
}
|
||||
}
|
||||
// All other attributes are ignored
|
||||
}
|
||||
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
buffer.append(ch, start, length);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
/**
|
||||
* SAX event handler that maps the contents of an XML attribute into
|
||||
* a metadata field.
|
||||
*
|
||||
* @since Apache Tika 0.10
|
||||
*/
|
||||
public class AttributeMetadataHandler extends AbstractMetadataHandler {
|
||||
|
||||
private final String uri;
|
||||
|
||||
private final String localName;
|
||||
|
||||
public AttributeMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, String name) {
|
||||
super(metadata, name);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
}
|
||||
public AttributeMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, Property property) {
|
||||
super(metadata, property);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(
|
||||
String uri, String localName, String qName, Attributes attributes)
|
||||
throws SAXException {
|
||||
for (int i = 0; i < attributes.getLength(); i++) {
|
||||
if (attributes.getURI(i).equals(this.uri)
|
||||
&& attributes.getLocalName(i).equals(this.localName)) {
|
||||
addMetadata(attributes.getValue(i).trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.DublinCore;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.sax.TeeContentHandler;
|
||||
import org.xml.sax.ContentHandler;
|
||||
|
||||
/**
|
||||
* Dublin Core metadata parser
|
||||
*/
|
||||
public class DcXMLParser extends XMLParser {
|
||||
|
||||
/** Serial version UID */
|
||||
private static final long serialVersionUID = 4905318835463880819L;
|
||||
|
||||
private static ContentHandler getDublinCoreHandler(
|
||||
Metadata metadata, Property property, String element) {
|
||||
return new ElementMetadataHandler(
|
||||
DublinCore.NAMESPACE_URI_DC, element,
|
||||
metadata, property);
|
||||
}
|
||||
|
||||
protected ContentHandler getContentHandler(
|
||||
ContentHandler handler, Metadata metadata, ParseContext context) {
|
||||
return new TeeContentHandler(
|
||||
super.getContentHandler(handler, metadata, context),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"),
|
||||
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights"));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,241 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.xml.sax.Attributes;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* SAX event handler that maps the contents of an XML element into
|
||||
* a metadata field.
|
||||
*
|
||||
* @since Apache Tika 0.10
|
||||
*/
|
||||
public class ElementMetadataHandler extends AbstractMetadataHandler {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class);
|
||||
|
||||
private static final String LOCAL_NAME_RDF_BAG = "Bag";
|
||||
private static final String LOCAL_NAME_RDF_LI = "li";
|
||||
private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
|
||||
|
||||
private final String uri;
|
||||
|
||||
private final String localName;
|
||||
|
||||
private final Metadata metadata;
|
||||
|
||||
private final String name;
|
||||
private Property targetProperty;
|
||||
|
||||
private final boolean allowDuplicateValues;
|
||||
private final boolean allowEmptyValues;
|
||||
|
||||
/**
|
||||
* The buffer used to capture characters when inside a bag li element.
|
||||
*/
|
||||
private final StringBuilder bufferBagged = new StringBuilder();
|
||||
|
||||
/**
|
||||
* The buffer used to capture characters inside standard elements.
|
||||
*/
|
||||
private final StringBuilder bufferBagless = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Whether or not the value was found in a standard element structure or inside a bag.
|
||||
*/
|
||||
private boolean isBagless = true;
|
||||
|
||||
private int matchLevel = 0;
|
||||
private int parentMatchLevel = 0;
|
||||
|
||||
/**
|
||||
* Constructor for string metadata keys.
|
||||
*
|
||||
* @param uri the uri of the namespace of the element
|
||||
* @param localName the local name of the element
|
||||
* @param metadata the Tika metadata object to populate
|
||||
* @param name the Tika metadata field key
|
||||
*/
|
||||
public ElementMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, String name) {
|
||||
super(metadata, name);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
this.metadata = metadata;
|
||||
this.name = name;
|
||||
this.allowDuplicateValues = false;
|
||||
this.allowEmptyValues = false;
|
||||
LOG.trace("created simple handler for {}", this.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for string metadata keys which allows change of behavior
|
||||
* for duplicate and empty entry values.
|
||||
*
|
||||
* @param uri the uri of the namespace of the element
|
||||
* @param localName the local name of the element
|
||||
* @param metadata the Tika metadata object to populate
|
||||
* @param name the Tika metadata field key
|
||||
* @param allowDuplicateValues add duplicate values to the Tika metadata
|
||||
* @param allowEmptyValues add empty values to the Tika metadata
|
||||
*/
|
||||
public ElementMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) {
|
||||
super(metadata, name);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
this.metadata = metadata;
|
||||
this.name = name;
|
||||
this.allowDuplicateValues = allowDuplicateValues;
|
||||
this.allowEmptyValues = allowEmptyValues;
|
||||
LOG.trace("created simple handler for {}", this.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for Property metadata keys.
|
||||
*
|
||||
* @param uri the uri of the namespace of the element
|
||||
* @param localName the local name of the element
|
||||
* @param metadata the Tika metadata object to populate
|
||||
* @param targetProperty the Tika metadata Property key
|
||||
*/
|
||||
public ElementMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, Property targetProperty) {
|
||||
super(metadata, targetProperty);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
this.metadata = metadata;
|
||||
this.targetProperty = targetProperty;
|
||||
this.name = targetProperty.getName();
|
||||
this.allowDuplicateValues = false;
|
||||
this.allowEmptyValues = false;
|
||||
LOG.trace("created property handler for {}", this.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for Property metadata keys which allows change of behavior
|
||||
* for duplicate and empty entry values.
|
||||
*
|
||||
* @param uri the uri of the namespace of the element
|
||||
* @param localName the local name of the element
|
||||
* @param metadata the Tika metadata object to populate
|
||||
* @param targetProperty the Tika metadata Property key
|
||||
* @param allowDuplicateValues add duplicate values to the Tika metadata
|
||||
* @param allowEmptyValues add empty values to the Tika metadata
|
||||
*/
|
||||
public ElementMetadataHandler(
|
||||
String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) {
|
||||
super(metadata, targetProperty);
|
||||
this.uri = uri;
|
||||
this.localName = localName;
|
||||
this.metadata = metadata;
|
||||
this.targetProperty = targetProperty;
|
||||
this.name = targetProperty.getName();
|
||||
this.allowDuplicateValues = allowDuplicateValues;
|
||||
this.allowEmptyValues = allowEmptyValues;
|
||||
LOG.trace("created property handler for {}", this.name);
|
||||
}
|
||||
|
||||
protected boolean isMatchingParentElement(String uri, String localName) {
|
||||
return (uri.equals(this.uri) && localName.equals(this.localName));
|
||||
}
|
||||
|
||||
protected boolean isMatchingElement(String uri, String localName) {
|
||||
// match if we're inside the parent element or within some bag element
|
||||
return (uri.equals(this.uri) && localName.equals(this.localName)) ||
|
||||
(parentMatchLevel > 0 &&
|
||||
((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) ||
|
||||
(uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI))
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(
|
||||
String uri, String localName, String name, Attributes attributes) {
|
||||
if (isMatchingElement(uri, localName)) {
|
||||
matchLevel++;
|
||||
}
|
||||
if (isMatchingParentElement(uri, localName)) {
|
||||
parentMatchLevel++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String name) {
|
||||
if (isMatchingParentElement(uri, localName)) {
|
||||
parentMatchLevel--;
|
||||
}
|
||||
if (isMatchingElement(uri, localName)) {
|
||||
matchLevel--;
|
||||
if (matchLevel == 2) {
|
||||
// we're inside a bag li element, add the bagged buffer
|
||||
addMetadata(bufferBagged.toString().trim());
|
||||
bufferBagged.setLength(0);
|
||||
isBagless = false;
|
||||
}
|
||||
if (matchLevel == 0 && isBagless) {
|
||||
String valueBagless = bufferBagless.toString();
|
||||
if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) {
|
||||
// we're in a standard element, add the bagless buffer
|
||||
addMetadata(valueBagless.trim());
|
||||
bufferBagless.setLength(0);
|
||||
}
|
||||
isBagless = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
// We need to append to both buffers since we don't if we're inside a bag until we're done
|
||||
if (parentMatchLevel > 0 && matchLevel > 2) {
|
||||
bufferBagged.append(ch, start, length);
|
||||
}
|
||||
if (parentMatchLevel > 0 && matchLevel > 0) {
|
||||
bufferBagless.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length) {
|
||||
characters(ch, start, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void addMetadata(String value) {
|
||||
LOG.trace("adding {}={}", name, value);
|
||||
if (targetProperty != null && targetProperty.isMultiValuePermitted()) {
|
||||
if ((value != null && value.length() > 0) || allowEmptyValues) {
|
||||
if (value == null || value.length() == 0 && allowEmptyValues) {
|
||||
value = "";
|
||||
}
|
||||
String[] previous = metadata.getValues(name);
|
||||
if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) {
|
||||
metadata.add(targetProperty, value);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
super.addMetadata(value);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
|
||||
import org.apache.tika.extractor.EmbeddedDocumentUtil;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
public class FictionBookParser extends XMLParser {
|
||||
private static final long serialVersionUID = 4195954546491524374L;
|
||||
|
||||
private static final Set<MediaType> SUPPORTED_TYPES =
|
||||
Collections.singleton(MediaType.application("x-fictionbook+xml"));
|
||||
@Override
|
||||
public Set<MediaType> getSupportedTypes(ParseContext context) {
|
||||
return SUPPORTED_TYPES;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
|
||||
return new BinaryElementsDataHandler(
|
||||
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler);
|
||||
}
|
||||
|
||||
private static class BinaryElementsDataHandler extends DefaultHandler {
|
||||
private static final String ELEMENT_BINARY = "binary";
|
||||
|
||||
private boolean binaryMode = false;
|
||||
private static final String ATTRIBUTE_ID = "id";
|
||||
|
||||
private final EmbeddedDocumentExtractor partExtractor;
|
||||
private final ContentHandler handler;
|
||||
private final StringBuilder binaryData = new StringBuilder();
|
||||
private Metadata metadata;
|
||||
private static final String ATTRIBUTE_CONTENT_TYPE = "content-type";
|
||||
|
||||
private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) {
|
||||
this.partExtractor = partExtractor;
|
||||
this.handler = handler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
binaryMode = ELEMENT_BINARY.equals(localName);
|
||||
if (binaryMode) {
|
||||
binaryData.setLength(0);
|
||||
metadata = new Metadata();
|
||||
|
||||
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID));
|
||||
metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException {
|
||||
if (binaryMode) {
|
||||
try {
|
||||
partExtractor.parseEmbedded(
|
||||
new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())),
|
||||
handler,
|
||||
metadata,
|
||||
true
|
||||
);
|
||||
} catch (IOException e) {
|
||||
throw new SAXException("IOException in parseEmbedded", e);
|
||||
}
|
||||
|
||||
binaryMode = false;
|
||||
binaryData.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
if (!binaryMode) {
|
||||
handler.characters(ch, start, length);
|
||||
} else {
|
||||
binaryData.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
|
||||
handler.ignorableWhitespace(ch, start, length);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Property;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/**
|
||||
* This adds Metadata entries with a specified name for
|
||||
* the textual content of a node (if present), and
|
||||
* all attribute values passed through the matcher
|
||||
* (but not their names).
|
||||
*
|
||||
* @deprecated Use the {@link AttributeMetadataHandler} and
|
||||
* {@link ElementMetadataHandler} classes instead
|
||||
*/
|
||||
public class MetadataHandler extends DefaultHandler {
|
||||
|
||||
private final Metadata metadata;
|
||||
|
||||
private final Property property;
|
||||
private final String name;
|
||||
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
|
||||
public MetadataHandler(Metadata metadata, String name) {
|
||||
this.metadata = metadata;
|
||||
this.property = null;
|
||||
this.name = name;
|
||||
}
|
||||
public MetadataHandler(Metadata metadata, Property property) {
|
||||
this.metadata = metadata;
|
||||
this.property = property;
|
||||
this.name = property.getName();
|
||||
}
|
||||
|
||||
public void addMetadata(String value) {
|
||||
if (value.length() > 0) {
|
||||
String previous = metadata.get(name);
|
||||
if (previous != null && previous.length() > 0) {
|
||||
value = previous + ", " + value;
|
||||
}
|
||||
|
||||
if (this.property != null) {
|
||||
metadata.set(property, value);
|
||||
} else {
|
||||
metadata.set(name, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String uri, String localName, String name) {
|
||||
addMetadata(buffer.toString());
|
||||
buffer.setLength(0);
|
||||
}
|
||||
|
||||
public void startElement(
|
||||
String uri, String localName, String name, Attributes attributes) {
|
||||
for (int i = 0; i < attributes.getLength(); i++) {
|
||||
addMetadata(attributes.getValue(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
buffer.append(ch, start, length);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.tika.parser.xml;
|
||||
|
||||
import org.apache.commons.io.input.CloseShieldInputStream;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.mime.MediaType;
|
||||
import org.apache.tika.parser.AbstractParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.sax.EmbeddedContentHandler;
|
||||
import org.apache.tika.sax.OfflineContentHandler;
|
||||
import org.apache.tika.sax.TaggedContentHandler;
|
||||
import org.apache.tika.sax.TextContentHandler;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.utils.XMLReaderUtils;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* XML parser.
|
||||
*/
|
||||
public class XMLParser extends AbstractParser {
|
||||
|
||||
/** Serial version UID */
|
||||
private static final long serialVersionUID = -6028836725280212837L;
|
||||
|
||||
private static final Set<MediaType> SUPPORTED_TYPES =
|
||||
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
|
||||
MediaType.application("xml"),
|
||||
MediaType.image("svg+xml"))));
|
||||
|
||||
public Set<MediaType> getSupportedTypes(ParseContext context) {
|
||||
return SUPPORTED_TYPES;
|
||||
}
|
||||
|
||||
public void parse(
|
||||
InputStream stream, ContentHandler handler,
|
||||
Metadata metadata, ParseContext context)
|
||||
throws IOException, SAXException, TikaException {
|
||||
if (metadata.get(Metadata.CONTENT_TYPE) == null) {
|
||||
metadata.set(Metadata.CONTENT_TYPE, "application/xml");
|
||||
}
|
||||
|
||||
final XHTMLContentHandler xhtml =
|
||||
new XHTMLContentHandler(handler, metadata);
|
||||
xhtml.startDocument();
|
||||
xhtml.startElement("p");
|
||||
|
||||
TaggedContentHandler tagged = new TaggedContentHandler(handler);
|
||||
try {
|
||||
XMLReaderUtils.parseSAX(
|
||||
new CloseShieldInputStream(stream),
|
||||
new OfflineContentHandler(new EmbeddedContentHandler(
|
||||
getContentHandler(tagged, metadata, context))), context);
|
||||
} catch (SAXException e) {
|
||||
tagged.throwIfCauseOf(e);
|
||||
throw new TikaException("XML parse error", e);
|
||||
} finally {
|
||||
xhtml.endElement("p");
|
||||
xhtml.endDocument();
|
||||
}
|
||||
}
|
||||
|
||||
protected ContentHandler getContentHandler(
|
||||
ContentHandler handler, Metadata metadata, ParseContext context) {
|
||||
return new TextContentHandler(handler, true);
|
||||
}
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
package docspell.extract
|
||||
|
||||
import docspell.common.MimeType
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
sealed trait ExtractResult {
|
||||
|
||||
def textOption: Option[String]
|
||||
|
||||
}
|
||||
|
||||
object ExtractResult {
|
||||
|
||||
case class UnsupportedFormat(mime: MimeType) extends ExtractResult {
|
||||
val textOption = None
|
||||
}
|
||||
case class Failure(ex: Throwable) extends ExtractResult {
|
||||
val textOption = None
|
||||
}
|
||||
case class Success(text: String) extends ExtractResult {
|
||||
val textOption = Some(text)
|
||||
}
|
||||
|
||||
def fromTry(r: Try[String]): ExtractResult =
|
||||
r.fold(Failure.apply, Success.apply)
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package docspell.extract.odf
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
import java.io.{ByteArrayInputStream, InputStream}
|
||||
|
||||
import org.apache.tika.metadata.Metadata
|
||||
import org.apache.tika.parser.ParseContext
|
||||
import org.apache.tika.parser.odf.OpenDocumentParser
|
||||
import org.apache.tika.sax.BodyContentHandler
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
object OdfExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
|
||||
|
||||
def get(is: InputStream) = Try {
|
||||
val handler = new BodyContentHandler()
|
||||
val pctx = new ParseContext()
|
||||
val meta = new Metadata()
|
||||
val ooparser = new OpenDocumentParser()
|
||||
ooparser.parse(is, handler, meta, pctx)
|
||||
handler.toString.trim
|
||||
}.toEither
|
||||
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import java.io.InputStream
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import org.apache.pdfbox.pdmodel.PDDocument
|
||||
import org.apache.pdfbox.text.PDFTextStripper
|
||||
|
||||
import scala.util.{Try, Using}
|
||||
import fs2.Stream
|
||||
|
||||
object PdfboxExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map { bytes =>
|
||||
Using(PDDocument.load(bytes))(readText).toEither.flatten
|
||||
}
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
Using(PDDocument.load(is))(readText).toEither.flatten
|
||||
|
||||
def get(inFile: Path): Either[Throwable, String] =
|
||||
Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten
|
||||
|
||||
private def readText(doc: PDDocument): Either[Throwable, String] =
|
||||
Try {
|
||||
val stripper = new PDFTextStripper()
|
||||
stripper.setAddMoreFormatting(true)
|
||||
stripper.setLineSeparator("\n")
|
||||
stripper.getText(doc).trim // trim here already
|
||||
}.toEither
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
package docspell.extract.poi
|
||||
|
||||
import java.io.{ByteArrayInputStream, InputStream}
|
||||
|
||||
import cats.data.EitherT
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument
|
||||
import fs2.Stream
|
||||
|
||||
import scala.util.Try
|
||||
import docspell.common._
|
||||
import docspell.files.TikaMimetype
|
||||
|
||||
object PoiExtract {
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] =
|
||||
TikaMimetype.detect(data, hint).flatMap {
|
||||
case PoiTypes.doc =>
|
||||
getDoc(data)
|
||||
case PoiTypes.xls =>
|
||||
getXls(data)
|
||||
case PoiTypes.xlsx =>
|
||||
getXlsx(data)
|
||||
case PoiTypes.docx =>
|
||||
getDocx(data)
|
||||
case PoiTypes.msoffice =>
|
||||
EitherT(getDoc[F](data))
|
||||
.recoverWith({
|
||||
case _ => EitherT(getXls[F](data))
|
||||
})
|
||||
.value
|
||||
case PoiTypes.ooxml =>
|
||||
EitherT(getDocx[F](data))
|
||||
.recoverWith({
|
||||
case _ => EitherT(getXlsx[F](data))
|
||||
})
|
||||
.value
|
||||
case mt =>
|
||||
Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}")))
|
||||
}
|
||||
|
||||
def getDocx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XWPFWordExtractor(new XWPFDocument(is))
|
||||
xt.getText.trim
|
||||
}.toEither
|
||||
|
||||
def getDoc(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new WordExtractor(is)
|
||||
xt.getText.trim
|
||||
}.toEither
|
||||
|
||||
def getXlsx(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new XSSFExcelExtractor(new XSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
}.toEither
|
||||
|
||||
def getXls(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val xt = new ExcelExtractor(new HSSFWorkbook(is))
|
||||
xt.getText.trim
|
||||
}.toEither
|
||||
|
||||
def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx)
|
||||
|
||||
def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc)
|
||||
|
||||
def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx)
|
||||
|
||||
def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls)
|
||||
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
package docspell.extract.poi
|
||||
|
||||
import docspell.common.MimeType
|
||||
|
||||
object PoiTypes {
|
||||
|
||||
val msoffice = MimeType.application("x-tika-msoffice")
|
||||
val ooxml = MimeType.application("x-tika-ooxml")
|
||||
val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
val xls = MimeType.application("vnd.ms-excel")
|
||||
val doc = MimeType.application("msword")
|
||||
|
||||
val all = Set(msoffice, ooxml, docx, xlsx, xls, doc)
|
||||
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package docspell.extract.rtf
|
||||
|
||||
import java.io.{ByteArrayInputStream, InputStream}
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import fs2.Stream
|
||||
import javax.swing.text.rtf.RTFEditorKit
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
object RtfExtract {
|
||||
|
||||
def get(is: InputStream): Either[Throwable, String] =
|
||||
Try {
|
||||
val kit = new RTFEditorKit()
|
||||
val doc = kit.createDefaultDocument()
|
||||
kit.read(is, doc, 0)
|
||||
doc.getText(0, doc.getLength).trim
|
||||
}.toEither
|
||||
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] =
|
||||
data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get)
|
||||
}
|
@ -1,9 +1,7 @@
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import docspell.extract.TestFiles
|
||||
import docspell.files.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
||||
@ -30,13 +28,4 @@ object TextExtractionSuite extends SimpleTestSuite {
|
||||
|
||||
assertEquals(extract.trim, expect.trim)
|
||||
}
|
||||
|
||||
test("find mimetypes") {
|
||||
ExampleFiles.
|
||||
all.foreach { url =>
|
||||
TikaMimetype.detect(url.readURL[IO](8192, blocker), MimeTypeHint.none).
|
||||
map(mt => println(url.asString + ": " + mt.asString)).
|
||||
unsafeRunSync
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,28 @@
|
||||
package docspell.extract.odf
|
||||
|
||||
import cats.effect._
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object OdfExtractTest extends SimpleTestSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val files = List(
|
||||
ExampleFiles.examples_sample_odt -> 6372,
|
||||
ExampleFiles.examples_sample_ods -> 717
|
||||
)
|
||||
|
||||
test("test extract from odt") {
|
||||
files.foreach { case (file, len) =>
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str1 = OdfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str1.length, len)
|
||||
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity)
|
||||
assertEquals(str2, str1)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package docspell.extract.pdfbox
|
||||
|
||||
import cats.effect._
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object PdfboxExtractTest extends SimpleTestSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val textPDFs = List(
|
||||
ExampleFiles.letter_de_pdf -> TestFiles.letterDEText,
|
||||
ExampleFiles.letter_en_pdf -> TestFiles.letterENText
|
||||
)
|
||||
|
||||
test("extract text from text PDFs by inputstream") {
|
||||
textPDFs.foreach {
|
||||
case (file, txt) =>
|
||||
val url = file.toJavaUrl.fold(sys.error, identity)
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
val received = removeFormatting(str)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
}
|
||||
}
|
||||
|
||||
test("extract text from text PDFs via Stream") {
|
||||
textPDFs.foreach {
|
||||
case (file, txt) =>
|
||||
val data = file.readURL[IO](8192, blocker)
|
||||
val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity)
|
||||
val received = removeFormatting(str)
|
||||
val expect = removeFormatting(txt)
|
||||
assertEquals(received, expect)
|
||||
}
|
||||
}
|
||||
|
||||
test("extract text from image PDFs") {
|
||||
val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity)
|
||||
|
||||
val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity)
|
||||
|
||||
assertEquals(str, "")
|
||||
}
|
||||
|
||||
private def removeFormatting(str: String): String =
|
||||
str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package docspell.extract.poi
|
||||
|
||||
import cats.effect._
|
||||
import docspell.common.MimeTypeHint
|
||||
import docspell.files.{ExampleFiles, TestFiles}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object PoiExtractTest extends SimpleTestSuite {
|
||||
val blocker = TestFiles.blocker
|
||||
implicit val CS = TestFiles.CS
|
||||
|
||||
val officeFiles = List(
|
||||
ExampleFiles.examples_sample_doc -> 6241,
|
||||
ExampleFiles.examples_sample_docx -> 6179,
|
||||
ExampleFiles.examples_sample_xlsx -> 660,
|
||||
ExampleFiles.examples_sample_xls -> 660
|
||||
)
|
||||
|
||||
test("extract text from ms office files") {
|
||||
officeFiles.foreach {
|
||||
case (file, len) =>
|
||||
val str1 = PoiExtract
|
||||
.get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none)
|
||||
.unsafeRunSync()
|
||||
.fold(throw _, identity)
|
||||
|
||||
val str2 = PoiExtract
|
||||
.get[IO](
|
||||
file.readURL[IO](8192, blocker),
|
||||
MimeTypeHint(Some(file.path.segments.last), None)
|
||||
)
|
||||
.unsafeRunSync()
|
||||
.fold(throw _, identity)
|
||||
|
||||
assertEquals(str1, str2)
|
||||
assertEquals(str1.length, len)
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
package docspell.extract.rtf
|
||||
|
||||
import docspell.files.ExampleFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object RtfExtractTest extends SimpleTestSuite {
|
||||
|
||||
test("extract text from rtf using java input-stream") {
|
||||
val file = ExampleFiles.examples_sample_rtf
|
||||
val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity)
|
||||
val str = RtfExtract.get(is).fold(throw _, identity)
|
||||
assertEquals(str.length, 7342)
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package docspell.files
|
||||
|
||||
case class Dimension(width: Int, height: Int) {
|
||||
|
||||
def toAwtDimension: java.awt.Dimension =
|
||||
new java.awt.Dimension(width, height)
|
||||
}
|
61
modules/files/src/main/scala/docspell/files/ImageSize.scala
Normal file
61
modules/files/src/main/scala/docspell/files/ImageSize.scala
Normal file
@ -0,0 +1,61 @@
|
||||
package docspell.files
|
||||
|
||||
import java.io.{ByteArrayInputStream, InputStream}
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect._
|
||||
import fs2.Stream
|
||||
import javax.imageio.stream.{FileImageInputStream, ImageInputStream}
|
||||
import javax.imageio.{ImageIO, ImageReader}
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import scala.util.{Try, Using}
|
||||
|
||||
object ImageSize {
|
||||
|
||||
/** Return the image size from its header without reading
|
||||
* the whole image into memory.
|
||||
*/
|
||||
def get(file: Path): Option[Dimension] =
|
||||
Using(new FileImageInputStream(file.toFile))(getDimension).toOption.flatten
|
||||
|
||||
/** Return the image size from its header without reading
|
||||
* the whole image into memory.
|
||||
*/
|
||||
def get(in: InputStream): Option[Dimension] =
|
||||
Option(ImageIO.createImageInputStream(in)).flatMap(getDimension)
|
||||
|
||||
/** Return the image size from its header without reading
|
||||
* the whole image into memory.
|
||||
*/
|
||||
def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = {
|
||||
data.take(768).compile.to(Array).map(ar => {
|
||||
val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar))
|
||||
if (iis == null) sys.error("no reader given for the array")
|
||||
else getDimension(iis)
|
||||
})
|
||||
}
|
||||
|
||||
private def getDimension(in: ImageInputStream): Option[Dimension] =
|
||||
ImageIO
|
||||
.getImageReaders(in)
|
||||
.asScala
|
||||
.to(LazyList)
|
||||
.collectFirst(Function.unlift { reader =>
|
||||
val dim = getDimension(in, reader).toOption
|
||||
reader.dispose()
|
||||
dim
|
||||
})
|
||||
|
||||
private def getDimension(
|
||||
in: ImageInputStream,
|
||||
reader: ImageReader
|
||||
): Either[Throwable, Dimension] =
|
||||
Try {
|
||||
reader.setInput(in)
|
||||
val width = reader.getWidth(reader.getMinIndex)
|
||||
val height = reader.getHeight(reader.getMinIndex)
|
||||
Dimension(width, height)
|
||||
}.toEither
|
||||
}
|
BIN
modules/files/src/test/resources/bombs/20K-gray.jpeg
Normal file
BIN
modules/files/src/test/resources/bombs/20K-gray.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.5 MiB |
BIN
modules/files/src/test/resources/bombs/20K-gray.png
Normal file
BIN
modules/files/src/test/resources/bombs/20K-gray.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 48 KiB |
BIN
modules/files/src/test/resources/bombs/20K-rgb.jpeg
Normal file
BIN
modules/files/src/test/resources/bombs/20K-rgb.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.2 MiB |
BIN
modules/files/src/test/resources/bombs/20K-rgb.png
Normal file
BIN
modules/files/src/test/resources/bombs/20K-rgb.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 MiB |
@ -2,18 +2,18 @@ Derek Jeter
|
||||
|
||||
123 Elm Ave.
|
||||
|
||||
Treesville, ON MI1N 2P3
|
||||
Treesville, ON M1N 2P3
|
||||
November 7, 2016
|
||||
|
||||
Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
|
||||
|
||||
Mr. M. Leat
|
||||
Mr. M. Leaf
|
||||
|
||||
Chief of Syrup Production
|
||||
Old Sticky Pancake Company
|
||||
456 Maple Lane
|
||||
|
||||
Forest, ON 7TW8 9Y0
|
||||
Forest, ON 7W8 9Y0
|
||||
|
||||
Hemptown, September 3, 2019
|
||||
Dear Mr. Leaf,
|
||||
|
@ -3,12 +3,12 @@
|
||||
<withJansi>true</withJansi>
|
||||
|
||||
<encoder>
|
||||
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
<pattern>%highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<logger name="docspell" level="debug" />
|
||||
<root level="INFO">
|
||||
<root level="error">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
@ -0,0 +1,46 @@
|
||||
package docspell.files
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.{Blocker, IO}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
import scala.util.Using
|
||||
|
||||
object ImageSizeTest extends SimpleTestSuite {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
//tiff files are not supported on the jdk by default
|
||||
//requires an external library
|
||||
val files = List(
|
||||
ExampleFiles.camera_letter_en_jpg -> Dimension(1695, 2378),
|
||||
ExampleFiles.camera_letter_en_png -> Dimension(1695, 2378),
|
||||
// ExampleFiles.camera_letter_en_tiff -> Dimension(1695, 2378),
|
||||
ExampleFiles.scanner_jfif_jpg -> Dimension(2480, 3514),
|
||||
ExampleFiles.bombs_20K_gray_jpeg -> Dimension(20000, 20000),
|
||||
ExampleFiles.bombs_20K_gray_png -> Dimension(20000, 20000),
|
||||
ExampleFiles.bombs_20K_rgb_jpeg -> Dimension(20000, 20000),
|
||||
ExampleFiles.bombs_20K_rgb_png -> Dimension(20000, 20000)
|
||||
)
|
||||
|
||||
test("get sizes from input-stream") {
|
||||
files.foreach {
|
||||
case (uri, expect) =>
|
||||
val url = uri.toJavaUrl.fold(sys.error, identity)
|
||||
Using.resource(url.openStream()) { in =>
|
||||
val dim = ImageSize.get(in)
|
||||
assertEquals(dim, expect.some)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("get sizes from stream") {
|
||||
files.foreach {
|
||||
case (uri, expect) =>
|
||||
val stream = uri.readURL[IO](8192, blocker)
|
||||
val dim = ImageSize.get(stream).unsafeRunSync()
|
||||
assertEquals(dim, expect.some)
|
||||
}
|
||||
}
|
||||
}
|
@ -1,8 +1,7 @@
|
||||
package docspell.extract
|
||||
package docspell.files
|
||||
|
||||
import fs2.Stream
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
import fs2.Stream
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
@ -12,19 +11,19 @@ object TestFiles {
|
||||
|
||||
val letterSourceDE: Stream[IO, Byte] =
|
||||
ExampleFiles.letter_de_pdf
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
.readURL[IO](8 * 1024, blocker)
|
||||
|
||||
val letterSourceEN: Stream[IO, Byte] =
|
||||
ExampleFiles.letter_en_pdf
|
||||
.readURL[IO](16 * 1024, blocker)
|
||||
.readURL[IO](8 * 1024, blocker)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.readText[IO](8 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.readText[IO](8 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -11,3 +11,8 @@ title: ADRs
|
||||
- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch)
|
||||
- [0005 Job Executor](adr/0005_job-executor)
|
||||
- [0006 More File Types](adr/0006_more-file-types)
|
||||
- [0007 Convert HTML](adr/0007_convert_html_files)
|
||||
- [0008 Convert Text](adr/0008_convert_plain_text)
|
||||
- [0009 Convert Office Files](adr/0009_convert_office_docs)
|
||||
- [0010 Convert Image Files](adr/0010_convert_image_files)
|
||||
- [0011 Extract Text](adr/0011_extract_text)
|
||||
|
@ -112,7 +112,7 @@ If conversion is not supported for the input file, it is skipped. If
|
||||
conversion fails, the error is propagated to let the retry mechanism
|
||||
take care.
|
||||
|
||||
### What types?
|
||||
#### What types?
|
||||
|
||||
Which file types should be supported? At a first step, all major
|
||||
office documents, common images, plain text (i.e. markdown) and html
|
||||
@ -123,6 +123,12 @@ There is always the preference to use jvm internal libraries in order
|
||||
to be more platform independent and to reduce external dependencies.
|
||||
But this is not always possible (like doing OCR).
|
||||
|
||||
<div class="thumbnail">
|
||||
<img src="./img/process-files.png" title="Overview processing files">
|
||||
</div>
|
||||
|
||||
#### Conversion
|
||||
|
||||
- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`):
|
||||
unoconv (see [ADR 9](0009_convert_office_docs))
|
||||
- HTML (`html`): wkhtmltopdf (see [ADR 7](0007_convert_html_files))
|
||||
@ -130,9 +136,19 @@ But this is not always possible (like doing OCR).
|
||||
- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR
|
||||
10](0010_convert_image_files))
|
||||
|
||||
#### Text Extraction
|
||||
|
||||
- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi
|
||||
- Office documends (`odt`, `ods`): Apache Tika (including the sources)
|
||||
- HTML: not supported, extract text from converted PDF
|
||||
- Images (`jpg`, `png`, `tif`): Tesseract
|
||||
- Text/Markdown: n.a.
|
||||
- PDF: Apache PDFBox or Tesseract
|
||||
|
||||
## Links
|
||||
|
||||
* [Convert HTML Files](0007_convert_html_files)
|
||||
* [Convert Plain Text](0008_convert_plain_text)
|
||||
* [Convert Office Documents](0009_convert_office_docs)
|
||||
* [Convert Image Files](0010_convert_image_files)
|
||||
* [Extract Text from Files](0011_extract_text)
|
||||
|
77
modules/microsite/docs/dev/adr/0011_extract_text.md
Normal file
77
modules/microsite/docs/dev/adr/0011_extract_text.md
Normal file
@ -0,0 +1,77 @@
|
||||
---
|
||||
layout: docs
|
||||
title: Extract Text from Files
|
||||
---
|
||||
|
||||
# Extract Text from Files
|
||||
|
||||
## Context and Problem Statement
|
||||
|
||||
With support for more file types there must be a way to extract text
|
||||
from all of them. It is better to extract text from the source files,
|
||||
in contrast to extracting the text from the converted pdf file.
|
||||
|
||||
There are multiple options and multiple file types. Again, most
|
||||
priority is to use a java/scala library to reduce external
|
||||
dependencies.
|
||||
|
||||
## Considered Options
|
||||
|
||||
### MS Office Documents
|
||||
|
||||
There is only one library I know: [Apache
|
||||
POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`.
|
||||
However, it doesn't support open-document format (odt and ods).
|
||||
|
||||
### OpenDocument Format
|
||||
|
||||
There are two libraries:
|
||||
|
||||
- [Apache Tika Parser](https://tika.apache.org/)
|
||||
- [ODFToolkit](https://github.com/tdf/odftoolkit)
|
||||
|
||||
*Tika:* The tika-parsers package contains an opendocument parser for
|
||||
extracting text. But it has a huge dependency tree, since it is a
|
||||
super-package containing a parser for almost every common file type.
|
||||
|
||||
*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org)
|
||||
and also pulls in quite some dependencies (while not as much as
|
||||
tika-parser). It is not too bad, since it is a library for
|
||||
manipulating opendocument files. But all I need is to only extract
|
||||
text. I created tests that extracted text from my odt/ods files. It
|
||||
worked at first sight, but running the tests in a loop resulted in
|
||||
strange nullpointer exceptions (it only worked the first run).
|
||||
|
||||
### Richtext
|
||||
|
||||
Richtext is supported by the jdk (using `RichtextEditorKit` from
|
||||
swing).
|
||||
|
||||
### PDF
|
||||
|
||||
For "image" pdf files, tesseract is used. For "text" PDF files, the
|
||||
library [Apache PDFBox](https://pdfbox.apache.org) can be used.
|
||||
|
||||
There also is [iText](https://github.com/itext/itext7) with a AGPL
|
||||
license.
|
||||
|
||||
### Images
|
||||
|
||||
For images and "image" PDF files, there is already tesseract in place.
|
||||
|
||||
### HTML
|
||||
|
||||
HTML must be converted into a PDF file before text can be extracted.
|
||||
|
||||
### Text/Markdown
|
||||
|
||||
These files can be used as-is, obviously.
|
||||
|
||||
|
||||
## Decision Outcome
|
||||
|
||||
- MS Office files: POI library
|
||||
- Open Document files: Tika, but integrating the few source files that
|
||||
make up the open document parser. Due to its huge dependency tree,
|
||||
the library is not added.
|
||||
- PDF: Apache PDFBox. I know this library better than itext.
|
BIN
modules/microsite/docs/dev/adr/img/process-files.png
Normal file
BIN
modules/microsite/docs/dev/adr/img/process-files.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 49 KiB |
43
modules/microsite/docs/dev/adr/process-files.puml
Normal file
43
modules/microsite/docs/dev/adr/process-files.puml
Normal file
@ -0,0 +1,43 @@
|
||||
@startuml
|
||||
scale 1200 width
|
||||
title: Processing Files
|
||||
skinparam monochrome true
|
||||
skinparam backgroundColor white
|
||||
skinparam rectangle {
|
||||
roundCorner<<Input>> 25
|
||||
roundCorner<<Output>> 5
|
||||
}
|
||||
rectangle Input <<Input>> {
|
||||
file "html"
|
||||
file "plaintext"
|
||||
file "image"
|
||||
file "msoffice"
|
||||
file "rtf"
|
||||
file "odf"
|
||||
file "pdf"
|
||||
}
|
||||
|
||||
node toBoth [
|
||||
PDF + TXT
|
||||
]
|
||||
node toPdf [
|
||||
PDF
|
||||
]
|
||||
node toTxt [
|
||||
TXT
|
||||
]
|
||||
|
||||
image --> toBoth:<tesseract>
|
||||
html --> toPdf:<wkhtmltopdf>
|
||||
toPdf --> toTxt:[pdfbox]
|
||||
plaintext --> html:[flexmark]
|
||||
msoffice --> toPdf:<unoconv>
|
||||
msoffice --> toTxt:[poi]
|
||||
rtf --> toTxt:[jdk]
|
||||
rtf --> toPdf:<unoconv>
|
||||
odf --> toTxt:[tika]
|
||||
odf --> toPdf:<unoconv>
|
||||
pdf --> toTxt:<tesseract>
|
||||
pdf --> toTxt:[pdfbox]
|
||||
plaintext -> toTxt:[identity]
|
||||
@enduml
|
@ -36,15 +36,20 @@ object Dependencies {
|
||||
val ViewerJSVersion = "0.5.8"
|
||||
|
||||
|
||||
val jclOverSlf4j = Seq(
|
||||
"org.slf4j" % "jcl-over-slf4j" % Slf4jVersion
|
||||
)
|
||||
val julOverSlf4j = Seq(
|
||||
"org.slf4j" % "jul-to-slf4j" % Slf4jVersion
|
||||
)
|
||||
|
||||
val poi = Seq(
|
||||
"org.apache.poi" % "poi" % PoiVersion,
|
||||
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
||||
"org.slf4j" % "slf4j-log4j12" % Slf4jVersion,
|
||||
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
|
||||
"org.apache.poi" % "poi-scratchpad" % PoiVersion,
|
||||
).map(_.excludeAll(
|
||||
ExclusionRule("commons-logging"),
|
||||
ExclusionRule("log4j")
|
||||
))
|
||||
ExclusionRule("commons-logging")
|
||||
)) ++ jclOverSlf4j
|
||||
|
||||
// https://github.com/vsch/flexmark-java
|
||||
// BSD 2-Clause
|
||||
@ -57,18 +62,17 @@ object Dependencies {
|
||||
ExclusionRule("hamcrest-core")
|
||||
))
|
||||
|
||||
val twelvemonkeys = Seq(
|
||||
"com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
|
||||
"com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
|
||||
)
|
||||
// val twelvemonkeys = Seq(
|
||||
// "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
|
||||
// "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
|
||||
// )
|
||||
|
||||
val pdfbox = Seq(
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
|
||||
ExclusionRule("commons-logging"),
|
||||
ExclusionRule("org.bouncycastle")
|
||||
),
|
||||
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
|
||||
)
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll (
|
||||
ExclusionRule("org.bouncycastle"),
|
||||
ExclusionRule("commons-logging")
|
||||
)
|
||||
) ++ jclOverSlf4j
|
||||
|
||||
val emil = Seq(
|
||||
"com.github.eikek" %% "emil-common" % EmilVersion,
|
||||
@ -100,6 +104,12 @@ object Dependencies {
|
||||
val tika = Seq(
|
||||
"org.apache.tika" % "tika-core" % TikaVersion
|
||||
)
|
||||
val commonsIO = Seq(
|
||||
"commons-io" % "commons-io" % "2.6"
|
||||
)
|
||||
val tikaParser = Seq(
|
||||
"org.apache.tika" % "tika-parsers" % TikaVersion
|
||||
)
|
||||
|
||||
val bcrypt = Seq(
|
||||
"org.mindrot" % "jbcrypt" % BcryptVersion
|
||||
|
Loading…
x
Reference in New Issue
Block a user