From 558007235bb85274fb0924f247da98c2c9ba80ee Mon Sep 17 00:00:00 2001 From: Scala Steward Date: Mon, 19 Jul 2021 14:25:26 +0200 Subject: [PATCH] Update tika-core to 2.0.0 Include new ODF parser from tika-2.0.0 --- .../exception/WriteLimitReachedException.java | 83 +++ .../odf/FlatOpenDocumentMacroHandler.java | 120 ++++ .../odf/NSNormalizerContentHandler.java | 73 ++- .../parser/odf/OpenDocumentBodyHandler.java | 564 +++++++++++++++++ .../parser/odf/OpenDocumentContentParser.java | 578 +----------------- .../parser/odf/OpenDocumentMacroHandler.java | 60 ++ .../odf/OpenDocumentManifestHandler.java | 45 ++ .../parser/odf/OpenDocumentMetaParser.java | 160 +++-- .../tika/parser/odf/OpenDocumentParser.java | 329 ++++++---- .../parser/xml/AbstractMetadataHandler.java | 52 +- .../AttributeDependantMetadataHandler.java | 34 +- .../parser/xml/AttributeMetadataHandler.java | 30 +- .../apache/tika/parser/xml/DcXMLParser.java | 46 +- .../parser/xml/ElementMetadataHandler.java | 75 ++- .../tika/parser/xml/FictionBookParser.java | 55 +- .../tika/parser/xml/MetadataHandler.java | 33 +- .../parser/xml/TextAndAttributeXMLParser.java | 34 ++ .../org/apache/tika/parser/xml/XMLParser.java | 51 +- .../org/apache/tika/utils/StringUtils.java | 206 +++++++ .../docspell/extract/odf/OdfExtractTest.scala | 2 +- .../scala/docspell/files/TikaMimetype.scala | 4 +- project/Dependencies.scala | 2 +- 22 files changed, 1653 insertions(+), 983 deletions(-) create mode 100644 modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java create mode 100644 modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java create mode 100644 modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java create mode 100644 modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java create mode 100644 modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentManifestHandler.java create mode 100644 modules/extract/src/main/java/org/apache/tika/parser/xml/TextAndAttributeXMLParser.java create mode 100644 modules/extract/src/main/java/org/apache/tika/utils/StringUtils.java diff --git a/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java new file mode 100644 index 00000000..2b88a075 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.exception; + +import org.xml.sax.SAXException; + +public class WriteLimitReachedException extends SAXException { + + //in case of (hopefully impossible) cyclic exception + private final static int MAX_DEPTH = 100; + + private final int writeLimit; + public WriteLimitReachedException(int writeLimit) { + this.writeLimit = writeLimit; + } + + @Override + public String getMessage() { + return "Your document contained more than " + writeLimit + + " characters, and so your requested limit has been" + + " reached. To receive the full text of the document," + + " increase your limit. (Text up to the limit is" + + " however available)."; + } + /** + * Checks whether the given exception (or any of it's root causes) was + * thrown by this handler as a signal of reaching the write limit. + * + * @param t throwable + * @return true if the write limit was reached, + * false otherwise + * @since Apache Tika 2.0 + */ + public static boolean isWriteLimitReached(Throwable t) { + return isWriteLimitReached(t, 0); + } + + private static boolean isWriteLimitReached(Throwable t, int depth) { + if (t == null) { + return false; + } + if (depth > MAX_DEPTH) { + return false; + } + if (t instanceof WriteLimitReachedException) { + return true; + } else { + return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1); + } + } + + public static void throwIfWriteLimitReached(Exception ex) throws SAXException { + throwIfWriteLimitReached(ex, 0); + } + + private static void throwIfWriteLimitReached(Exception ex, int depth) throws SAXException { + if (ex == null) { + return; + } + if (depth > MAX_DEPTH) { + return; + } + if (ex instanceof WriteLimitReachedException) { + throw (SAXException) ex; + } else { + isWriteLimitReached(ex.getCause(), depth + 1); + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java new file mode 100644 index 00000000..416be0fc --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * Handler for macros in flat open documents + */ +class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { + + static String MODULE = "module"; + static String NAME = "name"; + private static String SOURCE_CODE = "source-code"; + private final ContentHandler contentHandler; + private final ParseContext parseContext; + private final StringBuilder macroBuffer = new StringBuilder(); + String macroName = null; + boolean inMacro = false; + private EmbeddedDocumentExtractor embeddedDocumentExtractor; + + FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) { + super(contentHandler); + this.contentHandler = contentHandler; + this.parseContext = parseContext; + } + + @Override + public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) + throws SAXException { + if (MODULE.equals(localName)) { + macroName = XMLReaderUtils.getAttrValue(NAME, attrs); + } else if (SOURCE_CODE.equals(localName)) { + inMacro = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inMacro) { + macroBuffer.append(ch, start, length); + } + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) + throws SAXException { + if (SOURCE_CODE.equals(localName)) { + try { + handleMacro(); + } catch (IOException e) { + throw new SAXException(e); + } finally { + resetMacroState(); + } + } + } + + protected void resetMacroState() { + macroBuffer.setLength(0); + macroName = null; + inMacro = false; + } + + protected void handleMacro() throws IOException, SAXException { + + byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8); + + if (embeddedDocumentExtractor == null) { + embeddedDocumentExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + } + Metadata embeddedMetadata = new Metadata(); + if (!isBlank(macroName)) { + embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName); + } + embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + try (InputStream is = TikaInputStream.get(bytes)) { + embeddedDocumentExtractor + .parseEmbedded(is, contentHandler, embeddedMetadata, false); + } + } + } + + private static boolean isBlank(String s) { + return s == null || s.trim().isEmpty(); + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java index 80b2301c..2193e83b 100644 --- a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java @@ -1,31 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.odf; + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Locale; -import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import java.io.IOException; -import java.io.StringReader; -import java.util.Locale; +import org.apache.tika.sax.ContentHandlerDecorator; /** * Content handler decorator that: