All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.RecursiveParserWrapper Maven / Gradle / Ivy

package org.apache.tika.parser;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Set;

/**
 * This is a helper class that wraps a parser in a recursive handler.
 * It takes care of setting the embedded parser in the ParseContext 
 * and handling the embedded path calculations.
 * 

* After parsing a document, call getMetadata() to retrieve a list of * Metadata objects, one for each embedded resource. The first item * in the list will contain the Metadata for the outer container file. *

* Content can also be extracted and stored in the {@link #TIKA_CONTENT} field * of a Metadata object. Select the type of content to be stored * at initialization. *

* If a WriteLimitReachedException is encountered, the wrapper will stop * processing the current resource, and it will not process * any of the child resources for the given resource. However, it will try to * parse as much as it can. If a WLRE is reached in the parent document, * no child resources will be parsed. *

* The implementation is based on Jukka's RecursiveMetadataParser * and Nick's additions. See: * RecursiveMetadataParser. *

* Note that this wrapper holds all data in memory and is not appropriate * for files with content too large to be held in memory. *

* Note, too, that this wrapper is not thread safe because it stores state. * The client must initialize a new wrapper for each thread, and the client * is responsible for calling {@link #reset()} after each parse. *

* The unit tests for this class are in the tika-parsers module. *

*/ public class RecursiveParserWrapper extends ParserDecorator { /** * Generated serial version */ private static final long serialVersionUID = 9086536568120690938L; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#TIKA_CONTENT} */ @Deprecated public final static Property TIKA_CONTENT = AbstractRecursiveParserWrapperHandler.TIKA_CONTENT; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#PARSE_TIME_MILLIS} */ @Deprecated public final static Property PARSE_TIME_MILLIS = AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION} */ @Deprecated public final static Property WRITE_LIMIT_REACHED = AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_LIMIT_REACHED} */ @Deprecated public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_LIMIT_REACHED; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_EXCEPTION} */ @Deprecated public final static Property EMBEDDED_EXCEPTION = AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION; /** * @deprecated use {@link org.apache.tika.sax.RecursiveParserWrapperHandler#EMBEDDED_RESOURCE_PATH} */ @Deprecated public final static Property EMBEDDED_RESOURCE_PATH = AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH; /** * @deprecated this should be passed in via the {@link RecursiveParserWrapperHandler} */ @Deprecated private ContentHandlerFactory contentHandlerFactory = null; private final boolean catchEmbeddedExceptions; /** * set this on the RecursiveParserWrapperHandler instead * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20 */ @Deprecated private int maxEmbeddedResources = -1; /** * @deprecated this is here only for legacy behavior; it will be removed in 2.0 and/or 1.20 */ @Deprecated private ParserState lastParseState = null; /** * Initialize the wrapper with {@link #catchEmbeddedExceptions} set * to true as default. * * @param wrappedParser parser to use for the container documents and the embedded documents */ public RecursiveParserWrapper(Parser wrappedParser) { this(wrappedParser, true); } /** * * @param wrappedParser parser to wrap * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. * If set to false, embedded exceptions will be thrown and * the rest of the file will not be parsed. The following will not be ignored: * {@link CorruptedFileException}, {@link RuntimeException} */ public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) { super(wrappedParser); this.catchEmbeddedExceptions = catchEmbeddedExceptions; } /** * Initialize the wrapper with {@link #catchEmbeddedExceptions} set * to true as default. * * @param wrappedParser parser to use for the container documents and the embedded documents * @param contentHandlerFactory factory to use to generate a new content handler for * the container document and each embedded document * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser)} */ @Deprecated public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) { this(wrappedParser, contentHandlerFactory, true); } /** * Initialize the wrapper. * * @param wrappedParser parser to use for the container documents and the embedded documents * @param contentHandlerFactory factory to use to generate a new content handler for * the container document and each embedded document * @param catchEmbeddedExceptions whether or not to catch the embedded exceptions. * If set to true, the stack traces will be stored in * the metadata object with key: {@link #EMBEDDED_EXCEPTION}. * @deprecated use {@link RecursiveParserWrapper#RecursiveParserWrapper(Parser, boolean)} */ @Deprecated public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) { super(wrappedParser); this.contentHandlerFactory = contentHandlerFactory; this.catchEmbeddedExceptions = catchEmbeddedExceptions; } @Override public Set getSupportedTypes(ParseContext context) { return getWrappedParser().getSupportedTypes(context); } /** * Acts like a regular parser except it ignores the ContentHandler * and it automatically sets/overwrites the embedded Parser in the * ParseContext object. *

* To retrieve the results of the parse, use {@link #getMetadata()}. *

* Make sure to call {@link #reset()} after each parse. */ @Override public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //this tracks the state of the parent parser, per call to #parse //in future versions, we can remove lastParseState, and this will be thread-safe ParserState parserState; if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { parserState = new ParserState((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler); } else { parserState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); lastParseState = parserState; } EmbeddedParserDecorator decorator = new EmbeddedParserDecorator(getWrappedParser(), "/", parserState); context.set(Parser.class, decorator); ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); long started = System.currentTimeMillis(); parserState.recursiveParserWrapperHandler.startDocument(); TemporaryResources tmp = new TemporaryResources(); int totalWriteLimit = -1; if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) { totalWriteLimit = ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getTotalWriteLimit(); } try { TikaInputStream tis = TikaInputStream.get(stream, tmp); RecursivelySecureContentHandler secureContentHandler = new RecursivelySecureContentHandler(localHandler, tis, totalWriteLimit); context.set(RecursivelySecureContentHandler.class, secureContentHandler); getWrappedParser().parse(tis, secureContentHandler, metadata, context); } catch (Throwable e) { //try our best to record the problem in the metadata object //then rethrow if (WriteLimitReachedException.isWriteLimitReached(e)) { metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true"); } else { String stackTrace = ExceptionUtils.getFilteredStackTrace(e); metadata.add(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION, stackTrace); throw e; } } finally { tmp.dispose(); long elapsedMillis = System.currentTimeMillis() - started; metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata); parserState.recursiveParserWrapperHandler.endDocument(); } } /** * * The first element in the returned list represents the * data from the outer container file. There is no guarantee * about the ordering of the list after that. * * @deprecated use a {@link RecursiveParserWrapperHandler} instead * * @return list of Metadata objects that were gathered during the parse * @throws IllegalStateException if you've used a {@link RecursiveParserWrapperHandler} in your last * call to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public List getMetadata() { if (lastParseState != null) { return ((RecursiveParserWrapperHandler) lastParseState.recursiveParserWrapperHandler).getMetadataList(); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } } /** * Set the maximum number of embedded resources to store. * If the max is hit during parsing, the {@link #EMBEDDED_RESOURCE_LIMIT_REACHED} * property will be added to the container document's Metadata. * *

* If this value is < 0 (the default), the wrapper will store all Metadata. * @deprecated set this on a {@link RecursiveParserWrapperHandler} * @param max maximum number of embedded resources to store */ @Deprecated public void setMaxEmbeddedResources(int max) { maxEmbeddedResources = max; } /** * This clears the last parser state (metadata list, unknown count, hit embeddedresource count) * * @deprecated use a {@link org.apache.tika.sax.RecursiveParserWrapperHandler} instead * @throws IllegalStateException if you used a {@link RecursiveParserWrapper} in your call * to {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} */ @Deprecated public void reset() { if (lastParseState != null) { lastParseState = new ParserState(new RecursiveParserWrapperHandler(contentHandlerFactory, maxEmbeddedResources)); } else { throw new IllegalStateException("This is deprecated; please use a RecursiveParserWrapperHandler instead"); } } private String getResourceName(Metadata metadata, ParserState state) { String objectName = ""; if (metadata.get(Metadata.RESOURCE_NAME_KEY) != null) { objectName = metadata.get(Metadata.RESOURCE_NAME_KEY); } else if (metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID) != null) { objectName = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); } else { objectName = "embedded-" + (++state.unknownCount); } //make sure that there isn't any path info in the objectName //some parsers can return paths, not just file names objectName = FilenameUtils.getName(objectName); return objectName; } private class EmbeddedParserDecorator extends ParserDecorator { private static final long serialVersionUID = 207648200464263337L; private String location = null; private final ParserState parserState; private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) { super(parser); this.location = location; if (! this.location.endsWith("/")) { this.location += "/"; } this.parserState = parseState; } @Override public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //Test to see if we should avoid parsing if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) { return; } // Work out what this thing is String objectName = getResourceName(metadata, parserState); String objectLocation = this.location + objectName; metadata.add(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH, objectLocation); //get a fresh handler ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata); Parser preContextParser = context.get(Parser.class); context.set(Parser.class, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState)); long started = System.currentTimeMillis(); RecursivelySecureContentHandler secureContentHandler = context.get(RecursivelySecureContentHandler.class); //store the handler that was used before this parse //so that you can return it back to its state at the end of this parse ContentHandler preContextHandler = secureContentHandler.handler; secureContentHandler.updateContentHandler(localHandler); try { super.parse(stream, secureContentHandler, metadata, context); } catch (SAXException e) { boolean wlr = WriteLimitReachedException.isWriteLimitReached(e); if (wlr == true) { metadata.add(WRITE_LIMIT_REACHED, "true"); throw e; } else { if (catchEmbeddedExceptions) { ParserUtils.recordParserFailure(this, e, metadata); } else { throw e; } } } catch(CorruptedFileException e) { throw e; } catch (TikaException e) { if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null && e instanceof ZeroByteFileException) { //do nothing } else if (catchEmbeddedExceptions) { ParserUtils.recordParserFailure(this, e, metadata); } else { throw e; } } finally { context.set(Parser.class, preContextParser); secureContentHandler.updateContentHandler(preContextHandler); long elapsedMillis = System.currentTimeMillis() - started; metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); parserState.recursiveParserWrapperHandler.endEmbeddedDocument(localHandler, metadata); } } } /** * This tracks the state of the parse of a single document. * In future versions, this will allow the RecursiveParserWrapper to be thread safe. */ private class ParserState { private int unknownCount = 0; private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler; private ParserState(AbstractRecursiveParserWrapperHandler handler) { this.recursiveParserWrapperHandler = handler; } } private class RecursivelySecureContentHandler extends SecureContentHandler { private ContentHandler handler; //total allowable chars across all handlers private final int totalWriteLimit; //total chars written to all handlers private int totalChars = 0; public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream, int totalWriteLimit) { super(handler, stream); this.handler = handler; this.totalWriteLimit = totalWriteLimit; } public void updateContentHandler(ContentHandler handler) { setContentHandler(handler); this.handler = handler; } /** * Bypass the SecureContentHandler... *

* This handler only looks at zip bomb via zip expansion. * Users should be protected within entries against nested * nested xml entities. We don't want to carry * those stats _across_ entries. * * @param uri * @param localName * @param name * @param atts * @throws SAXException */ @Override public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException { this.handler.startElement(uri, localName, name, atts); } @Override public void endElement(String uri, String localName, String name) throws SAXException { this.handler.endElement(uri, localName, name); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (totalWriteLimit < 0) { super.characters(ch, start, length); return; } int availableLength = Math.min(totalWriteLimit - totalChars, length); super.characters(ch, start, availableLength); totalChars += availableLength; if (availableLength < length) { throw new WriteLimitReached(totalWriteLimit); } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (totalWriteLimit < 0) { super.ignorableWhitespace(ch, start, length); return; } int availableLength = Math.min(totalWriteLimit - totalChars, length); super.ignorableWhitespace(ch, start, availableLength); if (availableLength < length) { throw new WriteLimitReached(totalWriteLimit); } totalChars += availableLength; } } public static class WriteLimitReached extends SAXException { final int writeLimit; WriteLimitReached(int writeLimit) { this.writeLimit = writeLimit; } @Override public String getMessage() { return "Your document contained more than " + writeLimit + " characters, and so your requested limit has been" + " reached. To receive the full text of the document," + " increase your limit. (Text up to the limit is" + " however available)."; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy