org.apache.tika.parser.RecursiveParserWrapper Maven / Gradle / Ivy
Show all versions of tika-core Show documentation
package org.apache.tika.parser;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* This is a helper class that wraps a parser in a recursive handler.
* It takes care of setting the embedded parser in the ParseContext
* and handling the embedded path calculations.
*
* After parsing a document, call getMetadata() to retrieve a list of
* Metadata objects, one for each embedded resource. The first item
* in the list will contain the Metadata for the outer container file.
*
* Content can also be extracted and stored in the {@link #TIKA_CONTENT} field
* of a Metadata object. Select the type of content to be stored
* at initialization.
*
* If a WriteLimitReachedException is encountered, the wrapper will stop
* processing the current resource, and it will not process
* any of the child resources for the given resource. However, it will try to
* parse as much as it can. If a WLRE is reached in the parent document,
* no child resources will be parsed.
*
* The implementation is based on Jukka's RecursiveMetadataParser
* and Nick's additions. See:
* RecursiveMetadataParser.
*
* Note that this wrapper holds all data in memory and is not appropriate
* for files with content too large to be held in memory.
*
* Note, too, that this wrapper is not thread safe because it stores state.
* The client must initialize a new wrapper for each thread, and the client
* is responsible for calling {@link #reset()} after each parse.
*
* The unit tests for this class are in the tika-parsers module.
*
*/
public class RecursiveParserWrapper implements Parser {
/**
* Generated serial version
*/
private static final long serialVersionUID = 9086536568120690938L;
//move this to TikaCoreProperties?
public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX + "parse_time_millis");
public final static Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
public final static Property EMBEDDED_EXCEPTION =
Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
//move this to TikaCoreProperties?
public final static Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
private final Parser wrappedParser;
private final ContentHandlerFactory contentHandlerFactory;
private final List metadatas = new LinkedList<>();
private final boolean catchEmbeddedExceptions;
//used in naming embedded resources that don't have a name.
private int unknownCount = 0;
private int maxEmbeddedResources = -1;
private boolean hitMaxEmbeddedResources = false;
/**
* Initialize the wrapper with {@link #catchEmbeddedExceptions} set
* to true
as default.
*
* @param wrappedParser parser to use for the container documents and the embedded documents
* @param contentHandlerFactory factory to use to generate a new content handler for
* the container document and each embedded document
*/
public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) {
this(wrappedParser, contentHandlerFactory, true);
}
/**
* Initialize the wrapper.
*
* @param wrappedParser parser to use for the container documents and the embedded documents
* @param contentHandlerFactory factory to use to generate a new content handler for
* the container document and each embedded document
* @param catchEmbeddedExceptions whether or not to catch the embedded exceptions.
* If set to true
, the stack traces will be stored in
* the metadata object with key: {@link #EMBEDDED_EXCEPTION}.
*/
public RecursiveParserWrapper(Parser wrappedParser,
ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions) {
this.wrappedParser = wrappedParser;
this.contentHandlerFactory = contentHandlerFactory;
this.catchEmbeddedExceptions = catchEmbeddedExceptions;
}
@Override
public Set getSupportedTypes(ParseContext context) {
return wrappedParser.getSupportedTypes(context);
}
/**
* Acts like a regular parser except it ignores the ContentHandler
* and it automatically sets/overwrites the embedded Parser in the
* ParseContext object.
*
* To retrieve the results of the parse, use {@link #getMetadata()}.
*
* Make sure to call {@link #reset()} after each parse.
*/
@Override
public void parse(InputStream stream, ContentHandler ignore,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
EmbeddedParserDecorator decorator = new EmbeddedParserDecorator("/");
context.set(Parser.class, decorator);
ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
long started = new Date().getTime();
try {
wrappedParser.parse(stream, localHandler, metadata, context);
} catch (SAXException e) {
boolean wlr = isWriteLimitReached(e);
if (wlr == false) {
throw e;
}
metadata.set(WRITE_LIMIT_REACHED, "true");
} finally {
long elapsedMillis = new Date().getTime() - started;
metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
addContent(localHandler, metadata);
if (hitMaxEmbeddedResources) {
metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
}
metadatas.add(0, deepCopy(metadata));
}
}
/**
*
* The first element in the returned list represents the
* data from the outer container file. There is no guarantee
* about the ordering of the list after that.
*
* @return list of Metadata objects that were gathered during the parse
*/
public List getMetadata() {
return metadatas;
}
/**
* Set the maximum number of embedded resources to store.
* If the max is hit during parsing, the {@link #EMBEDDED_RESOURCE_LIMIT_REACHED}
* property will be added to the container document's Metadata.
*
*
* If this value is < 0 (the default), the wrapper will store all Metadata.
*
* @param max maximum number of embedded resources to store
*/
public void setMaxEmbeddedResources(int max) {
maxEmbeddedResources = max;
}
/**
* This clears the metadata list and resets {@link #unknownCount} and
* {@link #hitMaxEmbeddedResources}
*/
public void reset() {
metadatas.clear();
unknownCount = 0;
hitMaxEmbeddedResources = false;
}
/**
* Copied/modified from WriteOutContentHandler. Couldn't make that
* static, and we need to have something that will work
* with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
* @param t
* @return
*/
private boolean isWriteLimitReached(Throwable t) {
if (t.getMessage() != null &&
t.getMessage().indexOf("Your document contained more than") == 0) {
return true;
} else {
return t.getCause() != null && isWriteLimitReached(t.getCause());
}
}
//defensive copy
private Metadata deepCopy(Metadata m) {
Metadata clone = new Metadata();
for (String n : m.names()){
if (! m.isMultiValued(n)) {
clone.set(n, m.get(n));
} else {
String[] vals = m.getValues(n);
for (int i = 0; i < vals.length; i++) {
clone.add(n, vals[i]);
}
}
}
return clone;
}
private String getResourceName(Metadata metadata) {
String objectName = "";
if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) != null) {
objectName = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
} else if (metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID) != null) {
objectName = metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID);
} else {
objectName = "embedded-" + (++unknownCount);
}
//make sure that there isn't any path info in the objectName
//some parsers can return paths, not just file names
objectName = FilenameUtils.getName(objectName);
return objectName;
}
private void addContent(ContentHandler handler, Metadata metadata) {
if (handler.getClass().equals(DefaultHandler.class)){
//no-op: we can't rely on just testing for
//empty content because DefaultHandler's toString()
//returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
} else {
String content = handler.toString();
if (content != null && content.trim().length() > 0 ) {
metadata.add(TIKA_CONTENT, content);
}
}
}
private class EmbeddedParserDecorator extends ParserDecorator {
private static final long serialVersionUID = 207648200464263337L;
private String location = null;
private EmbeddedParserDecorator(String location) {
super(wrappedParser);
this.location = location;
if (! this.location.endsWith("/")) {
this.location += "/";
}
}
@Override
public void parse(InputStream stream, ContentHandler ignore,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
//Test to see if we should avoid parsing
if (maxEmbeddedResources > -1 &&
metadatas.size() >= maxEmbeddedResources) {
hitMaxEmbeddedResources = true;
return;
}
// Work out what this thing is
String objectName = getResourceName(metadata);
String objectLocation = this.location + objectName;
metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
//ignore the content handler that is passed in
//and get a fresh handler
ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
Parser preContextParser = context.get(Parser.class);
context.set(Parser.class, new EmbeddedParserDecorator(objectLocation));
long started = new Date().getTime();
try {
super.parse(stream, localHandler, metadata, context);
} catch (SAXException e) {
boolean wlr = isWriteLimitReached(e);
if (wlr == true) {
metadata.add(WRITE_LIMIT_REACHED, "true");
} else {
if (catchEmbeddedExceptions) {
String trace = ExceptionUtils.getStackTrace(e);
metadata.set(EMBEDDED_EXCEPTION, trace);
} else {
throw e;
}
}
} catch (TikaException e) {
if (catchEmbeddedExceptions) {
String trace = ExceptionUtils.getStackTrace(e);
metadata.set(EMBEDDED_EXCEPTION, trace);
} else {
throw e;
}
} finally {
context.set(Parser.class, preContextParser);
long elapsedMillis = new Date().getTime() - started;
metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
}
//Because of recursion, we need
//to re-test to make sure that we limit the
//number of stored resources
if (maxEmbeddedResources > -1 &&
metadatas.size() >= maxEmbeddedResources) {
hitMaxEmbeddedResources = true;
return;
}
addContent(localHandler, metadata);
metadatas.add(deepCopy(metadata));
}
}
}