All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.sax.SecureContentHandler Maven / Gradle / Ivy

Go to download

This is the core Apache Tika™ toolkit library from which all other modules inherit functionality. It also includes the core facades for the Tika API.

There is a newer version: 1.0.18
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import java.io.IOException;
import java.util.LinkedList;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/**
 * Content handler decorator that attempts to prevent denial of service
 * attacks against Tika parsers.
 * 

* Currently this class simply compares the number of output characters * to to the number of input bytes and keeps track of the XML nesting levels. * An exception gets thrown if the output seems excessive compared to the * input document. This is a strong indication of a zip bomb. * * @since Apache Tika 0.4 * @see TIKA-216 */ public class SecureContentHandler extends ContentHandlerDecorator { /** * The input stream that Tika is parsing. */ private final TikaInputStream stream; /** * Number of output characters that Tika has produced so far. */ private long characterCount = 0; /** * The current XML element depth. */ private int currentDepth = 0; /** * Current number of nested <div class="package-entr"> elements. */ private LinkedList packageEntryDepths = new LinkedList(); /** * Output threshold. */ private long threshold = 1000000; /** * Maximum compression ratio. */ private long ratio = 100; /** * Maximum XML element nesting level. */ private int maxDepth = 100; /** * Maximum package entry nesting level. */ private int maxPackageEntryDepth = 10; /** * Decorates the given content handler with zip bomb prevention based * on the count of bytes read from the given counting input stream. * The resulting decorator can be passed to a Tika parser along with * the given counting input stream. * * @param handler the content handler to be decorated * @param stream the input stream to be parsed */ public SecureContentHandler( ContentHandler handler, TikaInputStream stream) { super(handler); this.stream = stream; } /** * Returns the configured output threshold. * * @return output threshold */ public long getOutputThreshold() { return threshold; } /** * Sets the threshold for output characters before the zip bomb prevention * is activated. This avoids false positives in cases where an otherwise * normal document for some reason starts with a highly compressible * sequence of bytes. * * @param threshold new output threshold */ public void setOutputThreshold(long threshold) { this.threshold = threshold; } /** * Returns the maximum compression ratio. * * @return maximum compression ratio */ public long getMaximumCompressionRatio() { return ratio; } /** * Sets the ratio between output characters and input bytes. If this * ratio is exceeded (after the output threshold has been reached) then * an exception gets thrown. * * @param ratio new maximum compression ratio */ public void setMaximumCompressionRatio(long ratio) { this.ratio = ratio; } /** * Returns the maximum XML element nesting level. * * @return maximum XML element nesting level */ public int getMaximumDepth() { return maxDepth; } /** * Sets the maximum package entry nesting level. If this depth level is * exceeded then an exception gets thrown. * * @param depth maximum package entry nesting level */ public void setMaximumPackageEntryDepth(int depth) { this.maxPackageEntryDepth = depth; } /** * Returns the maximum package entry nesting level. * * @return maximum package entry nesting level */ public int getMaximumPackageEntryDepth() { return maxPackageEntryDepth; } /** * Sets the maximum XML element nesting level. If this depth level is * exceeded then an exception gets thrown. * * @param depth maximum XML element nesting level */ public void setMaximumDepth(int depth) { this.maxDepth = depth; } /** * Converts the given {@link SAXException} to a corresponding * {@link TikaException} if it's caused by this instance detecting * a zip bomb. * * @param e SAX exception * @throws TikaException zip bomb exception */ public void throwIfCauseOf(SAXException e) throws TikaException { if (e instanceof SecureSAXException && ((SecureSAXException) e).isCausedBy(this)) { throw new TikaException("Zip bomb detected!", e); } } private long getByteCount() throws SAXException { try { if (stream.hasLength()) { return stream.getLength(); } else { return stream.getPosition(); } } catch (IOException e) { throw new SAXException("Unable to get stream length", e); } } /** * Records the given number of output characters (or more accurately * UTF-16 code units). Throws an exception if the recorded number of * characters highly exceeds the number of input bytes read. * * @param length number of new output characters produced * @throws SAXException if a zip bomb is detected */ private void advance(int length) throws SAXException { characterCount += length; long byteCount = getByteCount(); if (characterCount > threshold && characterCount > byteCount * ratio) { throw new SecureSAXException( "Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount + " output characters"); } } @Override public void startElement( String uri, String localName, String name, Attributes atts) throws SAXException { currentDepth++; if (currentDepth >= maxDepth) { throw new SecureSAXException( "Suspected zip bomb: " + currentDepth + " levels of XML element nesting"); } if ("div".equals(name) && "package-entry".equals(atts.getValue("class"))) { packageEntryDepths.addLast(currentDepth); if (packageEntryDepths.size() >= maxPackageEntryDepth) { throw new SecureSAXException( "Suspected zip bomb: " + packageEntryDepths.size() + " levels of package entry nesting"); } } super.startElement(uri, localName, name, atts); } @Override public void endElement( String uri, String localName, String name) throws SAXException { super.endElement(uri, localName, name); if (!packageEntryDepths.isEmpty() && packageEntryDepths.getLast() == currentDepth) { packageEntryDepths.removeLast(); } currentDepth--; } @Override public void characters(char[] ch, int start, int length) throws SAXException { advance(length); super.characters(ch, start, length); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { advance(length); super.ignorableWhitespace(ch, start, length); } /** * Private exception class used to indicate a suspected zip bomb. * * @see SecureContentHandler#throwIfCauseOf(SAXException) */ private class SecureSAXException extends SAXException { /** Serial version UID.*/ private static final long serialVersionUID = 2285245380321771445L; public SecureSAXException(String message) throws SAXException { super(message); } public boolean isCausedBy(SecureContentHandler handler) { return SecureContentHandler.this == handler; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy