All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.validator.html.scan.MagicSAXFilter Maven / Gradle / Ivy

/*
 * Copyright (c) 2007-2011, Arshan Dabirsiaghi, Jason Li
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 * 
 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 * Neither the name of OWASP nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.owasp.validator.html.scan;

import java.util.*;
import java.util.regex.Pattern;

import org.apache.xerces.util.AugmentationsImpl;
import org.apache.xerces.util.XMLAttributesImpl;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.DefaultFilter;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.css.ExternalCssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.InternalPolicy;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;

/**
 * Implementation of an HTML-filter that adheres to an AntiSamy policy. This
 * filter is SAX-based which means it is much more memory-efficient and also a
 * bit faster than the DOM-based implementation.
 */
public class MagicSAXFilter extends DefaultFilter implements XMLDocumentFilter {

    private static enum Ops {
        CSS, FILTER, REMOVE, TRUNCATE, KEEP
    }
	private final Stack operations = new Stack();
	private List errorMessages = new ArrayList();
	private StringBuffer cssContent = null;
	private XMLAttributes cssAttributes = null;
	private CssScanner cssScanner = null;
	private InternalPolicy policy;
	private ResourceBundle messages;

	private boolean isNofollowAnchors;
	private boolean isValidateParamAsEmbed;
	private boolean inCdata = false;
    // From policy
    private boolean preserveComments;
    private int maxInputSize;
    private boolean externalCssScanner;

    public MagicSAXFilter(ResourceBundle messages) {
		this.messages = messages;
    }

    public void reset(InternalPolicy instance){
        this.policy = instance;
        isNofollowAnchors = policy.isNofollowAnchors();
        isValidateParamAsEmbed = policy.isValidateParamAsEmbed();
        preserveComments = policy.isPreserveComments();
        maxInputSize = policy.getMaxInputSize();
        externalCssScanner = policy.isEmbedStyleSheets();
        operations.clear();
        errorMessages.clear();
        cssContent = null;
        cssAttributes = null;
        cssScanner = null;
        inCdata = false;

    }

	public void characters(XMLString text, Augmentations augs) throws XNIException {
        //noinspection StatementWithEmptyBody
        Ops topOp = peekTop();
        //noinspection StatementWithEmptyBody
        if (topOp ==  Ops.REMOVE) {
			// content is removed altogether
		} else if (topOp == Ops.CSS) {
			// we record the style element's text content
			// to filter it later
			cssContent.append(text.ch, text.offset, text.length);
		} else {
			// pass through all character content.
			if ( inCdata ) {
				String encoded = HTMLEntityEncoder.htmlEntityEncode(text.toString());
                addError(ErrorMessageUtil.ERROR_CDATA_FOUND, new Object[]{encoded});
			}
			super.characters(text, augs);
		}
	}

    private static final Pattern conditionalDirectives =
            Pattern.compile("?");

    public void comment(XMLString text, Augmentations augs) throws XNIException {

		if (preserveComments) {
			String value = text.toString();
			// Strip conditional directives regardless of the
			// PRESERVE_COMMENTS setting.
			if (value != null) {
                value = conditionalDirectives.matcher(value).replaceAll("");
				super.comment(new XMLString(value.toCharArray(), 0, value.length()), augs);
			}
		}
	}

	public void doctypeDecl(String root, String publicId, String systemId, Augmentations augs) throws XNIException {
		// user supplied doctypes are ignored
	}

	public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs) throws XNIException {
		this.startElement(element, attributes, augs);
		this.endElement(element, augs);
	}

    private Ops peekTop(){
         return operations.empty() ? null : operations.peek();
    }

	public void endElement(QName element, Augmentations augs) throws XNIException {
        Ops topOp = peekTop();
        if (Ops.REMOVE == topOp) {
			// content is removed altogether
			operations.pop();
		} else if (Ops.FILTER == topOp) {
			// content is removed, but child nodes not
			operations.pop();
		} else if (Ops.CSS == topOp) {
			operations.pop();
			// now scan the CSS.
			CssScanner cssScanner = makeCssScanner();
			try {
				CleanResults results = cssScanner.scanStyleSheet(cssContent.toString(), maxInputSize);
				// report all errors found
				errorMessages.addAll(results.getErrorMessages());
				/*
				 * If IE gets an empty style tag, i.e.