org.zaproxy.zap.spider.parser.SpiderSitemapXMLParser Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of zap

The Zed Attack Proxy (ZAP) is an easy to use integrated penetration testing tool for finding vulnerabilities in web applications. It is designed to be used by people with a wide range of security experience and as such is ideal for developers and functional testers who are new to penetration testing. ZAP provides automated scanners as well as a set of tools that allow you to find security vulnerabilities manually.

There is a newer version: 2.7.0

Show newest version

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 *
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.zaproxy.zap.spider.parser;

import java.io.ByteArrayInputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import net.htmlparser.jericho.Source;

import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpStatusCode;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.zaproxy.zap.spider.SpiderParam;
import org.zaproxy.zap.utils.XmlUtils;

/**
 * SitemapXMLParser is used for parsing URLs from a sitemap.xml file, which sometimes (very helpfully) resides in the web root.
 * @author 70pointer
 *
 */
public class SpiderSitemapXMLParser extends SpiderParser {
	
	/** a pattern to match the sitemap.xml file name*/
	private Pattern SITEMAP_XML_FILENAME_PATTERN = Pattern.compile("/sitemap\\.xml$");
	
	/** a pattern to match the sitemap.xml file.. hint: It's XML*/
	private static final Pattern xmlPattern = Pattern.compile ("^<\\?xml\\s+version\\s*=\\s*\"[0-9.]+\"\\s+encoding\\s*=\\s*\"[^\"]+\"\\s*\\?>");

	/** The Spider parameters. */
	private SpiderParam params;
	
	/** used to parse the XML based file format */ 
	private static DocumentBuilder dBuilder;
	
	/**
	 * an x path expression to match the "loc" tag in sitemap.xml  
	 */	
	private static  XPathExpression xpathLocationExpression;

	/** statically initialise the XML DocumentBuilderFactory and DocumentBuilder */
	static {		
		try {
			dBuilder = XmlUtils.newXxeDisabledDocumentBuilderFactory().newDocumentBuilder();
			XPath  xpath = XPathFactory.newInstance().newXPath();
			xpathLocationExpression = xpath.compile("/urlset/url/loc/text()");
		} catch (ParserConfigurationException | XPathExpressionException e) {
			log.error(e);
		}
	}

	/**
	 * Instantiates a new sitemap.xml parser.
	 * 
	 * @param params the params
	 * @throws IllegalArgumentException if {@code params} is null.
	 */
	public SpiderSitemapXMLParser(SpiderParam params) {
		super();
		if (params == null) {
			throw new IllegalArgumentException("Parameter params must not be null.");
		}
		this.params = params;
	}

	@Override
	public boolean parseResource(HttpMessage message, Source source, int depth) {
		
		if (log.isDebugEnabled()) log.debug("Parsing a sitemap.xml resource...");
		
		if (message == null || !params.isParseSitemapXml() || 
				!message.getResponseHeader().isXml() ||
				HttpStatusCode.isClientError(message.getResponseHeader().getStatusCode()) ||
				HttpStatusCode.isServerError(message.getResponseHeader().getStatusCode())) {
			return false;
		}		
		
		// Get the response content
		byte [] response = message.getResponseBody().getBytes();
		String baseURL = message.getRequestHeader().getURI().toString();
		Matcher xmlFormatMatcher = xmlPattern.matcher(new String (response));
		if (xmlFormatMatcher.find()) {
						
			if (log.isDebugEnabled()) log.debug("The format matches XML");
			
			try {
				Document xmldoc = dBuilder.parse(new InputSource(new ByteArrayInputStream(response)));
				NodeList locationNodes = (NodeList) xpathLocationExpression.evaluate(xmldoc, XPathConstants.NODESET);
			    for (int i = 0; i < locationNodes.getLength(); i++) {
			    	processURL(message, depth, locationNodes.item(i).getNodeValue(), baseURL); 
			    }
			} 
			catch (Exception e) {
				log.error("An error occurred trying to parse sitemap.xml", e);
				return false;
			}
			// We consider the message fully parsed, so it doesn't get parsed by 'fallback' parsers
			return true;
		} else {
			//the file name is right, but the content is not. Pass it to another parser. 
			if (log.isDebugEnabled()) log.debug("The content of the response from '"+ baseURL + "' does not match the expected content for a sitemap.xml file. Ignoring it.");
			return false;
		}

	}

	@Override
	public boolean canParseResource(HttpMessage message, String path, boolean wasAlreadyParsed) {
		if (log.isDebugEnabled()) log.debug("canParseResource called on '"+ path + "'");
		// matches the file name of files that should be parsed with the sitemap.xml file parser
		Matcher matcher = SITEMAP_XML_FILENAME_PATTERN.matcher(path);
		return matcher.find();
	}
}