org.zaproxy.zap.spider.Spider Maven / Gradle / Ivy
Show all versions of zap Show documentation
/*
* Zed Attack Proxy (ZAP) and its related class files.
*
* ZAP is an HTTP/HTTPS proxy for assessing web application security.
*
* Copyright 2012 The ZAP Development Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.zaproxy.zap.spider;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.Logger;
import org.parosproxy.paros.model.Model;
import org.parosproxy.paros.network.ConnectionParam;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpRequestHeader;
import org.parosproxy.paros.network.HttpSender;
import org.zaproxy.zap.extension.spider.ExtensionSpider;
import org.zaproxy.zap.model.Context;
import org.zaproxy.zap.spider.filters.DefaultFetchFilter;
import org.zaproxy.zap.spider.filters.DefaultParseFilter;
import org.zaproxy.zap.spider.filters.FetchFilter;
import org.zaproxy.zap.spider.filters.FetchFilter.FetchStatus;
import org.zaproxy.zap.spider.filters.ParseFilter;
import org.zaproxy.zap.spider.parser.SpiderParser;
import org.zaproxy.zap.users.User;
/** The Class Spider. */
public class Spider {
/** The spider parameters. */
private SpiderParam spiderParam;
/** The connection parameters. */
private ConnectionParam connectionParam;
/** The model. */
private Model model;
/** The listeners for Spider related events. */
private List listeners;
/** If the spider is currently paused. */
private volatile boolean paused;
/** The the spider is currently stopped. */
private volatile boolean stopped;
/** The pause lock, used for locking access to the "paused" variable. */
private ReentrantLock pauseLock = new ReentrantLock();
/** The controller that manages the spidering process. */
private SpiderController controller;
/**
* The condition that is used for the threads in the pool to wait on, when the Spider crawling
* is paused. When the Spider is resumed, all the waiting threads are awakened.
*/
private Condition pausedCondition = pauseLock.newCondition();
/** The thread pool for spider workers. */
private ExecutorService threadPool;
/** The default fetch filter. */
private DefaultFetchFilter defaultFetchFilter;
/** The seed list. */
private LinkedHashSet seedList;
/** The extension. */
private ExtensionSpider extension;
/** The Constant log. */
private static final Logger log = Logger.getLogger(Spider.class);
/** The HTTP sender used to effectively send the data. */
private HttpSender httpSender;
/** The count of the tasks finished. */
private int tasksDoneCount;
/** The total count of all the submitted tasks. */
private int tasksTotalCount;
/** The scan context. If null, the scan is not performed in a context. */
private Context scanContext;
/** The scan user. */
private User scanUser;
/** The time the scan was started */
private long timeStarted;
/**
* The initialized marks if the spidering process is completely started. It solves the problem
* when the first task is processed and the process is finished before the other seeds are
* added.
*/
private boolean initialized;
/**
* we do not want to recurse into an SVN folder, or a subfolder of an SVN folder, if one was
* created from a previous Spider run
*/
private static final Pattern svnUrlPattern = Pattern.compile("\\.svn/"); // case sensitive
/**
* we do not want to recurse into a Git folder, or a subfolder of a Git folder, if one was
* created from a previous Spider run
*/
private static final Pattern gitUrlPattern = Pattern.compile("\\.git/"); // case sensitive
private final String id;
/**
* Instantiates a new spider.
*
* @param extension the extension
* @param spiderParam the spider param
* @param connectionParam the connection param
* @param model the model
* @param scanContext if a scan context is set, only URIs within the context are fetched and
* processed
* @deprecated (2.6.0) Use {@link #Spider(String, ExtensionSpider, SpiderParam, ConnectionParam,
* Model, Context)} instead.
*/
@Deprecated
public Spider(
ExtensionSpider extension,
SpiderParam spiderParam,
ConnectionParam connectionParam,
Model model,
Context scanContext) {
this("?", extension, spiderParam, connectionParam, model, scanContext);
}
/**
* Constructs a {@code Spider} with the given data.
*
* @param id the ID of the spider, usually a unique integer
* @param extension the extension
* @param spiderParam the spider param
* @param connectionParam the connection param
* @param model the model
* @param scanContext if a scan context is set, only URIs within the context are fetched and
* processed
* @since 2.6.0
*/
public Spider(
String id,
ExtensionSpider extension,
SpiderParam spiderParam,
ConnectionParam connectionParam,
Model model,
Context scanContext) {
super();
log.info("Spider initializing...");
this.id = id;
this.spiderParam = spiderParam;
this.connectionParam = connectionParam;
this.model = model;
this.extension = extension;
this.controller = new SpiderController(this, extension.getCustomParsers());
this.listeners = new LinkedList<>();
this.seedList = new LinkedHashSet<>();
this.scanContext = scanContext;
init();
}
/** Initialize the spider. */
private void init() {
this.paused = false;
this.stopped = true;
this.tasksDoneCount = 0;
this.tasksTotalCount = 0;
this.initialized = false;
// Add a default fetch filter and any custom ones
defaultFetchFilter = new DefaultFetchFilter();
this.addFetchFilter(defaultFetchFilter);
for (FetchFilter filter : extension.getCustomFetchFilters()) {
this.addFetchFilter(filter);
}
// Add a default parse filter and any custom ones
this.addParseFilter(new DefaultParseFilter(spiderParam, extension.getMessages()));
for (ParseFilter filter : extension.getCustomParseFilters()) this.addParseFilter(filter);
// Add the scan context, if any
defaultFetchFilter.setScanContext(this.scanContext);
defaultFetchFilter.setDomainsAlwaysInScope(spiderParam.getDomainsAlwaysInScopeEnabled());
}
/* SPIDER Related */
/**
* Adds a new seed for the Spider.
*
* @param msg the message used for seed. The request URI is used from the Request Header
*/
public void addSeed(HttpMessage msg) {
URI uri = msg.getRequestHeader().getURI();
addSeed(uri);
}
/**
* Adds a new seed for the Spider.
*
* @param uri the uri
*/
public void addSeed(URI uri) {
// Update the scope of the spidering process
String host = null;
try {
host = uri.getHost();
defaultFetchFilter.addScopeRegex(host);
} catch (URIException e) {
log.error("There was an error while adding seed value: " + uri, e);
return;
}
// Add the seed to the list -- it will be added to the task list only when the spider is
// started
this.seedList.add(uri);
// Add the appropriate 'robots.txt' as a seed
if (getSpiderParam().isParseRobotsTxt()) {
addRootFileSeed(uri, "robots.txt");
}
// Add the appropriate 'sitemap.xml' as a seed
if (getSpiderParam().isParseSitemapXml()) {
addRootFileSeed(uri, "sitemap.xml");
}
// And add '.svn/entries' as a seed, for SVN based spidering
if (getSpiderParam().isParseSVNEntries()) {
addFileSeed(uri, ".svn/entries", svnUrlPattern);
addFileSeed(uri, ".svn/wc.db", svnUrlPattern);
}
// And add '.git/index' as a seed, for Git based spidering
if (getSpiderParam().isParseGit()) {
addFileSeed(uri, ".git/index", gitUrlPattern);
}
}
/**
* Adds a file seed, with the given file name, at the root of the base URI.
*
* For example, with base URI as {@code http://example.com/some/path/file.html} and file name
* as {@code sitemap.xml} it's added the seed {@code http://example.com/sitemap.xml}.
*
* @param baseUri the base URI.
* @param fileName the file name.
*/
private void addRootFileSeed(URI baseUri, String fileName) {
String seed =
buildUri(
baseUri.getScheme(),
baseUri.getRawHost(),
baseUri.getPort(),
"/" + fileName);
try {
this.seedList.add(new URI(seed, true));
} catch (Exception e) {
log.warn("Error while creating [" + fileName + "] seed: " + seed, e);
}
}
/**
* Creates a URI (string) with the given scheme, host, port and path. The port is only added if
* not the default for the given scheme.
*
* @param scheme the scheme, {@code http} or {@code https}.
* @param host the name of the host.
* @param port the port.
* @param path the path, should start with {@code /}.
* @return the URI with the provided components.
*/
private static String buildUri(String scheme, char[] host, int port, String path) {
StringBuilder strBuilder = new StringBuilder(150);
strBuilder.append(scheme).append("://").append(host);
if (!isDefaultPort(scheme, port)) {
strBuilder.append(':').append(port);
}
strBuilder.append(path);
return strBuilder.toString();
}
/**
* Adds a file seed using the given base URI, file name and condition.
*
*
The file is added as part of the path, without existing file name. For example, with base
* URI as {@code http://example.com/some/path/file.html} and file name as {@code .git/index}
* it's added the seed {@code http://example.com/some/path/.git/index}.
*
*
If the given condition matches the base URI's path without the file name, the file seed is
* not added (this prevents adding the seed once again).
*
* @param baseUri the base URI to construct the file seed.
* @param fileName the name of the file seed.
* @param condition the condition to add the file seed.
*/
private void addFileSeed(URI baseUri, String fileName, Pattern condition) {
String fullpath = baseUri.getEscapedPath();
if (fullpath == null) {
fullpath = "";
}
String name = baseUri.getEscapedName();
if (name == null) {
name = "";
}
String pathminusfilename = fullpath.substring(0, fullpath.lastIndexOf(name));
if (pathminusfilename.isEmpty()) {
pathminusfilename = "/";
}
if (condition.matcher(pathminusfilename).find()) {
return;
}
String uri =
buildUri(
baseUri.getScheme(),
baseUri.getRawHost(),
baseUri.getPort(),
pathminusfilename + fileName);
try {
this.seedList.add(new URI(uri, true));
} catch (Exception e) {
log.warn(
"Error while creating a seed URI for file ["
+ fileName
+ "] from ["
+ baseUri
+ "] using ["
+ uri
+ "]:",
e);
}
}
/**
* Tells whether or not the given port is the default for the given scheme.
*
*
Only intended to be used with HTTP/S schemes.
*
* @param scheme the scheme.
* @param port the port.
* @return {@code true} if the given port is the default for the given scheme, {@code false}
* otherwise.
*/
private static boolean isDefaultPort(String scheme, int port) {
if (port == -1) {
return true;
}
if ("http".equalsIgnoreCase(scheme)) {
return port == 80;
}
if ("https".equalsIgnoreCase(scheme)) {
return port == 443;
}
return false;
}
/**
* Sets the exclude list which contains a List of strings, defining the uris that should be
* excluded.
*
* @param excludeList the new exclude list
*/
public void setExcludeList(List excludeList) {
log.debug("New Exclude list: " + excludeList);
defaultFetchFilter.setExcludeRegexes(excludeList);
}
/**
* Adds a new fetch filter to the spider.
*
* @param filter the filter
*/
public void addFetchFilter(FetchFilter filter) {
controller.addFetchFilter(filter);
}
/**
* Adds a new parse filter to the spider.
*
* @param filter the filter
*/
public void addParseFilter(ParseFilter filter) {
controller.addParseFilter(filter);
}
/**
* Gets the http sender. Can be called from the SpiderTask.
*
* @return the http sender
*/
protected HttpSender getHttpSender() {
return httpSender;
}
/**
* Gets the spider parameters. Can be called from the SpiderTask.
*
* @return the spider parameters
*/
protected SpiderParam getSpiderParam() {
return spiderParam;
}
/**
* Gets the controller.
*
* @return the controller
*/
protected SpiderController getController() {
return controller;
}
/**
* Gets the model.
*
* @return the model
*/
protected Model getModel() {
return this.model;
}
/**
* Submit a new task to the spidering task pool.
*
* @param task the task
*/
protected synchronized void submitTask(SpiderTask task) {
if (isStopped()) {
log.debug("Submitting task skipped (" + task + ") as the Spider process is stopped.");
return;
}
if (isTerminated()) {
log.debug(
"Submitting task skipped (" + task + ") as the Spider process is terminated.");
return;
}
this.tasksTotalCount++;
try {
this.threadPool.execute(task);
} catch (RejectedExecutionException e) {
if (log.isDebugEnabled()) {
log.debug(
"Submitted task was rejected ("
+ task
+ "), spider state: [stopped="
+ isStopped()
+ ", terminated="
+ isTerminated()
+ "].");
}
}
}
/**
* Gets the extension.
*
* @return the extension
*/
protected ExtensionSpider getExtensionSpider() {
return this.extension;
}
/* SPIDER PROCESS maintenance - pause, resume, shutdown, etc. */
/** Starts the Spider crawling. */
public void start() {
log.info("Starting spider...");
this.timeStarted = System.currentTimeMillis();
fetchFilterSeeds();
// Check if seeds are available, otherwise the Spider will start, but will not have any
// seeds and will not stop.
if (seedList == null || seedList.isEmpty()) {
log.warn("No seeds available for the Spider. Cancelling scan...");
notifyListenersSpiderComplete(false);
notifyListenersSpiderProgress(100, 0, 0);
return;
}
if (scanUser != null)
log.info(
"Scan will be performed from the point of view of User: " + scanUser.getName());
this.controller.init();
this.stopped = false;
this.paused = false;
this.initialized = false;
// Initialize the thread pool
this.threadPool =
Executors.newFixedThreadPool(
spiderParam.getThreadCount(),
new SpiderThreadFactory("ZAP-SpiderThreadPool-" + id + "-thread-"));
// Initialize the HTTP sender
httpSender =
new HttpSender(
connectionParam,
connectionParam.isHttpStateEnabled()
? true
: !spiderParam.isAcceptCookies(),
HttpSender.SPIDER_INITIATOR);
// Do not follow redirections because the request is not updated, the redirections will be
// handled manually.
httpSender.setFollowRedirect(false);
// Add the seeds
for (URI uri : seedList) {
if (log.isDebugEnabled()) {
log.debug("Adding seed for spider: " + uri);
}
controller.addSeed(uri, HttpRequestHeader.GET);
}
// Mark the process as completely initialized
initialized = true;
}
/**
* Filters the seed list using the current fetch filters, preventing any non-valid seed from
* being accessed.
*
* @see #seedList
* @see FetchFilter
* @see SpiderController#getFetchFilters()
* @since 2.5.0
*/
private void fetchFilterSeeds() {
if (seedList == null || seedList.isEmpty()) {
return;
}
for (Iterator it = seedList.iterator(); it.hasNext(); ) {
URI seed = it.next();
for (FetchFilter filter : controller.getFetchFilters()) {
FetchStatus filterReason = filter.checkFilter(seed);
if (filterReason != FetchStatus.VALID) {
if (log.isDebugEnabled()) {
log.debug("Seed: " + seed + " was filtered with reason: " + filterReason);
}
it.remove();
break;
}
}
}
}
/** Stops the Spider crawling. Must not be called from any of the threads in the thread pool. */
public void stop() {
if (stopped) {
return;
}
this.stopped = true;
log.info("Stopping spidering process by request.");
if (this.paused) {
// Have to resume first or we get a deadlock
this.resume();
}
// Issue the shutdown command
this.threadPool.shutdown();
try {
if (!this.threadPool.awaitTermination(2, TimeUnit.SECONDS)) {
log.warn(
"Failed to await for all spider threads to stop in the given time (2s)...");
for (Runnable task : this.threadPool.shutdownNow()) {
((SpiderTask) task).cleanup();
}
}
} catch (InterruptedException ignore) {
log.warn("Interrupted while awaiting for all spider threads to stop...");
}
if (httpSender != null) {
this.getHttpSender().shutdown();
httpSender = null;
}
// Notify the controller to clean up memory
controller.reset();
this.threadPool = null;
// Notify the listeners -- in the meanwhile
notifyListenersSpiderComplete(false);
}
/** The Spidering process is complete. */
private void complete() {
if (stopped) {
return;
}
log.info("Spidering process is complete. Shutting down...");
this.stopped = true;
if (httpSender != null) {
this.getHttpSender().shutdown();
httpSender = null;
}
// Notify the controller to clean up memory
controller.reset();
// Issue the shutdown command on a separate thread, as the current thread is most likely one
// from the pool
new Thread(
new Runnable() {
@Override
public void run() {
if (threadPool != null) {
threadPool.shutdown();
}
// Notify the listeners -- in the meanwhile
notifyListenersSpiderComplete(true);
controller.reset();
threadPool = null;
}
},
"ZAP-SpiderShutdownThread-" + id)
.start();
}
/** Pauses the Spider crawling. */
public void pause() {
pauseLock.lock();
try {
paused = true;
} finally {
pauseLock.unlock();
}
}
/** Resumes the Spider crawling. */
public void resume() {
pauseLock.lock();
try {
paused = false;
// Wake up all threads that are currently paused
pausedCondition.signalAll();
} finally {
pauseLock.unlock();
}
}
/**
* Sets the spider so it will scan from the point of view of a user.
*
* @param user the user to be scanned as
*/
public void setScanAsUser(User user) {
this.scanUser = user;
}
/**
* Gets the user that will be used in the scanning.
*
* @return the scan user
*/
protected User getScanUser() {
return this.scanUser;
}
/**
* This method is run by each thread in the Thread Pool before the task execution. Particularly,
* it checks if the Spidering process is paused and, if it is, it waits on the corresponding
* condition for the process to be resumed. Called from the SpiderTask.
*/
protected void preTaskExecution() {
checkPauseAndWait();
}
/**
* This method is run by Threads in the ThreadPool and checks if the scan is paused and, if it
* is, waits until it's unpaused.
*/
protected void checkPauseAndWait() {
pauseLock.lock();
try {
while (paused && !stopped) {
pausedCondition.await();
}
} catch (InterruptedException e) {
} finally {
pauseLock.unlock();
}
}
/**
* This method is run by each thread in the Thread Pool after the task execution. Particularly,
* it notifies the listeners of the progress and checks if the scan is complete. Called from the
* SpiderTask.
*/
protected synchronized void postTaskExecution() {
if (stopped) {
// Stopped, so don't count the task(s) as done.
// (worker threads call this method even if the task was not really executed.)
return;
}
tasksDoneCount++;
int percentageComplete = tasksDoneCount * 100 / tasksTotalCount;
// Compute the progress and notify the listeners
this.notifyListenersSpiderProgress(
percentageComplete, tasksDoneCount, tasksTotalCount - tasksDoneCount);
// Check for ending conditions
if (tasksDoneCount == tasksTotalCount && initialized) {
this.complete();
}
}
/**
* Checks if is paused.
*
* @return true, if is paused
*/
public boolean isPaused() {
return this.paused;
}
/**
* Checks if is stopped, i.e. a shutdown was issued or it is not running.
*
* @return true, if is stopped
*/
public boolean isStopped() {
if (!stopped && this.spiderParam.getMaxDuration() > 0) {
// Check to see if the scan has exceeded the specified maxDuration
if (TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - this.timeStarted)
> this.spiderParam.getMaxDuration()) {
log.info(
"Spidering process has exceeded maxDuration of "
+ this.spiderParam.getMaxDuration()
+ " minute(s)");
this.complete();
}
}
return stopped;
}
/**
* Checks if is terminated.
*
* @return true, if is terminated
*/
public boolean isTerminated() {
return threadPool.isTerminated();
}
/* LISTENERS SECTION */
/**
* Adds a new spider listener.
*
* @param listener the listener
*/
public void addSpiderListener(SpiderListener listener) {
this.listeners.add(listener);
}
/**
* Removes a spider listener.
*
* @param listener the listener
*/
public void removeSpiderListener(SpiderListener listener) {
this.listeners.remove(listener);
}
/**
* Notifies all the listeners regarding the spider progress.
*
* @param percentageComplete the percentage complete
* @param numberCrawled the number of pages crawled
* @param numberToCrawl the number of pages left to crawl
*/
protected synchronized void notifyListenersSpiderProgress(
int percentageComplete, int numberCrawled, int numberToCrawl) {
for (SpiderListener l : listeners) {
l.spiderProgress(percentageComplete, numberCrawled, numberToCrawl);
}
}
/**
* Notifies the listeners regarding a found uri.
*
* @param uri the uri
* @param method the method used for fetching the resource
* @param status the {@link FetchStatus} stating if this uri will be processed, and, if not,
* stating the reason of the filtering
*/
protected synchronized void notifyListenersFoundURI(
String uri, String method, FetchStatus status) {
for (SpiderListener l : listeners) {
l.foundURI(uri, method, status);
}
}
/**
* Notifies the listeners of a {@link SpiderTask}'s result.
*
* @param result the result of a spider task.
*/
protected synchronized void notifyListenersSpiderTaskResult(SpiderTaskResult result) {
for (SpiderListener l : listeners) {
l.notifySpiderTaskResult(result);
}
}
/**
* Notifies the listeners that the spider is complete.
*
* @param successful {@code true} if the spider completed successfully (e.g. was not stopped),
* {@code false} otherwise
*/
protected synchronized void notifyListenersSpiderComplete(boolean successful) {
for (SpiderListener l : listeners) {
l.spiderComplete(successful);
}
}
public void addCustomParser(SpiderParser sp) {
this.controller.addSpiderParser(sp);
}
private static class SpiderThreadFactory implements ThreadFactory {
private final AtomicInteger threadNumber;
private final String namePrefix;
private final ThreadGroup group;
public SpiderThreadFactory(String namePrefix) {
threadNumber = new AtomicInteger(1);
this.namePrefix = namePrefix;
SecurityManager s = System.getSecurityManager();
group = (s != null) ? s.getThreadGroup() : Thread.currentThread().getThreadGroup();
}
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(group, r, namePrefix + threadNumber.getAndIncrement(), 0);
if (t.isDaemon()) {
t.setDaemon(false);
}
if (t.getPriority() != Thread.NORM_PRIORITY) {
t.setPriority(Thread.NORM_PRIORITY);
}
return t;
}
}
}