All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.zaproxy.zap.spider.Spider Maven / Gradle / Ivy

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 *
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 *
 * Copyright 2012 The ZAP Development Team
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.zaproxy.zap.spider;

import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.Logger;
import org.parosproxy.paros.model.Model;
import org.parosproxy.paros.network.ConnectionParam;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpRequestHeader;
import org.parosproxy.paros.network.HttpSender;
import org.zaproxy.zap.extension.spider.ExtensionSpider;
import org.zaproxy.zap.model.Context;
import org.zaproxy.zap.spider.filters.DefaultFetchFilter;
import org.zaproxy.zap.spider.filters.DefaultParseFilter;
import org.zaproxy.zap.spider.filters.FetchFilter;
import org.zaproxy.zap.spider.filters.FetchFilter.FetchStatus;
import org.zaproxy.zap.spider.filters.ParseFilter;
import org.zaproxy.zap.spider.parser.SpiderParser;
import org.zaproxy.zap.users.User;

/** The Class Spider. */
public class Spider {

    /** The spider parameters. */
    private SpiderParam spiderParam;

    /** The connection parameters. */
    private ConnectionParam connectionParam;

    /** The model. */
    private Model model;

    /** The listeners for Spider related events. */
    private List listeners;

    /** If the spider is currently paused. */
    private volatile boolean paused;

    /** The the spider is currently stopped. */
    private volatile boolean stopped;

    /** The pause lock, used for locking access to the "paused" variable. */
    private ReentrantLock pauseLock = new ReentrantLock();

    /** The controller that manages the spidering process. */
    private SpiderController controller;

    /**
     * The condition that is used for the threads in the pool to wait on, when the Spider crawling
     * is paused. When the Spider is resumed, all the waiting threads are awakened.
     */
    private Condition pausedCondition = pauseLock.newCondition();

    /** The thread pool for spider workers. */
    private ExecutorService threadPool;

    /** The default fetch filter. */
    private DefaultFetchFilter defaultFetchFilter;

    /** The seed list. */
    private LinkedHashSet seedList;

    /** The extension. */
    private ExtensionSpider extension;

    /** The Constant log. */
    private static final Logger log = Logger.getLogger(Spider.class);

    /** The HTTP sender used to effectively send the data. */
    private HttpSender httpSender;

    /** The count of the tasks finished. */
    private int tasksDoneCount;

    /** The total count of all the submitted tasks. */
    private int tasksTotalCount;

    /** The scan context. If null, the scan is not performed in a context. */
    private Context scanContext;

    /** The scan user. */
    private User scanUser;

    /** The time the scan was started */
    private long timeStarted;

    /**
     * The initialized marks if the spidering process is completely started. It solves the problem
     * when the first task is processed and the process is finished before the other seeds are
     * added.
     */
    private boolean initialized;

    /**
     * we do not want to recurse into an SVN folder, or a subfolder of an SVN folder, if one was
     * created from a previous Spider run
     */
    private static final Pattern svnUrlPattern = Pattern.compile("\\.svn/"); // case sensitive

    /**
     * we do not want to recurse into a Git folder, or a subfolder of a Git folder, if one was
     * created from a previous Spider run
     */
    private static final Pattern gitUrlPattern = Pattern.compile("\\.git/"); // case sensitive

    private final String id;

    /**
     * Instantiates a new spider.
     *
     * @param extension the extension
     * @param spiderParam the spider param
     * @param connectionParam the connection param
     * @param model the model
     * @param scanContext if a scan context is set, only URIs within the context are fetched and
     *     processed
     * @deprecated (2.6.0) Use {@link #Spider(String, ExtensionSpider, SpiderParam, ConnectionParam,
     *     Model, Context)} instead.
     */
    @Deprecated
    public Spider(
            ExtensionSpider extension,
            SpiderParam spiderParam,
            ConnectionParam connectionParam,
            Model model,
            Context scanContext) {
        this("?", extension, spiderParam, connectionParam, model, scanContext);
    }

    /**
     * Constructs a {@code Spider} with the given data.
     *
     * @param id the ID of the spider, usually a unique integer
     * @param extension the extension
     * @param spiderParam the spider param
     * @param connectionParam the connection param
     * @param model the model
     * @param scanContext if a scan context is set, only URIs within the context are fetched and
     *     processed
     * @since 2.6.0
     */
    public Spider(
            String id,
            ExtensionSpider extension,
            SpiderParam spiderParam,
            ConnectionParam connectionParam,
            Model model,
            Context scanContext) {
        super();
        log.info("Spider initializing...");
        this.id = id;
        this.spiderParam = spiderParam;
        this.connectionParam = connectionParam;
        this.model = model;
        this.extension = extension;
        this.controller = new SpiderController(this, extension.getCustomParsers());
        this.listeners = new LinkedList<>();
        this.seedList = new LinkedHashSet<>();
        this.scanContext = scanContext;

        init();
    }

    /** Initialize the spider. */
    private void init() {
        this.paused = false;
        this.stopped = true;
        this.tasksDoneCount = 0;
        this.tasksTotalCount = 0;
        this.initialized = false;

        // Add a default fetch filter and any custom ones
        defaultFetchFilter = new DefaultFetchFilter();
        this.addFetchFilter(defaultFetchFilter);

        for (FetchFilter filter : extension.getCustomFetchFilters()) {
            this.addFetchFilter(filter);
        }

        // Add a default parse filter and any custom ones
        this.addParseFilter(new DefaultParseFilter(spiderParam, extension.getMessages()));
        for (ParseFilter filter : extension.getCustomParseFilters()) this.addParseFilter(filter);

        // Add the scan context, if any
        defaultFetchFilter.setScanContext(this.scanContext);
        defaultFetchFilter.setDomainsAlwaysInScope(spiderParam.getDomainsAlwaysInScopeEnabled());
    }

    /* SPIDER Related */
    /**
     * Adds a new seed for the Spider.
     *
     * @param msg the message used for seed. The request URI is used from the Request Header
     */
    public void addSeed(HttpMessage msg) {
        URI uri = msg.getRequestHeader().getURI();
        addSeed(uri);
    }

    /**
     * Adds a new seed for the Spider.
     *
     * @param uri the uri
     */
    public void addSeed(URI uri) {
        // Update the scope of the spidering process
        String host = null;

        try {
            host = uri.getHost();
            defaultFetchFilter.addScopeRegex(host);
        } catch (URIException e) {
            log.error("There was an error while adding seed value: " + uri, e);
            return;
        }
        // Add the seed to the list -- it will be added to the task list only when the spider is
        // started
        this.seedList.add(uri);
        // Add the appropriate 'robots.txt' as a seed
        if (getSpiderParam().isParseRobotsTxt()) {
            addRootFileSeed(uri, "robots.txt");
        }
        // Add the appropriate 'sitemap.xml' as a seed
        if (getSpiderParam().isParseSitemapXml()) {
            addRootFileSeed(uri, "sitemap.xml");
        }
        // And add '.svn/entries' as a seed, for SVN based spidering
        if (getSpiderParam().isParseSVNEntries()) {
            addFileSeed(uri, ".svn/entries", svnUrlPattern);
            addFileSeed(uri, ".svn/wc.db", svnUrlPattern);
        }

        // And add '.git/index' as a seed, for Git based spidering
        if (getSpiderParam().isParseGit()) {
            addFileSeed(uri, ".git/index", gitUrlPattern);
        }
    }

    /**
     * Adds a file seed, with the given file name, at the root of the base URI.
     *
     * 

For example, with base URI as {@code http://example.com/some/path/file.html} and file name * as {@code sitemap.xml} it's added the seed {@code http://example.com/sitemap.xml}. * * @param baseUri the base URI. * @param fileName the file name. */ private void addRootFileSeed(URI baseUri, String fileName) { String seed = buildUri( baseUri.getScheme(), baseUri.getRawHost(), baseUri.getPort(), "/" + fileName); try { this.seedList.add(new URI(seed, true)); } catch (Exception e) { log.warn("Error while creating [" + fileName + "] seed: " + seed, e); } } /** * Creates a URI (string) with the given scheme, host, port and path. The port is only added if * not the default for the given scheme. * * @param scheme the scheme, {@code http} or {@code https}. * @param host the name of the host. * @param port the port. * @param path the path, should start with {@code /}. * @return the URI with the provided components. */ private static String buildUri(String scheme, char[] host, int port, String path) { StringBuilder strBuilder = new StringBuilder(150); strBuilder.append(scheme).append("://").append(host); if (!isDefaultPort(scheme, port)) { strBuilder.append(':').append(port); } strBuilder.append(path); return strBuilder.toString(); } /** * Adds a file seed using the given base URI, file name and condition. * *

The file is added as part of the path, without existing file name. For example, with base * URI as {@code http://example.com/some/path/file.html} and file name as {@code .git/index} * it's added the seed {@code http://example.com/some/path/.git/index}. * *

If the given condition matches the base URI's path without the file name, the file seed is * not added (this prevents adding the seed once again). * * @param baseUri the base URI to construct the file seed. * @param fileName the name of the file seed. * @param condition the condition to add the file seed. */ private void addFileSeed(URI baseUri, String fileName, Pattern condition) { String fullpath = baseUri.getEscapedPath(); if (fullpath == null) { fullpath = ""; } String name = baseUri.getEscapedName(); if (name == null) { name = ""; } String pathminusfilename = fullpath.substring(0, fullpath.lastIndexOf(name)); if (pathminusfilename.isEmpty()) { pathminusfilename = "/"; } if (condition.matcher(pathminusfilename).find()) { return; } String uri = buildUri( baseUri.getScheme(), baseUri.getRawHost(), baseUri.getPort(), pathminusfilename + fileName); try { this.seedList.add(new URI(uri, true)); } catch (Exception e) { log.warn( "Error while creating a seed URI for file [" + fileName + "] from [" + baseUri + "] using [" + uri + "]:", e); } } /** * Tells whether or not the given port is the default for the given scheme. * *

Only intended to be used with HTTP/S schemes. * * @param scheme the scheme. * @param port the port. * @return {@code true} if the given port is the default for the given scheme, {@code false} * otherwise. */ private static boolean isDefaultPort(String scheme, int port) { if (port == -1) { return true; } if ("http".equalsIgnoreCase(scheme)) { return port == 80; } if ("https".equalsIgnoreCase(scheme)) { return port == 443; } return false; } /** * Sets the exclude list which contains a List of strings, defining the uris that should be * excluded. * * @param excludeList the new exclude list */ public void setExcludeList(List excludeList) { log.debug("New Exclude list: " + excludeList); defaultFetchFilter.setExcludeRegexes(excludeList); } /** * Adds a new fetch filter to the spider. * * @param filter the filter */ public void addFetchFilter(FetchFilter filter) { controller.addFetchFilter(filter); } /** * Adds a new parse filter to the spider. * * @param filter the filter */ public void addParseFilter(ParseFilter filter) { controller.addParseFilter(filter); } /** * Gets the http sender. Can be called from the SpiderTask. * * @return the http sender */ protected HttpSender getHttpSender() { return httpSender; } /** * Gets the spider parameters. Can be called from the SpiderTask. * * @return the spider parameters */ protected SpiderParam getSpiderParam() { return spiderParam; } /** * Gets the controller. * * @return the controller */ protected SpiderController getController() { return controller; } /** * Gets the model. * * @return the model */ protected Model getModel() { return this.model; } /** * Submit a new task to the spidering task pool. * * @param task the task */ protected synchronized void submitTask(SpiderTask task) { if (isStopped()) { log.debug("Submitting task skipped (" + task + ") as the Spider process is stopped."); return; } if (isTerminated()) { log.debug( "Submitting task skipped (" + task + ") as the Spider process is terminated."); return; } this.tasksTotalCount++; try { this.threadPool.execute(task); } catch (RejectedExecutionException e) { if (log.isDebugEnabled()) { log.debug( "Submitted task was rejected (" + task + "), spider state: [stopped=" + isStopped() + ", terminated=" + isTerminated() + "]."); } } } /** * Gets the extension. * * @return the extension */ protected ExtensionSpider getExtensionSpider() { return this.extension; } /* SPIDER PROCESS maintenance - pause, resume, shutdown, etc. */ /** Starts the Spider crawling. */ public void start() { log.info("Starting spider..."); this.timeStarted = System.currentTimeMillis(); fetchFilterSeeds(); // Check if seeds are available, otherwise the Spider will start, but will not have any // seeds and will not stop. if (seedList == null || seedList.isEmpty()) { log.warn("No seeds available for the Spider. Cancelling scan..."); notifyListenersSpiderComplete(false); notifyListenersSpiderProgress(100, 0, 0); return; } if (scanUser != null) log.info( "Scan will be performed from the point of view of User: " + scanUser.getName()); this.controller.init(); this.stopped = false; this.paused = false; this.initialized = false; // Initialize the thread pool this.threadPool = Executors.newFixedThreadPool( spiderParam.getThreadCount(), new SpiderThreadFactory("ZAP-SpiderThreadPool-" + id + "-thread-")); // Initialize the HTTP sender httpSender = new HttpSender( connectionParam, connectionParam.isHttpStateEnabled() ? true : !spiderParam.isAcceptCookies(), HttpSender.SPIDER_INITIATOR); // Do not follow redirections because the request is not updated, the redirections will be // handled manually. httpSender.setFollowRedirect(false); // Add the seeds for (URI uri : seedList) { if (log.isDebugEnabled()) { log.debug("Adding seed for spider: " + uri); } controller.addSeed(uri, HttpRequestHeader.GET); } // Mark the process as completely initialized initialized = true; } /** * Filters the seed list using the current fetch filters, preventing any non-valid seed from * being accessed. * * @see #seedList * @see FetchFilter * @see SpiderController#getFetchFilters() * @since 2.5.0 */ private void fetchFilterSeeds() { if (seedList == null || seedList.isEmpty()) { return; } for (Iterator it = seedList.iterator(); it.hasNext(); ) { URI seed = it.next(); for (FetchFilter filter : controller.getFetchFilters()) { FetchStatus filterReason = filter.checkFilter(seed); if (filterReason != FetchStatus.VALID) { if (log.isDebugEnabled()) { log.debug("Seed: " + seed + " was filtered with reason: " + filterReason); } it.remove(); break; } } } } /** Stops the Spider crawling. Must not be called from any of the threads in the thread pool. */ public void stop() { if (stopped) { return; } this.stopped = true; log.info("Stopping spidering process by request."); if (this.paused) { // Have to resume first or we get a deadlock this.resume(); } // Issue the shutdown command this.threadPool.shutdown(); try { if (!this.threadPool.awaitTermination(2, TimeUnit.SECONDS)) { log.warn( "Failed to await for all spider threads to stop in the given time (2s)..."); for (Runnable task : this.threadPool.shutdownNow()) { ((SpiderTask) task).cleanup(); } } } catch (InterruptedException ignore) { log.warn("Interrupted while awaiting for all spider threads to stop..."); } if (httpSender != null) { this.getHttpSender().shutdown(); httpSender = null; } // Notify the controller to clean up memory controller.reset(); this.threadPool = null; // Notify the listeners -- in the meanwhile notifyListenersSpiderComplete(false); } /** The Spidering process is complete. */ private void complete() { if (stopped) { return; } log.info("Spidering process is complete. Shutting down..."); this.stopped = true; if (httpSender != null) { this.getHttpSender().shutdown(); httpSender = null; } // Notify the controller to clean up memory controller.reset(); // Issue the shutdown command on a separate thread, as the current thread is most likely one // from the pool new Thread( new Runnable() { @Override public void run() { if (threadPool != null) { threadPool.shutdown(); } // Notify the listeners -- in the meanwhile notifyListenersSpiderComplete(true); controller.reset(); threadPool = null; } }, "ZAP-SpiderShutdownThread-" + id) .start(); } /** Pauses the Spider crawling. */ public void pause() { pauseLock.lock(); try { paused = true; } finally { pauseLock.unlock(); } } /** Resumes the Spider crawling. */ public void resume() { pauseLock.lock(); try { paused = false; // Wake up all threads that are currently paused pausedCondition.signalAll(); } finally { pauseLock.unlock(); } } /** * Sets the spider so it will scan from the point of view of a user. * * @param user the user to be scanned as */ public void setScanAsUser(User user) { this.scanUser = user; } /** * Gets the user that will be used in the scanning. * * @return the scan user */ protected User getScanUser() { return this.scanUser; } /** * This method is run by each thread in the Thread Pool before the task execution. Particularly, * it checks if the Spidering process is paused and, if it is, it waits on the corresponding * condition for the process to be resumed. Called from the SpiderTask. */ protected void preTaskExecution() { checkPauseAndWait(); } /** * This method is run by Threads in the ThreadPool and checks if the scan is paused and, if it * is, waits until it's unpaused. */ protected void checkPauseAndWait() { pauseLock.lock(); try { while (paused && !stopped) { pausedCondition.await(); } } catch (InterruptedException e) { } finally { pauseLock.unlock(); } } /** * This method is run by each thread in the Thread Pool after the task execution. Particularly, * it notifies the listeners of the progress and checks if the scan is complete. Called from the * SpiderTask. */ protected synchronized void postTaskExecution() { if (stopped) { // Stopped, so don't count the task(s) as done. // (worker threads call this method even if the task was not really executed.) return; } tasksDoneCount++; int percentageComplete = tasksDoneCount * 100 / tasksTotalCount; // Compute the progress and notify the listeners this.notifyListenersSpiderProgress( percentageComplete, tasksDoneCount, tasksTotalCount - tasksDoneCount); // Check for ending conditions if (tasksDoneCount == tasksTotalCount && initialized) { this.complete(); } } /** * Checks if is paused. * * @return true, if is paused */ public boolean isPaused() { return this.paused; } /** * Checks if is stopped, i.e. a shutdown was issued or it is not running. * * @return true, if is stopped */ public boolean isStopped() { if (!stopped && this.spiderParam.getMaxDuration() > 0) { // Check to see if the scan has exceeded the specified maxDuration if (TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - this.timeStarted) > this.spiderParam.getMaxDuration()) { log.info( "Spidering process has exceeded maxDuration of " + this.spiderParam.getMaxDuration() + " minute(s)"); this.complete(); } } return stopped; } /** * Checks if is terminated. * * @return true, if is terminated */ public boolean isTerminated() { return threadPool.isTerminated(); } /* LISTENERS SECTION */ /** * Adds a new spider listener. * * @param listener the listener */ public void addSpiderListener(SpiderListener listener) { this.listeners.add(listener); } /** * Removes a spider listener. * * @param listener the listener */ public void removeSpiderListener(SpiderListener listener) { this.listeners.remove(listener); } /** * Notifies all the listeners regarding the spider progress. * * @param percentageComplete the percentage complete * @param numberCrawled the number of pages crawled * @param numberToCrawl the number of pages left to crawl */ protected synchronized void notifyListenersSpiderProgress( int percentageComplete, int numberCrawled, int numberToCrawl) { for (SpiderListener l : listeners) { l.spiderProgress(percentageComplete, numberCrawled, numberToCrawl); } } /** * Notifies the listeners regarding a found uri. * * @param uri the uri * @param method the method used for fetching the resource * @param status the {@link FetchStatus} stating if this uri will be processed, and, if not, * stating the reason of the filtering */ protected synchronized void notifyListenersFoundURI( String uri, String method, FetchStatus status) { for (SpiderListener l : listeners) { l.foundURI(uri, method, status); } } /** * Notifies the listeners of a {@link SpiderTask}'s result. * * @param result the result of a spider task. */ protected synchronized void notifyListenersSpiderTaskResult(SpiderTaskResult result) { for (SpiderListener l : listeners) { l.notifySpiderTaskResult(result); } } /** * Notifies the listeners that the spider is complete. * * @param successful {@code true} if the spider completed successfully (e.g. was not stopped), * {@code false} otherwise */ protected synchronized void notifyListenersSpiderComplete(boolean successful) { for (SpiderListener l : listeners) { l.spiderComplete(successful); } } public void addCustomParser(SpiderParser sp) { this.controller.addSpiderParser(sp); } private static class SpiderThreadFactory implements ThreadFactory { private final AtomicInteger threadNumber; private final String namePrefix; private final ThreadGroup group; public SpiderThreadFactory(String namePrefix) { threadNumber = new AtomicInteger(1); this.namePrefix = namePrefix; SecurityManager s = System.getSecurityManager(); group = (s != null) ? s.getThreadGroup() : Thread.currentThread().getThreadGroup(); } @Override public Thread newThread(Runnable r) { Thread t = new Thread(group, r, namePrefix + threadNumber.getAndIncrement(), 0); if (t.isDaemon()) { t.setDaemon(false); } if (t.getPriority() != Thread.NORM_PRIORITY) { t.setPriority(Thread.NORM_PRIORITY); } return t; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy