com.goikosoft.crawler4j.robotstxt.UserAgentDirectives Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler4j Show documentation
crawler4j: Open Source Web Crawler for Java. Modified by Dario Goikoetxea to add POST capabilities
There is a newer version: 4.5.11
package com.goikosoft.crawler4j.robotstxt;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The UserAgentDirectives class stores the configuration for a single
 * user agent as defined in the robots.txt. The user agent string used
 * depends on the most recent User-agent: definition in the robots.txt file.
 *
 * Modified by Dario Goikoetxea to add timeout capabilities to RegExp
 */
public class UserAgentDirectives {
    public static final Logger logger = LoggerFactory.getLogger(UserAgentDirectives.class);

    public Set userAgents;
    private List sitemap = null;
    private String preferredHost = null;
    private Double crawlDelay = null;
    private Set pathRules = new HashSet<>();

    private long timeout;
    private boolean matchOnTimeout;
    private int checkInterval;

    /**
     * Comparator used to order the list of matching path rules in such a way
     * that the most specific match (= longest) match comes first.
     */
    static class PathComparator implements Comparator {
        /** The path to compare the path rules with */
        String path;

        /** Initialize with the path */
        PathComparator(String path) {
            this.path = path;
        }

        /**
         * Compare two paths.
         * If lhs matches and rhs does not, this will return -1
         * If rhs matches and lhs does not, this will return 1
         * If both match or both do not match,, this will return the result of
         *    a numeric comparison of the length of both patterns, where
         *    the longest (=most specific) one will come first.
         */
        @Override
        public int compare(PathRule lhs, PathRule rhs) {
            boolean p1Match = lhs.matches(path);
            boolean p2Match = rhs.matches(path);

            // Matching patterns come first
            if (p1Match && !p2Match) {
                return -1;
            } else if (p2Match && !p1Match) {
                return 1;
            }

            // Most specific pattern first
            String p1 = lhs.pattern.toString();
            String p2 = rhs.pattern.toString();

            if (p1.length() != p2.length()) {
                return Integer.compare(p2.length(), p1.length());
            }

            // Just order alphabetically if the patterns are of the same length
            return p1.compareTo(p2);
        }
    }

    /**
     * Create a UserAgentDirectives clause
     *
     * @param userAgents The list user agents for this rule
     */
    public UserAgentDirectives(Set userAgents) {
        this.userAgents = userAgents;
        this.timeout = -1;
        this.matchOnTimeout = false;
        this.checkInterval = TimeoutablePathRule.defaultCheckInterval;
    }

    public UserAgentDirectives(Set userAgents, long timeout, boolean matchOnTimeout) {
        this(userAgents, timeout, matchOnTimeout, null);
    }

    /**
     * Create a UserAgentDirectives clause
     *
     * @param userAgents The list user agents for this rule
     * @param timeout milliseconds before regular expressions timeout.
     * @param matchOnTimeout if true, a timeout will mean a match.
     * @param checkInterval number of characters read between timeout attemps. Default: 30000000
     */
    public UserAgentDirectives(Set userAgents, long timeout, boolean matchOnTimeout, Integer checkInterval) {
        this.userAgents = userAgents;
        this.timeout = timeout;
        this.matchOnTimeout = matchOnTimeout;
        if (checkInterval == null) {
            this.checkInterval = TimeoutablePathRule.defaultCheckInterval;
        } else {
            this.checkInterval = checkInterval;
        }
    }

    /**
     * Match the current user agent directive set with the given
     * user agent. The returned value will be the maximum match length
     * of any user agent.
     *
     * @param userAgent The user agent used by the crawler
     * @return The maximum length of a matching user agent in this set of directives
     */
    public int match(String userAgent) {
        userAgent = userAgent.toLowerCase();
        int maxLength = 0;
        for (String ua : userAgents) {
            if (ua.equals("*") || userAgent.contains(ua)) {
                maxLength = Math.max(maxLength, ua.length());
            }
        }
        return maxLength;
    }

    public boolean isWildcard() {
        return userAgents.contains("*");
    }

    public boolean isEmpty() {
        return pathRules.isEmpty();
    }

    public int checkAccess(String path, String userAgent) {
        // If the user agent does not match, the verdict is known
        if (match(userAgent) == 0) {
            return HostDirectives.UNDEFINED;
        }

        // Order the rules based on their match with the path
        Set rules = new TreeSet<>(new PathComparator(path));
        rules.addAll(pathRules);

        // Return the verdict of the best matching rule
        for (PathRule rule : rules) {
            if (rule.matches(path)) {
                return rule.type;
            }
        }

        return HostDirectives.UNDEFINED;
    }

    public static class UserAgentComparator implements Comparator {
        String crawlUserAgent;

        UserAgentComparator(String myUA) {
            crawlUserAgent = myUA;
        }

        @Override
        public int compare(UserAgentDirectives lhs, UserAgentDirectives rhs) {
            int matchLhs = lhs.match(crawlUserAgent);
            int matchRhs = rhs.match(crawlUserAgent);
            if (matchLhs != matchRhs) {
                return Integer.compare(matchRhs, matchLhs); // Sort descending
            }

            // Return the shortest list of user-agents unequal
            if (lhs.userAgents.size() != rhs.userAgents.size()) {
                return Integer.compare(lhs.userAgents.size(), rhs.userAgents.size());
            }

            // Alphabetic sort when length of lists is equal
            Iterator i1 = lhs.userAgents.iterator();
            Iterator i2 = rhs.userAgents.iterator();

            // Find first non-equal user agent
            while (i1.hasNext()) {
                String ua1 = i1.next();
                String ua2 = i2.next();
                int order = ua1.compareTo(ua2);
                if (order != 0) {
                    return order;
                }
            }

            // List of user agents was also equal, so these directives are equal
            return 0;
        }
    }

    /**
     * Add a rule to the list of rules for this user agent.
     * Valid rules are: sitemap, crawl-delay, host, allow and disallow.
     * These are based on the wikipedia article at:
     *
     * https://en.wikipedia.org/wiki/Robots_exclusion_standard
     *
     * and the Google documentation at:
     *
     * https://support.google.com/webmasters/answer/6062596
     *
     * @param rule The name of the rule
     * @param value The value of the rule
     */
    public void add(String rule, String value) {
        if (rule.equals("sitemap")) {
            if (this.sitemap == null) {
                this.sitemap = new ArrayList();
            }
            this.sitemap.add(value);
        } else if (rule.equals("crawl-delay")) {
            try {
                this.crawlDelay = Double.parseDouble(value);
            } catch (NumberFormatException e) {
                logger.warn("Invalid number format for crawl-delay robots.txt: {}", value);
            }
        } else if (rule.equals("host")) {
            this.preferredHost = value;
        } else if (rule.equals("allow")) {
            if (timeout < 0) {
                this.pathRules.add(new PathRule(HostDirectives.ALLOWED, value));
            } else {
                this.pathRules.add(new TimeoutablePathRule(HostDirectives.ALLOWED, value, timeout, matchOnTimeout,
                                                            checkInterval));
            }

        } else if (rule.equals("disallow")) {
            if (timeout < 0) {
                this.pathRules.add(new PathRule(HostDirectives.DISALLOWED, value));
            } else {
                this.pathRules.add(new TimeoutablePathRule(HostDirectives.DISALLOWED, value, timeout, matchOnTimeout,
                                                            checkInterval));
            }
        } else {
            logger.error("Invalid key in robots.txt passed to UserAgentRules: {}", rule);
        }
    }

    /**
     * Return the configured crawl delay in seconds
     *
     * @return The configured crawl delay, or null if none was specified
     */
    public Double getCrawlDelay() {
        return crawlDelay;
    }

    /**
     * Return the specified preferred host name in robots.txt.
     *
     * @return The specified hostname, or null if it was not specified
     */
    public String getPreferredHost() {
        return preferredHost;
    }

    /**
     * Return the listed sitemaps, or null if none was specified
     *
     * @return The list of sitemap-links specified in robots.txt
     */
    public List getSitemap() {
        return sitemap;
    }
}