All Downloads are FREE. Search and download functionalities are using the official Maven repository.

crawlercommons.robots.BaseRobotRules Maven / Gradle / Ivy

Go to download

crawler-commons is a set of reusable Java components that implement functionality common to any web crawler.

The newest version!
/**
 * Copyright 2016 Crawler-Commons
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package crawlercommons.robots;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.List;

/**
 * Result from parsing a single robots.txt file – a set of allow/disallow rules
 * to check whether a given URL is allowed, and optionally a Crawl-delay and Sitemap URLs.
 */
@SuppressWarnings("serial")
public abstract class BaseRobotRules implements Serializable {

    public static final long UNSET_CRAWL_DELAY = Long.MIN_VALUE;

    public abstract boolean isAllowed(String url);

    public abstract boolean isAllowAll();

    public abstract boolean isAllowNone();

    private long _crawlDelay = UNSET_CRAWL_DELAY;
    private boolean _deferVisits = false;
    private LinkedHashSet _sitemaps;

    public BaseRobotRules() {
        _sitemaps = new LinkedHashSet<>();
    }

    /**
     * Get Crawl-delay (in milliseconds)
     * 
     * @return Crawl-delay defined in the robots.txt for the given agent name,
     *         or {@link UNSET_CRAWL_DELAY} if not defined.
     */
    public long getCrawlDelay() {
        return _crawlDelay;
    }

    /**
     * @param crawlDelay
     *            Crawl-Delay in milliseconds
     */
    public void setCrawlDelay(long crawlDelay) {
        _crawlDelay = crawlDelay;
    }

    /**
     * @return whether to defer visits to the server
     */
    public boolean isDeferVisits() {
        return _deferVisits;
    }

    /**
     * Indicate to defer visits to the server, e.g. to wait until the robots.txt
     * becomes available.
     */
    public void setDeferVisits(boolean deferVisits) {
        _deferVisits = deferVisits;
    }

    /** Add sitemap URL to rules if not a duplicate */
    public void addSitemap(String sitemap) {
        _sitemaps.add(sitemap);
    }

    /** Get URLs of sitemap links found in robots.txt */
    public List getSitemaps() {
        return new ArrayList<>(_sitemaps);
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + (int) (_crawlDelay ^ (_crawlDelay >>> 32));
        result = prime * result + (_deferVisits ? 1231 : 1237);
        result = prime * result + ((_sitemaps == null) ? 0 : _sitemaps.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        BaseRobotRules other = (BaseRobotRules) obj;
        if (_crawlDelay != other._crawlDelay)
            return false;
        if (_deferVisits != other._deferVisits)
            return false;
        if (_sitemaps == null) {
            if (other._sitemaps != null)
                return false;
        } else if (!_sitemaps.equals(other._sitemaps))
            return false;
        return true;
    }

    /**
     * Returns a string with the crawl delay as well as a list of sitemaps if
     * they exist (and aren't more than 10).
     */
    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass()).append(":\n");
        long delay = getCrawlDelay();
        if (delay == UNSET_CRAWL_DELAY) {
            sb.append(" - no crawl delay\n");
        } else {
            sb.append(" - crawl delay: ").append(delay).append('\n');
        }

        List sitemaps = getSitemaps();
        int nSitemaps = sitemaps.size();
        if (nSitemaps == 0) {
            sb.append(" - no sitemap URLs\n");
        } else {
            sb.append(" - number of sitemap URLs: ").append(nSitemaps).append('\n');
            int numOfSitemapsToShow = Math.min(nSitemaps, 10);
            for (int i = 0; i < numOfSitemapsToShow; i++) {
                sb.append(sitemaps.get(i)).append("\n");
            }
        }

        return sb.toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy