org.archive.modules.net.Robotstxt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.net;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.nio.CharBuffer;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.archive.bdb.AutoKryo;
import org.archive.io.ReadSource;
/**
* Utility class for parsing and representing 'robots.txt' format
* directives, into a list of named user-agents and map from user-agents
* to RobotsDirectives.
*/
public class Robotstxt implements Serializable {
static final long serialVersionUID = 7025386509301303890L;
private static final Logger logger =
Logger.getLogger(Robotstxt.class.getName());
protected static final int MAX_SIZE = 500*1024;
private static final Pattern LINE_SEPARATOR = Pattern.compile("\r\n|\r|\n");
// all user agents contained in this robots.txt
// in order of declaration
// TODO: consider discarding irrelevant entries
protected LinkedList namedUserAgents = new LinkedList();
// map user-agents to directives
protected Map agentsToDirectives =
new HashMap();
protected RobotsDirectives wildcardDirectives = null;
protected boolean hasErrors = false;
protected static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
/** empty, reusable instance for all sites providing no rules */
public static Robotstxt NO_ROBOTS = new Robotstxt();
public Robotstxt() {
}
public Robotstxt(Reader reader) throws IOException {
try {
initializeFromReader(reader);
} finally {
IOUtils.closeQuietly(reader);
}
}
public Robotstxt(ReadSource customRobots) {
Reader reader = customRobots.obtainReader();
try {
initializeFromReader(reader);
} catch (IOException e) {
logger.log(Level.SEVERE,
"robots ReadSource problem: potential for inadvertent overcrawling",
e);
} finally {
IOUtils.closeQuietly(reader);
}
}
protected void initializeFromReader(Reader reader) throws IOException {
CharBuffer buffer = CharBuffer.allocate(MAX_SIZE);
while (buffer.hasRemaining() && reader.read(buffer) >= 0) ;
buffer.flip();
String[] lines = LINE_SEPARATOR.split(buffer);
if (buffer.limit() == buffer.capacity()) {
int processed = buffer.capacity();
if (lines.length != 0) {
// discard the partial line at the end so we don't process a truncated path
int last = lines.length - 1;
processed -= lines[last].length();
lines[last] = "";
}
logger.warning("processed " + processed + " characters, ignoring the rest (see HER-1990)");
}
// current is the disallowed paths for the preceding User-Agent(s)
RobotsDirectives current = null;
for (String read: lines) {
read = read.trim();
if (!read.isEmpty() && !read.startsWith("#")) {
// remove any html markup
read = read.replaceAll("<[^>]+>","");
int commentIndex = read.indexOf("#");
if (commentIndex > -1) {
// Strip trailing comment
read = read.substring(0, commentIndex);
}
read = read.trim();
if (read.matches("(?i)^User-agent:.*")) {
String ua = read.substring(11).trim().toLowerCase();
RobotsDirectives preexisting;
if (ua.equals("*")) {
preexisting = wildcardDirectives;
} else {
preexisting = agentsToDirectives.get(ua);
}
if (preexisting != null && preexisting.hasDirectives) {
current = preexisting;
} else if (current == null || current.hasDirectives) {
// only create new rules-list if necessary
// otherwise share with previous user-agent
current = new RobotsDirectives();
}
if (ua.equals("*")) {
wildcardDirectives = current;
} else {
namedUserAgents.addLast(ua);
agentsToDirectives.put(ua, current);
}
continue;
}
if (read.matches("(?i)Disallow:.*")) {
if (current == null) {
// buggy robots.txt
hasErrors = true;
continue;
}
String path = read.substring(9).trim();
// tolerate common error of ending path with '*' character
// (not allowed by original spec; redundant but harmless with
// Google's wildcarding extensions -- which we don't yet fully
// support).
if(path.endsWith("*")) {
path = path.substring(0,path.length()-1);
}
current.addDisallow(path);
continue;
}
if (read.matches("(?i)Crawl-delay:.*")) {
if (current == null) {
// buggy robots.txt
hasErrors = true;
continue;
}
// consider a crawl-delay as sufficient to end a grouping of
// User-Agent lines
String val = read.substring(12).trim();
try {
val = val.split("[^\\d\\.]+")[0];
current.setCrawlDelay(Float.parseFloat(val));
} catch (ArrayIndexOutOfBoundsException e) {
// ignore
} catch (NumberFormatException nfe) {
// ignore
}
continue;
}
if (read.matches("(?i)Allow:.*")) {
if (current == null) {
// buggy robots.txt
hasErrors = true;
continue;
}
String path = read.substring(6).trim();
// tolerate common error of ending path with '*' character
// (not allowed by original spec; redundant but harmless with
// Google's wildcarding extensions -- which we don't yet fully
// support).
if(path.endsWith("*")) {
path = path.substring(0,path.length()-1);
}
current.addAllow(path);
continue;
}
// unknown line; do nothing for now
}
}
}
/**
* Does this policy effectively allow everything? (No
* disallows or timing (crawl-delay) directives?)
*/
public boolean allowsAll() {
// TODO: refine so directives that are all empty are also
// recognized as allowing all
return agentsToDirectives.isEmpty();
}
public List getNamedUserAgents() {
return namedUserAgents;
}
/**
* Return the RobotsDirectives, if any, appropriate for the given User-Agent
* string. If useFallbacks is true, a wildcard ('*') directives or the default
* of NO_DIRECTIVES will be returned, as appropriate, if there is no better
* match. If useFallbacks is false, a null will be returned if no declared
* directives targeted the given User-Agent.
*
* @param ua String User-Agent to lookup
* @param useFallbacks if true, fall-back to wildcard directives or
* default allow as needed
* @return directives to use, or null if useFallbacks is false and no
* non-wildcard directives match the supplied User-Agent
*/
public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) {
// find matching ua
for(String uaListed : namedUserAgents) {
if(ua.indexOf(uaListed)>-1) {
return agentsToDirectives.get(uaListed);
}
}
if(useFallbacks==false) {
return null;
}
if (wildcardDirectives!=null) {
return wildcardDirectives;
}
// no applicable user-agents, so empty directives
return NO_DIRECTIVES;
}
/**
* Return directives to use for the given User-Agent, resorting to wildcard
* rules or the default no-directives if necessary.
*
* @param userAgent String User-Agent to lookup
* @return directives to use
*/
public RobotsDirectives getDirectivesFor(String userAgent) {
return getDirectivesFor(userAgent, true);
}
// Kryo support
public static void autoregisterTo(AutoKryo kryo) {
kryo.register(Robotstxt.class);
kryo.autoregister(HashMap.class);
kryo.autoregister(LinkedList.class);
kryo.autoregister(RobotsDirectives.class);
kryo.setRegistrationOptional(true);
}
}