crawlercommons.robots.BaseRobotsParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of crawler-commons Show documentation
Show all versions of crawler-commons Show documentation
crawler-commons is a set of reusable Java components that implement
functionality common to any web crawler.
The newest version!
/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.robots;
import java.io.Serializable;
import java.util.Collection;
/** Robots.txt parser definition. */
@SuppressWarnings("serial")
public abstract class BaseRobotsParser implements Serializable {
/**
* Parse the robots.txt file in content, and return rules appropriate
* for processing paths by userAgent. Note that multiple agent names
* may be provided as comma-separated values. How agent names are matched
* against user-agent lines in the robots.txt depends on the implementing
* class.
*
* Also note that names are lower-cased before comparison, and that any
* robot name you pass shouldn't contain commas or spaces; if the name has
* spaces, it will be split into multiple names, each of which will be
* compared against agent names in the robots.txt file. An agent name is
* considered a match if it's a prefix match on the provided robot name. For
* example, if you pass in "Mozilla Crawlerbot-super 1.0", this would match
* "crawlerbot" as the agent name, because of splitting on spaces,
* lower-casing, and the prefix match rule.
*
* @deprecated since 1.4 - replaced by {@link parseContent(String, byte[],
* String, Collection)}. Passing a collection of robot names
* gives users more control how user-agent and robot names are
* matched. Passing a list of names is also more efficient as it
* does not require to split the robot name string again and
* again on every robots.txt file to be parsed.
*
* @param url
* URL that robots.txt content was fetched from. A complete and
* valid URL (e.g., https://example.com/robots.txt) is expected.
* Used to resolve relative sitemap URLs and for
* logging/reporting purposes.
* @param content
* raw bytes from the site's robots.txt file
* @param contentType
* HTTP response header (mime-type)
* @param robotNames
* name(s) of crawler, to be used when processing file contents
* (just the name portion, w/o version or other details)
* @return robot rules.
*/
@Deprecated
public abstract BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotNames);
/**
* Parse the robots.txt file in content, and return rules appropriate
* for processing paths by userAgent. Multiple agent names can be
* provided as collection. How agent names are matched against user-agent
* lines in the robots.txt depends on the implementing class.
*
* @param url
* URL that robots.txt content was fetched from. A complete and
* valid URL (e.g., https://example.com/robots.txt) is expected.
* Used to resolve relative sitemap URLs and for
* logging/reporting purposes.
* @param content
* raw bytes from the site's robots.txt file
* @param contentType
* content type (MIME type) from HTTP response header
* @param robotNames
* name(s) of crawler, used to select rules from the robots.txt
* file by matching the names against the user-agent lines in the
* robots.txt file.
* @return robot rules.
*/
public abstract BaseRobotRules parseContent(String url, byte[] content, String contentType, Collection robotNames);
/**
* The fetch of robots.txt failed, so return rules appropriate for the given
* HTTP status code.
*
* @param httpStatusCode
* a failure status code (NOT 2xx)
* @return robot rules
*/
public abstract BaseRobotRules failedFetch(int httpStatusCode);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy