com.panforge.robotstxt.RobotsTxtReader Maven / Gradle / Ivy
/*
* Copyright 2016 Piotr Andzel.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.panforge.robotstxt;
import static com.panforge.robotstxt.URLDecoder.decode;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.commons.lang3.StringUtils;
/**
* Robots.txt reader.
*/
class RobotsTxtReader {
private final MatchingStrategy matchingStrategy;
private final WinningStrategy winningStrategy;
/**
* Creates instance of the robots.txt reader with default strategies.
*/
public RobotsTxtReader() {
this(MatchingStrategy.DEFAULT,WinningStrategy.DEFAULT);
}
/**
* Creates instance of the robots.txt reader.
* @param matchingStrategy
* @param winningStrategy
*/
public RobotsTxtReader(MatchingStrategy matchingStrategy, WinningStrategy winningStrategy) {
this.matchingStrategy = matchingStrategy;
this.winningStrategy = winningStrategy;
}
/**
* Reads robots txt.
* @param inputStream input stream with robots.txt content.
* @return parsed robots.txt
* @throws IOException if reading stream fails
*/
public RobotsTxt readRobotsTxt(InputStream inputStream) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
Group currentGroup = null;
boolean startGroup = false;
RobotsTxtImpl robots = new RobotsTxtImpl(matchingStrategy, winningStrategy);
for (Entry entry = readEntry(reader); entry != null; entry = readEntry(reader)) {
switch (entry.getKey().toUpperCase()) {
case "USER-AGENT":
if (!startGroup && currentGroup != null) {
robots.addGroup(currentGroup);
currentGroup = null;
}
if (currentGroup == null) {
currentGroup = new Group();
}
currentGroup.addUserAgent(entry.getValue());
startGroup = true;
break;
case "DISALLOW":
if (currentGroup != null) {
boolean access = entry.getValue().isEmpty();
currentGroup.addAccess(new Access(entry.getValue(), access));
startGroup = false;
}
break;
case "ALLOW":
if (currentGroup != null) {
boolean access = !entry.getValue().isEmpty();
currentGroup.addAccess(new Access(entry.getValue(), access));
startGroup = false;
}
break;
case "CRAWL-DELAY":
if (currentGroup != null) {
try {
int crawlDelay = Integer.parseInt(entry.getValue());
robots.setCrawlDelay(crawlDelay);
} catch (NumberFormatException ex) {
}
startGroup = false;
}
break;
case "HOST":
robots.setHost(entry.getValue());
startGroup = false;
break;
case "SITEMAP":
robots.getSitemaps().add(entry.getValue());
startGroup = false;
break;
default:
startGroup = false;
break;
}
}
if (currentGroup != null) {
robots.addGroup(currentGroup);
}
return robots;
}
/**
* Reads next entry from the reader.
*
* @return entry or null
if no more data in the stream
* @throws IOException if reading from stream fails
*/
private Entry readEntry(BufferedReader reader) throws IOException {
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
Entry entry = parseEntry(line);
if (entry != null) {
return entry;
}
}
return null;
}
/**
* Parses line into entry.
*
* Skip empty lines. Skip comments. Skip invalid lines.
*
* @param line line to parse
* @return entry or null
if skipped
* @throws IOException parsing line fails
*/
private Entry parseEntry(String line) throws IOException {
line = StringUtils.trimToEmpty(line);
if (line.startsWith("#")) {
return null;
}
int colonIndex = line.indexOf(":");
if (colonIndex < 0) {
return null;
}
String key = StringUtils.trimToNull(line.substring(0, colonIndex));
if (key == null) {
return null;
}
String rest = line.substring(colonIndex + 1, line.length());
int hashIndex = rest.indexOf("#");
String value = StringUtils.trimToEmpty(hashIndex >= 0 ? rest.substring(0, hashIndex) : rest);
value = decode(value);
return new Entry(key, value);
}
/**
* Local implementation of Map.Entry interface.
*/
private final static class Entry {
private final String key;
private final String value;
public Entry(String key, String value) {
this.key = key;
this.value = value;
}
public String getKey() {
return key;
}
public String getValue() {
return value;
}
@Override
public String toString() {
return String.format("%s: %s", key, value != null ? value : "");
}
}
}