
org.dspace.statistics.util.ApacheLogRobotsProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
The newest version!
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.statistics.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;
/**
* Command line utility to create a file of spider addresses from an Apache
* log file.
*
* @author Mark Diggory (mdiggory at atmire.com)
* @author kevinvandevelde at atmire.com
* @author ben at atmire.com
*/
public class ApacheLogRobotsProcessor {
/**
* Default constructor
*/
private ApacheLogRobotsProcessor() { }
/**
* Creates a file containing spiders based on an Apache log file
* by analyzing users of the robots.txt file
*
* @param args the command line arguments given
* @throws Exception if error
*/
public static void main(String[] args) throws Exception {
// create an Options object and populate it
CommandLineParser parser = new DefaultParser();
Options options = new Options();
options.addOption("l", "logfile", true, "type: Input log file");
options.addOption("s", "spiderfile", true, "type: Spider IP file");
CommandLine line = parser.parse(options, args);
// Log source
String logFileLoc;
if (line.hasOption("l")) {
logFileLoc = line.getOptionValue("l");
} else {
logFileLoc = "-";
}
// Spider IP list
String spiderIpPath;
if (line.hasOption("s")) {
spiderIpPath = line.getOptionValue("s");
} else {
spiderIpPath = "-";
}
//Get the IPs already added in our file
Set logSpiders;
Writer output;
if ("-".equals(spiderIpPath)) {
logSpiders = new HashSet<>();
output = new BufferedWriter(new OutputStreamWriter(System.out));
} else {
File spiderIpFile = new File(spiderIpPath);
if (spiderIpFile.exists()) {
logSpiders = SpiderDetector.readPatterns(spiderIpFile);
} else {
logSpiders = new HashSet<>();
}
output = new BufferedWriter(new FileWriter(spiderIpFile));
}
//First read in our log file line per line
BufferedReader in;
if ("-".equals(logFileLoc)) {
in = new BufferedReader(new InputStreamReader(System.in));
} else {
in = new BufferedReader(new FileReader(logFileLoc));
}
String logLine;
while ((logLine = in.readLine()) != null) {
//Currently only check if robot.txt is present in our line
if (logLine.contains("robots.txt")) {
//We got a robots.txt so we got a bot
String ip = logLine.substring(0, logLine.indexOf('-')).trim();
//Only add single IP addresses once we got it in it is enough
logSpiders.add(ip);
}
}
in.close();
//Last but not least add the IPs to our file
for (String ip : logSpiders) {
System.err.println("Adding new ip: " + ip);
//Write each new IP on a separate line
output.write(ip + "\n");
}
output.flush();
output.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy