All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dspace.statistics.util.ApacheLogRobotsProcessor Maven / Gradle / Ivy

The newest version!
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.statistics.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;

/**
 * Command line utility to create a file of spider addresses from an Apache
 * log file.
 *
 * @author Mark Diggory (mdiggory at atmire.com)
 * @author kevinvandevelde at atmire.com
 * @author ben at atmire.com
 */
public class ApacheLogRobotsProcessor {

    /**
     * Default constructor
     */
    private ApacheLogRobotsProcessor() { }

    /**
     * Creates a file containing spiders based on an Apache log file
     * by analyzing users of the robots.txt file
     *
     * @param args the command line arguments given
     * @throws Exception if error
     */

    public static void main(String[] args) throws Exception {
        // create an Options object and populate it
        CommandLineParser parser = new DefaultParser();

        Options options = new Options();
        options.addOption("l", "logfile", true, "type: Input log file");
        options.addOption("s", "spiderfile", true, "type: Spider IP file");

        CommandLine line = parser.parse(options, args);

        // Log source
        String logFileLoc;
        if (line.hasOption("l")) {
            logFileLoc = line.getOptionValue("l");
        } else {
            logFileLoc = "-";
        }

        // Spider IP list
        String spiderIpPath;
        if (line.hasOption("s")) {
            spiderIpPath = line.getOptionValue("s");
        } else {
            spiderIpPath = "-";
        }

        //Get the IPs already added in our file
        Set logSpiders;
        Writer output;

        if ("-".equals(spiderIpPath)) {
            logSpiders = new HashSet<>();
            output = new BufferedWriter(new OutputStreamWriter(System.out));
        } else {
            File spiderIpFile = new File(spiderIpPath);

            if (spiderIpFile.exists()) {
                logSpiders = SpiderDetector.readPatterns(spiderIpFile);
            } else {
                logSpiders = new HashSet<>();
            }
            output = new BufferedWriter(new FileWriter(spiderIpFile));
        }

        //First read in our log file line per line
        BufferedReader in;
        if ("-".equals(logFileLoc)) {
            in = new BufferedReader(new InputStreamReader(System.in));
        } else {
            in = new BufferedReader(new FileReader(logFileLoc));
        }

        String logLine;
        while ((logLine = in.readLine()) != null) {
            //Currently only check if robot.txt is present in our line
            if (logLine.contains("robots.txt")) {
                //We got a robots.txt so we got a bot
                String ip = logLine.substring(0, logLine.indexOf('-')).trim();
                //Only add single IP addresses once we got it in it is enough
                logSpiders.add(ip);
            }
        }
        in.close();

        //Last but not least add the IPs to our file
        for (String ip : logSpiders) {
            System.err.println("Adding new ip: " + ip);
            //Write each new IP on a separate line
            output.write(ip + "\n");
        }

        output.flush();
        output.close();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy