org.dspace.app.statistics.LogAnalyser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dspace-api Show documentation
DSpace core data model and service APIs.
The newest version!
/**
 * The contents of this file are subject to the license and copyright
 * detailed in the LICENSE and NOTICE files at the root of the source
 * tree and available online at
 *
 * http://www.dspace.org/license/
 */
package org.dspace.app.statistics;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.time.Instant;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.lang3.StringUtils;
import org.dspace.core.Context;
import org.dspace.core.LogHelper;
import org.dspace.core.Utils;
import org.dspace.discovery.DiscoverQuery;
import org.dspace.discovery.SearchServiceException;
import org.dspace.discovery.SearchUtils;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;

/**
 * This class performs all the actual analysis of a given set of DSpace log
 * files.  Most input can be configured; use the -help flag for a full list
 * of usage information.
 *
 * 
 * The output of this file is plain text and forms an "aggregation" file which
 * can then be used for display purposes using the related ReportGenerator
 * class.
 *
 * @author Richard Jones
 */
public class LogAnalyser {

    // set up our class globals
    // FIXME: there are so many of these perhaps they should exist in a static
    // object of their own

    /////////////////
    // aggregators
    /////////////////

    /**
     * aggregator for all actions performed in the system
     */
    private static Map actionAggregator;

    /**
     * aggregator for all searches performed
     */
    private static Map searchAggregator;

    /**
     * aggregator for user logins
     */
    private static Map userAggregator;

    /**
     * aggregator for item views
     */
    private static Map itemAggregator;

    /**
     * aggregator for current archive state statistics
     */
    private static Map archiveStats;

    /**
     * warning counter
     */
    private static int warnCount = 0;

    /**
     * exception counter
     */
    private static int excCount = 0;

    /**
     * log line counter
     */
    private static int lineCount = 0;

    //////////////////
    // config data
    //////////////////

    /**
     * list of actions to be included in the general summary
     */
    private static List generalSummary;

    /**
     * list of words not to be aggregated
     */
    private static List excludeWords;

    /**
     * list of search types to be ignored, such as "author:"
     */
    private static List excludeTypes;

    /**
     * list of characters to be excluded
     */
    private static List excludeChars;

    /**
     * list of item types to be reported on in the current state
     */
    private static List itemTypes;

    /**
     * bottom limit to output for search word analysis
     */
    private static int searchFloor;

    /**
     * bottom limit to output for item view analysis
     */
    private static int itemFloor;

    /**
     * number of items from most popular to be looked up in the database
     */
    private static int itemLookup;

    /**
     * mode to use for user email display
     */
    private static String userEmail;

    /**
     * URL of the service being analysed
     */
    private static String url;

    /**
     * Name of the service being analysed
     */
    private static String name;

    /**
     * Name of the service being analysed
     */
    private static String hostName;

    /**
     * the average number of views per item
     */
    private static long views = 0;

    ///////////////////////
    // regular expressions
    ///////////////////////

    /**
     * Exclude characters regular expression pattern
     */
    private static Pattern excludeCharRX = null;

    /**
     * handle indicator string regular expression pattern
     */
    private static Pattern handleRX = null;

    /**
     * item id indicator string regular expression pattern
     */
    private static Pattern itemRX = null;

    /**
     * query string indicator regular expression pattern
     */
    private static Pattern queryRX = null;

    /**
     * collection indicator regular expression pattern
     */
    private static Pattern collectionRX = null;

    /**
     * community indicator regular expression pattern
     */
    private static Pattern communityRX = null;

    /**
     * results indicator regular expression pattern
     */
    private static Pattern resultsRX = null;

    /**
     * single character regular expression pattern
     */
    private static Pattern singleRX = null;

    /**
     * a pattern to match a valid version 1.3 log file line
     */
    private static Pattern valid13 = null;

    /**
     * basic log line
     */
    private static Pattern validBase = null;

    /**
     * a pattern to match a valid version 1.4 log file line
     */
    private static Pattern valid14 = null;

    /**
     * pattern to match valid log file names
     */
    private static Pattern logRegex = null;

    /**
     * pattern to match commented out lines from the config file
     */
    private static final Pattern COMMENT = Pattern.compile("^#");

    /**
     * pattern to match genuine lines from the config file
     */
    private static final Pattern REAL = Pattern.compile("^(.+)=(.+)");

    /**
     * pattern to match all search types
     */
    private static Pattern typeRX = null;

    /**
     * pattern to match all search types
     */
    private static Pattern wordRX = null;

    //////////////////////////
    // Miscellaneous variables
    //////////////////////////

    /**
     * process timing clock
     */
    private static Instant startTime = null;

    /////////////////////////
    // command line options
    ////////////////////////

    /**
     * the log directory to be analysed
     */
    private static String logDir;

    /**
     * the regex to describe the file name format
     */
    private static String fileTemplate = "dspace\\.log.*";

    private static final ConfigurationService configurationService =
            DSpaceServicesFactory.getInstance().getConfigurationService();

    /**
     * the configuration file from which to configure the analyser
     */
    private static String configFile = configurationService.getProperty("dspace.dir")
            + File.separator + "config" + File.separator + "dstat.cfg";

    /**
     * the output file to which to write aggregation data
     */
    private static String outFile;

    /**
     * the starting date of the report
     */
    private static LocalDate startDate = null;

    /**
     * the end date of the report
     */
    private static LocalDate endDate = null;

    /**
     * the starting date of the report as obtained from the log files
     */
    private static LocalDate logStartDate = null;

    /**
     * the end date of the report as obtained from the log files
     */
    private static LocalDate logEndDate = null;

    /**
     * Default constructor
     */
    private LogAnalyser() { }

    /**
     * main method to be run from command line.  See usage information for
     * details as to how to use the command line flags (-help)
     *
     * @param argv the command line arguments given
     * @throws Exception    if error
     * @throws SQLException if database error
     */
    public static void main(String[] argv)
        throws Exception, SQLException {
        // first, start the processing clock
        startTime = Instant.now();

        // create context as super user
        Context context = new Context();
        context.turnOffAuthorisationSystem();

        // set up our command line variables
        String myLogDir = null;
        String myFileTemplate = null;
        String myConfigFile = null;
        String myOutFile = null;
        LocalDate myStartDate = null;
        LocalDate myEndDate = null;
        boolean myLookUp = false;

        // Define command line options.
        Options options = new Options();
        Option option;

        option = Option.builder().longOpt("log").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("file").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("cfg").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("out").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("help").build();
        options.addOption(option);

        option = Option.builder().longOpt("start").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("end").hasArg().build();
        options.addOption(option);

        option = Option.builder().longOpt("lookup").build();
        options.addOption(option);

        // Parse the command.
        DefaultParser cmdParser = new DefaultParser();
        CommandLine cmd = cmdParser.parse(options, argv);

        // Analyze the command.
        if (cmd.hasOption("help")) {
            LogAnalyser.usage();
            System.exit(0);
        }

        if (cmd.hasOption("log")) {
            myLogDir = cmd.getOptionValue("log");
        }

        if (cmd.hasOption("file")) {
            myFileTemplate = cmd.getOptionValue("file");
        }

        if (cmd.hasOption("cfg")) {
            myConfigFile = cmd.getOptionValue("cfg");
        }

        if (cmd.hasOption("out")) {
            myOutFile = cmd.getOptionValue("out");
        }

        if (cmd.hasOption("start")) {
            myStartDate = parseDate(cmd.getOptionValue("start"));
        }

        if (cmd.hasOption("end")) {
            myEndDate = parseDate(cmd.getOptionValue("end"));
        }

        myLookUp = cmd.hasOption("lookup");

        // now call the method which actually processes the logs
        processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile,
                myStartDate, myEndDate, myLookUp);
    }

    /**
     * using the pre-configuration information passed here, analyse the logs
     * and produce the aggregation file
     *
     * @param context        the DSpace context object this occurs under
     * @param myLogDir       the passed log directory.  Uses default if null
     * @param myFileTemplate the passed file name regex.  Uses default if null
     * @param myConfigFile   the DStat config file.  Uses default if null
     * @param myOutFile      the file to which to output aggregation data.  Uses default if null
     * @param myStartDate    the desired start of the analysis.  Starts from the beginning otherwise
     * @param myEndDate      the desired end of the analysis.  Goes to the end otherwise
     * @param myLookUp       force a lookup of the database
     * @return aggregate output
     * @throws IOException            if IO error
     * @throws SQLException           if database error
     * @throws SearchServiceException if search error
     */
    public static String processLogs(Context context, String myLogDir,
                                     String myFileTemplate, String myConfigFile,
                                     String myOutFile, LocalDate myStartDate,
                                     LocalDate myEndDate, boolean myLookUp)
        throws IOException, SQLException, SearchServiceException {
        // FIXME: perhaps we should have all parameters and aggregators put
        // together in a single aggregating object

        // if the timer has not yet been started, then start it
        startTime = Instant.now();

        //instantiate aggregators
        actionAggregator = new HashMap<>();
        searchAggregator = new HashMap<>();
        userAggregator = new HashMap<>();
        itemAggregator = new HashMap<>();
        archiveStats = new HashMap<>();

        //instantiate lists
        generalSummary = new ArrayList<>();
        excludeWords = new ArrayList<>();
        excludeTypes = new ArrayList<>();
        excludeChars = new ArrayList<>();
        itemTypes = new ArrayList<>();

        // set the parameters for this analysis
        setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);

        // pre prepare our standard file readers and buffered readers
        FileReader fr = null;
        BufferedReader br = null;

        // read in the config information, throwing an error if we fail to open
        // the given config file
        readConfig(configFile);

        // assemble the regular expressions for later use (requires the file
        // template to build the regex to match it
        setRegex(fileTemplate);

        // get the log files
        File[] logFiles = getLogFiles(logDir);

        // standard loop counter
        int i = 0;

        // for every log file do analysis
        // FIXME: it is easy to implement not processing log files after the
        // dates exceed the end boundary, but is there an easy way to do it
        // for the start of the file?  Note that we can assume that the contents
        // of the log file are sequential, but can we assume the files are
        // provided in a data sequence?
        for (i = 0; i < logFiles.length; i++) {
            // check to see if this file is a log file against the global regex
            Matcher matchRegex = logRegex.matcher(logFiles[i].getName());
            if (matchRegex.matches()) {
                // if it is a log file, open it up and lets have a look at the
                // contents.
                try {
                    fr = new FileReader(logFiles[i].toString());
                    br = new BufferedReader(fr);
                } catch (IOException e) {
                    System.out.println("Failed to read log file " + logFiles[i].toString());
                    System.exit(0);
                }

                // for each line in the file do the analysis
                // FIXME: perhaps each section needs to be dolled out to an
                // analysing class to allow pluggability of other methods of
                // analysis, and ease of code reading too - Pending further thought
                String line = null;
                while ((line = br.readLine()) != null) {
                    // get the log line object
                    LogLine logLine = getLogLine(line);

                    // if there are line segments get on with the analysis
                    if (logLine != null) {
                        // first find out if we are constraining by date and
                        // if so apply the restrictions
                        if ((startDate != null) && (!logLine.afterDate(startDate))) {
                            continue;
                        }

                        if ((endDate != null) && (!logLine.beforeDate(endDate))) {
                            break;
                        }

                        // count the number of lines parsed
                        lineCount++;

                        // if we are not constrained by date, register the date
                        // as the start/end date if it is the earliest/latest so far
                        // FIXME: this should probably have a method of its own
                        if (startDate == null) {
                            if (logStartDate != null) {
                                if (logLine.beforeDate(logStartDate)) {
                                    logStartDate = logLine.getDate();
                                }
                            } else {
                                logStartDate = logLine.getDate();
                            }
                        }

                        if (endDate == null) {
                            if (logEndDate != null) {
                                if (logLine.afterDate(logEndDate)) {
                                    logEndDate = logLine.getDate();
                                }
                            } else {
                                logEndDate = logLine.getDate();
                            }
                        }

                        // count the warnings
                        if (logLine.isLevel("WARN")) {
                            // FIXME: really, this ought to be some kind of level
                            // aggregator
                            warnCount++;
                        }
                        // count the exceptions
                        if (logLine.isLevel("ERROR")) {
                            excCount++;
                        }

                        if (null == logLine.getAction()) {
                            continue;
                        }

                        // is the action a search?
                        if (logLine.isAction("search")) {
                            // get back all the valid search words from the query
                            String[] words = analyseQuery(logLine.getParams());

                            // for each search word add to the aggregator or
                            // increment the aggregator's counter
                            for (String word : words) {
                                // FIXME: perhaps aggregators ought to be objects
                                // themselves
                                searchAggregator.put(word,
                                        increment(searchAggregator, word));
                            }
                        }

                        // is the action a login, and are we counting user logins?
                        if (logLine.isAction("login") && !userEmail.equals("off")) {
                            userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser()));
                        }

                        // is the action an item view?
                        if (logLine.isAction("view_item")) {
                            String handle = logLine.getParams();

                            // strip the handle string
                            Matcher matchHandle = handleRX.matcher(handle);
                            handle = matchHandle.replaceAll("");

                            // strip the item id string
                            Matcher matchItem = itemRX.matcher(handle);
                            handle = matchItem.replaceAll("").trim();

                            // either add the handle to the aggregator or
                            // increment its counter
                            itemAggregator.put(handle, increment(itemAggregator, handle));
                        }

                        // log all the activity
                        actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction()));
                    }
                }

                // close the file reading buffers
                br.close();
                fr.close();

            }
        }

        // do we want to do a database lookup?  Do so only if the start and
        // end dates are null or lookUp is true
        // FIXME: this is a kind of separate section.  Would it be worth building
        // the summary string separately and then inserting it into the real
        // summary later?  Especially if we make the archive analysis more complex
        archiveStats.put("All Items", getNumItems(context));
        for (i = 0; i < itemTypes.size(); i++) {
            archiveStats.put(itemTypes.get(i), getNumItems(context, itemTypes.get(i)));
        }

        // now do the host name and url lookup
        hostName = Utils.getHostName(configurationService.getProperty("dspace.ui.url"));
        name = configurationService.getProperty("dspace.name").trim();
        url = configurationService.getProperty("dspace.ui.url").trim();
        if ((url != null) && (!url.endsWith("/"))) {
            url = url + "/";
        }

        // do the average views analysis
        if ((archiveStats.get("All Items")) != 0) {
            // FIXME: this is dependent on their being a query on the db, which
            // there might not always be if it becomes configurable
            double avg = Math.ceil(
                (actionAggregator.get("view_item")).doubleValue() /
                    (archiveStats.get("All Items")).doubleValue());
            views = Math.round(avg);
        }

        // finally, write the output
        return createOutput();
    }


    /**
     * set the passed parameters up as global class variables.  This has to
     * be done in a separate method because the API permits for running from
     * the command line with args or calling the processLogs method statically
     * from elsewhere
     *
     * @param myLogDir       the log file directory to be analysed
     * @param myFileTemplate regex for log file names
     * @param myConfigFile   config file to use for dstat
     * @param myOutFile      file to write the aggregation into
     * @param myStartDate    requested log reporting start date
     * @param myEndDate      requested log reporting end date
     * @param myLookUp       requested look up force flag
     */
    public static void setParameters(String myLogDir, String myFileTemplate,
                                     String myConfigFile, String myOutFile,
                                     LocalDate myStartDate, LocalDate myEndDate,
                                     boolean myLookUp) {

        if (myLogDir != null) {
            logDir = myLogDir;
        } else {
            logDir = configurationService.getProperty("log.report.dir");
        }

        if (myFileTemplate != null) {
            fileTemplate = myFileTemplate;
        }

        if (myConfigFile != null) {
            configFile = myConfigFile;
        }

        if (myStartDate != null) {
            startDate = myStartDate;
        }

        if (myEndDate != null) {
            endDate = myEndDate;
        }

        if (myOutFile != null) {
            outFile = myOutFile;
        } else {
            outFile = configurationService.getProperty("log.report.dir") + File.separator + "dstat.dat";
        }
    }


    /**
     * generate the analyser's output to the specified out file
     *
     * @return output
     */
    public static String createOutput() {
        // start a string buffer to hold the final output
        StringBuilder summary = new StringBuilder();

        // define an iterator that will be used to go over the hashmap keys
        Iterator keys = null;

        // output the number of lines parsed
        summary.append("log_lines=").append(Integer.toString(lineCount)).append("\n");

        // output the number of warnings encountered
        summary.append("warnings=").append(Integer.toString(warnCount)).append("\n");
        summary.append("exceptions=").append(Integer.toString(excCount)).append("\n");

        // set the general summary config up in the aggregator file
        for (int i = 0; i < generalSummary.size(); i++) {
            summary.append("general_summary=").append(generalSummary.get(i)).append("\n");
        }

        // output the host name
        summary.append("server_name=").append(hostName).append("\n");

        // output the service name
        summary.append("service_name=").append(name).append("\n");

        // output the date information if necessary
        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("dd'/'MM'/'yyyy");
        if (startDate != null) {
            summary.append("start_date=").append(formatter.format(startDate)).append("\n");
        } else if (logStartDate != null) {
            summary.append("start_date=").append(formatter.format(logStartDate)).append("\n");
        }

        if (endDate != null) {
            summary.append("end_date=").append(formatter.format(endDate)).append("\n");
        } else if (logEndDate != null) {
            summary.append("end_date=").append(formatter.format(logEndDate)).append("\n");
        }

        // write out the archive stats
        keys = archiveStats.keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            summary.append("archive.").append(key).append("=").append(archiveStats.get(key)).append("\n");
        }

        // write out the action aggregation results
        keys = actionAggregator.keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            summary.append("action.").append(key).append("=").append(actionAggregator.get(key)).append("\n");
        }

        // depending on the config settings for reporting on emails output the
        // login information
        summary.append("user_email=").append(userEmail).append("\n");
        int address = 1;
        keys = userAggregator.keySet().iterator();

        // for each email address either write out the address and the count
        // or alias it with an "Address X" label, to keep the data confidential
        // FIXME: the users reporting should also have a floor value
        while (keys.hasNext()) {
            String key = keys.next();
            summary.append("user.");
            if (userEmail.equals("on")) {
                summary.append(key).append("=").append(userAggregator.get(key)).append("\n");
            } else if (userEmail.equals("alias")) {
                summary.append("Address ").append(Integer.toString(address++))
                        .append("=").append(userAggregator.get(key)).append("\n");
            }
        }

        // FIXME: all values which have floors set should provide an "other"
        // record which counts how many other things which didn't make it into
        // the listing there are

        // output the search word information
        summary.append("search_floor=").append(searchFloor).append("\n");
        keys = searchAggregator.keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            if ((searchAggregator.get(key)) >= searchFloor) {
                summary.append("search.").append(key).append("=")
                        .append(searchAggregator.get(key)).append("\n");
            }
        }

        // FIXME: we should do a lot more with the search aggregator
        // Possible feature list:
        //  - constrain by collection/community perhaps?
        //  - we should consider building our own aggregator class which can
        //      be full of rich data.  Perhaps this and the Stats class should
        //      be the same thing.

        // item viewing information
        summary.append("item_floor=").append(itemFloor).append("\n");
        summary.append("host_url=").append(url).append("\n");
        summary.append("item_lookup=").append(itemLookup).append("\n");

        // write out the item access information
        keys = itemAggregator.keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            if ((itemAggregator.get(key)) >= itemFloor) {
                summary.append("item.").append(key).append("=")
                        .append(itemAggregator.get(key)).append("\n");
            }
        }

        // output the average views per item
        if (views > 0) {
            summary.append("avg_item_views=").append(views).append("\n");
        }

        // insert the analysis processing time information
        long timeInMillis = Instant.now().toEpochMilli() - startTime.toEpochMilli();
        summary.append("analysis_process_time=")
                .append(Long.toString(timeInMillis / 1000)).append("\n");

        // finally write the string into the output file
        try (BufferedWriter out = new BufferedWriter(new FileWriter(outFile));) {
            out.write(summary.toString());
            out.flush();
        } catch (IOException e) {
            System.out.println("Unable to write to output file " + outFile);
            System.exit(0);
        }

        return summary.toString();
    }


    /**
     * get an array of file objects representing the passed log directory
     *
     * @param logDir the log directory in which to pick up files
     * @return an array of file objects representing the given logDir
     */
    public static File[] getLogFiles(String logDir) {
        // open the log files directory, read in the files, check that they
        // match the passed regular expression then analyse the content
        File logs = new File(logDir);

        // if log dir is not a directory throw and error and exit
        if (!logs.isDirectory()) {
            System.out.println("Passed log directory is not a directory");
            System.exit(0);
        }

        // get the files in the directory
        return logs.listFiles();
    }


    /**
     * set up the regular expressions to be used by this analyser.  Mostly this
     * exists to provide a degree of segregation and readability to the code
     * and to ensure that you only need to set up the regular expressions to
     * be used once
     *
     * @param fileTemplate the regex to be used to identify dspace log files
     */
    public static void setRegex(String fileTemplate) {
        // build the exclude characters regular expression
        StringBuilder charRegEx = new StringBuilder();
        charRegEx.append("[");
        for (int i = 0; i < excludeChars.size(); i++) {
            charRegEx.append("\\").append(excludeChars.get(i));
        }
        charRegEx.append("]");
        excludeCharRX = Pattern.compile(charRegEx.toString());

        // regular expression to find handle indicators in strings
        handleRX = Pattern.compile("handle=");

        // regular expression to find item_id indicators in strings
        itemRX = Pattern.compile(",item_id=.*$");

        // regular expression to find query indicators in strings
        queryRX = Pattern.compile("query=");

        // regular expression to find collections in strings
        collectionRX = Pattern.compile("collection_id=[0-9]*,");

        // regular expression to find communities in strings
        communityRX = Pattern.compile("community_id=[0-9]*,");

        // regular expression to find search result sets
        resultsRX = Pattern.compile(",results=(.*)");

        // regular expressions to find single characters anywhere in the string
        singleRX = Pattern.compile("( . |^. | .$)");

        // set up the standard log file line regular expression
        String logLineBase = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ (.*)";
        //date time LEVEL class @ whatever
        String logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+)" +
            ":[^:]+:([^:]+):(.*)";
        String logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+)" +
            ":[^:]+:[^:]+:([^:]+):(.*)";
        valid13 = Pattern.compile(logLine13);
        valid14 = Pattern.compile(logLine14);
        validBase = Pattern.compile(logLineBase);

        // set up the pattern for validating log file names
        logRegex = Pattern.compile(fileTemplate);

        // set up the pattern for matching any of the query types
        StringBuilder typeRXString = new StringBuilder();
        typeRXString.append("(");
        for (int i = 0; i < excludeTypes.size(); i++) {
            if (i > 0) {
                typeRXString.append("|");
            }
            typeRXString.append(excludeTypes.get(i));
        }
        typeRXString.append(")");
        typeRX = Pattern.compile(typeRXString.toString());

        // set up the pattern for matching any of the words to exclude
        StringBuilder wordRXString = new StringBuilder();
        wordRXString.append("(");
        for (int i = 0; i < excludeWords.size(); i++) {
            if (i > 0) {
                wordRXString.append("|");
            }
            wordRXString.append(" ").append(excludeWords.get(i)).append(" ");
            wordRXString.append("|");
            wordRXString.append("^").append(excludeWords.get(i)).append(" ");
            wordRXString.append("|");
            wordRXString.append(" ").append(excludeWords.get(i)).append("$");
        }
        wordRXString.append(")");
        wordRX = Pattern.compile(wordRXString.toString());
    }

    /**
     * get the current config file name
     *
     * @return The name of the config file
     */
    public static String getConfigFile() {
        return configFile;
    }

    /**
     * Read in the current config file and populate the class globals.
     *
     * @throws IOException if IO error
     */
    public static void readConfig() throws IOException {
        readConfig(configFile);
    }

    /**
     * Read in the given config file and populate the class globals.
     *
     * @param configFile the config file to read in
     * @throws IOException if IO error
     */
    public static void readConfig(String configFile) throws IOException {
        //instantiate aggregators
        actionAggregator = new HashMap<>();
        searchAggregator = new HashMap<>();
        userAggregator = new HashMap<>();
        itemAggregator = new HashMap<>();
        archiveStats = new HashMap<>();

        //instantiate lists
        generalSummary = new ArrayList<>();
        excludeWords = new ArrayList<>();
        excludeTypes = new ArrayList<>();
        excludeChars = new ArrayList<>();
        itemTypes = new ArrayList<>();

        // prepare our standard file readers and buffered readers
        FileReader fr = null;
        BufferedReader br = null;

        String record = null;
        try {
            fr = new FileReader(configFile);
            br = new BufferedReader(fr);
        } catch (IOException e) {
            System.out.println("Failed to read config file: " + configFile);
            System.exit(0);
        }

        // read in the config file and set up our instance variables
        while ((record = br.readLine()) != null) {
            // check to see what kind of line we have
            Matcher matchComment = COMMENT.matcher(record);
            Matcher matchReal = REAL.matcher(record);

            // if the line is not a comment and is real, read it in
            if (!matchComment.matches() && matchReal.matches()) {
                // lift the values out of the matcher's result groups
                String key = matchReal.group(1).trim();
                String value = matchReal.group(2).trim();

                // read the config values into our instance variables (see
                // documentation for more info on config params)
                if (key.equals("general.summary")) {
                    actionAggregator.put(value, 0);
                    generalSummary.add(value);
                }

                if (key.equals("exclude.word")) {
                    excludeWords.add(value);
                }

                if (key.equals("exclude.type")) {
                    excludeTypes.add(value);
                }

                if (key.equals("exclude.character")) {
                    excludeChars.add(value);
                }

                if (key.equals("item.type")) {
                    itemTypes.add(value);
                }

                if (key.equals("item.floor")) {
                    itemFloor = Integer.parseInt(value);
                }

                if (key.equals("search.floor")) {
                    searchFloor = Integer.parseInt(value);
                }

                if (key.equals("item.lookup")) {
                    itemLookup = Integer.parseInt(value);
                }

                if (key.equals("user.email")) {
                    userEmail = value;
                }
            }
        }

        // close the inputs
        br.close();
        fr.close();
    }

    /**
     * increment the value of the given map at the given key by one.
     *
     * @param map the map whose value we want to increase
     * @param key the key of the map whose value to increase
     * @return an integer object containing the new value
     */
    public static Integer increment(Map map, String key) {
        Integer newValue = null;
        if (map.containsKey(key)) {
            // FIXME: this seems like a ridiculous way to add Integers
            newValue = (map.get(key)) + 1;
        } else {
            newValue = 1;
        }
        return newValue;
    }

    /**
     * Take the standard date string requested at the command line and convert
     * it into a Date object.  Throws and error and exits if the date does
     * not parse
     *
     * @param date the string representation of the date
     * @return a date object containing the date, with the time set to
     * 00:00:00
     */
    public static LocalDate parseDate(String date) {
        DateTimeFormatter formatter = DateTimeFormatter.ISO_LOCAL_DATE;
        LocalDate parsedDate = null;

        try {
            parsedDate = LocalDate.parse(date, formatter);
        } catch (DateTimeParseException e) {
            System.out.println("The date is not in the correct format");
            System.exit(0);
        }
        return parsedDate;
    }


    /**
     * Take the date object and convert it into a string of the form YYYY-MM-DD
     *
     * @param date the date to be converted
     * @return A string of the form YYYY-MM-DD
     */
    public static String unParseDate(LocalDate date) {
        return DateTimeFormatter.ISO_LOCAL_DATE.format(date);
    }


    /**
     * Take a search query string and pull out all of the meaningful information
     * from it, giving the results in the form of a String array, a single word
     * to each element
     *
     * @param query the search query to be analysed
     * @return the string array containing meaningful search terms
     */
    public static String[] analyseQuery(String query) {
        // register our standard loop counter
        int i = 0;

        // make the query string totally lower case, to ensure we don't miss out
        // on matches due to capitalisation
        query = query.toLowerCase();

        // now perform successive find and replace operations using pre-defined
        // global regular expressions
        Matcher matchQuery = queryRX.matcher(query);
        query = matchQuery.replaceAll(" ");

        Matcher matchCollection = collectionRX.matcher(query);
        query = matchCollection.replaceAll(" ");

        Matcher matchCommunity = communityRX.matcher(query);
        query = matchCommunity.replaceAll(" ");

        Matcher matchResults = resultsRX.matcher(query);
        query = matchResults.replaceAll(" ");

        Matcher matchTypes = typeRX.matcher(query);
        query = matchTypes.replaceAll(" ");

        Matcher matchChars = excludeCharRX.matcher(query);
        query = matchChars.replaceAll(" ");

        Matcher matchWords = wordRX.matcher(query);
        query = matchWords.replaceAll(" ");

        Matcher single = singleRX.matcher(query);
        query = single.replaceAll(" ");

        // split the remaining string by whitespace, trim and stuff into an
        // array to be returned
        StringTokenizer st = new StringTokenizer(query);
        String[] words = new String[st.countTokens()];
        for (i = 0; i < words.length; i++) {
            words[i] = st.nextToken().trim();
        }

        // FIXME: some single characters are still slipping through the net;
        // why? and how do we fix it?
        return words;
    }


    /**
     * split the given line into it's relevant segments if applicable (i.e. the
     * line matches the required regular expression.
     *
     * @param line the line to be segmented
     * @return a Log Line object for the given line
     */
    public static LogLine getLogLine(String line) {
        // FIXME: consider moving this code into the LogLine class.  To do this
        // we need to much more carefully define the structure and behaviour
        // of the LogLine class
        Matcher match;

        if (line.indexOf(":ip_addr") > 0) {
            match = valid14.matcher(line);
        } else {
            match = valid13.matcher(line);
        }

        if (match.matches()) {
            // set up a new log line object
            LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
                                          LogHelper.unescapeLogField(match.group(2)).trim(),
                                          LogHelper.unescapeLogField(match.group(3)).trim(),
                                          LogHelper.unescapeLogField(match.group(4)).trim(),
                                          LogHelper.unescapeLogField(match.group(5)).trim());

            return logLine;
        } else {
            match = validBase.matcher(line);
            if (match.matches()) {
                LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
                                              LogHelper.unescapeLogField(match.group(2)).trim(),
                                              null,
                                              null,
                                              null
                );
                return logLine;
            }
            return null;
        }
    }


    /**
     * get the number of items in the archive which were accessioned between
     * the provided start and end dates, with the given value for the DC field
     * 'type' (unqualified)
     *
     * @param context the DSpace context for the action
     * @param type    value for DC field 'type' (unqualified)
     * @return an integer containing the relevant count
     * @throws SQLException           if database error
     * @throws SearchServiceException if search error
     */
    public static Integer getNumItems(Context context, String type)
        throws SQLException, SearchServiceException {
        // FIXME: this method is clearly not optimised

        // FIXME: we don't yet collect total statistics, such as number of items
        // withdrawn, number in process of submission etc.  We should probably do
        // that

        DiscoverQuery discoverQuery = new DiscoverQuery();
        if (StringUtils.isNotBlank(type)) {
            discoverQuery.addFilterQueries("dc.type=" + type + "*");
        }
        StringBuilder accessionedQuery = new StringBuilder();
        accessionedQuery.append("dc.date.accessioned_dt:[");
        if (startDate != null) {
            accessionedQuery.append(unParseDate(startDate));
        } else {
            accessionedQuery.append("*");
        }
        accessionedQuery.append(" TO ");
        if (endDate != null) {
            accessionedQuery.append(unParseDate(endDate));
        } else {
            accessionedQuery.append("*");
        }
        accessionedQuery.append("]");
        discoverQuery.addFilterQueries(accessionedQuery.toString());
        discoverQuery.addFilterQueries("withdrawn: false");
        discoverQuery.addFilterQueries("archived: true");

        return (int) SearchUtils.getSearchService().search(context, discoverQuery).getTotalSearchResults();
    }


    /**
     * get the total number of items in the archive at time of execution,
     * ignoring all other constraints
     *
     * @param context the DSpace context the action is being performed in
     * @return an Integer containing the number of items in the
     * archive
     * @throws SQLException           if database error
     * @throws SearchServiceException if search error
     */
    public static Integer getNumItems(Context context)
        throws SQLException, SearchServiceException {
        return getNumItems(context, null);
    }


    /**
     * print out the usage information for this class to the standard out
     */
    public static void usage() {
        String usage = "Usage Information:\n" +
            "LogAnalyser [options [parameters]]\n" +
            "-log [log directory]\n" +
            "\tOptional\n" +
            "\tSpecify a directory containing log files\n" +
            "\tDefault uses [dspace.dir]/log from dspace.cfg\n" +
            "-file [file name regex]\n" +
            "\tOptional\n" +
            "\tSpecify a regular expression as the file name template.\n" +
            "\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" +
            "\tDefault uses dspace.log*\n" +
            "-cfg [config file path]\n" +
            "\tOptional\n" +
            "\tSpecify a config file to be used\n" +
            "\tDefault uses dstat.cfg in dspace config directory\n" +
            "-out [output file path]\n" +
            "\tOptional\n" +
            "\tSpecify an output file to write results into\n" +
            "\tDefault uses dstat.dat in dspace log directory\n" +
            "-start [YYYY-MM-DD]\n" +
            "\tOptional\n" +
            "\tSpecify the start date of the analysis\n" +
            "\tIf a start date is specified then no attempt to gather \n" +
            "\tcurrent database statistics will be made unless -lookup is\n" +
            "\talso passed\n" +
            "\tDefault is to start from the earliest date records exist for\n" +
            "-end [YYYY-MM-DD]\n" +
            "\tOptional\n" +
            "\tSpecify the end date of the analysis\n" +
            "\tIf an end date is specified then no attempt to gather \n" +
            "\tcurrent database statistics will be made unless -lookup is\n" +
            "\talso passed\n" +
            "\tDefault is to work up to the last date records exist for\n" +
            "-lookup\n" +
            "\tOptional\n" +
            "\tForce a lookup of the current database statistics\n" +
            "\tOnly needs to be used if date constraints are also in place\n" +
            "-help\n" +
            "\tdisplay this usage information\n";

        System.out.println(usage);
    }
}