
org.dspace.app.statistics.LogAnalyser Maven / Gradle / Ivy
Show all versions of dspace-api Show documentation
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.statistics;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.time.Instant;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.lang3.StringUtils;
import org.dspace.core.Context;
import org.dspace.core.LogHelper;
import org.dspace.core.Utils;
import org.dspace.discovery.DiscoverQuery;
import org.dspace.discovery.SearchServiceException;
import org.dspace.discovery.SearchUtils;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
/**
* This class performs all the actual analysis of a given set of DSpace log
* files. Most input can be configured; use the -help flag for a full list
* of usage information.
*
*
* The output of this file is plain text and forms an "aggregation" file which
* can then be used for display purposes using the related ReportGenerator
* class.
*
* @author Richard Jones
*/
public class LogAnalyser {
// set up our class globals
// FIXME: there are so many of these perhaps they should exist in a static
// object of their own
/////////////////
// aggregators
/////////////////
/**
* aggregator for all actions performed in the system
*/
private static Map actionAggregator;
/**
* aggregator for all searches performed
*/
private static Map searchAggregator;
/**
* aggregator for user logins
*/
private static Map userAggregator;
/**
* aggregator for item views
*/
private static Map itemAggregator;
/**
* aggregator for current archive state statistics
*/
private static Map archiveStats;
/**
* warning counter
*/
private static int warnCount = 0;
/**
* exception counter
*/
private static int excCount = 0;
/**
* log line counter
*/
private static int lineCount = 0;
//////////////////
// config data
//////////////////
/**
* list of actions to be included in the general summary
*/
private static List generalSummary;
/**
* list of words not to be aggregated
*/
private static List excludeWords;
/**
* list of search types to be ignored, such as "author:"
*/
private static List excludeTypes;
/**
* list of characters to be excluded
*/
private static List excludeChars;
/**
* list of item types to be reported on in the current state
*/
private static List itemTypes;
/**
* bottom limit to output for search word analysis
*/
private static int searchFloor;
/**
* bottom limit to output for item view analysis
*/
private static int itemFloor;
/**
* number of items from most popular to be looked up in the database
*/
private static int itemLookup;
/**
* mode to use for user email display
*/
private static String userEmail;
/**
* URL of the service being analysed
*/
private static String url;
/**
* Name of the service being analysed
*/
private static String name;
/**
* Name of the service being analysed
*/
private static String hostName;
/**
* the average number of views per item
*/
private static long views = 0;
///////////////////////
// regular expressions
///////////////////////
/**
* Exclude characters regular expression pattern
*/
private static Pattern excludeCharRX = null;
/**
* handle indicator string regular expression pattern
*/
private static Pattern handleRX = null;
/**
* item id indicator string regular expression pattern
*/
private static Pattern itemRX = null;
/**
* query string indicator regular expression pattern
*/
private static Pattern queryRX = null;
/**
* collection indicator regular expression pattern
*/
private static Pattern collectionRX = null;
/**
* community indicator regular expression pattern
*/
private static Pattern communityRX = null;
/**
* results indicator regular expression pattern
*/
private static Pattern resultsRX = null;
/**
* single character regular expression pattern
*/
private static Pattern singleRX = null;
/**
* a pattern to match a valid version 1.3 log file line
*/
private static Pattern valid13 = null;
/**
* basic log line
*/
private static Pattern validBase = null;
/**
* a pattern to match a valid version 1.4 log file line
*/
private static Pattern valid14 = null;
/**
* pattern to match valid log file names
*/
private static Pattern logRegex = null;
/**
* pattern to match commented out lines from the config file
*/
private static final Pattern COMMENT = Pattern.compile("^#");
/**
* pattern to match genuine lines from the config file
*/
private static final Pattern REAL = Pattern.compile("^(.+)=(.+)");
/**
* pattern to match all search types
*/
private static Pattern typeRX = null;
/**
* pattern to match all search types
*/
private static Pattern wordRX = null;
//////////////////////////
// Miscellaneous variables
//////////////////////////
/**
* process timing clock
*/
private static Instant startTime = null;
/////////////////////////
// command line options
////////////////////////
/**
* the log directory to be analysed
*/
private static String logDir;
/**
* the regex to describe the file name format
*/
private static String fileTemplate = "dspace\\.log.*";
private static final ConfigurationService configurationService =
DSpaceServicesFactory.getInstance().getConfigurationService();
/**
* the configuration file from which to configure the analyser
*/
private static String configFile = configurationService.getProperty("dspace.dir")
+ File.separator + "config" + File.separator + "dstat.cfg";
/**
* the output file to which to write aggregation data
*/
private static String outFile;
/**
* the starting date of the report
*/
private static LocalDate startDate = null;
/**
* the end date of the report
*/
private static LocalDate endDate = null;
/**
* the starting date of the report as obtained from the log files
*/
private static LocalDate logStartDate = null;
/**
* the end date of the report as obtained from the log files
*/
private static LocalDate logEndDate = null;
/**
* Default constructor
*/
private LogAnalyser() { }
/**
* main method to be run from command line. See usage information for
* details as to how to use the command line flags (-help)
*
* @param argv the command line arguments given
* @throws Exception if error
* @throws SQLException if database error
*/
public static void main(String[] argv)
throws Exception, SQLException {
// first, start the processing clock
startTime = Instant.now();
// create context as super user
Context context = new Context();
context.turnOffAuthorisationSystem();
// set up our command line variables
String myLogDir = null;
String myFileTemplate = null;
String myConfigFile = null;
String myOutFile = null;
LocalDate myStartDate = null;
LocalDate myEndDate = null;
boolean myLookUp = false;
// Define command line options.
Options options = new Options();
Option option;
option = Option.builder().longOpt("log").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("file").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("cfg").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("out").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("help").build();
options.addOption(option);
option = Option.builder().longOpt("start").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("end").hasArg().build();
options.addOption(option);
option = Option.builder().longOpt("lookup").build();
options.addOption(option);
// Parse the command.
DefaultParser cmdParser = new DefaultParser();
CommandLine cmd = cmdParser.parse(options, argv);
// Analyze the command.
if (cmd.hasOption("help")) {
LogAnalyser.usage();
System.exit(0);
}
if (cmd.hasOption("log")) {
myLogDir = cmd.getOptionValue("log");
}
if (cmd.hasOption("file")) {
myFileTemplate = cmd.getOptionValue("file");
}
if (cmd.hasOption("cfg")) {
myConfigFile = cmd.getOptionValue("cfg");
}
if (cmd.hasOption("out")) {
myOutFile = cmd.getOptionValue("out");
}
if (cmd.hasOption("start")) {
myStartDate = parseDate(cmd.getOptionValue("start"));
}
if (cmd.hasOption("end")) {
myEndDate = parseDate(cmd.getOptionValue("end"));
}
myLookUp = cmd.hasOption("lookup");
// now call the method which actually processes the logs
processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile,
myStartDate, myEndDate, myLookUp);
}
/**
* using the pre-configuration information passed here, analyse the logs
* and produce the aggregation file
*
* @param context the DSpace context object this occurs under
* @param myLogDir the passed log directory. Uses default if null
* @param myFileTemplate the passed file name regex. Uses default if null
* @param myConfigFile the DStat config file. Uses default if null
* @param myOutFile the file to which to output aggregation data. Uses default if null
* @param myStartDate the desired start of the analysis. Starts from the beginning otherwise
* @param myEndDate the desired end of the analysis. Goes to the end otherwise
* @param myLookUp force a lookup of the database
* @return aggregate output
* @throws IOException if IO error
* @throws SQLException if database error
* @throws SearchServiceException if search error
*/
public static String processLogs(Context context, String myLogDir,
String myFileTemplate, String myConfigFile,
String myOutFile, LocalDate myStartDate,
LocalDate myEndDate, boolean myLookUp)
throws IOException, SQLException, SearchServiceException {
// FIXME: perhaps we should have all parameters and aggregators put
// together in a single aggregating object
// if the timer has not yet been started, then start it
startTime = Instant.now();
//instantiate aggregators
actionAggregator = new HashMap<>();
searchAggregator = new HashMap<>();
userAggregator = new HashMap<>();
itemAggregator = new HashMap<>();
archiveStats = new HashMap<>();
//instantiate lists
generalSummary = new ArrayList<>();
excludeWords = new ArrayList<>();
excludeTypes = new ArrayList<>();
excludeChars = new ArrayList<>();
itemTypes = new ArrayList<>();
// set the parameters for this analysis
setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
// pre prepare our standard file readers and buffered readers
FileReader fr = null;
BufferedReader br = null;
// read in the config information, throwing an error if we fail to open
// the given config file
readConfig(configFile);
// assemble the regular expressions for later use (requires the file
// template to build the regex to match it
setRegex(fileTemplate);
// get the log files
File[] logFiles = getLogFiles(logDir);
// standard loop counter
int i = 0;
// for every log file do analysis
// FIXME: it is easy to implement not processing log files after the
// dates exceed the end boundary, but is there an easy way to do it
// for the start of the file? Note that we can assume that the contents
// of the log file are sequential, but can we assume the files are
// provided in a data sequence?
for (i = 0; i < logFiles.length; i++) {
// check to see if this file is a log file against the global regex
Matcher matchRegex = logRegex.matcher(logFiles[i].getName());
if (matchRegex.matches()) {
// if it is a log file, open it up and lets have a look at the
// contents.
try {
fr = new FileReader(logFiles[i].toString());
br = new BufferedReader(fr);
} catch (IOException e) {
System.out.println("Failed to read log file " + logFiles[i].toString());
System.exit(0);
}
// for each line in the file do the analysis
// FIXME: perhaps each section needs to be dolled out to an
// analysing class to allow pluggability of other methods of
// analysis, and ease of code reading too - Pending further thought
String line = null;
while ((line = br.readLine()) != null) {
// get the log line object
LogLine logLine = getLogLine(line);
// if there are line segments get on with the analysis
if (logLine != null) {
// first find out if we are constraining by date and
// if so apply the restrictions
if ((startDate != null) && (!logLine.afterDate(startDate))) {
continue;
}
if ((endDate != null) && (!logLine.beforeDate(endDate))) {
break;
}
// count the number of lines parsed
lineCount++;
// if we are not constrained by date, register the date
// as the start/end date if it is the earliest/latest so far
// FIXME: this should probably have a method of its own
if (startDate == null) {
if (logStartDate != null) {
if (logLine.beforeDate(logStartDate)) {
logStartDate = logLine.getDate();
}
} else {
logStartDate = logLine.getDate();
}
}
if (endDate == null) {
if (logEndDate != null) {
if (logLine.afterDate(logEndDate)) {
logEndDate = logLine.getDate();
}
} else {
logEndDate = logLine.getDate();
}
}
// count the warnings
if (logLine.isLevel("WARN")) {
// FIXME: really, this ought to be some kind of level
// aggregator
warnCount++;
}
// count the exceptions
if (logLine.isLevel("ERROR")) {
excCount++;
}
if (null == logLine.getAction()) {
continue;
}
// is the action a search?
if (logLine.isAction("search")) {
// get back all the valid search words from the query
String[] words = analyseQuery(logLine.getParams());
// for each search word add to the aggregator or
// increment the aggregator's counter
for (String word : words) {
// FIXME: perhaps aggregators ought to be objects
// themselves
searchAggregator.put(word,
increment(searchAggregator, word));
}
}
// is the action a login, and are we counting user logins?
if (logLine.isAction("login") && !userEmail.equals("off")) {
userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser()));
}
// is the action an item view?
if (logLine.isAction("view_item")) {
String handle = logLine.getParams();
// strip the handle string
Matcher matchHandle = handleRX.matcher(handle);
handle = matchHandle.replaceAll("");
// strip the item id string
Matcher matchItem = itemRX.matcher(handle);
handle = matchItem.replaceAll("").trim();
// either add the handle to the aggregator or
// increment its counter
itemAggregator.put(handle, increment(itemAggregator, handle));
}
// log all the activity
actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction()));
}
}
// close the file reading buffers
br.close();
fr.close();
}
}
// do we want to do a database lookup? Do so only if the start and
// end dates are null or lookUp is true
// FIXME: this is a kind of separate section. Would it be worth building
// the summary string separately and then inserting it into the real
// summary later? Especially if we make the archive analysis more complex
archiveStats.put("All Items", getNumItems(context));
for (i = 0; i < itemTypes.size(); i++) {
archiveStats.put(itemTypes.get(i), getNumItems(context, itemTypes.get(i)));
}
// now do the host name and url lookup
hostName = Utils.getHostName(configurationService.getProperty("dspace.ui.url"));
name = configurationService.getProperty("dspace.name").trim();
url = configurationService.getProperty("dspace.ui.url").trim();
if ((url != null) && (!url.endsWith("/"))) {
url = url + "/";
}
// do the average views analysis
if ((archiveStats.get("All Items")) != 0) {
// FIXME: this is dependent on their being a query on the db, which
// there might not always be if it becomes configurable
double avg = Math.ceil(
(actionAggregator.get("view_item")).doubleValue() /
(archiveStats.get("All Items")).doubleValue());
views = Math.round(avg);
}
// finally, write the output
return createOutput();
}
/**
* set the passed parameters up as global class variables. This has to
* be done in a separate method because the API permits for running from
* the command line with args or calling the processLogs method statically
* from elsewhere
*
* @param myLogDir the log file directory to be analysed
* @param myFileTemplate regex for log file names
* @param myConfigFile config file to use for dstat
* @param myOutFile file to write the aggregation into
* @param myStartDate requested log reporting start date
* @param myEndDate requested log reporting end date
* @param myLookUp requested look up force flag
*/
public static void setParameters(String myLogDir, String myFileTemplate,
String myConfigFile, String myOutFile,
LocalDate myStartDate, LocalDate myEndDate,
boolean myLookUp) {
if (myLogDir != null) {
logDir = myLogDir;
} else {
logDir = configurationService.getProperty("log.report.dir");
}
if (myFileTemplate != null) {
fileTemplate = myFileTemplate;
}
if (myConfigFile != null) {
configFile = myConfigFile;
}
if (myStartDate != null) {
startDate = myStartDate;
}
if (myEndDate != null) {
endDate = myEndDate;
}
if (myOutFile != null) {
outFile = myOutFile;
} else {
outFile = configurationService.getProperty("log.report.dir") + File.separator + "dstat.dat";
}
}
/**
* generate the analyser's output to the specified out file
*
* @return output
*/
public static String createOutput() {
// start a string buffer to hold the final output
StringBuilder summary = new StringBuilder();
// define an iterator that will be used to go over the hashmap keys
Iterator keys = null;
// output the number of lines parsed
summary.append("log_lines=").append(Integer.toString(lineCount)).append("\n");
// output the number of warnings encountered
summary.append("warnings=").append(Integer.toString(warnCount)).append("\n");
summary.append("exceptions=").append(Integer.toString(excCount)).append("\n");
// set the general summary config up in the aggregator file
for (int i = 0; i < generalSummary.size(); i++) {
summary.append("general_summary=").append(generalSummary.get(i)).append("\n");
}
// output the host name
summary.append("server_name=").append(hostName).append("\n");
// output the service name
summary.append("service_name=").append(name).append("\n");
// output the date information if necessary
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("dd'/'MM'/'yyyy");
if (startDate != null) {
summary.append("start_date=").append(formatter.format(startDate)).append("\n");
} else if (logStartDate != null) {
summary.append("start_date=").append(formatter.format(logStartDate)).append("\n");
}
if (endDate != null) {
summary.append("end_date=").append(formatter.format(endDate)).append("\n");
} else if (logEndDate != null) {
summary.append("end_date=").append(formatter.format(logEndDate)).append("\n");
}
// write out the archive stats
keys = archiveStats.keySet().iterator();
while (keys.hasNext()) {
String key = keys.next();
summary.append("archive.").append(key).append("=").append(archiveStats.get(key)).append("\n");
}
// write out the action aggregation results
keys = actionAggregator.keySet().iterator();
while (keys.hasNext()) {
String key = keys.next();
summary.append("action.").append(key).append("=").append(actionAggregator.get(key)).append("\n");
}
// depending on the config settings for reporting on emails output the
// login information
summary.append("user_email=").append(userEmail).append("\n");
int address = 1;
keys = userAggregator.keySet().iterator();
// for each email address either write out the address and the count
// or alias it with an "Address X" label, to keep the data confidential
// FIXME: the users reporting should also have a floor value
while (keys.hasNext()) {
String key = keys.next();
summary.append("user.");
if (userEmail.equals("on")) {
summary.append(key).append("=").append(userAggregator.get(key)).append("\n");
} else if (userEmail.equals("alias")) {
summary.append("Address ").append(Integer.toString(address++))
.append("=").append(userAggregator.get(key)).append("\n");
}
}
// FIXME: all values which have floors set should provide an "other"
// record which counts how many other things which didn't make it into
// the listing there are
// output the search word information
summary.append("search_floor=").append(searchFloor).append("\n");
keys = searchAggregator.keySet().iterator();
while (keys.hasNext()) {
String key = keys.next();
if ((searchAggregator.get(key)) >= searchFloor) {
summary.append("search.").append(key).append("=")
.append(searchAggregator.get(key)).append("\n");
}
}
// FIXME: we should do a lot more with the search aggregator
// Possible feature list:
// - constrain by collection/community perhaps?
// - we should consider building our own aggregator class which can
// be full of rich data. Perhaps this and the Stats class should
// be the same thing.
// item viewing information
summary.append("item_floor=").append(itemFloor).append("\n");
summary.append("host_url=").append(url).append("\n");
summary.append("item_lookup=").append(itemLookup).append("\n");
// write out the item access information
keys = itemAggregator.keySet().iterator();
while (keys.hasNext()) {
String key = keys.next();
if ((itemAggregator.get(key)) >= itemFloor) {
summary.append("item.").append(key).append("=")
.append(itemAggregator.get(key)).append("\n");
}
}
// output the average views per item
if (views > 0) {
summary.append("avg_item_views=").append(views).append("\n");
}
// insert the analysis processing time information
long timeInMillis = Instant.now().toEpochMilli() - startTime.toEpochMilli();
summary.append("analysis_process_time=")
.append(Long.toString(timeInMillis / 1000)).append("\n");
// finally write the string into the output file
try (BufferedWriter out = new BufferedWriter(new FileWriter(outFile));) {
out.write(summary.toString());
out.flush();
} catch (IOException e) {
System.out.println("Unable to write to output file " + outFile);
System.exit(0);
}
return summary.toString();
}
/**
* get an array of file objects representing the passed log directory
*
* @param logDir the log directory in which to pick up files
* @return an array of file objects representing the given logDir
*/
public static File[] getLogFiles(String logDir) {
// open the log files directory, read in the files, check that they
// match the passed regular expression then analyse the content
File logs = new File(logDir);
// if log dir is not a directory throw and error and exit
if (!logs.isDirectory()) {
System.out.println("Passed log directory is not a directory");
System.exit(0);
}
// get the files in the directory
return logs.listFiles();
}
/**
* set up the regular expressions to be used by this analyser. Mostly this
* exists to provide a degree of segregation and readability to the code
* and to ensure that you only need to set up the regular expressions to
* be used once
*
* @param fileTemplate the regex to be used to identify dspace log files
*/
public static void setRegex(String fileTemplate) {
// build the exclude characters regular expression
StringBuilder charRegEx = new StringBuilder();
charRegEx.append("[");
for (int i = 0; i < excludeChars.size(); i++) {
charRegEx.append("\\").append(excludeChars.get(i));
}
charRegEx.append("]");
excludeCharRX = Pattern.compile(charRegEx.toString());
// regular expression to find handle indicators in strings
handleRX = Pattern.compile("handle=");
// regular expression to find item_id indicators in strings
itemRX = Pattern.compile(",item_id=.*$");
// regular expression to find query indicators in strings
queryRX = Pattern.compile("query=");
// regular expression to find collections in strings
collectionRX = Pattern.compile("collection_id=[0-9]*,");
// regular expression to find communities in strings
communityRX = Pattern.compile("community_id=[0-9]*,");
// regular expression to find search result sets
resultsRX = Pattern.compile(",results=(.*)");
// regular expressions to find single characters anywhere in the string
singleRX = Pattern.compile("( . |^. | .$)");
// set up the standard log file line regular expression
String logLineBase = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ (.*)";
//date time LEVEL class @ whatever
String logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+)" +
":[^:]+:([^:]+):(.*)";
String logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+)" +
":[^:]+:[^:]+:([^:]+):(.*)";
valid13 = Pattern.compile(logLine13);
valid14 = Pattern.compile(logLine14);
validBase = Pattern.compile(logLineBase);
// set up the pattern for validating log file names
logRegex = Pattern.compile(fileTemplate);
// set up the pattern for matching any of the query types
StringBuilder typeRXString = new StringBuilder();
typeRXString.append("(");
for (int i = 0; i < excludeTypes.size(); i++) {
if (i > 0) {
typeRXString.append("|");
}
typeRXString.append(excludeTypes.get(i));
}
typeRXString.append(")");
typeRX = Pattern.compile(typeRXString.toString());
// set up the pattern for matching any of the words to exclude
StringBuilder wordRXString = new StringBuilder();
wordRXString.append("(");
for (int i = 0; i < excludeWords.size(); i++) {
if (i > 0) {
wordRXString.append("|");
}
wordRXString.append(" ").append(excludeWords.get(i)).append(" ");
wordRXString.append("|");
wordRXString.append("^").append(excludeWords.get(i)).append(" ");
wordRXString.append("|");
wordRXString.append(" ").append(excludeWords.get(i)).append("$");
}
wordRXString.append(")");
wordRX = Pattern.compile(wordRXString.toString());
}
/**
* get the current config file name
*
* @return The name of the config file
*/
public static String getConfigFile() {
return configFile;
}
/**
* Read in the current config file and populate the class globals.
*
* @throws IOException if IO error
*/
public static void readConfig() throws IOException {
readConfig(configFile);
}
/**
* Read in the given config file and populate the class globals.
*
* @param configFile the config file to read in
* @throws IOException if IO error
*/
public static void readConfig(String configFile) throws IOException {
//instantiate aggregators
actionAggregator = new HashMap<>();
searchAggregator = new HashMap<>();
userAggregator = new HashMap<>();
itemAggregator = new HashMap<>();
archiveStats = new HashMap<>();
//instantiate lists
generalSummary = new ArrayList<>();
excludeWords = new ArrayList<>();
excludeTypes = new ArrayList<>();
excludeChars = new ArrayList<>();
itemTypes = new ArrayList<>();
// prepare our standard file readers and buffered readers
FileReader fr = null;
BufferedReader br = null;
String record = null;
try {
fr = new FileReader(configFile);
br = new BufferedReader(fr);
} catch (IOException e) {
System.out.println("Failed to read config file: " + configFile);
System.exit(0);
}
// read in the config file and set up our instance variables
while ((record = br.readLine()) != null) {
// check to see what kind of line we have
Matcher matchComment = COMMENT.matcher(record);
Matcher matchReal = REAL.matcher(record);
// if the line is not a comment and is real, read it in
if (!matchComment.matches() && matchReal.matches()) {
// lift the values out of the matcher's result groups
String key = matchReal.group(1).trim();
String value = matchReal.group(2).trim();
// read the config values into our instance variables (see
// documentation for more info on config params)
if (key.equals("general.summary")) {
actionAggregator.put(value, 0);
generalSummary.add(value);
}
if (key.equals("exclude.word")) {
excludeWords.add(value);
}
if (key.equals("exclude.type")) {
excludeTypes.add(value);
}
if (key.equals("exclude.character")) {
excludeChars.add(value);
}
if (key.equals("item.type")) {
itemTypes.add(value);
}
if (key.equals("item.floor")) {
itemFloor = Integer.parseInt(value);
}
if (key.equals("search.floor")) {
searchFloor = Integer.parseInt(value);
}
if (key.equals("item.lookup")) {
itemLookup = Integer.parseInt(value);
}
if (key.equals("user.email")) {
userEmail = value;
}
}
}
// close the inputs
br.close();
fr.close();
}
/**
* increment the value of the given map at the given key by one.
*
* @param map the map whose value we want to increase
* @param key the key of the map whose value to increase
* @return an integer object containing the new value
*/
public static Integer increment(Map map, String key) {
Integer newValue = null;
if (map.containsKey(key)) {
// FIXME: this seems like a ridiculous way to add Integers
newValue = (map.get(key)) + 1;
} else {
newValue = 1;
}
return newValue;
}
/**
* Take the standard date string requested at the command line and convert
* it into a Date object. Throws and error and exits if the date does
* not parse
*
* @param date the string representation of the date
* @return a date object containing the date, with the time set to
* 00:00:00
*/
public static LocalDate parseDate(String date) {
DateTimeFormatter formatter = DateTimeFormatter.ISO_LOCAL_DATE;
LocalDate parsedDate = null;
try {
parsedDate = LocalDate.parse(date, formatter);
} catch (DateTimeParseException e) {
System.out.println("The date is not in the correct format");
System.exit(0);
}
return parsedDate;
}
/**
* Take the date object and convert it into a string of the form YYYY-MM-DD
*
* @param date the date to be converted
* @return A string of the form YYYY-MM-DD
*/
public static String unParseDate(LocalDate date) {
return DateTimeFormatter.ISO_LOCAL_DATE.format(date);
}
/**
* Take a search query string and pull out all of the meaningful information
* from it, giving the results in the form of a String array, a single word
* to each element
*
* @param query the search query to be analysed
* @return the string array containing meaningful search terms
*/
public static String[] analyseQuery(String query) {
// register our standard loop counter
int i = 0;
// make the query string totally lower case, to ensure we don't miss out
// on matches due to capitalisation
query = query.toLowerCase();
// now perform successive find and replace operations using pre-defined
// global regular expressions
Matcher matchQuery = queryRX.matcher(query);
query = matchQuery.replaceAll(" ");
Matcher matchCollection = collectionRX.matcher(query);
query = matchCollection.replaceAll(" ");
Matcher matchCommunity = communityRX.matcher(query);
query = matchCommunity.replaceAll(" ");
Matcher matchResults = resultsRX.matcher(query);
query = matchResults.replaceAll(" ");
Matcher matchTypes = typeRX.matcher(query);
query = matchTypes.replaceAll(" ");
Matcher matchChars = excludeCharRX.matcher(query);
query = matchChars.replaceAll(" ");
Matcher matchWords = wordRX.matcher(query);
query = matchWords.replaceAll(" ");
Matcher single = singleRX.matcher(query);
query = single.replaceAll(" ");
// split the remaining string by whitespace, trim and stuff into an
// array to be returned
StringTokenizer st = new StringTokenizer(query);
String[] words = new String[st.countTokens()];
for (i = 0; i < words.length; i++) {
words[i] = st.nextToken().trim();
}
// FIXME: some single characters are still slipping through the net;
// why? and how do we fix it?
return words;
}
/**
* split the given line into it's relevant segments if applicable (i.e. the
* line matches the required regular expression.
*
* @param line the line to be segmented
* @return a Log Line object for the given line
*/
public static LogLine getLogLine(String line) {
// FIXME: consider moving this code into the LogLine class. To do this
// we need to much more carefully define the structure and behaviour
// of the LogLine class
Matcher match;
if (line.indexOf(":ip_addr") > 0) {
match = valid14.matcher(line);
} else {
match = valid13.matcher(line);
}
if (match.matches()) {
// set up a new log line object
LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
LogHelper.unescapeLogField(match.group(2)).trim(),
LogHelper.unescapeLogField(match.group(3)).trim(),
LogHelper.unescapeLogField(match.group(4)).trim(),
LogHelper.unescapeLogField(match.group(5)).trim());
return logLine;
} else {
match = validBase.matcher(line);
if (match.matches()) {
LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
LogHelper.unescapeLogField(match.group(2)).trim(),
null,
null,
null
);
return logLine;
}
return null;
}
}
/**
* get the number of items in the archive which were accessioned between
* the provided start and end dates, with the given value for the DC field
* 'type' (unqualified)
*
* @param context the DSpace context for the action
* @param type value for DC field 'type' (unqualified)
* @return an integer containing the relevant count
* @throws SQLException if database error
* @throws SearchServiceException if search error
*/
public static Integer getNumItems(Context context, String type)
throws SQLException, SearchServiceException {
// FIXME: this method is clearly not optimised
// FIXME: we don't yet collect total statistics, such as number of items
// withdrawn, number in process of submission etc. We should probably do
// that
DiscoverQuery discoverQuery = new DiscoverQuery();
if (StringUtils.isNotBlank(type)) {
discoverQuery.addFilterQueries("dc.type=" + type + "*");
}
StringBuilder accessionedQuery = new StringBuilder();
accessionedQuery.append("dc.date.accessioned_dt:[");
if (startDate != null) {
accessionedQuery.append(unParseDate(startDate));
} else {
accessionedQuery.append("*");
}
accessionedQuery.append(" TO ");
if (endDate != null) {
accessionedQuery.append(unParseDate(endDate));
} else {
accessionedQuery.append("*");
}
accessionedQuery.append("]");
discoverQuery.addFilterQueries(accessionedQuery.toString());
discoverQuery.addFilterQueries("withdrawn: false");
discoverQuery.addFilterQueries("archived: true");
return (int) SearchUtils.getSearchService().search(context, discoverQuery).getTotalSearchResults();
}
/**
* get the total number of items in the archive at time of execution,
* ignoring all other constraints
*
* @param context the DSpace context the action is being performed in
* @return an Integer containing the number of items in the
* archive
* @throws SQLException if database error
* @throws SearchServiceException if search error
*/
public static Integer getNumItems(Context context)
throws SQLException, SearchServiceException {
return getNumItems(context, null);
}
/**
* print out the usage information for this class to the standard out
*/
public static void usage() {
String usage = "Usage Information:\n" +
"LogAnalyser [options [parameters]]\n" +
"-log [log directory]\n" +
"\tOptional\n" +
"\tSpecify a directory containing log files\n" +
"\tDefault uses [dspace.dir]/log from dspace.cfg\n" +
"-file [file name regex]\n" +
"\tOptional\n" +
"\tSpecify a regular expression as the file name template.\n" +
"\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" +
"\tDefault uses dspace.log*\n" +
"-cfg [config file path]\n" +
"\tOptional\n" +
"\tSpecify a config file to be used\n" +
"\tDefault uses dstat.cfg in dspace config directory\n" +
"-out [output file path]\n" +
"\tOptional\n" +
"\tSpecify an output file to write results into\n" +
"\tDefault uses dstat.dat in dspace log directory\n" +
"-start [YYYY-MM-DD]\n" +
"\tOptional\n" +
"\tSpecify the start date of the analysis\n" +
"\tIf a start date is specified then no attempt to gather \n" +
"\tcurrent database statistics will be made unless -lookup is\n" +
"\talso passed\n" +
"\tDefault is to start from the earliest date records exist for\n" +
"-end [YYYY-MM-DD]\n" +
"\tOptional\n" +
"\tSpecify the end date of the analysis\n" +
"\tIf an end date is specified then no attempt to gather \n" +
"\tcurrent database statistics will be made unless -lookup is\n" +
"\talso passed\n" +
"\tDefault is to work up to the last date records exist for\n" +
"-lookup\n" +
"\tOptional\n" +
"\tForce a lookup of the current database statistics\n" +
"\tOnly needs to be used if date constraints are also in place\n" +
"-help\n" +
"\tdisplay this usage information\n";
System.out.println(usage);
}
}