org.wikibrain.pageview.PageViewLoader Maven / Gradle / Ivy
package org.wikibrain.pageview;
import org.apache.commons.cli.*;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.joda.time.Hours;
import org.joda.time.Interval;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.lang.LanguageSet;
import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Java "script" to load pageviews.
* By default it loads two hours worth of pageviews from exactly one week ago.
*/
public class PageViewLoader {
private static final Logger LOG = LoggerFactory.getLogger(PageViewLoader.class);
private final LanguageSet languageSet;
private final PageViewDao dao;
public PageViewLoader(LanguageSet languageSet, PageViewDao dao) {
this.languageSet = languageSet;
this.dao = dao;
}
public PageViewDao getDao() {
return dao;
}
public void load(List intervals) throws DaoException {
dao.ensureLoaded(intervals, languageSet);
}
/**
* first arg: comma-separated lang codes
* start date and end date (second and third args) must be in UTC time
* dates must be entered as four_digit_year-numeric_month_1-12-numeric_day_1-31-numeric_hour_0-23
* can enter as many start date to end date combinations as desired in order to download multiple days
* @param args
* @throws ClassNotFoundException
* @throws SQLException
* @throws IOException
* @throws ConfigurationException
* @throws WikiBrainException
* @throws DaoException
*/
public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, WikiBrainException, DaoException {
Options options = new Options();
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("drop-tables")
.withDescription("drop and recreate all tables")
.create("d"));
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("start-date")
.withDescription("date at which to start loading page view files")
.hasArgs()
.create("s"));
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("end-date")
.withDescription("date at which to stop loading page view files")
.hasArgs()
.create("e"));
options.addOption(
new DefaultOptionBuilder()
.withLongOpt("random-hours")
.withDescription("select a certain number of random hours between the specified start and end date")
.hasArgs()
.create("r"));
EnvBuilder.addStandardOptions(options);
CommandLineParser parser = new PosixParser();
CommandLine cmd;
try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.err.println( "Invalid option usage: " + e.getMessage());
new HelpFormatter().printHelp("PageViewLoader", options);
System.exit(1);
return;
}
List intervals = new ArrayList();
if (cmd.hasOption("s")) {
String [] startStrings = cmd.getOptionValues("s");
String [] endStrings = cmd.hasOption("e") ? cmd.getOptionValues("e") : new String[0];
for (int i = 0; i < startStrings.length; i++) {
DateTime start = parseDateOrDie(startStrings[i]);
DateTime end = (endStrings.length >= i)
? parseDateOrDie(endStrings[i])
: start.plusMinutes(60 * 2 - 1);
intervals.add(new Interval(start, end));
}
if (cmd.hasOption("r")) {
if (startStrings.length != 1) {
System.err.println("when using -r, you can only specify one start/end interval");
new HelpFormatter().printHelp("PageViewLoader", options);
System.exit(1);
}
intervals = selectRandomHours(intervals.get(0), Integer.valueOf(cmd.getOptionValue("r")));
}
} else if (cmd.hasOption("r")) {
// Default to 12 month date range.
DateTime start = DateTime.now().minusMonths(13);
DateTime end = DateTime.now().minusMonths(1);
intervals = selectRandomHours(new Interval(start, end), Integer.valueOf(cmd.getOptionValue("r")));
} else {
// Default to two hours of views two week ago.
DateTime start = DateTime.now().minusWeeks(2);
DateTime end = start.plusMinutes(60 * 2 - 1);
intervals.add(new Interval(start, end));
}
Env env = new EnvBuilder(cmd).build();
Configurator conf = env.getConfigurator();
PageViewDao dao = conf.get(PageViewDao.class);
PageViewLoader loader = new PageViewLoader(env.getLanguages(), dao);
if (cmd.hasOption("d")) {
LOG.info("Clearing pageview data");
dao.clear();
}
LOG.info("loading pageview data for intervals " + intervals);
loader.load(intervals);
LOG.info("DONE");
}
public static String[] DATE_FORMATS = new String[] {"yyyy-MM-dd", "yyyy-MM-dd:HH"};
private static DateTime parseDateOrDie(String dateString) throws WikiBrainException {
DateTime result = null;
for (String format : DATE_FORMATS) {
DateTimeFormatter fmt = DateTimeFormat.forPattern(format);
try {
result = fmt.parseDateTime(dateString);
} catch (IllegalArgumentException e) {
// Try the next format
}
}
if (result == null) {
System.err.format("Invalid date format '%s'. Must be one of %s.\n", dateString, StringUtils.join(DATE_FORMATS, ", "));
System.exit(1);
}
return result;
}
private static List selectRandomHours(Interval interval, int n) {
Hours hours = interval.toDuration().toStandardHours();
List result = new ArrayList();
Random random = new Random();
for (int i = 0; i < n; i++) {
int begOffset = random.nextInt(hours.getHours());
DateTime start = interval.getStart().plusHours(begOffset);
DateTime end = start.plusHours(1);
result.add(new Interval(start, end));
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy