All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikibrain.pageview.PageViewLoader Maven / Gradle / Ivy

There is a newer version: 0.9.1
Show newest version
package org.wikibrain.pageview;

import org.apache.commons.cli.*;
import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.joda.time.Hours;
import org.joda.time.Interval;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.conf.DefaultOptionBuilder;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.lang.LanguageSet;

import java.io.IOException;
import java.sql.SQLException;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Java "script" to load pageviews.
 * By default it loads two hours worth of pageviews from exactly one week ago.
 */
public class PageViewLoader {
    private static final Logger LOG = LoggerFactory.getLogger(PageViewLoader.class);
    private final LanguageSet languageSet;
    private final PageViewDao dao;

    public PageViewLoader(LanguageSet languageSet, PageViewDao dao) {
        this.languageSet = languageSet;
        this.dao = dao;
    }

    public PageViewDao getDao() {
        return dao;
    }

    public void load(List intervals) throws DaoException {
        dao.ensureLoaded(intervals, languageSet);
    }

    /**
     * first arg: comma-separated lang codes
     * start date and end date (second and third args) must be in UTC time
     * dates must be entered as four_digit_year-numeric_month_1-12-numeric_day_1-31-numeric_hour_0-23
     * can enter as many start date to end date combinations as desired in order to download multiple days
     * @param args
     * @throws ClassNotFoundException
     * @throws SQLException
     * @throws IOException
     * @throws ConfigurationException
     * @throws WikiBrainException
     * @throws DaoException
     */
    public static void main(String args[]) throws ClassNotFoundException, SQLException, IOException, ConfigurationException, WikiBrainException, DaoException {
        Options options = new Options();
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("drop-tables")
                        .withDescription("drop and recreate all tables")
                        .create("d"));
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("start-date")
                        .withDescription("date at which to start loading page view files")
                        .hasArgs()
                        .create("s"));
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("end-date")
                        .withDescription("date at which to stop loading page view files")
                        .hasArgs()
                        .create("e"));
        options.addOption(
                new DefaultOptionBuilder()
                        .withLongOpt("random-hours")
                        .withDescription("select a certain number of random hours between the specified start and end date")
                        .hasArgs()
                        .create("r"));
        EnvBuilder.addStandardOptions(options);

        CommandLineParser parser = new PosixParser();
        CommandLine cmd;
        try {
            cmd = parser.parse(options, args);
        } catch (ParseException e) {
            System.err.println( "Invalid option usage: " + e.getMessage());
            new HelpFormatter().printHelp("PageViewLoader", options);
            System.exit(1);
            return;
        }

        List intervals = new ArrayList();
        if (cmd.hasOption("s")) {
            String [] startStrings = cmd.getOptionValues("s");
            String [] endStrings = cmd.hasOption("e") ? cmd.getOptionValues("e") : new String[0];
            for (int i = 0; i < startStrings.length; i++) {
                DateTime start = parseDateOrDie(startStrings[i]);
                DateTime end = (endStrings.length >= i)
                        ? parseDateOrDie(endStrings[i])
                        : start.plusMinutes(60 * 2 - 1);
                intervals.add(new Interval(start, end));
            }
            if (cmd.hasOption("r")) {
                if (startStrings.length != 1) {
                    System.err.println("when using -r, you can only specify one start/end interval");
                    new HelpFormatter().printHelp("PageViewLoader", options);
                    System.exit(1);
                }
                intervals = selectRandomHours(intervals.get(0), Integer.valueOf(cmd.getOptionValue("r")));
            }
        } else if (cmd.hasOption("r")) {
            // Default to 12 month date range.
            DateTime start = DateTime.now().minusMonths(13);
            DateTime end = DateTime.now().minusMonths(1);
            intervals = selectRandomHours(new Interval(start, end), Integer.valueOf(cmd.getOptionValue("r")));
        } else {
            // Default to two hours of views two week ago.
            DateTime start = DateTime.now().minusWeeks(2);
            DateTime end = start.plusMinutes(60 * 2 - 1);
            intervals.add(new Interval(start, end));
        }

        Env env = new EnvBuilder(cmd).build();
        Configurator conf = env.getConfigurator();
        PageViewDao dao = conf.get(PageViewDao.class);
        PageViewLoader loader = new PageViewLoader(env.getLanguages(), dao);

        if (cmd.hasOption("d")) {
            LOG.info("Clearing pageview data");
            dao.clear();
        }

        LOG.info("loading pageview data for intervals " + intervals);

        loader.load(intervals);

        LOG.info("DONE");
    }

    public static String[] DATE_FORMATS = new String[] {"yyyy-MM-dd", "yyyy-MM-dd:HH"};


    private static DateTime parseDateOrDie(String dateString) throws WikiBrainException {
        DateTime result = null;
        for (String format : DATE_FORMATS) {
            DateTimeFormatter fmt = DateTimeFormat.forPattern(format);
            try {
                result = fmt.parseDateTime(dateString);
            } catch (IllegalArgumentException e) {
                // Try the next format
            }
        }
        if (result == null) {
            System.err.format("Invalid date format '%s'. Must be one of %s.\n", dateString, StringUtils.join(DATE_FORMATS, ", "));
            System.exit(1);
        }
        return result;
    }

    private static List selectRandomHours(Interval interval, int n) {
        Hours hours = interval.toDuration().toStandardHours();
        List result = new ArrayList();
        Random random = new Random();
        for (int i = 0; i < n; i++) {
            int begOffset = random.nextInt(hours.getHours());
            DateTime start = interval.getStart().plusHours(begOffset);
            DateTime end = start.plusHours(1);
            result.add(new Interval(start, end));
        }
        return result;
    }
}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy