org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader Maven / Gradle / Ivy
package org.fbk.cit.hlt.thewikimachine.wikipedia;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.thewikimachine.util.CharacterTable;
import org.fbk.cit.hlt.thewikimachine.util.GenericFileUtils;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 8/29/13
* Time: 2:45 PM
* To change this template use File | Settings | File Templates.
*
* time java -Dfile.encoding=UTF-8 -mx62G -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader -m 06 -y 2013 -o tmp2/
*
* http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-12/
*
* @see org.fbk.cit.hlt.thewikimachine.index.PageTrafficIndexer
*/
public class WikipediaTrafficDownloader {
/**
* Define a static logger variable so that it references the
* Logger instance named CategorySuperCategoryIndexer
.
*/
static Logger logger = Logger.getLogger(WikipediaTrafficDownloader.class.getName());
public static final String[] languages = {"lt", "sq", "be", "fi", "lv", "sr", "bg", "fr", "nl", "sv", "ca", "hr", "no", "tr", "cs", "hu", "pl", "uk", "da", "id", "pt", "de", "is", "ro", "it", "ru", "es", "sk", "et", "sl", "en"};
private static Pattern spacePattern = Pattern.compile(" ");
//private Map> trafficMap;
private static Pattern tabPattern = Pattern.compile("\t");
private static DecimalFormat tf = new DecimalFormat("###,###,###");
static final int SLEEP_TIME = 10000;
static final int DEFAULT_NOTIFICATION_POINT = 10000;
public static final String BASE_URL = "http://dumps.wikimedia.org/other/pagecounts-raw/";
private String year;
private String month;
private Map redirectMap;
private Map> trafficMap;
public WikipediaTrafficDownloader(String baseUrl, String outputDir, int sleep, String year, String month, String rootDir) {
long begin = System.currentTimeMillis();
logger.info("process started " + new Date());
if (month.length() == 1) {
month = "0" + month;
}
this.year = year;
this.month = month;
if (!baseUrl.endsWith(File.separator)) {
baseUrl += File.separator;
}
baseUrl += year + File.separator + year + CharacterTable.HYPHEN_MINUS + month + File.separator;
logger.debug(baseUrl);
if (!outputDir.endsWith(File.separator)) {
outputDir += File.separator;
}
outputDir += year + CharacterTable.HYPHEN_MINUS + month + File.separator;
if (!rootDir.endsWith(File.separator)) {
rootDir += File.separator;
}
File outputDirFile = new File(outputDir);
if (!outputDirFile.exists()) {
outputDirFile.mkdirs();
}
logger.debug(outputDir);
logger.debug(rootDir);
init(languages, rootDir);
Document doc = null;
try {
doc = Jsoup.connect(baseUrl).get();
} catch (IOException e) {
logger.error(e);
}
int count = 0;
Elements newsHeadlines = doc.select("a");
for (Object newsHeadline : newsHeadlines) {
/*if (count >= 3) {
break;
}*/
Element e = (Element) newsHeadline;
String text = e.html();
if (text.startsWith("pagecounts")) {
String fileToProcess = e.attr("href");
try {
URL url = new URL(baseUrl + fileToProcess);
logger.info(tf.format(count) + "/" + tf.format(newsHeadlines.size()) + "\t" + url);
process(url);
write(outputDir, fileToProcess);
} catch (Exception ex) {
logger.error(ex);
} finally {
try {
logger.info("waiting " + tf.format(sleep) + "ms...");
Thread.sleep(sleep);
} catch (InterruptedException ex) {
logger.error(ex);
}
count++;
}
}
}
long end = System.currentTimeMillis();
logger.info(tf.format(count) + " files processed in " + tf.format(end - begin) + " ms " + new Date());
}
private void init(String[] languages, String rootDir) {
trafficMap = new HashMap>();
redirectMap = new HashMap();
for (int i = 0; i < languages.length; i++) {
try {
Map resourceMap = GenericFileUtils.searchForFilesInTheSameFolder(rootDir + languages[i], "page-freq.csv", "redirect.csv");
//todo: use title id... or load the redirection pages
File f = new File(resourceMap.get("page-freq.csv"));
logger.debug("initializing " + languages[i] + "...");
Map localMap = initFromPageFreq(f);
trafficMap.put(languages[i], localMap);
File r = new File(resourceMap.get("redirect.csv"));
PageMap pageMap = new PageMap(r);
redirectMap.put(languages[i], pageMap);
} catch (IOException e) {
logger.error(e);
}
}
}
private Map initFromPageFreq(File f) throws IOException {
logger.info("reading " + f + " (" + tf.format(f.length()) + ")...");
String line = null;
int c = 0;
Map map = new HashMap();
LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while ((line = lr.readLine()) != null) {
String[] array = tabPattern.split(line);
//logger.info(array.length);
if (array.length == 2) {
map.put(array[1], new Counter(0));
}
c++;
}
logger.info(tf.format(map.size()) + " pages read " + new Date());
lr.close();
return map;
}
private void write(String outputDir, String fileToProcess) {
logger.info("writing partial result after " + fileToProcess + "...");
logger.info("writing " + languages.length + " languages in " + outputDir + "...");
Iterator languageIterator = trafficMap.keySet().iterator();
for (; languageIterator.hasNext(); ) {
String language = languageIterator.next();
//logger.info("writing " + language + "...");
Map localTrafficMap = trafficMap.get(language);
try {
File logFile = new File(outputDir + language + CharacterTable.HYPHEN_MINUS + year + CharacterTable.HYPHEN_MINUS + month + ".log");
PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(logFile, true), "UTF-8")));
pw.println(language + "\t" + fileToProcess + "\t" + localTrafficMap.size() + "\t" + new Date());
pw.close();
File outputFile = new File(outputDir + language + CharacterTable.HYPHEN_MINUS + year + CharacterTable.HYPHEN_MINUS + month + ".csv");
write(localTrafficMap, language, outputFile);
} catch (IOException e) {
logger.error(e);
}
}
}
void write(Map map, String language, File file) throws IOException {
long begin = System.currentTimeMillis();
logger.info("writing " + tf.format(map.size()) + " " + language + " pages in " + file + "...");
PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
Map localTrafficMap = trafficMap.get(language);
Iterator pageIterator = localTrafficMap.keySet().iterator();
for (; pageIterator.hasNext(); ) {
String page = pageIterator.next();
Counter c = map.get(page);
pw.print(c.get());
pw.print("\t");
pw.print(page);
pw.print("\n");
}
pw.close();
long end = System.currentTimeMillis();
logger.info(tf.format(map.size()) + " pages wrote in " + tf.format(end - begin) + " " + new Date());
}
void process(URL url) throws IOException {
long begin = System.currentTimeMillis();
logger.info("processing " + url + "...");
char[] buffer = new char[1024];
URLConnection connection = url.openConnection();
LineNumberReader lr = new LineNumberReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream())));
int count = 0;
String line;
while ((line = lr.readLine()) != null) {
//logger.debug(count + "\t" + line);
//long begin = System.currentTimeMillis();
String[] array = spacePattern.split(line);
//logger.debug(Arrays.toString(array));
if (array.length > 3) {
Map localTrafficMap = trafficMap.get(array[0]);
if (localTrafficMap != null) {
try {
String page = URLDecoder.decode(array[1], "UTF-8");
PageMap localRedirectMap = redirectMap.get(array[0]);
String redirectPage = localRedirectMap.get(page);
//todo: check multiple redirects
if (redirectPage != null) {
String secondRedirectPage = localRedirectMap.get(redirectPage);
if (secondRedirectPage != null) {
//logger.warn(page + " ==>" + secondRedirectPage);
page = secondRedirectPage;
}
else {
//logger.warn(page + " -->" + redirectPage);
page = redirectPage;
}
}
int freq = Integer.parseInt(array[2]);
//logger.debug(count + "\t" + array[0] + "\t" + page + "\t" + freq + "\t" + array[3]);
Counter counter = localTrafficMap.get(page);
if (counter != null) {
//logger.debug("inc\t" + page + "\t" + freq + "\t" + c);
counter.inc(freq);
}
/*else {
// uncomment to add unseen pages
//logger.debug("new\t " + page + "\t" + freq + "\t0");
counter = new Counter(freq);
localTrafficMap.put(page, counter);
}*/
} catch (IllegalArgumentException ignored) {
}
}
}
if ((count % DEFAULT_NOTIFICATION_POINT) == 0) {
//logger.debug(sw.toString());
//long end = System.currentTimeMillis();
//logger.debug(count + " line processed in " + tf.format(end - begin) + " " + new Date());
System.out.print(".");
}
count++;
}
System.out.print("\n");
lr.close();
long end = System.currentTimeMillis();
logger.info(url + " processed in " + tf.format(end - begin) + " ms " + new Date());
}
public static void main(String[] args) {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
Options options = new Options();
options.addOption(OptionBuilder.withArgName("url").hasArg().withDescription("base url from which to process the traffic statistics (default " + BASE_URL + ")").withLongOpt("base-url").create("u"));
//Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default is " + DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("b");
options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("year for which traffic statistics are processed").isRequired().withLongOpt("year").create("y"));
options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("month for which traffic statistics are processed").isRequired().withLongOpt("month").create("m"));
options.addOption(OptionBuilder.withArgName("dir").withDescription("output folder in which to store the traffic statistics").isRequired().hasArg().withLongOpt("output-dir").create("o"));
options.addOption(OptionBuilder.withArgName("dir").withDescription("root folder (model folder) from which to read the page frequency and redirect files").isRequired().hasArg().withLongOpt("root-dir").create("r"));
options.addOption(OptionBuilder.withArgName("milliseconds").withDescription("sleep time between queries (default " + SLEEP_TIME + ")").hasArg().withLongOpt("sleep").create("s"));
//options.addOption("c", "clean", false, "Clean the output folder before writing on it");
options.addOption("h", "help", false, "Print this message");
try {
CommandLineParser parser = new PosixParser();
CommandLine line = parser.parse(options, args);
String output = line.getOptionValue("output-dir");
String baseUrl = BASE_URL;
if (line.hasOption("base-url")) {
baseUrl = line.getOptionValue("base-url");
}
/*int notificationPoint = DEFAULT_NOTIFICATION_POINT;
if (line.hasOption("notification-point")) {
notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
} */
int sleep = SLEEP_TIME;
if (line.hasOption("sleep")) {
try {
sleep = Integer.parseInt(line.getOptionValue("sleep"));
} catch (Exception ignored) {
}
}
String year = line.getOptionValue("year");
String month = line.getOptionValue("month");
String rootDir = line.getOptionValue("root-dir");
new WikipediaTrafficDownloader(baseUrl, output, sleep, year, month, rootDir);
} catch (ParseException e) {
// oops, something went wrong
System.out.println("Parsing failed: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader", "\n", options, "\n", true);
}
}
/*class Counter {
int count;
public Counter(int count) {
this.count = count;
}
public void inc() {
count++;
}
public void inc(int l) {
count += l;
}
public int get() {
return count;
}
public String toString() {
return Integer.toString(count);
}
}*/
/*String read(URL website) throws Exception {
logger.debug("reading " + website + "...");
char[] buffer = new char[1024];
URLConnection connection = website.openConnection();
BufferedReader in = new BufferedReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream())));
StringWriter sw = new StringWriter();
int len;
int count = 0;
while ((len = in.read(buffer)) > 0) {
long begin = System.currentTimeMillis();
sw.write(buffer, 0, len);
count++;
if ((count % 1024) == 0) {
//logger.debug(sw.toString());
long end = System.currentTimeMillis();
logger.debug(count + "K downloaded in " + tf.format(end - begin));
}
}
in.close();
return sw.toString();
} */
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy