All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader Maven / Gradle / Ivy

package org.fbk.cit.hlt.thewikimachine.wikipedia;

import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.thewikimachine.util.CharacterTable;
import org.fbk.cit.hlt.thewikimachine.util.GenericFileUtils;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 8/29/13
 * Time: 2:45 PM
 * To change this template use File | Settings | File Templates.
 * 

* time java -Dfile.encoding=UTF-8 -mx62G -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader -m 06 -y 2013 -o tmp2/ * * http://dumps.wikimedia.org/other/pagecounts-raw/2014/2014-12/ * * @see org.fbk.cit.hlt.thewikimachine.index.PageTrafficIndexer */ public class WikipediaTrafficDownloader { /** * Define a static logger variable so that it references the * Logger instance named CategorySuperCategoryIndexer. */ static Logger logger = Logger.getLogger(WikipediaTrafficDownloader.class.getName()); public static final String[] languages = {"lt", "sq", "be", "fi", "lv", "sr", "bg", "fr", "nl", "sv", "ca", "hr", "no", "tr", "cs", "hu", "pl", "uk", "da", "id", "pt", "de", "is", "ro", "it", "ru", "es", "sk", "et", "sl", "en"}; private static Pattern spacePattern = Pattern.compile(" "); //private Map> trafficMap; private static Pattern tabPattern = Pattern.compile("\t"); private static DecimalFormat tf = new DecimalFormat("###,###,###"); static final int SLEEP_TIME = 10000; static final int DEFAULT_NOTIFICATION_POINT = 10000; public static final String BASE_URL = "http://dumps.wikimedia.org/other/pagecounts-raw/"; private String year; private String month; private Map redirectMap; private Map> trafficMap; public WikipediaTrafficDownloader(String baseUrl, String outputDir, int sleep, String year, String month, String rootDir) { long begin = System.currentTimeMillis(); logger.info("process started " + new Date()); if (month.length() == 1) { month = "0" + month; } this.year = year; this.month = month; if (!baseUrl.endsWith(File.separator)) { baseUrl += File.separator; } baseUrl += year + File.separator + year + CharacterTable.HYPHEN_MINUS + month + File.separator; logger.debug(baseUrl); if (!outputDir.endsWith(File.separator)) { outputDir += File.separator; } outputDir += year + CharacterTable.HYPHEN_MINUS + month + File.separator; if (!rootDir.endsWith(File.separator)) { rootDir += File.separator; } File outputDirFile = new File(outputDir); if (!outputDirFile.exists()) { outputDirFile.mkdirs(); } logger.debug(outputDir); logger.debug(rootDir); init(languages, rootDir); Document doc = null; try { doc = Jsoup.connect(baseUrl).get(); } catch (IOException e) { logger.error(e); } int count = 0; Elements newsHeadlines = doc.select("a"); for (Object newsHeadline : newsHeadlines) { /*if (count >= 3) { break; }*/ Element e = (Element) newsHeadline; String text = e.html(); if (text.startsWith("pagecounts")) { String fileToProcess = e.attr("href"); try { URL url = new URL(baseUrl + fileToProcess); logger.info(tf.format(count) + "/" + tf.format(newsHeadlines.size()) + "\t" + url); process(url); write(outputDir, fileToProcess); } catch (Exception ex) { logger.error(ex); } finally { try { logger.info("waiting " + tf.format(sleep) + "ms..."); Thread.sleep(sleep); } catch (InterruptedException ex) { logger.error(ex); } count++; } } } long end = System.currentTimeMillis(); logger.info(tf.format(count) + " files processed in " + tf.format(end - begin) + " ms " + new Date()); } private void init(String[] languages, String rootDir) { trafficMap = new HashMap>(); redirectMap = new HashMap(); for (int i = 0; i < languages.length; i++) { try { Map resourceMap = GenericFileUtils.searchForFilesInTheSameFolder(rootDir + languages[i], "page-freq.csv", "redirect.csv"); //todo: use title id... or load the redirection pages File f = new File(resourceMap.get("page-freq.csv")); logger.debug("initializing " + languages[i] + "..."); Map localMap = initFromPageFreq(f); trafficMap.put(languages[i], localMap); File r = new File(resourceMap.get("redirect.csv")); PageMap pageMap = new PageMap(r); redirectMap.put(languages[i], pageMap); } catch (IOException e) { logger.error(e); } } } private Map initFromPageFreq(File f) throws IOException { logger.info("reading " + f + " (" + tf.format(f.length()) + ")..."); String line = null; int c = 0; Map map = new HashMap(); LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); while ((line = lr.readLine()) != null) { String[] array = tabPattern.split(line); //logger.info(array.length); if (array.length == 2) { map.put(array[1], new Counter(0)); } c++; } logger.info(tf.format(map.size()) + " pages read " + new Date()); lr.close(); return map; } private void write(String outputDir, String fileToProcess) { logger.info("writing partial result after " + fileToProcess + "..."); logger.info("writing " + languages.length + " languages in " + outputDir + "..."); Iterator languageIterator = trafficMap.keySet().iterator(); for (; languageIterator.hasNext(); ) { String language = languageIterator.next(); //logger.info("writing " + language + "..."); Map localTrafficMap = trafficMap.get(language); try { File logFile = new File(outputDir + language + CharacterTable.HYPHEN_MINUS + year + CharacterTable.HYPHEN_MINUS + month + ".log"); PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(logFile, true), "UTF-8"))); pw.println(language + "\t" + fileToProcess + "\t" + localTrafficMap.size() + "\t" + new Date()); pw.close(); File outputFile = new File(outputDir + language + CharacterTable.HYPHEN_MINUS + year + CharacterTable.HYPHEN_MINUS + month + ".csv"); write(localTrafficMap, language, outputFile); } catch (IOException e) { logger.error(e); } } } void write(Map map, String language, File file) throws IOException { long begin = System.currentTimeMillis(); logger.info("writing " + tf.format(map.size()) + " " + language + " pages in " + file + "..."); PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"))); Map localTrafficMap = trafficMap.get(language); Iterator pageIterator = localTrafficMap.keySet().iterator(); for (; pageIterator.hasNext(); ) { String page = pageIterator.next(); Counter c = map.get(page); pw.print(c.get()); pw.print("\t"); pw.print(page); pw.print("\n"); } pw.close(); long end = System.currentTimeMillis(); logger.info(tf.format(map.size()) + " pages wrote in " + tf.format(end - begin) + " " + new Date()); } void process(URL url) throws IOException { long begin = System.currentTimeMillis(); logger.info("processing " + url + "..."); char[] buffer = new char[1024]; URLConnection connection = url.openConnection(); LineNumberReader lr = new LineNumberReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream()))); int count = 0; String line; while ((line = lr.readLine()) != null) { //logger.debug(count + "\t" + line); //long begin = System.currentTimeMillis(); String[] array = spacePattern.split(line); //logger.debug(Arrays.toString(array)); if (array.length > 3) { Map localTrafficMap = trafficMap.get(array[0]); if (localTrafficMap != null) { try { String page = URLDecoder.decode(array[1], "UTF-8"); PageMap localRedirectMap = redirectMap.get(array[0]); String redirectPage = localRedirectMap.get(page); //todo: check multiple redirects if (redirectPage != null) { String secondRedirectPage = localRedirectMap.get(redirectPage); if (secondRedirectPage != null) { //logger.warn(page + " ==>" + secondRedirectPage); page = secondRedirectPage; } else { //logger.warn(page + " -->" + redirectPage); page = redirectPage; } } int freq = Integer.parseInt(array[2]); //logger.debug(count + "\t" + array[0] + "\t" + page + "\t" + freq + "\t" + array[3]); Counter counter = localTrafficMap.get(page); if (counter != null) { //logger.debug("inc\t" + page + "\t" + freq + "\t" + c); counter.inc(freq); } /*else { // uncomment to add unseen pages //logger.debug("new\t " + page + "\t" + freq + "\t0"); counter = new Counter(freq); localTrafficMap.put(page, counter); }*/ } catch (IllegalArgumentException ignored) { } } } if ((count % DEFAULT_NOTIFICATION_POINT) == 0) { //logger.debug(sw.toString()); //long end = System.currentTimeMillis(); //logger.debug(count + " line processed in " + tf.format(end - begin) + " " + new Date()); System.out.print("."); } count++; } System.out.print("\n"); lr.close(); long end = System.currentTimeMillis(); logger.info(url + " processed in " + tf.format(end - begin) + " ms " + new Date()); } public static void main(String[] args) { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "configuration/log-config.txt"; } PropertyConfigurator.configure(logConfig); Options options = new Options(); options.addOption(OptionBuilder.withArgName("url").hasArg().withDescription("base url from which to process the traffic statistics (default " + BASE_URL + ")").withLongOpt("base-url").create("u")); //Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default is " + DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("b"); options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("year for which traffic statistics are processed").isRequired().withLongOpt("year").create("y")); options.addOption(OptionBuilder.withArgName("int").hasArg().withDescription("month for which traffic statistics are processed").isRequired().withLongOpt("month").create("m")); options.addOption(OptionBuilder.withArgName("dir").withDescription("output folder in which to store the traffic statistics").isRequired().hasArg().withLongOpt("output-dir").create("o")); options.addOption(OptionBuilder.withArgName("dir").withDescription("root folder (model folder) from which to read the page frequency and redirect files").isRequired().hasArg().withLongOpt("root-dir").create("r")); options.addOption(OptionBuilder.withArgName("milliseconds").withDescription("sleep time between queries (default " + SLEEP_TIME + ")").hasArg().withLongOpt("sleep").create("s")); //options.addOption("c", "clean", false, "Clean the output folder before writing on it"); options.addOption("h", "help", false, "Print this message"); try { CommandLineParser parser = new PosixParser(); CommandLine line = parser.parse(options, args); String output = line.getOptionValue("output-dir"); String baseUrl = BASE_URL; if (line.hasOption("base-url")) { baseUrl = line.getOptionValue("base-url"); } /*int notificationPoint = DEFAULT_NOTIFICATION_POINT; if (line.hasOption("notification-point")) { notificationPoint = Integer.parseInt(line.getOptionValue("notification-point")); } */ int sleep = SLEEP_TIME; if (line.hasOption("sleep")) { try { sleep = Integer.parseInt(line.getOptionValue("sleep")); } catch (Exception ignored) { } } String year = line.getOptionValue("year"); String month = line.getOptionValue("month"); String rootDir = line.getOptionValue("root-dir"); new WikipediaTrafficDownloader(baseUrl, output, sleep, year, month, rootDir); } catch (ParseException e) { // oops, something went wrong System.out.println("Parsing failed: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader", "\n", options, "\n", true); } } /*class Counter { int count; public Counter(int count) { this.count = count; } public void inc() { count++; } public void inc(int l) { count += l; } public int get() { return count; } public String toString() { return Integer.toString(count); } }*/ /*String read(URL website) throws Exception { logger.debug("reading " + website + "..."); char[] buffer = new char[1024]; URLConnection connection = website.openConnection(); BufferedReader in = new BufferedReader(new InputStreamReader(new GZIPInputStream(connection.getInputStream()))); StringWriter sw = new StringWriter(); int len; int count = 0; while ((len = in.read(buffer)) > 0) { long begin = System.currentTimeMillis(); sw.write(buffer, 0, len); count++; if ((count % 1024) == 0) { //logger.debug(sw.toString()); long end = System.currentTimeMillis(); logger.debug(count + "K downloaded in " + tf.format(end - begin)); } } in.close(); return sw.toString(); } */ }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy