eu.fbk.twm.wiki.wikipedia.WikipediaTrafficExtractor Maven / Gradle / Ivy

Go to download
package eu.fbk.twm.wiki.wikipedia;

import eu.fbk.twm.utils.Defaults;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.PageMap;
import eu.fbk.utils.core.io.FolderScanner;
import eu.fbk.utils.core.io.GZFilter;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.*;
import java.net.URLDecoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 8/24/13
 * Time: 6:38 PM
 * To change this template use File | Settings | File Templates.
 */
@Deprecated public class WikipediaTrafficExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named WikipediaTrafficExtractor.
	 */
	static Logger logger = Logger.getLogger(WikipediaTrafficExtractor.class.getName());

	private static final int DEFAULT_NUM_FILES = 1000;

	private static final int PAGE_COLUMN = 0;

	private static final int TRAFFIC_COLUMN = 1;

	private static Pattern spacePattern = Pattern.compile(" ");

	private Map trafficMap;

	private static Pattern tabPattern = Pattern.compile("\t");

	private PageMap redirectPageMap;

	String trafficSourceName;

	int numFiles;

	public WikipediaTrafficExtractor(String trafficSourceName, int numFiles) throws IOException {
		this.trafficSourceName = trafficSourceName;
		this.numFiles = numFiles;
		trafficMap = new HashMap();
	}

	public void start(ExtractorParameters extractorParameters) {
		try {
			redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
			logger.info(redirectPageMap.size() + " redirect pages");

			File trafficFile = new File(extractorParameters.getWikipediaPageTrafficFileName());
			if (trafficFile.exists() && trafficFile.length() > 0) {
				initFromTraffic(trafficFile);
			}
			else {
				initFromPageFreq(new File(extractorParameters.getWikipediaPageFreqFileName()));
			}


			File f = new File(trafficSourceName);
			if (!f.exists()) {
				logger.error(f + " does not exist");
				return;
			}
			logger.info("updating traffic statistics from " + f + "...");
			if (f.isFile()) {
				long begin = System.currentTimeMillis();
				process(f, extractorParameters.getLang());
				long end = System.currentTimeMillis();
				logger.info(f + " processed in " + (end - begin) + " ms, " + trafficMap.size() + " pages\t" + new Date());
			}
			else {
				FolderScanner fs = new FolderScanner(f);
				fs.setFilter(new GZFilter());

				int total = 0;
				int count = 1;
				while (fs.hasNext()) {
					Object[] files = fs.next();
					//logger.info(count + " : " + files.length);
					for (int i = 0; i < files.length; i++) {

						if (count > numFiles) {
							break;
						}
						long begin = System.currentTimeMillis();
						File fi = (File) files[i];
						logger.debug(fi);
						process(fi, extractorParameters.getLang());
						long end = System.currentTimeMillis();
						logger.info(count + " - " + fi + " processed in " + (end - begin) + " ms, " + trafficMap.size() + " pages (" + (trafficMap.size() - total) + ")\t" + new Date());
						total = trafficMap.size();
						count++;
					}
				}
			}

			//write(extractorParameters.getWikipediaPageTrafficFileName());
			write("prova2.csv");

		} catch (IOException e) {
			logger.error(e);
		}

	}

	private void write(String out) throws IOException {
		logger.info("writing " + trafficMap.size() + " pages...");
		PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")));

		Iterator it = trafficMap.keySet().iterator();
		while (it.hasNext()) {
			String page = it.next();
			Counter c = (Counter) trafficMap.get(page);
			//logger.info(c);
			pw.print(c.get());
			pw.print("\t");
			pw.print(page);
			pw.print("\n");
		}
		pw.close();
		logger.info(trafficMap.size() + " pages wrote (" + new Date() + ")");
	}

	private void initFromPageFreq(File f) throws IOException {
		logger.info("initializing from page freq " + f + "(" + f.length() + ")...");
		String line = null;
		int c = 0;
		LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
		while ((line = lr.readLine()) != null) {
			String[] array = tabPattern.split(line);
			//logger.info(array.length);
			if (array.length == 2) {
				trafficMap.put(array[1], new Counter(0));
			}
			c++;
		}
		logger.info(trafficMap.size() + " pages read\t" + new Date());
		lr.close();
	}

	private void initFromTraffic(File f) throws IOException {
		logger.info("initializing from traffic " + f + "(" + f.length() + ")...");
		String line = null;
		int c = 0;
		LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
		while ((line = lr.readLine()) != null) {
			String[] array = tabPattern.split(line);
			//logger.info(array.length);
			if (array.length == 2) {
				trafficMap.put(array[1], new Counter(Integer.parseInt(array[0])));
			}
			c++;
		}
		logger.info(trafficMap.size() + " pages read\t" + new Date());
		lr.close();
	}

	private void process(File f, String lang) throws IOException {
		logger.info("processing " + f + " (" + lang + ")...");
		String line = null;
		int freq = 0;
		LineNumberReader lr = new LineNumberReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f)), "UTF-8"));
		String[] array = null;
		char fc = 0;
		Counter c = null;
		int tot = 0;
		while ((line = lr.readLine()) != null) {

			array = spacePattern.split(line);
			if (array[0].equals(lang)) {
				//logger.debug(tot + "\t" + line);
				//Counter c = map.get(array[1]);
				// capitalize the first char, if needed
				try {
					String page = URLDecoder.decode(array[1], "UTF-8");
					fc = page.charAt(0);
					if (Character.isLowerCase(fc)) {
						page = Character.toUpperCase(fc) + page.substring(1, page.length());
						//logger.info(line + "\t--->\t" + array[1]);
					}

					freq = Integer.parseInt(array[2]);
					if (freq > 0) {
						String targetPage = redirectPageMap.get(page);
						if (targetPage != null) {
							//logger.debug("\t" + page + " -> " + targetPage);
							page = targetPage;
						}

						c = (Counter) trafficMap.get(page);
						if (c != null) {
							//logger.debug("inc\t" + page + "\t" + freq + "\t" + c);
							c.inc(freq);
						}
						/*else {
							//logger.debug("new\t " + page + "\t" + freq + "\t0");
							c = new Counter(freq);
							trafficMap.put(page, c);
						} */
					}
				} catch (Exception e) {
					logger.error("Error at line " + tot);
				}


			}
			tot++;
		}
		lr.close();
	}

	//
	public static void main(String args[]) throws Exception {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);

		/*if (args.length != 6)
		{
			//logger.info("java -mx4G org.fbk.cit.hlt.wikify.wikistats.WikipediaTrafficExtractor in-dir out-file lang in-redirectFileName");
			logger.info("java -mx4G org.fbk.cit.hlt.wikify.wikistats.WikipediaTrafficExtractor in-dir out-file lang in-redirectFileName in-init size");
			System.exit(-1);
		}

		new WikipediaTrafficExtractor(new File(args[0]), new File(args[1]), args[2], new File(args[3]), new File(args[4]), Integer.parseInt(args[5]));
		//new WikipediaTrafficExtractor(new File(args[0]), new File(args[1]), args[2], new File(args[3]));
    */
		Options options = new Options();
		try {
			Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
			Option wikipediaStatisticsOpt = OptionBuilder.withArgName("stats-dir").hasArg().withDescription("wikipedia statistics directory").isRequired().withLongOpt("stats-dir").create();
			Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
			//Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
			Option numFilesOpt = OptionBuilder.withArgName("num-files").hasArg().withDescription("number of files to process (default all)").withLongOpt("num-files").create();
			Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + Defaults.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");
			//Option maximumFormFreqOpt = OptionBuilder.withArgName("max-freq").hasArg().withDescription("maximum frequency of wanted forms (default is " + WikipediaExtractor.DEFAULT_MAXIMUM_FORM_FREQ + ")").withLongOpt("max-freq").create("m");
			options.addOption("h", "help", false, "print this message");
			options.addOption("v", "version", false, "output version information and exit");
			Option baseDirOpt = OptionBuilder.withDescription("if set, use the output folder as base dir").withLongOpt("base-dir").create();

			options.addOption(wikipediaDumpOpt);
			options.addOption(wikipediaStatisticsOpt);
			options.addOption(outputDirOpt);
			//options.addOption(numThreadOpt);
			options.addOption(numFilesOpt);
			options.addOption(notificationPointOpt);
			//options.addOption(maximumFormFreqOpt);
			options.addOption(baseDirOpt);
			CommandLineParser parser = new PosixParser();
			CommandLine line = parser.parse(options, args);


			int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
			if (line.hasOption("num-threads")) {
				numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
			}

			int numFiles = DEFAULT_NUM_FILES;
			if (line.hasOption("num-files")) {
				numFiles = Integer.parseInt(line.getOptionValue("num-files"));
			}

			int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
			if (line.hasOption("notification-point")) {
				notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
			}
			ExtractorParameters extractorParameters;
			if (line.hasOption("base-dir")) {
				extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"), true);
			}
			else {
				extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
			}

			logger.debug("extracting statistics (" + extractorParameters.getWikipediaPageTrafficFileName() + ")...");
			WikipediaTrafficExtractor wikipediaTrafficExtractor = new WikipediaTrafficExtractor(line.getOptionValue("stats-dir"), numFiles);
			wikipediaTrafficExtractor.start(extractorParameters);

			logger.info("extraction ended " + new Date());

		} catch (org.apache.commons.cli.ParseException e) {
			// oops, something went wrong
			System.out.println("Parsing failed: " + e.getMessage() + "\n");
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficExtractor", "\n", options, "\n", true);
		}
	}


	class Counter {

		int count;

		public Counter(int count) {
			this.count = count;
		}

		public void inc() {
			count++;
		}

		public void inc(int l) {
			count += l;
		}

		public int get() {
			return count;
		}

		public String toString() {
			return Integer.toString(count);
		}

	}

}