
eu.fbk.twm.wiki.wikipedia.WikipediaTrafficExtractor Maven / Gradle / Ivy
package eu.fbk.twm.wiki.wikipedia;
import eu.fbk.twm.utils.Defaults;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.PageMap;
import eu.fbk.utils.core.io.FolderScanner;
import eu.fbk.utils.core.io.GZFilter;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import java.io.*;
import java.net.URLDecoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 8/24/13
* Time: 6:38 PM
* To change this template use File | Settings | File Templates.
*/
@Deprecated public class WikipediaTrafficExtractor {
/**
* Define a static logger variable so that it references the
* Logger instance named WikipediaTrafficExtractor
.
*/
static Logger logger = Logger.getLogger(WikipediaTrafficExtractor.class.getName());
private static final int DEFAULT_NUM_FILES = 1000;
private static final int PAGE_COLUMN = 0;
private static final int TRAFFIC_COLUMN = 1;
private static Pattern spacePattern = Pattern.compile(" ");
private Map trafficMap;
private static Pattern tabPattern = Pattern.compile("\t");
private PageMap redirectPageMap;
String trafficSourceName;
int numFiles;
public WikipediaTrafficExtractor(String trafficSourceName, int numFiles) throws IOException {
this.trafficSourceName = trafficSourceName;
this.numFiles = numFiles;
trafficMap = new HashMap();
}
public void start(ExtractorParameters extractorParameters) {
try {
redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
logger.info(redirectPageMap.size() + " redirect pages");
File trafficFile = new File(extractorParameters.getWikipediaPageTrafficFileName());
if (trafficFile.exists() && trafficFile.length() > 0) {
initFromTraffic(trafficFile);
}
else {
initFromPageFreq(new File(extractorParameters.getWikipediaPageFreqFileName()));
}
File f = new File(trafficSourceName);
if (!f.exists()) {
logger.error(f + " does not exist");
return;
}
logger.info("updating traffic statistics from " + f + "...");
if (f.isFile()) {
long begin = System.currentTimeMillis();
process(f, extractorParameters.getLang());
long end = System.currentTimeMillis();
logger.info(f + " processed in " + (end - begin) + " ms, " + trafficMap.size() + " pages\t" + new Date());
}
else {
FolderScanner fs = new FolderScanner(f);
fs.setFilter(new GZFilter());
int total = 0;
int count = 1;
while (fs.hasNext()) {
Object[] files = fs.next();
//logger.info(count + " : " + files.length);
for (int i = 0; i < files.length; i++) {
if (count > numFiles) {
break;
}
long begin = System.currentTimeMillis();
File fi = (File) files[i];
logger.debug(fi);
process(fi, extractorParameters.getLang());
long end = System.currentTimeMillis();
logger.info(count + " - " + fi + " processed in " + (end - begin) + " ms, " + trafficMap.size() + " pages (" + (trafficMap.size() - total) + ")\t" + new Date());
total = trafficMap.size();
count++;
}
}
}
//write(extractorParameters.getWikipediaPageTrafficFileName());
write("prova2.csv");
} catch (IOException e) {
logger.error(e);
}
}
private void write(String out) throws IOException {
logger.info("writing " + trafficMap.size() + " pages...");
PrintWriter pw = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")));
Iterator it = trafficMap.keySet().iterator();
while (it.hasNext()) {
String page = it.next();
Counter c = (Counter) trafficMap.get(page);
//logger.info(c);
pw.print(c.get());
pw.print("\t");
pw.print(page);
pw.print("\n");
}
pw.close();
logger.info(trafficMap.size() + " pages wrote (" + new Date() + ")");
}
private void initFromPageFreq(File f) throws IOException {
logger.info("initializing from page freq " + f + "(" + f.length() + ")...");
String line = null;
int c = 0;
LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while ((line = lr.readLine()) != null) {
String[] array = tabPattern.split(line);
//logger.info(array.length);
if (array.length == 2) {
trafficMap.put(array[1], new Counter(0));
}
c++;
}
logger.info(trafficMap.size() + " pages read\t" + new Date());
lr.close();
}
private void initFromTraffic(File f) throws IOException {
logger.info("initializing from traffic " + f + "(" + f.length() + ")...");
String line = null;
int c = 0;
LineNumberReader lr = new LineNumberReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
while ((line = lr.readLine()) != null) {
String[] array = tabPattern.split(line);
//logger.info(array.length);
if (array.length == 2) {
trafficMap.put(array[1], new Counter(Integer.parseInt(array[0])));
}
c++;
}
logger.info(trafficMap.size() + " pages read\t" + new Date());
lr.close();
}
private void process(File f, String lang) throws IOException {
logger.info("processing " + f + " (" + lang + ")...");
String line = null;
int freq = 0;
LineNumberReader lr = new LineNumberReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(f)), "UTF-8"));
String[] array = null;
char fc = 0;
Counter c = null;
int tot = 0;
while ((line = lr.readLine()) != null) {
array = spacePattern.split(line);
if (array[0].equals(lang)) {
//logger.debug(tot + "\t" + line);
//Counter c = map.get(array[1]);
// capitalize the first char, if needed
try {
String page = URLDecoder.decode(array[1], "UTF-8");
fc = page.charAt(0);
if (Character.isLowerCase(fc)) {
page = Character.toUpperCase(fc) + page.substring(1, page.length());
//logger.info(line + "\t--->\t" + array[1]);
}
freq = Integer.parseInt(array[2]);
if (freq > 0) {
String targetPage = redirectPageMap.get(page);
if (targetPage != null) {
//logger.debug("\t" + page + " -> " + targetPage);
page = targetPage;
}
c = (Counter) trafficMap.get(page);
if (c != null) {
//logger.debug("inc\t" + page + "\t" + freq + "\t" + c);
c.inc(freq);
}
/*else {
//logger.debug("new\t " + page + "\t" + freq + "\t0");
c = new Counter(freq);
trafficMap.put(page, c);
} */
}
} catch (Exception e) {
logger.error("Error at line " + tot);
}
}
tot++;
}
lr.close();
}
//
public static void main(String args[]) throws Exception {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "log-config.txt";
}
PropertyConfigurator.configure(logConfig);
/*if (args.length != 6)
{
//logger.info("java -mx4G org.fbk.cit.hlt.wikify.wikistats.WikipediaTrafficExtractor in-dir out-file lang in-redirectFileName");
logger.info("java -mx4G org.fbk.cit.hlt.wikify.wikistats.WikipediaTrafficExtractor in-dir out-file lang in-redirectFileName in-init size");
System.exit(-1);
}
new WikipediaTrafficExtractor(new File(args[0]), new File(args[1]), args[2], new File(args[3]), new File(args[4]), Integer.parseInt(args[5]));
//new WikipediaTrafficExtractor(new File(args[0]), new File(args[1]), args[2], new File(args[3]));
*/
Options options = new Options();
try {
Option wikipediaDumpOpt = OptionBuilder.withArgName("file").hasArg().withDescription("wikipedia xml dump file").isRequired().withLongOpt("wikipedia-dump").create("d");
Option wikipediaStatisticsOpt = OptionBuilder.withArgName("stats-dir").hasArg().withDescription("wikipedia statistics directory").isRequired().withLongOpt("stats-dir").create();
Option outputDirOpt = OptionBuilder.withArgName("dir").hasArg().withDescription("output directory in which to store output files").isRequired().withLongOpt("output-dir").create("o");
//Option numThreadOpt = OptionBuilder.withArgName("int").hasArg().withDescription("number of threads (default " + Defaults.DEFAULT_THREADS_NUMBER + ")").withLongOpt("num-threads").create("t");
Option numFilesOpt = OptionBuilder.withArgName("num-files").hasArg().withDescription("number of files to process (default all)").withLongOpt("num-files").create();
Option notificationPointOpt = OptionBuilder.withArgName("int").hasArg().withDescription("receive notification every n pages (default " + Defaults.DEFAULT_NOTIFICATION_POINT + ")").withLongOpt("notification-point").create("n");
//Option maximumFormFreqOpt = OptionBuilder.withArgName("max-freq").hasArg().withDescription("maximum frequency of wanted forms (default is " + WikipediaExtractor.DEFAULT_MAXIMUM_FORM_FREQ + ")").withLongOpt("max-freq").create("m");
options.addOption("h", "help", false, "print this message");
options.addOption("v", "version", false, "output version information and exit");
Option baseDirOpt = OptionBuilder.withDescription("if set, use the output folder as base dir").withLongOpt("base-dir").create();
options.addOption(wikipediaDumpOpt);
options.addOption(wikipediaStatisticsOpt);
options.addOption(outputDirOpt);
//options.addOption(numThreadOpt);
options.addOption(numFilesOpt);
options.addOption(notificationPointOpt);
//options.addOption(maximumFormFreqOpt);
options.addOption(baseDirOpt);
CommandLineParser parser = new PosixParser();
CommandLine line = parser.parse(options, args);
int numThreads = Defaults.DEFAULT_THREADS_NUMBER;
if (line.hasOption("num-threads")) {
numThreads = Integer.parseInt(line.getOptionValue("num-threads"));
}
int numFiles = DEFAULT_NUM_FILES;
if (line.hasOption("num-files")) {
numFiles = Integer.parseInt(line.getOptionValue("num-files"));
}
int notificationPoint = Defaults.DEFAULT_NOTIFICATION_POINT;
if (line.hasOption("notification-point")) {
notificationPoint = Integer.parseInt(line.getOptionValue("notification-point"));
}
ExtractorParameters extractorParameters;
if (line.hasOption("base-dir")) {
extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"), true);
}
else {
extractorParameters = new ExtractorParameters(line.getOptionValue("wikipedia-dump"), line.getOptionValue("output-dir"));
}
logger.debug("extracting statistics (" + extractorParameters.getWikipediaPageTrafficFileName() + ")...");
WikipediaTrafficExtractor wikipediaTrafficExtractor = new WikipediaTrafficExtractor(line.getOptionValue("stats-dir"), numFiles);
wikipediaTrafficExtractor.start(extractorParameters);
logger.info("extraction ended " + new Date());
} catch (org.apache.commons.cli.ParseException e) {
// oops, something went wrong
System.out.println("Parsing failed: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficExtractor", "\n", options, "\n", true);
}
}
class Counter {
int count;
public Counter(int count) {
this.count = count;
}
public void inc() {
count++;
}
public void inc(int l) {
count += l;
}
public int get() {
return count;
}
public String toString() {
return Integer.toString(count);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy