All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.wikipedia.StatisticsIndexer Maven / Gradle / Ivy

package eu.fbk.twm.wiki.wikipedia;

import eu.fbk.twm.index.util.SerialUtils;
import eu.fbk.twm.utils.FrequencyHashSet;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import java.io.*;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
 * Created with IntelliJ IDEA.
 * User: aprosio
 * Date: 1/31/13
 * Time: 12:51 PM
 * To change this template use File | Settings | File Templates.
 *
 * http://stats.grok.se/it/201308/Atene
 *
 * http://dumps.wikimedia.org/other/pagecounts-raw/2013/2013-08/
 */
@Deprecated public class StatisticsIndexer {

	public static final String PAGE_FIELD_NAME = "page";

	public static final String TRAFFIC_FIELD_NAME = "num";

	public static void main(String[] args) {
		Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[1].getClassName());
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}
		PropertyConfigurator.configure(logConfig);

		CommandLineParser parser = new PosixParser();
		Options options = new Options();
		options.addOption(OptionBuilder.withDescription("Language").isRequired().hasArg().withArgName("iso-code").create("l"));
		options.addOption(OptionBuilder.withDescription("Output folder where to save the Lucene index").isRequired().hasArg().withArgName("folder").create("o"));
		options.addOption(OptionBuilder.withDescription("Redirect file").isRequired().hasArg().withArgName("file").create("r"));
		options.addOption(OptionBuilder.withDescription("Input folder with files").isRequired().hasArg().withArgName("folder").create("i"));
		options.addOption(OptionBuilder.withLongOpt("pattern").withDescription("Starting pattern for file names (default pagecounts)").hasArg().withArgName("pattern").create("p"));
		options.addOption(OptionBuilder.withLongOpt("stop").withDescription("Stop after  files").hasArg().withArgName("num").create());
		options.addOption("c", "clean", false, "Clean the output folder before writing on it");
		options.addOption("h", "help", false, "Print this message");

		CommandLine commandLine = null;

		try {
			commandLine = parser.parse(options, args);
			if (commandLine.hasOption("help")) {
				throw new ParseException("");
			}
		} catch (ParseException exp) {
			System.out.println();
			if (exp.getMessage().length() > 0) {
				System.out.println("ERR: " + exp.getMessage());
				System.out.println();
			}
			HelpFormatter formatter = new HelpFormatter();
			formatter.printHelp(400, "java -mx4g " + Thread.currentThread().getStackTrace()[1].getClassName(), "\n", options, "\n", true);
			System.out.println();
			System.exit(0);
		}

		String outLucene = commandLine.getOptionValue('o');
		String lang = commandLine.getOptionValue('l');
		String redirectFile = commandLine.getOptionValue('r');
		String inFolder = commandLine.getOptionValue('i');

		if (!inFolder.endsWith(File.separator)) {
			inFolder += File.separator;
		}
		if (!outLucene.endsWith(File.separator)) {
			outLucene += File.separator;
		}

		boolean clean = false;
		if (commandLine.hasOption('c')) {
			clean = true;
		}

		String pattern = "pagecounts";
		if (commandLine.hasOption('p')) {
			pattern = commandLine.getOptionValue('p');
		}

		int stop = 0;
		if (commandLine.hasOption("stop")) {
			stop = Integer.parseInt(commandLine.getOptionValue("stop"));
		}

		String dir = outLucene;
		if (clean) {
			System.out.println("Cleaning the folder");

			File d2 = new File(dir);
			if (!d2.exists()) {
				if (!d2.mkdirs()) {
					System.out.println("Unable to create directory " + dir);
					System.exit(1);
				}
			}
			else {
				String a2[] = d2.list();
				for (int j = 0; j < a2.length; j++) {
					String fileName = dir + a2[j];
					File f = new File(fileName);
					f.delete();
				}
			}
		}

		logger.info("Loading redirect file " + redirectFile);
		HashMap redir = new HashMap();
		Pattern p = Pattern.compile("\\s+");

		try {
			BufferedReader reader = new BufferedReader(new FileReader(redirectFile));

			String line;
			while ((line = reader.readLine()) != null) {
				String[] parts = p.split(line);
				if (parts.length < 2) {
					continue;
				}
				String s = parts[1];
				for (int i = 2; i < parts.length; i++) {
					s += "_" + parts[i];
				}
				redir.put(parts[0], s);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		logger.info("Redirect file loaded");

		FrequencyHashSet frequencies = new FrequencyHashSet();

		File inFolderFile = new File(inFolder);
		if (!inFolderFile.exists() || !inFolderFile.isDirectory()) {
			logger.error("Invalid input folder");
			System.exit(0);
		}
		File[] listOfFiles = inFolderFile.listFiles();
		int i = 0;
		for (File f : listOfFiles) {
			String simpleName = f.getName();
			String fileName = f.getAbsolutePath();
			if (!simpleName.startsWith(pattern)) {
				continue;
			}
			//todo: inserire un parametro per stabilire se i file sono gzippati o no
			if (!simpleName.endsWith("gz")) {
				continue;
			}
			logger.info(i +" downloading " + fileName + "...");
			try {
				OutputStream out = new ByteArrayOutputStream();
				GZIPInputStream z = new GZIPInputStream(new FileInputStream(fileName));
				byte[] buf = new byte[1024];
				int len;
				while ((len = z.read(buf)) > 0) {
					out.write(buf, 0, len);
				}

				z.close();

				BufferedReader reader = new BufferedReader(new StringReader(out.toString()));
				String line;
				while ((line = reader.readLine()) != null) {
					String[] parts = p.split(line);
					if (parts.length < 3) {
						continue;
					}
					if (!parts[0].equals(lang)) {
						continue;
					}
					String page;
					try {
						page = URLDecoder.decode(parts[1], "UTF-8");
					} catch (Exception e) {
						continue;
					}
					if (redir.get(page) != null) {
						page = redir.get(page);
					}
					frequencies.add(page, Integer.parseInt(parts[2]));
				}
				out.close();
			} catch (IOException e) {
				e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
				System.exit(0);
			}
			if (stop > 0 && ++i >= stop) {
				break;
			}
		}

		logger.info("Writing Lucene index");
		IndexWriter w;
		try {
			w = new IndexWriter(outLucene, new WhitespaceAnalyzer());
			for (String page : frequencies.keySet()) {
				Document doc = new Document();
				doc.add(new Field(PAGE_FIELD_NAME, lang + ":" + page, Field.Store.YES, Field.Index.NOT_ANALYZED));
				doc.add(new Field(TRAFFIC_FIELD_NAME, SerialUtils.toByteArray(frequencies.get(page)), Field.Store.YES));
				w.addDocument(doc);
			}
			logger.info("Optimizing and closing");
			w.optimize();
			w.close();
		} catch (IOException e) {
			e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
			System.exit(0);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy