![JAR search and dependency download from the Maven repository](/logo.png)
eu.fbk.twm.wiki.wikipedia.StatisticsIndexer Maven / Gradle / Ivy
package eu.fbk.twm.wiki.wikipedia;
import eu.fbk.twm.index.util.SerialUtils;
import eu.fbk.twm.utils.FrequencyHashSet;
import org.apache.commons.cli.*;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import java.io.*;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
/**
* Created with IntelliJ IDEA.
* User: aprosio
* Date: 1/31/13
* Time: 12:51 PM
* To change this template use File | Settings | File Templates.
*
* http://stats.grok.se/it/201308/Atene
*
* http://dumps.wikimedia.org/other/pagecounts-raw/2013/2013-08/
*/
@Deprecated public class StatisticsIndexer {
public static final String PAGE_FIELD_NAME = "page";
public static final String TRAFFIC_FIELD_NAME = "num";
public static void main(String[] args) {
Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[1].getClassName());
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "configuration/log-config.txt";
}
PropertyConfigurator.configure(logConfig);
CommandLineParser parser = new PosixParser();
Options options = new Options();
options.addOption(OptionBuilder.withDescription("Language").isRequired().hasArg().withArgName("iso-code").create("l"));
options.addOption(OptionBuilder.withDescription("Output folder where to save the Lucene index").isRequired().hasArg().withArgName("folder").create("o"));
options.addOption(OptionBuilder.withDescription("Redirect file").isRequired().hasArg().withArgName("file").create("r"));
options.addOption(OptionBuilder.withDescription("Input folder with files").isRequired().hasArg().withArgName("folder").create("i"));
options.addOption(OptionBuilder.withLongOpt("pattern").withDescription("Starting pattern for file names (default pagecounts)").hasArg().withArgName("pattern").create("p"));
options.addOption(OptionBuilder.withLongOpt("stop").withDescription("Stop after files").hasArg().withArgName("num").create());
options.addOption("c", "clean", false, "Clean the output folder before writing on it");
options.addOption("h", "help", false, "Print this message");
CommandLine commandLine = null;
try {
commandLine = parser.parse(options, args);
if (commandLine.hasOption("help")) {
throw new ParseException("");
}
} catch (ParseException exp) {
System.out.println();
if (exp.getMessage().length() > 0) {
System.out.println("ERR: " + exp.getMessage());
System.out.println();
}
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(400, "java -mx4g " + Thread.currentThread().getStackTrace()[1].getClassName(), "\n", options, "\n", true);
System.out.println();
System.exit(0);
}
String outLucene = commandLine.getOptionValue('o');
String lang = commandLine.getOptionValue('l');
String redirectFile = commandLine.getOptionValue('r');
String inFolder = commandLine.getOptionValue('i');
if (!inFolder.endsWith(File.separator)) {
inFolder += File.separator;
}
if (!outLucene.endsWith(File.separator)) {
outLucene += File.separator;
}
boolean clean = false;
if (commandLine.hasOption('c')) {
clean = true;
}
String pattern = "pagecounts";
if (commandLine.hasOption('p')) {
pattern = commandLine.getOptionValue('p');
}
int stop = 0;
if (commandLine.hasOption("stop")) {
stop = Integer.parseInt(commandLine.getOptionValue("stop"));
}
String dir = outLucene;
if (clean) {
System.out.println("Cleaning the folder");
File d2 = new File(dir);
if (!d2.exists()) {
if (!d2.mkdirs()) {
System.out.println("Unable to create directory " + dir);
System.exit(1);
}
}
else {
String a2[] = d2.list();
for (int j = 0; j < a2.length; j++) {
String fileName = dir + a2[j];
File f = new File(fileName);
f.delete();
}
}
}
logger.info("Loading redirect file " + redirectFile);
HashMap redir = new HashMap();
Pattern p = Pattern.compile("\\s+");
try {
BufferedReader reader = new BufferedReader(new FileReader(redirectFile));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = p.split(line);
if (parts.length < 2) {
continue;
}
String s = parts[1];
for (int i = 2; i < parts.length; i++) {
s += "_" + parts[i];
}
redir.put(parts[0], s);
}
} catch (Exception e) {
e.printStackTrace();
}
logger.info("Redirect file loaded");
FrequencyHashSet frequencies = new FrequencyHashSet();
File inFolderFile = new File(inFolder);
if (!inFolderFile.exists() || !inFolderFile.isDirectory()) {
logger.error("Invalid input folder");
System.exit(0);
}
File[] listOfFiles = inFolderFile.listFiles();
int i = 0;
for (File f : listOfFiles) {
String simpleName = f.getName();
String fileName = f.getAbsolutePath();
if (!simpleName.startsWith(pattern)) {
continue;
}
//todo: inserire un parametro per stabilire se i file sono gzippati o no
if (!simpleName.endsWith("gz")) {
continue;
}
logger.info(i +" downloading " + fileName + "...");
try {
OutputStream out = new ByteArrayOutputStream();
GZIPInputStream z = new GZIPInputStream(new FileInputStream(fileName));
byte[] buf = new byte[1024];
int len;
while ((len = z.read(buf)) > 0) {
out.write(buf, 0, len);
}
z.close();
BufferedReader reader = new BufferedReader(new StringReader(out.toString()));
String line;
while ((line = reader.readLine()) != null) {
String[] parts = p.split(line);
if (parts.length < 3) {
continue;
}
if (!parts[0].equals(lang)) {
continue;
}
String page;
try {
page = URLDecoder.decode(parts[1], "UTF-8");
} catch (Exception e) {
continue;
}
if (redir.get(page) != null) {
page = redir.get(page);
}
frequencies.add(page, Integer.parseInt(parts[2]));
}
out.close();
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
System.exit(0);
}
if (stop > 0 && ++i >= stop) {
break;
}
}
logger.info("Writing Lucene index");
IndexWriter w;
try {
w = new IndexWriter(outLucene, new WhitespaceAnalyzer());
for (String page : frequencies.keySet()) {
Document doc = new Document();
doc.add(new Field(PAGE_FIELD_NAME, lang + ":" + page, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(TRAFFIC_FIELD_NAME, SerialUtils.toByteArray(frequencies.get(page)), Field.Store.YES));
w.addDocument(doc);
}
logger.info("Optimizing and closing");
w.optimize();
w.close();
} catch (IOException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
System.exit(0);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy