
eu.fbk.twm.wiki.xmldump.util.PruneTemplates Maven / Gradle / Ivy
/*
* Copyright 2012 Fondazione Bruno Kessler (FBK)
*
* FBK reserves all rights in the Program as delivered.
* The Program or any portion thereof may not be reproduced
* in any form whatsoever except as provided by license
* without the written consent of FBK. A license under FBK's
* rights in the Program may be available directly from FBK.
*/
package eu.fbk.twm.wiki.xmldump.util;
import org.apache.log4j.Logger;
import java.io.*;
import java.util.HashMap;
public class PruneTemplates {
static Logger logger = Logger.getLogger(PruneTemplates.class.getName());
public PruneTemplates(String inFile, String outComplete, String outGood, double range, int min_n) throws IOException {
File outCompleteFile = new File(outComplete);
if (outCompleteFile.exists()) {
outCompleteFile.delete();
}
if (!outCompleteFile.createNewFile()) {
logger.error("File " + outComplete + " not writeable!");
System.exit(1);
}
File outGoodFile = new File(outGood);
if (outGoodFile.exists()) {
outGoodFile.delete();
}
if (!outGoodFile.createNewFile()) {
logger.error("File " + outGood + " not writeable!");
System.exit(1);
}
if (!(new File(inFile)).exists()) {
logger.error("File " + inFile + " does not exist!");
System.exit(1);
}
HashMap> count = new HashMap>();
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outComplete), "UTF-8"));
BufferedWriter writer_range = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outGood), "UTF-8"));
BufferedReader in = new BufferedReader(new FileReader(inFile));
String line;
int i = 0;
while ((line = in.readLine()) != null) {
i++;
if (i % 10000 == 0) {
System.out.print(".");
}
if (i % 1000000 == 0) {
System.out.println(" " + i);
}
String[] parts = line.split("\t");
if (parts.length < 2) {
continue;
}
String page = parts[0];
String template = parts[1];
/*
template = template.replaceAll("_", "");
template = template.replaceAll("", "");
*/
HashMap val;
if (!count.containsKey(template)) {
val = new HashMap();
}
else {
val = count.get(template);
}
int value;
if (!val.containsKey(page)) {
value = 1;
}
else {
value = val.get(page) + 1;
}
val.put(page, value);
count.put(template, val);
/*
System.out.println(Arrays.toString(parts));
System.exit(1);
*/
}
System.out.println("");
// System.out.println(count.size());
for (String key : count.keySet()) {
int n = count.get(key).size();
int tot = 0;
for (String key2 : count.get(key).keySet()) {
tot += count.get(key).get(key2);
}
double ratio = tot * 1.0 / n;
String toBeWritten = key + "\t" + n + "\t" + tot + "\t" + ratio;
writer.write(toBeWritten + "\n");
if (ratio < range && n >= min_n) {
writer_range.write(toBeWritten + "\n");
}
}
writer.flush();
writer.close();
writer_range.flush();
writer_range.close();
//System.out.println(count);
}
public static void main(String... args) {
if (args.length != 5) {
logger.error("Wrong number of parameters " + args.length);
logger.error("Usage: java -mx4g main.java.org.fbk.cit.hlt.moschitti.PruneTemplates" +
" in-template-map-rep-file out-tpl-complete out-tpl-good range min");
// valori 1.01 50
System.exit(-1);
}
String inFile = args[0];
String outComplete = args[1];
String outGood = args[2];
double range = Double.parseDouble(args[3]);
int min_n = Integer.parseInt(args[4]);
try {
new PruneTemplates(inFile, outComplete, outGood, range, min_n);
}
catch (Exception e) {
e.printStackTrace();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy