All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.util.PruneTemplates Maven / Gradle / Ivy

/*
 * Copyright 2012 Fondazione Bruno Kessler (FBK)
 * 
 * FBK reserves all rights in the Program as delivered.
 * The Program or any portion thereof may not be reproduced
 * in any form whatsoever except as provided by license
 * without the written consent of FBK.  A license under FBK's
 * rights in the Program may be available directly from FBK.
 */

package eu.fbk.twm.wiki.xmldump.util;

import org.apache.log4j.Logger;

import java.io.*;
import java.util.HashMap;

public class PruneTemplates {
    
    static Logger logger = Logger.getLogger(PruneTemplates.class.getName());

	public PruneTemplates(String inFile, String outComplete, String outGood, double range, int min_n) throws IOException {
		File outCompleteFile = new File(outComplete);
		if (outCompleteFile.exists()) {
			outCompleteFile.delete();
		}
		if (!outCompleteFile.createNewFile()) {
			logger.error("File " + outComplete + " not writeable!");
			System.exit(1);
		}

		File outGoodFile = new File(outGood);
		if (outGoodFile.exists()) {
			outGoodFile.delete();
		}
		if (!outGoodFile.createNewFile()) {
			logger.error("File " + outGood + " not writeable!");
			System.exit(1);
		}

		if (!(new File(inFile)).exists()) {
			logger.error("File " + inFile + " does not exist!");
			System.exit(1);
		}

		HashMap> count = new HashMap>();

		BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outComplete), "UTF-8"));
		BufferedWriter writer_range = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outGood), "UTF-8"));

		BufferedReader in = new BufferedReader(new FileReader(inFile));
		String line;
		int i = 0;
		while ((line = in.readLine()) != null) {
			i++;
			if (i % 10000 == 0) {
				System.out.print(".");
			}
			if (i % 1000000 == 0) {
				System.out.println(" " + i);
			}
			String[] parts = line.split("\t");
			if (parts.length < 2) {
				continue;
			}

			String page = parts[0];
			String template = parts[1];
/*
                template = template.replaceAll("_", "");
                template = template.replaceAll("", "");
*/

			HashMap val;
			if (!count.containsKey(template)) {
				val = new HashMap();
			}
			else {
				val = count.get(template);
			}

			int value;
			if (!val.containsKey(page)) {
				value = 1;
			}
			else {
				value = val.get(page) + 1;
			}

			val.put(page, value);
			count.put(template, val);
		        /*
                System.out.println(Arrays.toString(parts));
                System.exit(1);
                */
		}

		System.out.println("");
		// System.out.println(count.size());

		for (String key : count.keySet()) {
			int n = count.get(key).size();
			int tot = 0;
			for (String key2 : count.get(key).keySet()) {
				tot += count.get(key).get(key2);
			}

			double ratio = tot * 1.0 / n;

			String toBeWritten = key + "\t" + n + "\t" + tot + "\t" + ratio;
			writer.write(toBeWritten + "\n");
			if (ratio < range && n >= min_n) {
				writer_range.write(toBeWritten + "\n");
			}
		}

		writer.flush();
		writer.close();
		writer_range.flush();
		writer_range.close();

		//System.out.println(count);
	}
    
    public static void main(String... args) {
        
        if (args.length != 5) {
            logger.error("Wrong number of parameters " + args.length);
            logger.error("Usage: java -mx4g main.java.org.fbk.cit.hlt.moschitti.PruneTemplates" +
                    " in-template-map-rep-file out-tpl-complete out-tpl-good range min");
            // valori 1.01 50
            System.exit(-1);
        }

        String inFile = args[0];
        String outComplete = args[1];
        String outGood = args[2];
        
        double range = Double.parseDouble(args[3]);
        int min_n = Integer.parseInt(args[4]);

        try {
			new PruneTemplates(inFile, outComplete, outGood, range, min_n);
		}
		catch (Exception e) {
		    e.printStackTrace();
	    }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy