All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fbk.cit.hlt.thewikimachine.util.FreqSet Maven / Gradle / Ivy

package org.fbk.cit.hlt.thewikimachine.util;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.*;
import java.util.*;

/**
 * A set of strings in which each string is associated with
 * the frequency countPageCounter
 *
 * @author Claudio Giuliano
 * @version %I%, %G%
 * @since 1.0
 */
public class FreqSet {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named FreqSet.
	 */
	static Logger logger = Logger.getLogger(FreqSet.class.getName());

	protected Map map;

	protected int total;

	public FreqSet() {
		map = new TreeMap();
	}

	/*public FreqSet(boolean threadSafe) {
		if (threadSafe) {
			map = Collections.synchronizedMap(new TreeMap());
		}
		else {
			map = new TreeMap();
		}
	}*/

	public int total() {
		return total;
	}

	public void addAll(Set set) {
		Iterator it = set.iterator();
		while (it.hasNext()) {
			add(it.next());
		}
	}

	public int add(String ngram, int count) {
		//todo: this is not thread safe
		total += count;
		Counter c = (Counter) map.get(ngram);
		if (c == null) {
			map.put(ngram, new Counter(count));
			return count;
		}
		c.inc(count);
		return c.count;
	}

	public int add(String ngram) {
		//todo: this is not thread safe
		total++;
		Counter c = (Counter) map.get(ngram);
		if (c == null) {
			map.put(ngram, new Counter(1));
			return 1;
		}
		c.inc();
		return c.count;
	}

	public boolean contains(String ngram) {
		Counter c = (Counter) map.get(ngram);
		if (c == null) {
			return false;
		}

		return true;
	} // end contains

	//
	public Collection values() {
		return map.values();
	} // end values

	public Object[] toArray() {
		return map.keySet().toArray();
	} // end toArray

	//
	public Iterator iterator() {
		return map.keySet().iterator();
	} // end iterator

	//
	public int get(String ngram) {
		//logger.debug("get: " + ngram + ", " + toChar(ngram));
		Counter c = (Counter) map.get(ngram);
		if (c == null) {
			return 0;
		}

		return c.get();

	} // end get

	//
	public static String toChar(String w) {
		StringBuilder sb = new StringBuilder();
		int ch = 0;
		for (int i = 0; i < w.length(); i++) {
			ch = w.charAt(i);
			if (i > 0) {
				sb.append(" ");
			}
			sb.append(ch);
		}

		sb.append("\n");

		for (int i = 0; i < w.length(); i++) {
			ch = w.charAt(i);
			if (i > 0) {
				sb.append(" ");
			}
			sb.append((char) ch);
		}
		return sb.toString();
	} // end toChar


	//
	public int size() {
		return map.size();
	} // end size

	public void write(Writer out) throws IOException {

		Iterator it = map.entrySet().iterator();
		//logger.info("writing freq set " + map.entrySet().size());
		//logger.info("writing freq set " + map.size());

		int i = 0;
		while (it.hasNext()) {
			Map.Entry entry = (Map.Entry) it.next();

			// freq key
			out.write(entry.getValue().toString());
			out.write("\t");
			out.write(entry.getKey().toString());
			out.write("\n");

			if ((i % 100000) == 0) {
				out.flush();
			}
		} // end while
	} // end write

	public void write(Writer writer, boolean sort) throws IOException {
		SortedMap> sortedMap = toSortedMap();
		Iterator it = sortedMap.keySet().iterator();
		for (int i = 0; it.hasNext(); i++) {
			Integer freq = it.next();
			List list = sortedMap.get(freq);
			for (int j = 0; j < list.size(); j++) {
				writer.write(freq.toString());
				writer.write(StringTable.HORIZONTAL_TABULATION);
				writer.write(list.get(j));
				writer.write(StringTable.LINE_FEED);
			}
		}
		writer.flush();
	}

	/**
	 * Reads the frequency set from the specified input stream.
	 * 

* This method processes input in terms of lines. A natural * line of input is terminated either by a set of line * terminator characters (\n or \r or \r\n) or by the end * of the filePageCounter. A natural line may be either a blank line, * a comment line, or hold some part of a id-feature pair. * Lines are read from the input stream until end of filePageCounter * is reached. *

* A natural line that contains only white space characters * is considered blank and is ignored. A comment line has * an ASCII '#' as its first non-white space character; * comment lines are also ignored and do not encode id-feature * information. *

* The id contains all of the characters in the line starting * with the first non-white space character and up to, but * not including, the first '\t'. All remaining characters * on the line become part of the associated feature string; * if there are no remaining characters, the feature is the * empty string "". * * @param in a Reader object to * provide the underlying stream. * @throws IOException if reading this feature termIndex * from the specified input stream * throws an IOException. */ public void read(Reader in) throws IOException { logger.info("reading vocabulary..."); LineNumberReader lnr = new LineNumberReader(in); String line; String[] s; Integer id; int count = 0; while ((line = lnr.readLine()) != null) { line = line.trim(); //logger.debug(line); if (!line.startsWith("#")) { s = line.split("\t"); // token index //logger.debug(line); if (s.length == 2) { /* SynchronizedCounter c = map.get(s[0]); if (c == null) { map.put(s[0], new SynchronizedCounter(Integer.parseLong(s[1]))); } else { c.inc(Integer.parseLong(s[1])); } */ int freq = Integer.parseInt(s[0]); total += freq; //if (freq > 5) { //logger.debug("added: " + toChar(s[0])); map.put(s[1], new Counter(freq)); } } // end if } // end if } lnr.close(); logger.info(map.size() + " n-grams read"); } // end read // public String getMaxValue() { int max = 0; String maxs = null; int f = 0; String s = null; Counter c = null; Iterator it = map.keySet().iterator(); while (it.hasNext()) { s = it.next(); c = map.get(s); f = c.get(); if (f > max) { max = f; maxs = s; } // end if } // end return maxs; } // end getMaxValue // public SortedMap> toSortedMap() { SortedMap> smap = new TreeMap>(new Comparator() { public int compare(Integer e1, Integer e2) { return e2.compareTo(e1); } }); Iterator it = map.keySet().iterator(); while (it.hasNext()) { String s = it.next(); Counter c = map.get(s); List list = smap.get(c.get()); if (list == null) { list = new ArrayList(); list.add(s); smap.put(c.get(), list); } else { list.add(s); } } return smap; } // end toSortedMap // public String toString(boolean b) { StringBuilder sb = new StringBuilder(); String s = null; Counter c = null; Iterator it = map.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { if (i > 0) { sb.append("\t"); } s = it.next(); c = map.get(s); sb.append(s); sb.append("\t"); sb.append(c); } return sb.toString(); } // end toString // public String toString() { StringBuilder sb = new StringBuilder(); sb.append("("); Iterator it = map.keySet().iterator(); while (it.hasNext()) { String s = it.next(); Counter c = map.get(s); sb.append(s); sb.append("\t"); sb.append(c); sb.append(", "); } sb.append("...)"); return sb.toString(); } // end toString class Counter { int count; public Counter(int count) { this.count = count; } public void inc() { count++; } public void inc(int l) { count += l; } public int get() { return count; } public String toString() { return Integer.toString(count); } } public static void main(String args[]) throws Exception { String logConfig = System.getProperty("log-config"); if (logConfig == null) { logConfig = "log-config.txt"; } PropertyConfigurator.configure(logConfig); if (args.length != 2) { //logger.info("java -mx1024M org.fbk.irst.tcc.web1t.FreqSet in-ngram-filePageCounter out-ngram-filePageCounter"); logger.info("java -mx1024M org.fbk.irst.tcc.web1t.FreqSet in-ngram-filePageCounter term"); System.exit(-1); } FreqSet set = new FreqSet(); //set.read(new FileReader(new File(args[0]))); InputStreamReader reader = new InputStreamReader(new FileInputStream(new File(args[0])), "UTF-8"); set.read(reader); //logger.info(args[1] + ": " + set.get(args[1])); //logger.info(args[1] + ": " + set.get("pluralita'")); //set.write(new FileWriter(new File(args[1]))); OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(args[1])), "ISO-8859-1"); //set.write(new FileWriter(new File(args[1]))); set.write(writer); } } // end class FreqSet





© 2015 - 2025 Weber Informatics LLC | Privacy Policy